diff options
Diffstat (limited to 'fs')
418 files changed, 14223 insertions, 10473 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 141a856c50e7..a7749c126b8e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -240,6 +240,21 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +config HUGETLB_PAGE_FREE_VMEMMAP + def_bool HUGETLB_PAGE + depends on X86_64 + depends on SPARSEMEM_VMEMMAP + +config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON + bool "Default freeing vmemmap pages of HugeTLB to on" + default n + depends on HUGETLB_PAGE_FREE_VMEMMAP + help + When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap + pages associated with each HugeTLB page is default off. Say Y here + to enable freeing vmemmap pages of HugeTLB by default. It can then + be disabled on the command line via hugetlb_free_vmemmap=off. + config MEMFD_CREATE def_bool TMPFS || HUGETLBFS diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index fb7ee026d101..adbb3a1edcbf 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -73,6 +73,7 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations adfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = adfs_readpage, .writepage = adfs_writepage, .write_begin = adfs_write_begin, diff --git a/fs/affs/file.c b/fs/affs/file.c index d91b0133d95d..75ebd2b576ca 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -453,6 +453,7 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations affs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = affs_readpage, .writepage = affs_writepage, .write_begin = affs_write_begin, @@ -833,6 +834,7 @@ err_bh: } const struct address_space_operations affs_aops_ofs = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = affs_readpage_ofs, //.writepage = affs_writepage_ofs, .write_begin = affs_write_begin_ofs, diff --git a/fs/afs/main.c b/fs/afs/main.c index b2975256dadb..179004b15566 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -203,8 +203,8 @@ static int __init afs_init(void) goto error_fs; afs_proc_symlink = proc_symlink("fs/afs", NULL, "../self/net/afs"); - if (IS_ERR(afs_proc_symlink)) { - ret = PTR_ERR(afs_proc_symlink); + if (!afs_proc_symlink) { + ret = -ENOMEM; goto error_proc; } diff --git a/fs/afs/write.c b/fs/afs/write.c index 3edb6204b937..3104b62c2082 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -118,6 +118,15 @@ int afs_write_end(struct file *file, struct address_space *mapping, _enter("{%llx:%llu},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); + if (!PageUptodate(page)) { + if (copied < len) { + copied = 0; + goto out; + } + + SetPageUptodate(page); + } + if (copied == 0) goto out; @@ -132,8 +141,6 @@ int afs_write_end(struct file *file, struct address_space *mapping, write_sequnlock(&vnode->cb_lock); } - ASSERT(PageUptodate(page)); - if (PagePrivate(page)) { priv = page_private(page); f = afs_page_dirty_from(page, priv); @@ -730,7 +737,7 @@ static int afs_writepages_region(struct address_space *mapping, return ret; } - start += ret * PAGE_SIZE; + start += ret; cond_resched(); } while (wbc->nr_to_write > 0); @@ -837,6 +844,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); unsigned long priv; + vm_fault_t ret = VM_FAULT_RETRY; _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); @@ -848,14 +856,14 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page) && wait_on_page_fscache_killable(page) < 0) - return VM_FAULT_RETRY; + goto out; #endif if (wait_on_page_writeback_killable(page)) - return VM_FAULT_RETRY; + goto out; if (lock_page_killable(page) < 0) - return VM_FAULT_RETRY; + goto out; /* We mustn't change page->private until writeback is complete as that * details the portion of the page we need to write back and we might @@ -863,7 +871,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) */ if (wait_on_page_writeback_killable(page) < 0) { unlock_page(page); - return VM_FAULT_RETRY; + goto out; } priv = afs_page_dirty(page, 0, thp_size(page)); @@ -877,8 +885,10 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) } file_update_time(file); + ret = VM_FAULT_LOCKED; +out: sb_end_pagefault(inode->i_sb); - return VM_FAULT_LOCKED; + return ret; } /* diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 0dceefc54b48..7f8544abf636 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -188,6 +188,7 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations bfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = bfs_readpage, .writepage = bfs_writepage, .write_begin = bfs_write_begin, diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 3e84e9bb9084..145917f734fe 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -222,7 +222,7 @@ static int load_aout_binary(struct linux_binprm * bprm) error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, fd_offset); if (error != N_TXTADDR(ex)) @@ -230,7 +230,7 @@ static int load_aout_binary(struct linux_binprm * bprm) error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, fd_offset + ex.a_text); if (error != N_DATADDR(ex)) return error; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 187b3f2b9202..439ed81e755a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1070,7 +1070,7 @@ out_free_interp: elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, !!interpreter, false); - elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; + elf_flags = MAP_PRIVATE | MAP_DENYWRITE; vaddr = elf_ppnt->p_vaddr; /* @@ -1537,7 +1537,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, { const struct cred *cred; unsigned int i, len; - + unsigned int state; + /* first copy the parameters from user space */ memset(psinfo, 0, sizeof(struct elf_prpsinfo)); @@ -1559,7 +1560,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_pgrp = task_pgrp_vnr(p); psinfo->pr_sid = task_session_vnr(p); - i = p->state ? ffz(~p->state) + 1 : 0; + state = READ_ONCE(p->__state); + i = state ? ffz(~state) + 1 : 0; psinfo->pr_state = i; psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; psinfo->pr_zomb = psinfo->pr_sname == 'Z'; @@ -1571,7 +1573,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); - + return 0; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 2c99b102c860..cf4028487dcc 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -928,7 +928,7 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( { struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; - unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags; + unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0; int loop, ret; load_addr = params->load_addr; @@ -948,12 +948,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( } /* allocate one big anon block for everything */ - mflags = MAP_PRIVATE; - if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) - mflags |= MAP_EXECUTABLE; - maddr = vm_mmap(NULL, load_addr, top - base, - PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); + PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE, 0); if (IS_ERR_VALUE(maddr)) return (int) maddr; @@ -1046,9 +1042,6 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (phdr->p_flags & PF_X) prot |= PROT_EXEC; flags = MAP_PRIVATE | MAP_DENYWRITE; - if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) - flags |= MAP_EXECUTABLE; - maddr = 0; switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { @@ -1331,6 +1324,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, { const struct cred *cred; unsigned int i, len; + unsigned int state; /* first copy the parameters from user space */ memset(psinfo, 0, sizeof(struct elf_prpsinfo)); @@ -1353,7 +1347,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_pgrp = task_pgrp_vnr(p); psinfo->pr_sid = task_session_vnr(p); - i = p->state ? ffz(~p->state) + 1 : 0; + state = READ_ONCE(p->__state); + i = state ? ffz(~state) + 1 : 0; psinfo->pr_state = i; psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; psinfo->pr_zomb = psinfo->pr_sname == 'Z'; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index a1072c6a2341..5d776f80ee50 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -573,7 +573,7 @@ static int load_flat_file(struct linux_binprm *bprm, pr_debug("ROM mapping of file (we hope)\n"); textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, - MAP_PRIVATE|MAP_EXECUTABLE, 0); + MAP_PRIVATE, 0); if (!textpos || IS_ERR_VALUE(textpos)) { ret = textpos; if (!textpos) diff --git a/fs/block_dev.c b/fs/block_dev.c index e215da6d49b4..0c424a0cadaa 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -895,7 +895,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) mapping_set_gfp_mask(&inode->i_data, GFP_USER); bdev = I_BDEV(inode); - mutex_init(&bdev->bd_mutex); mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); bdev->bd_disk = disk; @@ -1154,7 +1153,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) struct bd_holder_disk *holder; int ret = 0; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); WARN_ON_ONCE(!bdev->bd_holder); @@ -1199,7 +1198,7 @@ out_del: out_free: kfree(holder); out_unlock: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); @@ -1218,7 +1217,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); holder = bd_find_holder_disk(bdev, disk); @@ -1230,138 +1229,97 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) kfree(holder); } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif -static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); +static void blkdev_flush_mapping(struct block_device *bdev) +{ + WARN_ON_ONCE(bdev->bd_holders); + sync_blockdev(bdev); + kill_bdev(bdev); + bdev_write_inode(bdev); +} -int bdev_disk_changed(struct block_device *bdev, bool invalidate) +static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; int ret = 0; - lockdep_assert_held(&bdev->bd_mutex); - - if (!(disk->flags & GENHD_FL_UP)) - return -ENXIO; - -rescan: - if (bdev->bd_part_count) - return -EBUSY; - sync_blockdev(bdev); - invalidate_bdev(bdev); - blk_drop_partitions(disk); - - clear_bit(GD_NEED_PART_SCAN, &disk->state); - - /* - * Historically we only set the capacity to zero for devices that - * support partitions (independ of actually having partitions created). - * Doing that is rather inconsistent, but changing it broke legacy - * udisks polling for legacy ide-cdrom devices. Use the crude check - * below to get the sane behavior for most device while not breaking - * userspace for this particular setup. - */ - if (invalidate) { - if (disk_part_scan_enabled(disk) || - !(disk->flags & GENHD_FL_REMOVABLE)) - set_capacity(disk, 0); + if (disk->fops->open) { + ret = disk->fops->open(bdev, mode); + if (ret) { + /* avoid ghost partitions on a removed medium */ + if (ret == -ENOMEDIUM && + test_bit(GD_NEED_PART_SCAN, &disk->state)) + bdev_disk_changed(disk, true); + return ret; + } } - if (get_capacity(disk)) { - ret = blk_add_partitions(disk, bdev); - if (ret == -EAGAIN) - goto rescan; - } else if (invalidate) { - /* - * Tell userspace that the media / partition table may have - * changed. - */ - kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + if (!bdev->bd_openers) { + set_init_blocksize(bdev); + if (bdev->bd_bdi == &noop_backing_dev_info) + bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } + if (test_bit(GD_NEED_PART_SCAN, &disk->state)) + bdev_disk_changed(disk, false); + bdev->bd_openers++; + return 0;; +} - return ret; +static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) +{ + if (!--bdev->bd_openers) + blkdev_flush_mapping(bdev); + if (bdev->bd_disk->fops->release) + bdev->bd_disk->fops->release(bdev->bd_disk, mode); } -/* - * Only exported for loop and dasd for historic reasons. Don't use in new - * code! - */ -EXPORT_SYMBOL_GPL(bdev_disk_changed); -/* - * bd_mutex locking: - * - * mutex_lock(part->bd_mutex) - * mutex_lock_nested(whole->bd_mutex, 1) - */ -static int __blkdev_get(struct block_device *bdev, fmode_t mode) +static int blkdev_get_part(struct block_device *part, fmode_t mode) { - struct gendisk *disk = bdev->bd_disk; - int ret = 0; + struct gendisk *disk = part->bd_disk; + struct block_device *whole; + int ret; - if (!(disk->flags & GENHD_FL_UP)) - return -ENXIO; + if (part->bd_openers) + goto done; - if (!bdev->bd_openers) { - if (!bdev_is_partition(bdev)) { - ret = 0; - if (disk->fops->open) - ret = disk->fops->open(bdev, mode); + whole = bdgrab(disk->part0); + ret = blkdev_get_whole(whole, mode); + if (ret) + goto out_put_whole; - if (!ret) - set_init_blocksize(bdev); + ret = -ENXIO; + if (!bdev_nr_sectors(part)) + goto out_blkdev_put; - /* - * If the device is invalidated, rescan partition - * if open succeeded or failed with -ENOMEDIUM. - * The latter is necessary to prevent ghost - * partitions on a removed medium. - */ - if (test_bit(GD_NEED_PART_SCAN, &disk->state) && - (!ret || ret == -ENOMEDIUM)) - bdev_disk_changed(bdev, ret == -ENOMEDIUM); + disk->open_partitions++; + set_init_blocksize(part); + if (part->bd_bdi == &noop_backing_dev_info) + part->bd_bdi = bdi_get(disk->queue->backing_dev_info); +done: + part->bd_openers++; + return 0; - if (ret) - return ret; - } else { - struct block_device *whole = bdgrab(disk->part0); - - mutex_lock_nested(&whole->bd_mutex, 1); - ret = __blkdev_get(whole, mode); - if (ret) { - mutex_unlock(&whole->bd_mutex); - bdput(whole); - return ret; - } - whole->bd_part_count++; - mutex_unlock(&whole->bd_mutex); +out_blkdev_put: + blkdev_put_whole(whole, mode); +out_put_whole: + bdput(whole); + return ret; +} - if (!bdev_nr_sectors(bdev)) { - __blkdev_put(whole, mode, 1); - bdput(whole); - return -ENXIO; - } - set_init_blocksize(bdev); - } +static void blkdev_put_part(struct block_device *part, fmode_t mode) +{ + struct block_device *whole = bdev_whole(part); - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); - } else { - if (!bdev_is_partition(bdev)) { - if (bdev->bd_disk->fops->open) - ret = bdev->bd_disk->fops->open(bdev, mode); - /* the same as first opener case, read comment there */ - if (test_bit(GD_NEED_PART_SCAN, &disk->state) && - (!ret || ret == -ENOMEDIUM)) - bdev_disk_changed(bdev, ret == -ENOMEDIUM); - if (ret) - return ret; - } - } - bdev->bd_openers++; - return 0; + if (--part->bd_openers) + return; + blkdev_flush_mapping(part); + whole->bd_disk->open_partitions--; + blkdev_put_whole(whole, mode); + bdput(whole); } struct block_device *blkdev_get_no_open(dev_t dev) @@ -1447,8 +1405,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) disk_block_events(disk); - mutex_lock(&bdev->bd_mutex); - ret =__blkdev_get(bdev, mode); + mutex_lock(&disk->open_mutex); + ret = -ENXIO; + if (!(disk->flags & GENHD_FL_UP)) + goto abort_claiming; + if (bdev_is_partition(bdev)) + ret = blkdev_get_part(bdev, mode); + else + ret = blkdev_get_whole(bdev, mode); if (ret) goto abort_claiming; if (mode & FMODE_EXCL) { @@ -1467,7 +1431,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) unblock_events = false; } } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&disk->open_mutex); if (unblock_events) disk_unblock_events(disk); @@ -1476,7 +1440,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) abort_claiming: if (mode & FMODE_EXCL) bd_abort_claiming(bdev, holder); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&disk->open_mutex); disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); @@ -1551,10 +1515,9 @@ static int blkdev_open(struct inode * inode, struct file * filp) return 0; } -static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) +void blkdev_put(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; - struct block_device *victim = NULL; /* * Sync early if it looks like we're the last one. If someone else @@ -1566,41 +1529,14 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (bdev->bd_openers == 1) sync_blockdev(bdev); - mutex_lock_nested(&bdev->bd_mutex, for_part); - if (for_part) - bdev->bd_part_count--; - - if (!--bdev->bd_openers) { - WARN_ON_ONCE(bdev->bd_holders); - sync_blockdev(bdev); - kill_bdev(bdev); - bdev_write_inode(bdev); - if (bdev_is_partition(bdev)) - victim = bdev_whole(bdev); - } - - if (!bdev_is_partition(bdev) && disk->fops->release) - disk->fops->release(disk, mode); - mutex_unlock(&bdev->bd_mutex); - if (victim) { - __blkdev_put(victim, mode, 1); - bdput(victim); - } -} - -void blkdev_put(struct block_device *bdev, fmode_t mode) -{ - struct gendisk *disk = bdev->bd_disk; - - mutex_lock(&bdev->bd_mutex); - + mutex_lock(&disk->open_mutex); if (mode & FMODE_EXCL) { struct block_device *whole = bdev_whole(bdev); bool bdev_free; /* * Release a claim on the device. The holder fields - * are protected with bdev_lock. bd_mutex is to + * are protected with bdev_lock. open_mutex is to * synchronize disk_holder unlinking. */ spin_lock(&bdev_lock); @@ -1631,9 +1567,13 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * from userland - e.g. eject(1). */ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); - mutex_unlock(&bdev->bd_mutex); - __blkdev_put(bdev, mode, 0); + if (bdev_is_partition(bdev)) + blkdev_put_part(bdev, mode); + else + blkdev_put_whole(bdev, mode); + mutex_unlock(&disk->open_mutex); + blkdev_put_no_open(bdev); } EXPORT_SYMBOL(blkdev_put); @@ -1669,7 +1609,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) * Does not take i_mutex for the write and thus is not for general purpose * use. */ -ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *bd_inode = bdev_file_inode(file); @@ -1707,9 +1647,8 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_finish_plug(&plug); return ret; } -EXPORT_SYMBOL_GPL(blkdev_write_iter); -ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) +static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *bd_inode = bdev_file_inode(file); @@ -1731,7 +1670,6 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; } -EXPORT_SYMBOL_GPL(blkdev_read_iter); static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -1740,6 +1678,7 @@ static int blkdev_writepages(struct address_space *mapping, } static const struct address_space_operations def_blk_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = blkdev_readpage, .readahead = blkdev_readahead, .writepage = blkdev_writepage, @@ -1925,10 +1864,10 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) old_inode = inode; bdev = I_BDEV(inode); - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); if (bdev->bd_openers) func(bdev, arg); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 68b95ad82126..520a0f6a7d9e 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -18,6 +18,8 @@ config BTRFS_FS select RAID6_PQ select XOR_BLOCKS select SRCU + depends on !PPC_256K_PAGES # powerpc + depends on !PAGE_SIZE_256KB # hexagon help Btrfs is a general purpose copy-on-write filesystem with extents, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 117d423fdb93..7a8a2fc19533 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2675,7 +2675,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, * * @ref_key: The same as @ref_key in handle_direct_tree_backref() * @tree_key: The first key of this tree block. - * @path: A clean (released) path, to avoid allocating path everytime + * @path: A clean (released) path, to avoid allocating path every time * the function get called. */ static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index aa57bdc8fc89..38b127b9edfc 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1399,7 +1399,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_space_info_update_bytes_pinned(fs_info, space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; - __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned); block_group->pinned = 0; spin_unlock(&block_group->lock); @@ -1491,7 +1490,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; - int ret; + LIST_HEAD(again_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1502,6 +1501,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) mutex_lock(&fs_info->reclaim_bgs_lock); spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->reclaim_bgs)) { + int ret = 0; + bg = list_first_entry(&fs_info->reclaim_bgs, struct btrfs_block_group, bg_list); @@ -1547,9 +1548,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) bg->start); next: - btrfs_put_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); + if (ret == -EAGAIN && list_empty(&bg->bg_list)) + list_add_tail(&bg->bg_list, &again_list); + else + btrfs_put_block_group(bg); } + list_splice_tail(&again_list, &fs_info->reclaim_bgs); spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); @@ -2442,16 +2447,16 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) spin_lock(&sinfo->lock); spin_lock(&cache->lock); if (!--cache->ro) { - num_bytes = cache->length - cache->reserved - - cache->pinned - cache->bytes_super - - cache->zone_unusable - cache->used; - sinfo->bytes_readonly -= num_bytes; if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes back */ cache->zone_unusable = cache->alloc_offset - cache->used; sinfo->bytes_zone_unusable += cache->zone_unusable; sinfo->bytes_readonly -= cache->zone_unusable; } + num_bytes = cache->length - cache->reserved - + cache->pinned - cache->bytes_super - + cache->zone_unusable - cache->used; + sinfo->bytes_readonly -= num_bytes; list_del_init(&cache->ro_list); } spin_unlock(&cache->lock); @@ -2505,7 +2510,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group, struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; - u64 num_pages = 0; + u64 cache_size = 0; int retries = 0; int ret = 0; @@ -2617,20 +2622,20 @@ again: * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = div_u64(block_group->length, SZ_256M); - if (!num_pages) - num_pages = 1; + cache_size = div_u64(block_group->length, SZ_256M); + if (!cache_size) + cache_size = 1; - num_pages *= 16; - num_pages *= PAGE_SIZE; + cache_size *= 16; + cache_size *= fs_info->sectorsize; ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, - num_pages); + cache_size); if (ret) goto out_put; - ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, - num_pages, num_pages, + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size, + cache_size, cache_size, &alloc_hint); /* * Our cache requires contiguous chunks so that we don't modify a bunch @@ -3062,8 +3067,6 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - __btrfs_mod_total_bytes_pinned(cache->space_info, - num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1346d698463a..9a023ae0f98b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -149,7 +149,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, const u32 csum_size = fs_info->csum_size; const u32 sectorsize = fs_info->sectorsize; struct page *page; - unsigned long i; + unsigned int i; char *kaddr; u8 csum[BTRFS_CSUM_SIZE]; struct compressed_bio *cb = bio->bi_private; @@ -208,7 +208,7 @@ static void end_compressed_bio_read(struct bio *bio) struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; - unsigned long index; + unsigned int index; unsigned int mirror = btrfs_io_bio(bio)->mirror_num; int ret = 0; @@ -334,7 +334,7 @@ static void end_compressed_bio_write(struct bio *bio) struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; - unsigned long index; + unsigned int index; if (bio->bi_status) cb->errors = 1; @@ -349,12 +349,10 @@ static void end_compressed_bio_write(struct bio *bio) * call back into the FS and do all the end_io operations */ inode = cb->inode; - cb->compressed_pages[0]->mapping = cb->inode->i_mapping; btrfs_record_physical_zoned(inode, cb->start, bio); - btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], + btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, cb->start, cb->start + cb->len - 1, bio->bi_status == BLK_STS_OK); - cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); /* note, our inode could be gone now */ @@ -387,10 +385,10 @@ out: * the end io hooks. */ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned long len, u64 disk_start, - unsigned long compressed_len, + unsigned int len, u64 disk_start, + unsigned int compressed_len, struct page **compressed_pages, - unsigned long nr_pages, + unsigned int nr_pages, unsigned int write_flags, struct cgroup_subsys_state *blkcg_css) { @@ -427,24 +425,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, bio->bi_end_io = end_compressed_bio_write; if (use_append) { - struct extent_map *em; - struct map_lookup *map; - struct block_device *bdev; + struct btrfs_device *device; - em = btrfs_get_chunk_map(fs_info, disk_start, PAGE_SIZE); - if (IS_ERR(em)) { + device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE); + if (IS_ERR(device)) { kfree(cb); bio_put(bio); return BLK_STS_NOTSUPP; } - map = em->map_lookup; - /* We only support single profile for now */ - ASSERT(map->num_stripes == 1); - bdev = map->stripes[0].dev->bdev; - - bio_set_dev(bio, bdev); - free_extent_map(em); + bio_set_dev(bio, device->bdev); } if (blkcg_css) { @@ -515,7 +505,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, } if (bytes_left < PAGE_SIZE) { btrfs_info(fs_info, - "bytes left %lu compress len %lu nr %lu", + "bytes left %lu compress len %u nr %u", bytes_left, cb->compressed_len, cb->nr_pages); } bytes_left -= PAGE_SIZE; @@ -677,9 +667,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree; struct compressed_bio *cb; - unsigned long compressed_len; - unsigned long nr_pages; - unsigned long pg_index; + unsigned int compressed_len; + unsigned int nr_pages; + unsigned int pg_index; struct page *page; struct bio *comp_bio; u64 cur_disk_byte = bio->bi_iter.bi_sector << 9; @@ -1202,9 +1192,6 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level) * * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes - * - * @max_out tells us the max number of bytes that we're allowed to - * stuff into pages */ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, u64 start, struct page **pages, @@ -1225,20 +1212,6 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, return ret; } -/* - * pages_in is an array of pages with compressed data. - * - * disk_start is the starting logical offset of this array in the file - * - * orig_bio contains the pages from the file that we want to decompress into - * - * srclen is the number of bytes in pages_in - * - * The basic idea is that we have a bio that was created by readpages. - * The pages in the bio are for the uncompressed data, and they may not - * be contiguous. They all correspond to the range of bytes covered by - * the compressed extent. - */ static int btrfs_decompress_bio(struct compressed_bio *cb) { struct list_head *workspace; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 8001b700ea3a..c359f20920d0 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -31,6 +31,9 @@ struct compressed_bio { /* number of bios pending for this compressed extent */ refcount_t pending_bios; + /* Number of compressed pages in the array */ + unsigned int nr_pages; + /* the pages with the compressed data on them */ struct page **compressed_pages; @@ -40,20 +43,17 @@ struct compressed_bio { /* starting offset in the inode for our pages */ u64 start; - /* number of bytes in the inode we're working on */ - unsigned long len; - - /* number of bytes on disk */ - unsigned long compressed_len; + /* Number of bytes in the inode we're working on */ + unsigned int len; - /* the compression algorithm for this bio */ - int compress_type; + /* Number of bytes on disk */ + unsigned int compressed_len; - /* number of compressed pages in the array */ - unsigned long nr_pages; + /* The compression algorithm for this bio */ + u8 compress_type; /* IO errors */ - int errors; + u8 errors; int mirror_num; /* for reads, this is the bio we are copying the data into */ @@ -91,10 +91,10 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, struct bio *bio); blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned long len, u64 disk_start, - unsigned long compressed_len, + unsigned int len, u64 disk_start, + unsigned int compressed_len, struct page **compressed_pages, - unsigned long nr_pages, + unsigned int nr_pages, unsigned int write_flags, struct cgroup_subsys_state *blkcg_css); blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a484fb72a01f..4bc3ca2cbd7d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -596,7 +596,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, fs_info->generation); if (!should_cow_block(trans, root, buf)) { - trans->dirty = true; *cow_ret = buf; return 0; } @@ -1788,10 +1787,8 @@ again: * then we don't want to set the path blocking, * so we test it here */ - if (!should_cow_block(trans, root, b)) { - trans->dirty = true; + if (!should_cow_block(trans, root, b)) goto cow_done; - } /* * must have write locks on this node and the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9fb76829a281..e5e53e592d4f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -561,10 +561,16 @@ enum { /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. - * Set and cleared while holding fs_info::balance_mutex. */ BTRFS_FS_BALANCE_RUNNING, + /* + * Indicate that relocation of a chunk has started, it's set per chunk + * and is toggled between chunks. + * Set, tested and cleared while holding fs_info::send_reloc_lock. + */ + BTRFS_FS_RELOC_RUNNING, + /* Indicate that the cleaner thread is awake and doing something. */ BTRFS_FS_CLEANER_RUNNING, @@ -817,8 +823,6 @@ struct btrfs_fs_info { struct kobject *space_info_kobj; struct kobject *qgroups_kobj; - u64 total_pinned; - /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; @@ -871,6 +875,9 @@ struct btrfs_fs_info { struct btrfs_balance_control *balance_ctl; wait_queue_head_t balance_wait_q; + /* Cancellation requests for chunk relocation */ + atomic_t reloc_cancel_req; + u32 data_chunk_allocations; u32 metadata_ratio; @@ -986,14 +993,15 @@ struct btrfs_fs_info { struct crypto_shash *csum_shash; + spinlock_t send_reloc_lock; /* * Number of send operations in progress. - * Updated while holding fs_info::balance_mutex. + * Updated while holding fs_info::send_reloc_lock. */ int send_in_progress; - /* Type of exclusive operation running */ - unsigned long exclusive_operation; + /* Type of exclusive operation running, protected by super_lock */ + enum btrfs_exclusive_operation exclusive_operation; /* * Zone size > 0 when in ZONED mode, otherwise it's used for a check @@ -1375,38 +1383,39 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) * * Note: don't forget to add new options to btrfs_show_options() */ -#define BTRFS_MOUNT_NODATASUM (1 << 0) -#define BTRFS_MOUNT_NODATACOW (1 << 1) -#define BTRFS_MOUNT_NOBARRIER (1 << 2) -#define BTRFS_MOUNT_SSD (1 << 3) -#define BTRFS_MOUNT_DEGRADED (1 << 4) -#define BTRFS_MOUNT_COMPRESS (1 << 5) -#define BTRFS_MOUNT_NOTREELOG (1 << 6) -#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) -#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) -#define BTRFS_MOUNT_NOSSD (1 << 9) -#define BTRFS_MOUNT_DISCARD_SYNC (1 << 10) -#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) -#define BTRFS_MOUNT_SPACE_CACHE (1 << 12) -#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) -#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) -#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) -#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) -/* bit 17 is free */ -#define BTRFS_MOUNT_USEBACKUPROOT (1 << 18) -#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) -#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) -#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) -#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) -#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) -#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) -#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) -#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) -#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) -#define BTRFS_MOUNT_REF_VERIFY (1 << 28) -#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29) -#define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30) -#define BTRFS_MOUNT_IGNOREDATACSUMS (1 << 31) +enum { + BTRFS_MOUNT_NODATASUM = (1UL << 0), + BTRFS_MOUNT_NODATACOW = (1UL << 1), + BTRFS_MOUNT_NOBARRIER = (1UL << 2), + BTRFS_MOUNT_SSD = (1UL << 3), + BTRFS_MOUNT_DEGRADED = (1UL << 4), + BTRFS_MOUNT_COMPRESS = (1UL << 5), + BTRFS_MOUNT_NOTREELOG = (1UL << 6), + BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), + BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), + BTRFS_MOUNT_NOSSD = (1UL << 9), + BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), + BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), + BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), + BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), + BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), + BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), + BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), + BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), + BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), + BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), + BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), + BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), + BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), + BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), + BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), + BTRFS_MOUNT_REF_VERIFY = (1UL << 27), + BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), + BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), + BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), +}; #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) @@ -2216,11 +2225,13 @@ BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, static inline bool btrfs_root_readonly(const struct btrfs_root *root) { + /* Byte-swap the constant at compile time, root_item::flags is LE */ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } static inline bool btrfs_root_dead(const struct btrfs_root *root) { + /* Byte-swap the constant at compile time, root_item::flags is LE */ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; } @@ -2746,9 +2757,9 @@ enum btrfs_reserve_flush_enum { /* * Flush space by above mentioned methods and by: * - Running delayed iputs - * - Commiting transaction + * - Committing transaction * - * Can be interruped by fatal signal. + * Can be interrupted by a fatal signal. */ BTRFS_RESERVE_FLUSH_DATA, BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, @@ -2758,7 +2769,7 @@ enum btrfs_reserve_flush_enum { * Pretty much the same as FLUSH_ALL, but can also steal space from * global rsv. * - * Can be interruped by fatal signal. + * Can be interrupted by a fatal signal. */ BTRFS_RESERVE_FLUSH_ALL_STEAL, }; @@ -2774,7 +2785,6 @@ enum btrfs_flush_state { ALLOC_CHUNK_FORCE = 8, RUN_DELAYED_IPUTS = 9, COMMIT_TRANS = 10, - FORCE_COMMIT_TRANS = 11, }; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, @@ -3100,8 +3110,8 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end); +unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, + struct page *page, u64 start, u64 end); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -3125,7 +3135,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, u64 new_size, - u32 min_type); + u32 min_type, u64 *extents_found); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, @@ -3146,9 +3156,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, unsigned long bio_flags); -bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, - unsigned int size); -void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); @@ -3187,7 +3195,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); -void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; extern const struct iomap_ops btrfs_dio_iomap_ops; @@ -3222,6 +3231,9 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); /* file.c */ @@ -3786,4 +3798,14 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) return fs_info->zoned != 0; } +/* + * We use page status Private2 to indicate there is an ordered extent with + * unfinished IO. + * + * Rename the Private2 accessors to Ordered, to improve readability. + */ +#define PageOrdered(page) PagePrivate2(page) +#define SetPageOrdered(page) SetPagePrivate2(page) +#define ClearPageOrdered(page) ClearPagePrivate2(page) + #endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 56642ca7af10..2059d1504149 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -89,7 +89,7 @@ * ->outstanding_extents += 1 (current value is 1) * * -> set_delalloc - * ->outstanding_extents += 1 (currrent value is 2) + * ->outstanding_extents += 1 (current value is 2) * * -> btrfs_delalloc_release_extents() * ->outstanding_extents -= 1 (current value is 1) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1a88f6214ebc..257c1e18abd4 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -681,7 +681,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, { struct btrfs_delayed_item *curr, *next; int free_space; - int total_data_size = 0, total_size = 0; + int total_size = 0; struct extent_buffer *leaf; char *data_ptr; struct btrfs_key *keys; @@ -706,7 +706,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, */ while (total_size + next->data_len + sizeof(struct btrfs_item) <= free_space) { - total_data_size += next->data_len; total_size += next->data_len + sizeof(struct btrfs_item); list_add_tail(&next->tree_list, &head); nitems++; @@ -974,14 +973,16 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) { - struct btrfs_delayed_root *delayed_root; - ASSERT(delayed_node->root); - clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); - delayed_node->count--; + if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { + struct btrfs_delayed_root *delayed_root; - delayed_root = delayed_node->root->fs_info->delayed_root; - finish_one_item(delayed_root); + ASSERT(delayed_node->root); + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + finish_one_item(delayed_root); + } } static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, @@ -1009,12 +1010,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, nofs_flag = memalloc_nofs_save(); ret = btrfs_lookup_inode(trans, root, path, &key, mod); memalloc_nofs_restore(nofs_flag); - if (ret > 0) { - btrfs_release_path(path); - return -ENOENT; - } else if (ret < 0) { - return ret; - } + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], @@ -1024,7 +1023,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) - goto no_iref; + goto out; path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) @@ -1046,12 +1045,19 @@ again: btrfs_del_item(trans, root, path); out: btrfs_release_delayed_iref(node); -no_iref: btrfs_release_path(path); err_out: btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); btrfs_release_delayed_inode(node); + /* + * If we fail to update the delayed inode we need to abort the + * transaction, because we could leave the inode with the improper + * counts behind. + */ + if (ret && ret != -ENOENT) + btrfs_abort_transaction(trans, ret); + return ret; search: @@ -1898,8 +1904,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) btrfs_release_delayed_item(prev_item); } - if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) - btrfs_release_delayed_iref(delayed_node); + btrfs_release_delayed_iref(delayed_node); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index c92d9d4f5f46..06bc842ecdb3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -641,7 +641,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; - u64 flags = btrfs_ref_head_to_space_flags(existing); int old_ref_mod; BUG_ON(existing->is_data != update->is_data); @@ -711,26 +710,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, } } - /* - * This handles the following conditions: - * - * 1. We had a ref mod of 0 or more and went negative, indicating that - * we may be freeing space, so add our space to the - * total_bytes_pinned counter. - * 2. We were negative and went to 0 or positive, so no longer can say - * that the space would be pinned, decrement our counter from the - * total_bytes_pinned counter. - * 3. We are now at 0 and have ->must_insert_reserved set, which means - * this was a new allocation and then we dropped it, and thus must - * add our space to the total_bytes_pinned counter. - */ - if (existing->total_ref_mod < 0 && old_ref_mod >= 0) - btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); - else if (existing->total_ref_mod >= 0 && old_ref_mod < 0) - btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes); - else if (existing->total_ref_mod == 0 && existing->must_insert_reserved) - btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); - spin_unlock(&existing->lock); } @@ -835,17 +814,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { - u64 flags = btrfs_ref_head_to_space_flags(head_ref); - if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_updates += btrfs_csum_bytes_to_leaves(trans->fs_info, head_ref->num_bytes); } - if (head_ref->ref_mod < 0) - btrfs_mod_total_bytes_pinned(trans->fs_info, flags, - head_ref->num_bytes); delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index d05f73530af7..d029be40ea6f 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -37,7 +37,7 @@ * - Write duplication * * All new writes will be written to both target and source devices, so even - * if replace gets canceled, sources device still contans up-to-date data. + * if replace gets canceled, sources device still contains up-to-date data. * * Location: handle_ops_on_dev_replace() from __btrfs_map_block() * Start: btrfs_dev_replace_start() diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 306ff20af70f..e1b7bd927d69 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -624,7 +624,7 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) * @fs_info: fs_info of interest * * The unused_bgs list needs to be punted to the discard lists because the - * order of operations is changed. In the normal sychronous discard path, the + * order of operations is changed. In the normal synchronous discard path, the * block groups are trimmed via a single large trim in transaction commit. This * is ultimately what we are trying to avoid with asynchronous discard. Thus, * it must be done before going down the unused_bgs path. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c9a3036c23bf..b117dd3b8172 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -241,7 +241,6 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, { struct extent_state *cached_state = NULL; int ret; - bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB); if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; @@ -249,9 +248,6 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; - if (need_lock) - btrfs_tree_read_lock(eb); - lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (extent_buffer_uptodate(eb) && @@ -264,22 +260,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; - - /* - * Things reading via commit roots that don't have normal protection, - * like send, can have a really old block in cache that may point at a - * block that has been freed and re-allocated. So don't clear uptodate - * if we find an eb that is under IO (dirty/writeback) because we could - * end up reading in the stale data and then writing it back out and - * making everybody very sad. - */ - if (!extent_buffer_under_io(eb)) - clear_extent_buffer_uptodate(eb); + clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); - if (need_lock) - btrfs_tree_read_unlock(eb); return ret; } @@ -584,6 +568,7 @@ static int validate_extent_buffer(struct extent_buffer *eb) const u32 csum_size = fs_info->csum_size; u8 found_level; u8 result[BTRFS_CSUM_SIZE]; + const u8 *header_csum; int ret = 0; found_start = btrfs_header_bytenr(eb); @@ -608,15 +593,14 @@ static int validate_extent_buffer(struct extent_buffer *eb) } csum_tree_block(eb, result); + header_csum = page_address(eb->pages[0]) + + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum)); - if (memcmp_extent_buffer(eb, result, 0, csum_size)) { - u8 val[BTRFS_CSUM_SIZE] = { 0 }; - - read_extent_buffer(eb, &val, 0, csum_size); + if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, - "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", - fs_info->sb->s_id, eb->start, - CSUM_FMT_VALUE(csum_size, val), + "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", + eb->start, + CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb)); ret = -EUCLEAN; @@ -917,23 +901,22 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, return btree_csum_one_bio(bio); } -static int check_async_write(struct btrfs_fs_info *fs_info, +static bool should_async_write(struct btrfs_fs_info *fs_info, struct btrfs_inode *bi) { if (btrfs_is_zoned(fs_info)) - return 0; + return false; if (atomic_read(&bi->sync_writers)) - return 0; + return false; if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return 0; - return 1; + return false; + return true; } blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; if (btrfs_op(bio) != BTRFS_MAP_WRITE) { @@ -946,7 +929,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, if (ret) goto out_w_error; ret = btrfs_map_bio(fs_info, bio, mirror_num); - } else if (!async) { + } else if (!should_async_write(fs_info, BTRFS_I(inode))) { ret = btree_csum_one_bio(bio); if (ret) goto out_w_error; @@ -2252,6 +2235,7 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; init_waitqueue_head(&fs_info->balance_wait_q); + atomic_set(&fs_info->reloc_cancel_req, 0); } static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) @@ -2648,6 +2632,24 @@ static int validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, + BTRFS_FSID_SIZE)) { + btrfs_err(fs_info, + "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", + fs_info->super_copy->fsid, fs_info->fs_devices->fsid); + ret = -EINVAL; + } + + if (btrfs_fs_incompat(fs_info, METADATA_UUID) && + memcmp(fs_info->fs_devices->metadata_uuid, + fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) { + btrfs_err(fs_info, +"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", + fs_info->super_copy->metadata_uuid, + fs_info->fs_devices->metadata_uuid); + ret = -EINVAL; + } + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, @@ -2981,6 +2983,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; + spin_lock_init(&fs_info->send_reloc_lock); fs_info->send_in_progress = 0; fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; @@ -3279,14 +3282,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device disk_super = fs_info->super_copy; - ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, - BTRFS_FSID_SIZE)); - - if (btrfs_fs_incompat(fs_info, METADATA_UUID)) { - ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid, - fs_info->super_copy->metadata_uuid, - BTRFS_FSID_SIZE)); - } features = btrfs_super_flags(disk_super); if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { @@ -3461,7 +3456,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * At this point we know all the devices that make this filesystem, * including the seed devices but we don't know yet if the replace * target is required. So free devices that are not part of this - * filesystem but skip the replace traget device which is checked + * filesystem but skip the replace target device which is checked * below in btrfs_init_dev_replace(). */ btrfs_free_extra_devids(fs_devices); @@ -3588,8 +3583,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { ret = btrfsic_mount(fs_info, fs_devices, btrfs_test_opt(fs_info, - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? - 1 : 0, + CHECK_INTEGRITY_DATA) ? 1 : 0, fs_info->check_integrity_print_mask); if (ret) btrfs_warn(fs_info, @@ -4686,9 +4680,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, cache->space_info->bytes_reserved -= head->num_bytes; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch( - &cache->space_info->total_bytes_pinned, - head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); btrfs_put_block_group(cache); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d5c35e4cb76..d296483d148f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1425,7 +1425,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * bytenr of the parent block. Since new extents are always * created with indirect references, this will only be the case * when relocating a shared extent. In that case, root_objectid - * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must + * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must * be 0 * * @root_objectid: The id of the root where this modification has originated, @@ -1804,19 +1804,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); } - /* - * We were dropping refs, or had a new ref and dropped it, and thus must - * adjust down our total_bytes_pinned, the space may or may not have - * been pinned and so is accounted for properly in the pinned space by - * now. - */ - if (head->total_ref_mod < 0 || - (head->total_ref_mod == 0 && head->must_insert_reserved)) { - u64 flags = btrfs_ref_head_to_space_flags(head); - - btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes); - } - btrfs_delayed_refs_rsv_release(fs_info, nr_items); } @@ -2551,7 +2538,6 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -2762,7 +2748,6 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, cache->pinned -= len; btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); space_info->max_extent_size = 0; - __btrfs_mod_total_bytes_pinned(space_info, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; @@ -4784,7 +4769,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } - trans->dirty = true; /* this returns a buffer locked for blocking */ return buf; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index dee2dafbc872..9e81d25dea70 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -136,7 +136,7 @@ struct tree_entry { }; struct extent_page_data { - struct bio *bio; + struct btrfs_bio_ctrl bio_ctrl; /* tells writepage not to lock the state bits for this range * it still does the unlocking */ @@ -185,10 +185,12 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num, /* Cleanup unsubmitted bios */ static void end_write_bio(struct extent_page_data *epd, int ret) { - if (epd->bio) { - epd->bio->bi_status = errno_to_blk_status(ret); - bio_endio(epd->bio); - epd->bio = NULL; + struct bio *bio = epd->bio_ctrl.bio; + + if (bio) { + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + epd->bio_ctrl.bio = NULL; } } @@ -201,9 +203,10 @@ static void end_write_bio(struct extent_page_data *epd, int ret) static int __must_check flush_write_bio(struct extent_page_data *epd) { int ret = 0; + struct bio *bio = epd->bio_ctrl.bio; - if (epd->bio) { - ret = submit_one_bio(epd->bio, 0, 0); + if (bio) { + ret = submit_one_bio(bio, 0, 0); /* * Clean up of epd->bio is handled by its endio function. * And endio is either triggered by successful bio execution @@ -211,7 +214,7 @@ static int __must_check flush_write_bio(struct extent_page_data *epd) * So at this point, no matter what happened, we don't need * to clean up epd->bio. */ - epd->bio = NULL; + epd->bio_ctrl.bio = NULL; } return ret; } @@ -1805,10 +1808,130 @@ out: return found; } +/* + * Process one page for __process_pages_contig(). + * + * Return >0 if we hit @page == @locked_page. + * Return 0 if we updated the page status. + * Return -EGAIN if the we need to try again. + * (For PAGE_LOCK case but got dirty page or page not belong to mapping) + */ +static int process_one_page(struct btrfs_fs_info *fs_info, + struct address_space *mapping, + struct page *page, struct page *locked_page, + unsigned long page_ops, u64 start, u64 end) +{ + u32 len; + + ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); + len = end + 1 - start; + + if (page_ops & PAGE_SET_ORDERED) + btrfs_page_clamp_set_ordered(fs_info, page, start, len); + if (page_ops & PAGE_SET_ERROR) + btrfs_page_clamp_set_error(fs_info, page, start, len); + if (page_ops & PAGE_START_WRITEBACK) { + btrfs_page_clamp_clear_dirty(fs_info, page, start, len); + btrfs_page_clamp_set_writeback(fs_info, page, start, len); + } + if (page_ops & PAGE_END_WRITEBACK) + btrfs_page_clamp_clear_writeback(fs_info, page, start, len); + + if (page == locked_page) + return 1; + + if (page_ops & PAGE_LOCK) { + int ret; + + ret = btrfs_page_start_writer_lock(fs_info, page, start, len); + if (ret) + return ret; + if (!PageDirty(page) || page->mapping != mapping) { + btrfs_page_end_writer_lock(fs_info, page, start, len); + return -EAGAIN; + } + } + if (page_ops & PAGE_UNLOCK) + btrfs_page_end_writer_lock(fs_info, page, start, len); + return 0; +} + static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, - pgoff_t start_index, pgoff_t end_index, - unsigned long page_ops, pgoff_t *index_ret); + u64 start, u64 end, unsigned long page_ops, + u64 *processed_end) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + pgoff_t index = start_index; + unsigned long nr_pages = end_index - start_index + 1; + unsigned long pages_processed = 0; + struct page *pages[16]; + int err = 0; + int i; + + if (page_ops & PAGE_LOCK) { + ASSERT(page_ops == PAGE_LOCK); + ASSERT(processed_end && *processed_end == start); + } + + if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + mapping_set_error(mapping, -EIO); + + while (nr_pages > 0) { + int found_pages; + + found_pages = find_get_pages_contig(mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + if (found_pages == 0) { + /* + * Only if we're going to lock these pages, we can find + * nothing at @index. + */ + ASSERT(page_ops & PAGE_LOCK); + err = -EAGAIN; + goto out; + } + + for (i = 0; i < found_pages; i++) { + int process_ret; + + process_ret = process_one_page(fs_info, mapping, + pages[i], locked_page, page_ops, + start, end); + if (process_ret < 0) { + for (; i < found_pages; i++) + put_page(pages[i]); + err = -EAGAIN; + goto out; + } + put_page(pages[i]); + pages_processed++; + } + nr_pages -= found_pages; + index += found_pages; + cond_resched(); + } +out: + if (err && processed_end) { + /* + * Update @processed_end. I know this is awful since it has + * two different return value patterns (inclusive vs exclusive). + * + * But the exclusive pattern is necessary if @start is 0, or we + * underflow and check against processed_end won't work as + * expected. + */ + if (pages_processed) + *processed_end = min(end, + ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1); + else + *processed_end = start; + } + return err; +} static noinline void __unlock_for_delalloc(struct inode *inode, struct page *locked_page, @@ -1821,7 +1944,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode, if (index == locked_page->index && end_index == index) return; - __process_pages_contig(inode->i_mapping, locked_page, index, end_index, + __process_pages_contig(inode->i_mapping, locked_page, start, end, PAGE_UNLOCK, NULL); } @@ -1831,19 +1954,19 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 delalloc_end) { unsigned long index = delalloc_start >> PAGE_SHIFT; - unsigned long index_ret = index; unsigned long end_index = delalloc_end >> PAGE_SHIFT; + u64 processed_end = delalloc_start; int ret; ASSERT(locked_page); if (index == locked_page->index && index == end_index) return 0; - ret = __process_pages_contig(inode->i_mapping, locked_page, index, - end_index, PAGE_LOCK, &index_ret); - if (ret == -EAGAIN) + ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start, + delalloc_end, PAGE_LOCK, &processed_end); + if (ret == -EAGAIN && processed_end > delalloc_start) __unlock_for_delalloc(inode, locked_page, delalloc_start, - (u64)index_ret << PAGE_SHIFT); + processed_end); return ret; } @@ -1936,84 +2059,6 @@ out_failed: return found; } -static int __process_pages_contig(struct address_space *mapping, - struct page *locked_page, - pgoff_t start_index, pgoff_t end_index, - unsigned long page_ops, pgoff_t *index_ret) -{ - unsigned long nr_pages = end_index - start_index + 1; - unsigned long pages_processed = 0; - pgoff_t index = start_index; - struct page *pages[16]; - unsigned ret; - int err = 0; - int i; - - if (page_ops & PAGE_LOCK) { - ASSERT(page_ops == PAGE_LOCK); - ASSERT(index_ret && *index_ret == start_index); - } - - if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) - mapping_set_error(mapping, -EIO); - - while (nr_pages > 0) { - ret = find_get_pages_contig(mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (ret == 0) { - /* - * Only if we're going to lock these pages, - * can we find nothing at @index. - */ - ASSERT(page_ops & PAGE_LOCK); - err = -EAGAIN; - goto out; - } - - for (i = 0; i < ret; i++) { - if (page_ops & PAGE_SET_PRIVATE2) - SetPagePrivate2(pages[i]); - - if (locked_page && pages[i] == locked_page) { - put_page(pages[i]); - pages_processed++; - continue; - } - if (page_ops & PAGE_START_WRITEBACK) { - clear_page_dirty_for_io(pages[i]); - set_page_writeback(pages[i]); - } - if (page_ops & PAGE_SET_ERROR) - SetPageError(pages[i]); - if (page_ops & PAGE_END_WRITEBACK) - end_page_writeback(pages[i]); - if (page_ops & PAGE_UNLOCK) - unlock_page(pages[i]); - if (page_ops & PAGE_LOCK) { - lock_page(pages[i]); - if (!PageDirty(pages[i]) || - pages[i]->mapping != mapping) { - unlock_page(pages[i]); - for (; i < ret; i++) - put_page(pages[i]); - err = -EAGAIN; - goto out; - } - } - put_page(pages[i]); - pages_processed++; - } - nr_pages -= ret; - index += ret; - cond_resched(); - } -out: - if (err && index_ret) - *index_ret = start_index + pages_processed - 1; - return err; -} - void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 clear_bits, unsigned long page_ops) @@ -2021,8 +2066,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - page_ops, NULL); + start, end, page_ops, NULL); } /* @@ -2381,13 +2425,6 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, BUG_ON(!failrec->this_mirror); - if (failrec->in_validation) { - /* there was no real error, just free the record */ - btrfs_debug(fs_info, - "clean_io_failure: freeing dummy error at %llu", - failrec->start); - goto out; - } if (sb_rdonly(fs_info->sb)) goto out; @@ -2449,7 +2486,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - u64 start, u64 end) + u64 start) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct io_failure_record *failrec; @@ -2457,15 +2494,15 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + const u32 sectorsize = fs_info->sectorsize; int ret; u64 logical; failrec = get_state_failrec(failure_tree, start); if (!IS_ERR(failrec)) { btrfs_debug(fs_info, - "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", - failrec->logical, failrec->start, failrec->len, - failrec->in_validation); + "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", + failrec->logical, failrec->start, failrec->len); /* * when data can be on disk more than twice, add to failrec here * (e.g. with a list for failed_mirror) to make @@ -2480,10 +2517,9 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return ERR_PTR(-ENOMEM); failrec->start = start; - failrec->len = end - start + 1; + failrec->len = sectorsize; failrec->this_mirror = 0; failrec->bio_flags = 0; - failrec->in_validation = 0; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, failrec->len); @@ -2519,12 +2555,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode free_extent_map(em); /* Set the bits in the private failure tree */ - ret = set_extent_bits(failure_tree, start, end, + ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, EXTENT_LOCKED | EXTENT_DIRTY); if (ret >= 0) { ret = set_state_failrec(failure_tree, start, failrec); /* Set the bits in the inode's tree */ - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); + ret = set_extent_bits(tree, start, start + sectorsize - 1, + EXTENT_DAMAGED); } else if (ret < 0) { kfree(failrec); return ERR_PTR(ret); @@ -2533,7 +2570,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, +static bool btrfs_check_repairable(struct inode *inode, struct io_failure_record *failrec, int failed_mirror) { @@ -2553,39 +2590,22 @@ static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, return false; } + /* The failure record should only contain one sector */ + ASSERT(failrec->len == fs_info->sectorsize); + /* - * there are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk + * There are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + * + * Since we're only doing repair for one sector, we only need to get + * a good copy of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. */ - if (needs_validation) { - /* - * to fulfill b), we need to know the exact failing sectors, as - * we don't want to rewrite any more than the failed ones. thus, - * we need separate read requests for the failed bio - * - * if the following BUG_ON triggers, our validation request got - * merged. we need separate requests for our algorithm to work. - */ - BUG_ON(failrec->in_validation); - failrec->in_validation = 1; - failrec->this_mirror = failed_mirror; - } else { - /* - * we're ready to fulfill a) and b) alongside. get a good copy - * of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. - */ - if (failrec->in_validation) { - BUG_ON(failrec->this_mirror != failed_mirror); - failrec->in_validation = 0; - failrec->this_mirror = 0; - } - failrec->failed_mirror = failed_mirror; + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; - } if (failrec->this_mirror > num_copies) { btrfs_debug(fs_info, @@ -2597,53 +2617,11 @@ static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, return true; } -static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio) -{ - u64 len = 0; - const u32 blocksize = inode->i_sb->s_blocksize; - - /* - * If bi_status is BLK_STS_OK, then this was a checksum error, not an - * I/O error. In this case, we already know exactly which sector was - * bad, so we don't need to validate. - */ - if (bio->bi_status == BLK_STS_OK) - return false; - - /* - * We need to validate each sector individually if the failed I/O was - * for multiple sectors. - * - * There are a few possible bios that can end up here: - * 1. A buffered read bio, which is not cloned. - * 2. A direct I/O read bio, which is cloned. - * 3. A (buffered or direct) repair bio, which is not cloned. - * - * For cloned bios (case 2), we can get the size from - * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get - * it from the bvecs. - */ - if (bio_flagged(bio, BIO_CLONED)) { - if (btrfs_io_bio(bio)->iter.bi_size > blocksize) - return true; - } else { - struct bio_vec *bvec; - int i; - - bio_for_each_bvec_all(bvec, bio, i) { - len += bvec->bv_len; - if (len > blocksize) - return true; - } - } - return false; -} - -blk_status_t btrfs_submit_read_repair(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - submit_bio_hook_t *submit_bio_hook) +int btrfs_repair_one_sector(struct inode *inode, + struct bio *failed_bio, u32 bio_offset, + struct page *page, unsigned int pgoff, + u64 start, int failed_mirror, + submit_bio_hook_t *submit_bio_hook) { struct io_failure_record *failrec; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2651,7 +2629,6 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); const int icsum = bio_offset >> fs_info->sectorsize_bits; - bool need_validation; struct bio *repair_bio; struct btrfs_io_bio *repair_io_bio; blk_status_t status; @@ -2661,23 +2638,19 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, start, end); + failrec = btrfs_get_io_failure_record(inode, start); if (IS_ERR(failrec)) - return errno_to_blk_status(PTR_ERR(failrec)); + return PTR_ERR(failrec); - need_validation = btrfs_io_needs_validation(inode, failed_bio); - if (!btrfs_check_repairable(inode, need_validation, failrec, - failed_mirror)) { + if (!btrfs_check_repairable(inode, failrec, failed_mirror)) { free_io_failure(failure_tree, tree, failrec); - return BLK_STS_IOERR; + return -EIO; } repair_bio = btrfs_io_bio_alloc(1); repair_io_bio = btrfs_io_bio(repair_bio); repair_bio->bi_opf = REQ_OP_READ; - if (need_validation) - repair_bio->bi_opf |= REQ_FAILFAST_DEV; repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; repair_bio->bi_private = failed_bio->bi_private; @@ -2695,8 +2668,8 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, repair_io_bio->iter = repair_bio->bi_iter; btrfs_debug(btrfs_sb(inode->i_sb), -"repair read error: submitting new read to mirror %d, in_validation=%d", - failrec->this_mirror, failrec->in_validation); + "repair read error: submitting new read to mirror %d", + failrec->this_mirror); status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags); @@ -2704,17 +2677,114 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, free_io_failure(failure_tree, tree, failrec); bio_put(repair_bio); } - return status; + return blk_status_to_errno(status); +} + +static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); + + if (uptodate) { + btrfs_page_set_uptodate(fs_info, page, start, len); + } else { + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_set_error(fs_info, page, start, len); + } + + if (fs_info->sectorsize == PAGE_SIZE) + unlock_page(page); + else + btrfs_subpage_end_reader(fs_info, page, start, len); +} + +static blk_status_t submit_read_repair(struct inode *inode, + struct bio *failed_bio, u32 bio_offset, + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + unsigned int error_bitmap, + submit_bio_hook_t *submit_bio_hook) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; + int error = 0; + int i; + + BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + + /* We're here because we had some read errors or csum mismatch */ + ASSERT(error_bitmap); + + /* + * We only get called on buffered IO, thus page must be mapped and bio + * must not be cloned. + */ + ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED)); + + /* Iterate through all the sectors in the range */ + for (i = 0; i < nr_bits; i++) { + const unsigned int offset = i * sectorsize; + struct extent_state *cached = NULL; + bool uptodate = false; + int ret; + + if (!(error_bitmap & (1U << i))) { + /* + * This sector has no error, just end the page read + * and unlock the range. + */ + uptodate = true; + goto next; + } + + ret = btrfs_repair_one_sector(inode, failed_bio, + bio_offset + offset, + page, pgoff + offset, start + offset, + failed_mirror, submit_bio_hook); + if (!ret) { + /* + * We have submitted the read repair, the page release + * will be handled by the endio function of the + * submitted repair bio. + * Thus we don't need to do any thing here. + */ + continue; + } + /* + * Repair failed, just record the error but still continue. + * Or the remaining sectors will not be properly unlocked. + */ + if (!error) + error = ret; +next: + end_page_read(page, uptodate, start + offset, sectorsize); + if (uptodate) + set_extent_uptodate(&BTRFS_I(inode)->io_tree, + start + offset, + start + offset + sectorsize - 1, + &cached, GFP_ATOMIC); + unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree, + start + offset, + start + offset + sectorsize - 1, + &cached); + } + return errno_to_blk_status(error); } /* lots and lots of room for performance fixes in the end_bio funcs */ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { + struct btrfs_inode *inode; int uptodate = (err == 0); int ret = 0; - btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); + ASSERT(page && page->mapping); + inode = BTRFS_I(page->mapping->host); + btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); if (!uptodate) { ClearPageUptodate(page); @@ -2747,25 +2817,20 @@ static void end_bio_extent_writepage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; - /* We always issue full-page reads, but if some block - * in a page fails to read, blk_update_request() will - * advance bv_offset and adjust bv_len to compensate. - * Print a warning for nonzero offsets, and an error - * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) - btrfs_err(fs_info, - "partial page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else - btrfs_info(fs_info, - "incomplete page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - } + /* Our read/write should always be sector aligned. */ + if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + btrfs_err(fs_info, + "partial page write in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) + btrfs_info(fs_info, + "incomplete page write with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); - start = page_offset(page); - end = start + bvec->bv_offset + bvec->bv_len - 1; + start = page_offset(page) + bvec->bv_offset; + end = start + bvec->bv_len - 1; if (first_bvec) { btrfs_record_physical_zoned(inode, start, bio); @@ -2773,7 +2838,8 @@ static void end_bio_extent_writepage(struct bio *bio) } end_extent_writepage(page, error, start, end); - end_page_writeback(page); + + btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); } bio_put(bio); @@ -2862,30 +2928,6 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); } -static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); - - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); - - if (uptodate) { - btrfs_page_set_uptodate(fs_info, page, start, len); - } else { - btrfs_page_clear_uptodate(fs_info, page, start, len); - btrfs_page_set_error(fs_info, page, start, len); - } - - if (fs_info->sectorsize == PAGE_SIZE) - unlock_page(page); - else if (is_data_inode(page->mapping->host)) - /* - * For subpage data, unlock the page if we're the last reader. - * For subpage metadata, page lock is not utilized for read. - */ - btrfs_subpage_end_reader(fs_info, page, start, len); -} - /* * Find extent buffer for a givne bytenr. * @@ -2929,7 +2971,6 @@ static struct extent_buffer *find_extent_buffer_readpage( static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; @@ -2944,10 +2985,12 @@ static void end_bio_extent_readpage(struct bio *bio) ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { + bool uptodate = !bio->bi_status; struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; + unsigned int error_bitmap = (unsigned int)-1; u64 start; u64 end; u32 len; @@ -2982,14 +3025,16 @@ static void end_bio_extent_readpage(struct bio *bio) mirror = io_bio->mirror_num; if (likely(uptodate)) { - if (is_data_inode(inode)) - ret = btrfs_verify_data_csum(io_bio, + if (is_data_inode(inode)) { + error_bitmap = btrfs_verify_data_csum(io_bio, bio_offset, page, start, end); - else + ret = error_bitmap; + } else { ret = btrfs_validate_metadata_buffer(io_bio, page, start, end, mirror); + } if (ret) - uptodate = 0; + uptodate = false; else clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, tree, start, @@ -3001,27 +3046,18 @@ static void end_bio_extent_readpage(struct bio *bio) goto readpage_ok; if (is_data_inode(inode)) { - /* - * The generic bio_readpage_error handles errors the - * following way: If possible, new read requests are - * created and submitted and will end up in - * end_bio_extent_readpage as well (if we're lucky, - * not in the !uptodate case). In that case it returns - * 0 and we just go on with the next page in our bio. - * If it can't handle the error it will return -EIO and - * we remain responsible for that page. + * btrfs_submit_read_repair() will handle all the good + * and bad sectors, we just continue to the next bvec. */ - if (!btrfs_submit_read_repair(inode, bio, bio_offset, - page, - start - page_offset(page), - start, end, mirror, - btrfs_submit_data_bio)) { - uptodate = !bio->bi_status; - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - continue; - } + submit_read_repair(inode, bio, bio_offset, page, + start - page_offset(page), start, + end, mirror, error_bitmap, + btrfs_submit_data_bio); + + ASSERT(bio_offset + len > bio_offset); + bio_offset += len; + continue; } else { struct extent_buffer *eb; @@ -3151,42 +3187,99 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * * Return true if successfully page added. Otherwise, return false. */ -static bool btrfs_bio_add_page(struct bio *bio, struct page *page, +static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, + struct page *page, u64 disk_bytenr, unsigned int size, unsigned int pg_offset, - unsigned long prev_bio_flags, unsigned long bio_flags) { + struct bio *bio = bio_ctrl->bio; + u32 bio_size = bio->bi_iter.bi_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig; int ret; - if (prev_bio_flags != bio_flags) + ASSERT(bio); + /* The limit should be calculated when bio_ctrl->bio is allocated */ + ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); + if (bio_ctrl->bio_flags != bio_flags) return false; - if (prev_bio_flags & EXTENT_BIO_COMPRESSED) + if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; if (!contig) return false; - if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags)) + if (bio_size + size > bio_ctrl->len_to_oe_boundary || + bio_size + size > bio_ctrl->len_to_stripe_boundary) return false; - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct page *first_page = bio_first_bvec_all(bio)->bv_page; - - if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size)) - return false; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) ret = bio_add_zone_append_page(bio, page, size, pg_offset); - } else { + else ret = bio_add_page(bio, page, size, pg_offset); - } return ret == size; } +static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, + struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_io_geometry geom; + struct btrfs_ordered_extent *ordered; + struct extent_map *em; + u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); + int ret; + + /* + * Pages for compressed extent are never submitted to disk directly, + * thus it has no real boundary, just set them to U32_MAX. + * + * The split happens for real compressed bio, which happens in + * btrfs_submit_compressed_read/write(). + */ + if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) { + bio_ctrl->len_to_oe_boundary = U32_MAX; + bio_ctrl->len_to_stripe_boundary = U32_MAX; + return 0; + } + em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); + if (IS_ERR(em)) + return PTR_ERR(em); + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), + logical, &geom); + free_extent_map(em); + if (ret < 0) { + return ret; + } + if (geom.len > U32_MAX) + bio_ctrl->len_to_stripe_boundary = U32_MAX; + else + bio_ctrl->len_to_stripe_boundary = (u32)geom.len; + + if (!btrfs_is_zoned(fs_info) || + bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { + bio_ctrl->len_to_oe_boundary = U32_MAX; + return 0; + } + + ASSERT(fs_info->max_zone_append_size > 0); + /* Ordered extent not yet created, so we're good */ + ordered = btrfs_lookup_ordered_extent(inode, logical); + if (!ordered) { + bio_ctrl->len_to_oe_boundary = U32_MAX; + return 0; + } + + bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, + ordered->disk_bytenr + ordered->disk_num_bytes - logical); + btrfs_put_ordered_extent(ordered); + return 0; +} + /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting @@ -3203,12 +3296,11 @@ static bool btrfs_bio_add_page(struct bio *bio, struct page *page, */ static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, + struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, size_t size, unsigned long pg_offset, - struct bio **bio_ret, bio_end_io_t end_io_func, int mirror_num, - unsigned long prev_bio_flags, unsigned long bio_flags, bool force_bio_submit) { @@ -3219,19 +3311,19 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree = &inode->io_tree; struct btrfs_fs_info *fs_info = inode->root->fs_info; - ASSERT(bio_ret); + ASSERT(bio_ctrl); - if (*bio_ret) { - bio = *bio_ret; + ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && + pg_offset + size <= PAGE_SIZE); + if (bio_ctrl->bio) { + bio = bio_ctrl->bio; if (force_bio_submit || - !btrfs_bio_add_page(bio, page, disk_bytenr, io_size, - pg_offset, prev_bio_flags, bio_flags)) { - ret = submit_one_bio(bio, mirror_num, prev_bio_flags); - if (ret < 0) { - *bio_ret = NULL; + !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size, + pg_offset, bio_flags)) { + ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags); + bio_ctrl->bio = NULL; + if (ret < 0) return ret; - } - bio = NULL; } else { if (wbc) wbc_account_cgroup_owner(wbc, page, io_size); @@ -3254,22 +3346,18 @@ static int submit_extent_page(unsigned int opf, wbc_account_cgroup_owner(wbc, page, io_size); } if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_device *device; - em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size); - if (IS_ERR(em)) - return PTR_ERR(em); + device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size); + if (IS_ERR(device)) + return PTR_ERR(device); - map = em->map_lookup; - /* We only support single profile for now */ - ASSERT(map->num_stripes == 1); - btrfs_io_bio(bio)->device = map->stripes[0].dev; - - free_extent_map(em); + btrfs_io_bio(bio)->device = device; } - *bio_ret = bio; + bio_ctrl->bio = bio; + bio_ctrl->bio_flags = bio_flags; + ret = calc_bio_boundaries(bio_ctrl, inode); return ret; } @@ -3382,7 +3470,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * return 0 on success, otherwise return error */ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct bio **bio, unsigned long *bio_flags, + struct btrfs_bio_ctrl *bio_ctrl, unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; @@ -3558,15 +3646,13 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - page, disk_bytenr, iosize, - pg_offset, bio, + bio_ctrl, page, disk_bytenr, iosize, + pg_offset, end_bio_extent_readpage, 0, - *bio_flags, this_bio_flag, force_bio_submit); if (!ret) { nr++; - *bio_flags = this_bio_flag; } else { unlock_extent(tree, cur, cur + iosize - 1); end_page_read(page, false, cur, iosize); @@ -3580,11 +3666,10 @@ out: } static inline void contiguous_readpages(struct page *pages[], int nr_pages, - u64 start, u64 end, - struct extent_map **em_cached, - struct bio **bio, - unsigned long *bio_flags, - u64 *prev_em_start) + u64 start, u64 end, + struct extent_map **em_cached, + struct btrfs_bio_ctrl *bio_ctrl, + u64 *prev_em_start) { struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); int index; @@ -3592,7 +3677,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - btrfs_do_readpage(pages[index], em_cached, bio, bio_flags, + btrfs_do_readpage(pages[index], em_cached, bio_ctrl, REQ_RAHEAD, prev_em_start); put_page(pages[index]); } @@ -3680,6 +3765,54 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, } /* + * Find the first byte we need to write. + * + * For subpage, one page can contain several sectors, and + * __extent_writepage_io() will just grab all extent maps in the page + * range and try to submit all non-inline/non-compressed extents. + * + * This is a big problem for subpage, we shouldn't re-submit already written + * data at all. + * This function will lookup subpage dirty bit to find which range we really + * need to submit. + * + * Return the next dirty range in [@start, @end). + * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. + */ +static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, + struct page *page, u64 *start, u64 *end) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u64 orig_start = *start; + /* Declare as unsigned long so we can use bitmap ops */ + unsigned long dirty_bitmap; + unsigned long flags; + int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits; + int range_start_bit = nbits; + int range_end_bit; + + /* + * For regular sector size == page size case, since one page only + * contains one sector, we return the page offset directly. + */ + if (fs_info->sectorsize == PAGE_SIZE) { + *start = page_offset(page); + *end = page_offset(page) + PAGE_SIZE; + return; + } + + /* We should have the page locked, but just in case */ + spin_lock_irqsave(&subpage->lock, flags); + dirty_bitmap = subpage->dirty_bitmap; + spin_unlock_irqrestore(&subpage->lock, flags); + + bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit, + BTRFS_SUBPAGE_BITMAP_SIZE); + *start = page_offset(page) + range_start_bit * fs_info->sectorsize; + *end = page_offset(page) + range_end_bit * fs_info->sectorsize; +} + +/* * helper for __extent_writepage. This calls the writepage start hooks, * and does the loop to map the page into extents and bios. * @@ -3696,7 +3829,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_io_tree *tree = &inode->io_tree; u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; u64 cur = start; @@ -3727,15 +3859,26 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, while (cur <= end) { u64 disk_bytenr; u64 em_end; + u64 dirty_range_start = cur; + u64 dirty_range_end; u32 iosize; if (cur >= i_size) { - btrfs_writepage_endio_finish_ordered(page, cur, end, 1); + btrfs_writepage_endio_finish_ordered(inode, page, cur, + end, 1); break; } + + find_next_dirty_byte(fs_info, page, &dirty_range_start, + &dirty_range_end); + if (cur < dirty_range_start) { + cur = dirty_range_start; + continue; + } + em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); if (IS_ERR_OR_NULL(em)) { - SetPageError(page); + btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); break; } @@ -3750,8 +3893,11 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); disk_bytenr = em->block_start + extent_offset; - /* Note that em_end from extent_map_end() is exclusive */ - iosize = min(em_end, end + 1) - cur; + /* + * Note that em_end from extent_map_end() and dirty_range_end from + * find_next_dirty_byte() are all exclusive + */ + iosize = min(min(em_end, end + 1), dirty_range_end) - cur; if (btrfs_use_zone_append(inode, em->block_start)) opf = REQ_OP_ZONE_APPEND; @@ -3768,28 +3914,38 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (compressed) nr++; else - btrfs_writepage_endio_finish_ordered(page, cur, - cur + iosize - 1, 1); + btrfs_writepage_endio_finish_ordered(inode, + page, cur, cur + iosize - 1, 1); cur += iosize; continue; } - btrfs_set_range_writeback(tree, cur, cur + iosize - 1); + btrfs_set_range_writeback(inode, cur, cur + iosize - 1); if (!PageWriteback(page)) { btrfs_err(inode->root->fs_info, "page %lu not writeback, cur %llu end %llu", page->index, cur, end); } - ret = submit_extent_page(opf | write_flags, wbc, page, + /* + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. + * So clear subpage dirty bit here so next time we won't submit + * page for range already written to disk. + */ + btrfs_page_clear_dirty(fs_info, page, cur, iosize); + + ret = submit_extent_page(opf | write_flags, wbc, + &epd->bio_ctrl, page, disk_bytenr, iosize, - cur - page_offset(page), &epd->bio, + cur - page_offset(page), end_bio_extent_writepage, - 0, 0, 0, false); + 0, 0, false); if (ret) { - SetPageError(page); + btrfs_page_set_error(fs_info, page, cur, iosize); if (PageWriteback(page)) - end_page_writeback(page); + btrfs_page_clear_writeback(fs_info, page, cur, + iosize); } cur += iosize; @@ -4098,12 +4254,15 @@ static struct extent_buffer *find_extent_buffer_nolock( * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() * after all extent buffers in the page has finished their writeback. */ -static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info, - struct bio *bio) +static void end_bio_subpage_eb_writepage(struct bio *bio) { + struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct bvec_iter_all iter_all; + fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); + ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; @@ -4154,16 +4313,11 @@ static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info, static void end_bio_extent_buffer_writepage(struct bio *bio) { - struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct extent_buffer *eb; int done; struct bvec_iter_all iter_all; - fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) - return end_bio_subpage_eb_writepage(fs_info, bio); - ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; @@ -4189,12 +4343,34 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) bio_put(bio); } +static void prepare_eb_write(struct extent_buffer *eb) +{ + u32 nritems; + unsigned long start; + unsigned long end; + + clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); + atomic_set(&eb->io_pages, num_extent_pages(eb)); + + /* Set btree blocks beyond nritems with 0 to avoid stale content */ + nritems = btrfs_header_nritems(eb); + if (btrfs_header_level(eb) > 0) { + end = btrfs_node_key_ptr_offset(nritems); + memzero_extent_buffer(eb, end, eb->len - end); + } else { + /* + * Leaf: + * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 + */ + start = btrfs_item_nr_offset(nritems); + end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); + memzero_extent_buffer(eb, start, end - start); + } +} + /* * Unlike the work in write_one_eb(), we rely completely on extent locking. * Page locking is only utilized at minimum to keep the VMM code happy. - * - * Caller should still call write_one_eb() other than this function directly. - * As write_one_eb() has extra preparation before submitting the extent buffer. */ static int write_one_subpage_eb(struct extent_buffer *eb, struct writeback_control *wbc, @@ -4206,6 +4382,8 @@ static int write_one_subpage_eb(struct extent_buffer *eb, bool no_dirty_ebs = false; int ret; + prepare_eb_write(eb); + /* clear_page_dirty_for_io() in subpage helper needs page locked */ lock_page(page); btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); @@ -4216,10 +4394,10 @@ static int write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); - ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, - eb->start, eb->len, eb->start - page_offset(page), - &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, - false); + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, + &epd->bio_ctrl, page, eb->start, eb->len, + eb->start - page_offset(page), + end_bio_subpage_eb_writepage, 0, 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); set_btree_ioerr(page, eb); @@ -4244,45 +4422,23 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct extent_page_data *epd) { u64 disk_bytenr = eb->start; - u32 nritems; int i, num_pages; - unsigned long start, end; unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; int ret = 0; - clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); - num_pages = num_extent_pages(eb); - atomic_set(&eb->io_pages, num_pages); - - /* set btree blocks beyond nritems with 0 to avoid stale content. */ - nritems = btrfs_header_nritems(eb); - if (btrfs_header_level(eb) > 0) { - end = btrfs_node_key_ptr_offset(nritems); - - memzero_extent_buffer(eb, end, eb->len - end); - } else { - /* - * leaf: - * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 - */ - start = btrfs_item_nr_offset(nritems); - end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); - memzero_extent_buffer(eb, start, end - start); - } - - if (eb->fs_info->sectorsize < PAGE_SIZE) - return write_one_subpage_eb(eb, wbc, epd); + prepare_eb_write(eb); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - p, disk_bytenr, PAGE_SIZE, 0, - &epd->bio, + &epd->bio_ctrl, p, disk_bytenr, + PAGE_SIZE, 0, end_bio_extent_buffer_writepage, - 0, 0, 0, false); + 0, 0, false); if (ret) { set_btree_ioerr(p, eb); if (PageWriteback(p)) @@ -4386,7 +4542,7 @@ static int submit_eb_subpage(struct page *page, free_extent_buffer(eb); goto cleanup; } - ret = write_one_eb(eb, wbc, epd); + ret = write_one_subpage_eb(eb, wbc, epd); free_extent_buffer(eb); if (ret < 0) goto cleanup; @@ -4498,7 +4654,7 @@ int btree_write_cache_pages(struct address_space *mapping, { struct extent_buffer *eb_context = NULL; struct extent_page_data epd = { - .bio = NULL, + .bio_ctrl = { 0 }, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4780,7 +4936,7 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) { int ret; struct extent_page_data epd = { - .bio = NULL, + .bio_ctrl = { 0 }, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4807,7 +4963,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, PAGE_SHIFT; struct extent_page_data epd = { - .bio = NULL, + .bio_ctrl = { 0 }, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, }; @@ -4827,8 +4983,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, if (clear_page_dirty_for_io(page)) ret = __extent_writepage(page, &wbc_writepages, &epd); else { - btrfs_writepage_endio_finish_ordered(page, start, - start + PAGE_SIZE - 1, 1); + btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), + page, start, start + PAGE_SIZE - 1, 1); unlock_page(page); } put_page(page); @@ -4850,7 +5006,7 @@ int extent_writepages(struct address_space *mapping, { int ret = 0; struct extent_page_data epd = { - .bio = NULL, + .bio_ctrl = { 0 }, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4867,8 +5023,7 @@ int extent_writepages(struct address_space *mapping, void extent_readahead(struct readahead_control *rac) { - struct bio *bio = NULL; - unsigned long bio_flags = 0; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; struct page *pagepool[16]; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; @@ -4879,14 +5034,14 @@ void extent_readahead(struct readahead_control *rac) u64 contig_end = contig_start + readahead_batch_length(rac) - 1; contiguous_readpages(pagepool, nr, contig_start, contig_end, - &em_cached, &bio, &bio_flags, &prev_em_start); + &em_cached, &bio_ctrl, &prev_em_start); } if (em_cached) free_extent_map(em_cached); - if (bio) { - if (submit_one_bio(bio, 0, bio_flags)) + if (bio_ctrl.bio) { + if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags)) return; } } @@ -5429,6 +5584,12 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) subpage = (struct btrfs_subpage *)page->private; if (atomic_read(&subpage->eb_refs)) return true; + /* + * Even there is no eb refs here, we may still have + * end_page_read() call relying on page::private. + */ + if (atomic_read(&subpage->readers)) + return true; } return false; } @@ -5489,7 +5650,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag /* * We can only detach the page private if there are no other ebs in the - * page range. + * page range and no unfinished IO. */ if (!page_range_has_eb(fs_info, page)) btrfs_detach_subpage(fs_info, page); @@ -6176,7 +6337,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; - struct bio *bio = NULL; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; int ret = 0; ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); @@ -6184,10 +6345,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; if (wait == WAIT_NONE) { - ret = try_lock_extent(io_tree, eb->start, - eb->start + eb->len - 1); - if (ret <= 0) - return ret; + if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) + return -EAGAIN; } else { ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); if (ret < 0) @@ -6209,9 +6368,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, check_buffer_tree_ref(eb); btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); - ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start, - eb->len, eb->start - page_offset(page), &bio, - end_bio_extent_readpage, mirror_num, 0, 0, + btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); + ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl, + page, eb->start, eb->len, + eb->start - page_offset(page), + end_bio_extent_readpage, mirror_num, 0, true); if (ret) { /* @@ -6221,10 +6382,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } - if (bio) { + if (bio_ctrl.bio) { int tmp; - tmp = submit_one_bio(bio, mirror_num, 0); + tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0); + bio_ctrl.bio = NULL; if (tmp < 0) return tmp; } @@ -6247,8 +6409,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) int all_uptodate = 1; int num_pages; unsigned long num_reads = 0; - struct bio *bio = NULL; - unsigned long bio_flags = 0; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -6312,9 +6473,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) ClearPageError(page); err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, - page, page_offset(page), PAGE_SIZE, 0, - &bio, end_bio_extent_readpage, - mirror_num, 0, 0, false); + &bio_ctrl, page, page_offset(page), + PAGE_SIZE, 0, end_bio_extent_readpage, + mirror_num, 0, false); if (err) { /* * We failed to submit the bio so it's the @@ -6331,8 +6492,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } - if (bio) { - err = submit_one_bio(bio, mirror_num, bio_flags); + if (bio_ctrl.bio) { + err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags); + bio_ctrl.bio = NULL; if (err) return err; } @@ -6515,9 +6677,10 @@ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, char *kaddr; assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); - memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, - BTRFS_FSID_SIZE); + kaddr = page_address(eb->pages[0]) + + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, + chunk_tree_uuid)); + memcpy(kaddr, srcv, BTRFS_FSID_SIZE); } void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) @@ -6525,9 +6688,9 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) char *kaddr; assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); - memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, - BTRFS_FSID_SIZE); + kaddr = page_address(eb->pages[0]) + + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid)); + memcpy(kaddr, srcv, BTRFS_FSID_SIZE); } void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 227215a5722c..62027f551b44 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -39,7 +39,7 @@ enum { /* Page starts writeback, clear dirty bit and set writeback bit */ #define PAGE_START_WRITEBACK (1 << 1) #define PAGE_END_WRITEBACK (1 << 2) -#define PAGE_SET_PRIVATE2 (1 << 3) +#define PAGE_SET_ORDERED (1 << 3) #define PAGE_SET_ERROR (1 << 4) #define PAGE_LOCK (1 << 5) @@ -102,6 +102,17 @@ struct extent_buffer { }; /* + * Structure to record info about the bio being assembled, and other info like + * how many bytes are there before stripe/ordered extent boundary. + */ +struct btrfs_bio_ctrl { + struct bio *bio; + unsigned long bio_flags; + u32 len_to_stripe_boundary; + u32 len_to_oe_boundary; +}; + +/* * Structure to record how many bytes and which ranges are set/cleared */ struct extent_changeset { @@ -169,7 +180,7 @@ int try_release_extent_buffer(struct page *page); int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags); int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct bio **bio, unsigned long *bio_flags, + struct btrfs_bio_ctrl *bio_ctrl, unsigned int read_flags, u64 *prev_em_start); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, @@ -281,7 +292,7 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); * When IO fails, either with EIO or csum verification fails, we * try other mirrors that might have a good copy of the data. This * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the page is set up to date + * mirrors. If another mirror has good data, the sector is set up to date * and things continue. If a good mirror can't be found, the original * bio end_io callback is called to indicate things have failed. */ @@ -293,15 +304,13 @@ struct io_failure_record { unsigned long bio_flags; int this_mirror; int failed_mirror; - int in_validation; }; - -blk_status_t btrfs_submit_read_repair(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - submit_bio_hook_t *submit_bio_hook); +int btrfs_repair_one_sector(struct inode *inode, + struct bio *failed_bio, u32 bio_offset, + struct page *page, unsigned int pgoff, + u64 start, int failed_mirror, + submit_bio_hook_t *submit_bio_hook); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 441cee7fbb62..df6631eefc65 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -618,7 +618,7 @@ fail: * @file_start: offset in file this bio begins to describe * @contig: Boolean. If true/1 means all bio vecs in this bio are * contiguous and they begin at @file_start in the file. False/0 - * means this bio can contains potentially discontigous bio vecs + * means this bio can contain potentially discontiguous bio vecs * so the logical offset of each should be calculated separately. */ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3b10d98b4ebb..ee34497500e1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -28,6 +28,7 @@ #include "compression.h" #include "delalloc-space.h" #include "reflink.h" +#include "subpage.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -398,7 +399,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, /* * Copy data from userspace to the current page */ - copied = iov_iter_copy_from_user_atomic(page, i, offset, count); + copied = copy_page_from_iter_atomic(page, offset, count, i); /* Flush processor's dcache for this page */ flush_dcache_page(page); @@ -412,20 +413,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, * The rest of the btrfs_file_write code will fall * back to page at a time copies after we return 0. */ - if (!PageUptodate(page) && copied < count) - copied = 0; + if (unlikely(copied < count)) { + if (!PageUptodate(page)) { + iov_iter_revert(i, copied); + copied = 0; + } + if (!copied) + break; + } - iov_iter_advance(i, copied); write_bytes -= copied; total_copied += copied; - - /* Return to btrfs_file_write_iter to fault page */ - if (unlikely(copied == 0)) - break; - - if (copied < PAGE_SIZE - offset) { - offset += copied; - } else { + offset += copied; + if (offset == PAGE_SIZE) { pg++; offset = 0; } @@ -482,6 +482,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, start_pos = round_down(pos, fs_info->sectorsize); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); + ASSERT(num_bytes <= U32_MAX); end_of_last_block = start_pos + num_bytes - 1; @@ -500,9 +501,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; - SetPageUptodate(p); + + btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); ClearPageChecked(p); - set_page_dirty(p); + btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); } /* @@ -1094,7 +1096,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int del_nr = 0; int del_slot = 0; int recow; - int ret; + int ret = 0; u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); @@ -1315,7 +1317,7 @@ again: } out: btrfs_free_path(path); - return 0; + return ret; } /* @@ -2483,6 +2485,17 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, const u64 lockend, struct extent_state **cached_state) { + /* + * For subpage case, if the range is not at page boundary, we could + * have pages at the leading/tailing part of the range. + * This could lead to dead loop since filemap_range_has_page() + * will always return true. + * So here we need to do extra page alignment for + * filemap_range_has_page(). + */ + const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); + const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + while (1) { struct btrfs_ordered_extent *ordered; int ret; @@ -2503,7 +2516,7 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, (ordered->file_offset + ordered->num_bytes <= lockstart || ordered->file_offset > lockend)) && !filemap_range_has_page(inode->i_mapping, - lockstart, lockend)) { + page_lockstart, page_lockend)) { if (ordered) btrfs_put_ordered_extent(ordered); break; @@ -3034,22 +3047,20 @@ struct falloc_range { */ static int add_falloc_range(struct list_head *head, u64 start, u64 len) { - struct falloc_range *prev = NULL; struct falloc_range *range = NULL; - if (list_empty(head)) - goto insert; - - /* - * As fallocate iterate by bytenr order, we only need to check - * the last range. - */ - prev = list_entry(head->prev, struct falloc_range, list); - if (prev->start + prev->len == start) { - prev->len += len; - return 0; + if (!list_empty(head)) { + /* + * As fallocate iterates by bytenr order, we only need to check + * the last range. + */ + range = list_last_entry(head, struct falloc_range, list); + if (range->start + range->len == start) { + range->len += len; + return 0; + } } -insert: + range = kmalloc(sizeof(*range), GFP_KERNEL); if (!range) return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4806295116d8..2131ae5b9ed7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -327,7 +327,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, * need to check for -EAGAIN. */ ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, BTRFS_EXTENT_DATA_KEY); + 0, BTRFS_EXTENT_DATA_KEY, NULL); if (ret) goto fail; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 46f392943f4d..e6eb20987351 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -51,6 +51,7 @@ #include "block-group.h" #include "space-info.h" #include "zoned.h" +#include "subpage.h" struct btrfs_iget_args { u64 ino; @@ -166,22 +167,47 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, struct page *page; while (index <= end_index) { + /* + * For locked page, we will call end_extent_writepage() on it + * in run_delalloc_range() for the error handling. That + * end_extent_writepage() function will call + * btrfs_mark_ordered_io_finished() to clear page Ordered and + * run the ordered extent accounting. + * + * Here we can't just clear the Ordered bit, or + * btrfs_mark_ordered_io_finished() would skip the accounting + * for the page range, and the ordered extent will never finish. + */ + if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { + index++; + continue; + } page = find_get_page(inode->vfs_inode.i_mapping, index); index++; if (!page) continue; - ClearPagePrivate2(page); + + /* + * Here we just clear all Ordered bits for every page in the + * range, then __endio_write_update_ordered() will handle + * the ordered extent accounting for the range. + */ + btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, + offset, bytes); put_page(page); } + /* The locked page covers the full range, nothing needs to be done */ + if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) + return; /* * In case this page belongs to the delalloc range being instantiated * then skip it, since the first page of a range is going to be * properly cleaned up by the caller of run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { - offset += PAGE_SIZE; - bytes -= PAGE_SIZE; + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; + offset = page_offset(locked_page) + PAGE_SIZE; } return __endio_write_update_ordered(inode, offset, bytes, false); @@ -603,7 +629,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (inode_need_compress(BTRFS_I(inode), start, end)) { + if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -946,7 +972,8 @@ retry: const u64 end = start + async_extent->ram_size - 1; p->mapping = inode->vfs_inode.i_mapping; - btrfs_writepage_endio_finish_ordered(p, start, end, 0); + btrfs_writepage_endio_finish_ordered(inode, p, start, + end, 0); p->mapping = NULL; extent_clear_unlock_delalloc(inode, start, end, NULL, 0, @@ -1064,7 +1091,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, NULL, + extent_clear_unlock_delalloc(inode, start, end, + locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1072,6 +1100,19 @@ static noinline int cow_file_range(struct btrfs_inode *inode, *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; + /* + * locked_page is locked by the caller of + * writepage_delalloc(), not locked by + * __process_pages_contig(). + * + * We can't let __process_pages_contig() to unlock it, + * as it doesn't have any subpage::writers recorded. + * + * Here we manually unlock the page, since the caller + * can't use page_started to determine if it's an + * inline extent or a compressed extent. + */ + unlock_page(locked_page); goto out; } else if (ret < 0) { goto out_unlock; @@ -1150,15 +1191,16 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* we're not doing compressed IO, don't unlock the first - * page (which the caller expects to stay locked), don't - * clear any dirty bits and don't set any writeback bits + /* + * We're not doing compressed IO, don't unlock the first page + * (which the caller expects to stay locked), don't clear any + * dirty bits and don't set any writeback bits * - * Do set the Private2 bit so we know this page was properly - * setup for writepage + * Do set the Ordered (Private2) bit so we know this page was + * properly setup for writepage. */ page_ops = unlock ? PAGE_UNLOCK : 0; - page_ops |= PAGE_SET_PRIVATE2; + page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, locked_page, @@ -1822,7 +1864,7 @@ out_check: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_PRIVATE2); + PAGE_UNLOCK | PAGE_SET_ORDERED); cur_offset = extent_end; @@ -2193,26 +2235,22 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 logical = bio->bi_iter.bi_sector << 9; + u32 bio_len = bio->bi_iter.bi_size; struct extent_map *em; - u64 length = 0; - u64 map_length; int ret = 0; struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; - length = bio->bi_iter.bi_size; - map_length = length; - em = btrfs_get_chunk_map(fs_info, logical, map_length); + em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); if (IS_ERR(em)) return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, - map_length, &geom); + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom); if (ret < 0) goto out; - if (geom.len < length + size) + if (geom.len < bio_len + size) ret = 1; out: free_extent_map(em); @@ -2233,33 +2271,6 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } -bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, - unsigned int size) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered; - u64 len = bio->bi_iter.bi_size + size; - bool ret = true; - - ASSERT(btrfs_is_zoned(fs_info)); - ASSERT(fs_info->max_zone_append_size > 0); - ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND); - - /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); - if (!ordered) - return ret; - - if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len > - ordered->disk_bytenr + ordered->disk_num_bytes) - ret = false; - - btrfs_put_ordered_extent(ordered); - - return ret; -} - static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, struct bio *bio, loff_t file_offset) { @@ -2601,7 +2612,7 @@ again: lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PagePrivate2(page)) + if (PageOrdered(page)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); @@ -2676,8 +2687,8 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_writepage_fixup *fixup; - /* this page is properly in the ordered list */ - if (TestClearPagePrivate2(page)) + /* This page has ordered extent covering it already */ + if (PageOrdered(page)) return 0; /* @@ -2773,7 +2784,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, /* * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the - * number of bytes only for that range contaning the inline extent. + * number of bytes only for that range containing the inline extent. * The remaining of the range will be processed when clearning the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ @@ -3069,28 +3080,14 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } -void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, u64 end, int uptodate) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workqueue *wq; - - trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); - - ClearPagePrivate2(page); - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1, uptodate)) - return; + trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - - btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); - btrfs_queue_work(wq, &ordered_extent->work); + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, + finish_ordered_fn, uptodate); } /* @@ -3152,15 +3149,19 @@ zeroit: * @bio_offset: offset to the beginning of the bio (in bytes) * @start: file offset of the range start * @end: file offset of the range end (inclusive) + * + * Return a bitmap where bit set means a csum mismatch, and bit not set means + * csum match. */ -int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end) +unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, + struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_root *root = BTRFS_I(inode)->root; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; + unsigned int result = 0; if (PageChecked(page)) { ClearPageChecked(page); @@ -3188,10 +3189,14 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, page_offset(page) + pg_off); - if (ret < 0) - return -EIO; + if (ret < 0) { + const int nr_bit = (pg_off - offset_in_page(start)) >> + root->fs_info->sectorsize_bits; + + result |= (1U << nr_bit); + } } - return 0; + return result; } /* @@ -4109,7 +4114,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * This is a placeholder inode for a subvolume we didn't have a * reference to at the time of the snapshot creation. In the meantime * we could have renamed the real subvol link into our snapshot, so - * depending on btrfs_del_root_ref to return -ENOENT here is incorret. + * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. * Instead simply lookup the dir_index_item for this entry so we can * remove it. Otherwise we know we have a ref to the root and we can * call btrfs_del_root_ref, and it _shouldn't_ fail. @@ -4464,20 +4469,36 @@ out: #define NEED_TRUNCATE_BLOCK 1 /* - * this can truncate away extent items, csum items and directory items. - * It starts at a high offset and removes keys until it can't find - * any higher than new_size + * Remove inode items from a given root. * - * csum items that cross the new i_size are truncated to the new size - * as well. + * @trans: A transaction handle. + * @root: The root from which to remove items. + * @inode: The inode whose items we want to remove. + * @new_size: The new i_size for the inode. This is only applicable when + * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise. + * @min_type: The minimum key type to remove. All keys with a type + * greater than this value are removed and all keys with + * this type are removed only if their offset is >= @new_size. + * @extents_found: Output parameter that will contain the number of file + * extent items that were removed or adjusted to the new + * inode i_size. The caller is responsible for initializing + * the counter. Also, it can be NULL if the caller does not + * need this counter. * - * min_type is the minimum key type to truncate down to. If set to 0, this - * will kill all the items on this inode, including the INODE_ITEM_KEY. + * Remove all keys associated with the inode from the given root that have a key + * with a type greater than or equals to @min_type. When @min_type has a value of + * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value + * greater than or equals to @new_size. If a file extent item that starts before + * @new_size and ends after it is found, its length is adjusted. + * + * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is + * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. */ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, - u64 new_size, u32 min_type) + u64 new_size, u32 min_type, + u64 *extents_found) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; @@ -4623,6 +4644,9 @@ search_again: if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; + if (extents_found != NULL) + (*extents_found)++; + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; @@ -4941,7 +4965,7 @@ again: flush_dcache_page(page); } ClearPageChecked(page); - set_page_dirty(page); + btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) @@ -5455,7 +5479,7 @@ void btrfs_evict_inode(struct inode *inode) trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, 0); + 0, 0, NULL); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -7937,19 +7961,17 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, btrfs_ino(BTRFS_I(inode)), pgoff); } else { - blk_status_t status; + int ret; ASSERT((start - io_bio->logical) < UINT_MAX); - status = btrfs_submit_read_repair(inode, - &io_bio->bio, - start - io_bio->logical, - bvec.bv_page, pgoff, - start, - start + sectorsize - 1, - io_bio->mirror_num, - submit_dio_repair_bio); - if (status) - err = status; + ret = btrfs_repair_one_sector(inode, + &io_bio->bio, + start - io_bio->logical, + bvec.bv_page, pgoff, + start, io_bio->mirror_num, + submit_dio_repair_bio); + if (ret) + err = errno_to_blk_status(ret); } start += sectorsize; ASSERT(bio_offset + sectorsize > bio_offset); @@ -7964,41 +7986,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered = NULL; - struct btrfs_workqueue *wq; - u64 ordered_offset = offset; - u64 ordered_bytes = bytes; - u64 last_offset; - - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - - while (ordered_offset < offset + bytes) { - last_offset = ordered_offset; - if (btrfs_dec_test_first_ordered_pending(inode, &ordered, - &ordered_offset, - ordered_bytes, - uptodate)) { - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, - NULL); - btrfs_queue_work(wq, &ordered->work); - } - - /* No ordered extent found in the range, exit */ - if (ordered_offset == last_offset) - return; - /* - * Our bio might span multiple ordered extents. In this case - * we keep going until we have accounted the whole dio. - */ - if (ordered_offset < offset + bytes) { - ordered_bytes = offset + bytes - ordered_offset; - ordered = NULL; - } - } + btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, + finish_ordered_fn, uptodate); } static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, @@ -8172,7 +8161,7 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, goto out_err_em; } ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), - logical, submit_len, &geom); + logical, &geom); if (ret) { status = errno_to_blk_status(ret); goto out_err_em; @@ -8276,15 +8265,14 @@ int btrfs_readpage(struct file *file, struct page *page) struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - unsigned long bio_flags = 0; - struct bio *bio = NULL; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); - if (bio) - ret = submit_one_bio(bio, 0, bio_flags); + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + if (bio_ctrl.bio) + ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); return ret; } @@ -8353,9 +8341,9 @@ static int btrfs_migratepage(struct address_space *mapping, if (page_has_private(page)) attach_page_private(newpage, detach_page_private(page)); - if (PagePrivate2(page)) { - ClearPagePrivate2(page); - SetPagePrivate2(newpage); + if (PageOrdered(page)) { + ClearPageOrdered(page); + SetPageOrdered(newpage); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -8370,27 +8358,42 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; - struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; - u64 start; - u64 end; + u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; - bool found_ordered = false; - bool completed_ordered = false; /* - * we have the page locked, so new writeback can't start, - * and the dirty bit won't be cleared while we are here. + * We have page locked so no new ordered extent can be created on this + * page, nor bio can be submitted for this page. + * + * But already submitted bio can still be finished on this page. + * Furthermore, endio function won't skip page which has Ordered + * (Private2) already cleared, so it's possible for endio and + * invalidatepage to do the same ordered extent accounting twice + * on one page. * - * Wait for IO on this page so that we can safely clear - * the PagePrivate2 bit and do ordered accounting + * So here we wait for any submitted bios to finish, so that we won't + * do double ordered extent accounting on the same page. */ wait_on_page_writeback(page); - if (offset) { + /* + * For subpage case, we have call sites like + * btrfs_punch_hole_lock_range() which passes range not aligned to + * sectorsize. + * If the range doesn't cover the full page, we don't need to and + * shouldn't clear page extent mapped, as page->private can still + * record subpage dirty bits for other part of the range. + * + * For cases that can invalidate the full even the range doesn't + * cover the full page, like invalidating the last page, we're + * still safe to wait for ordered extent to finish. + */ + if (!(offset == 0 && length == PAGE_SIZE)) { btrfs_releasepage(page, GFP_NOFS); return; } @@ -8398,89 +8401,123 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); - start = page_start; -again: - ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); - if (ordered) { - found_ordered = true; - end = min(page_end, - ordered->file_offset + ordered->num_bytes - 1); + cur = page_start; + while (cur < page_end) { + struct btrfs_ordered_extent *ordered; + bool delete_states; + u64 range_end; + u32 range_len; + + ordered = btrfs_lookup_first_ordered_range(inode, cur, + page_end + 1 - cur); + if (!ordered) { + range_end = page_end; + /* + * No ordered extent covering this range, we are safe + * to delete all extent states in the range. + */ + delete_states = true; + goto next; + } + if (ordered->file_offset > cur) { + /* + * There is a range between [cur, oe->file_offset) not + * covered by any ordered extent. + * We are safe to delete all extent states, and handle + * the ordered extent in the next iteration. + */ + range_end = ordered->file_offset - 1; + delete_states = true; + goto next; + } + + range_end = min(ordered->file_offset + ordered->num_bytes - 1, + page_end); + ASSERT(range_end + 1 - cur < U32_MAX); + range_len = range_end + 1 - cur; + if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { + /* + * If Ordered (Private2) is cleared, it means endio has + * already been executed for the range. + * We can't delete the extent states as + * btrfs_finish_ordered_io() may still use some of them. + */ + delete_states = false; + goto next; + } + btrfs_page_clear_ordered(fs_info, page, cur, range_len); + /* * IO on this page will never be started, so we need to account * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW * here, must leave that up for the ordered extent completion. + * + * This will also unlock the range for incoming + * btrfs_finish_ordered_io(). */ if (!inode_evicting) - clear_extent_bit(tree, start, end, + clear_extent_bit(tree, cur, range_end, EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); + + spin_lock_irq(&inode->ordered_tree.lock); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + ordered->truncated_len = min(ordered->truncated_len, + cur - ordered->file_offset); + spin_unlock_irq(&inode->ordered_tree.lock); + + if (btrfs_dec_test_ordered_pending(inode, &ordered, + cur, range_end + 1 - cur, 1)) { + btrfs_finish_ordered_io(ordered); + /* + * The ordered extent has finished, now we're again + * safe to delete all extent states of the range. + */ + delete_states = true; + } else { + /* + * btrfs_finish_ordered_io() will get executed by endio + * of other pages, thus we can't delete extent states + * anymore + */ + delete_states = false; + } +next: + if (ordered) + btrfs_put_ordered_extent(ordered); /* - * whoever cleared the private bit is responsible - * for the finish_ordered_io + * Qgroup reserved space handler + * Sector(s) here will be either: + * + * 1) Already written to disk or bio already finished + * Then its QGROUP_RESERVED bit in io_tree is already cleared. + * Qgroup will be handled by its qgroup_record then. + * btrfs_qgroup_free_data() call will do nothing here. + * + * 2) Not written to disk yet + * Then btrfs_qgroup_free_data() call will clear the + * QGROUP_RESERVED bit of its io_tree, and free the qgroup + * reserved data space. + * Since the IO will never happen for this page. */ - if (TestClearPagePrivate2(page)) { - spin_lock_irq(&inode->ordered_tree.lock); - set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); - ordered->truncated_len = min(ordered->truncated_len, - start - ordered->file_offset); - spin_unlock_irq(&inode->ordered_tree.lock); - - if (btrfs_dec_test_ordered_pending(inode, &ordered, - start, - end - start + 1, 1)) { - btrfs_finish_ordered_io(ordered); - completed_ordered = true; - } - } - btrfs_put_ordered_extent(ordered); + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); if (!inode_evicting) { - cached_state = NULL; - lock_extent_bits(tree, start, end, - &cached_state); + clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_UPTODATE | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, + delete_states, &cached_state); } - - start = end + 1; - if (start < page_end) - goto again; + cur = range_end + 1; } - /* - * Qgroup reserved space handler - * Page here will be either - * 1) Already written to disk or ordered extent already submitted - * Then its QGROUP_RESERVED bit in io_tree is already cleaned. - * Qgroup will be handled by its qgroup_record then. - * btrfs_qgroup_free_data() call will do nothing here. - * - * 2) Not written to disk yet - * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED - * bit of its io_tree, and free the qgroup reserved data space. - * Since the IO will never happen for this page. + * We have iterated through all ordered extents of the page, the page + * should not have Ordered (Private2) anymore, or the above iteration + * did something wrong. */ - btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); - if (!inode_evicting) { - bool delete = true; - - /* - * If there's an ordered extent for this range and we have not - * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set - * in the range for the ordered extent completion. We must also - * not delete the range, otherwise we would lose that bit (and - * any other bits set in the range). Make sure EXTENT_UPTODATE - * is cleared if we don't delete, otherwise it can lead to - * corruptions if the i_size is extented later. - */ - if (found_ordered && !completed_ordered) - delete = false; - clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, - delete, &cached_state); - + ASSERT(!PageOrdered(page)); + if (!inode_evicting) __btrfs_releasepage(page, GFP_NOFS); - } - ClearPageChecked(page); clear_page_extent_mapped(page); } @@ -8626,8 +8663,8 @@ again: flush_dcache_page(page); } ClearPageChecked(page); - set_page_dirty(page); - SetPageUptodate(page); + btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); + btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); @@ -8661,6 +8698,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; u64 min_size = btrfs_calc_metadata_size(fs_info, 1); + u64 extents_found = 0; if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), @@ -8718,20 +8756,13 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) min_size, false); BUG_ON(ret); - /* - * So if we truncate and then write and fsync we normally would just - * write the extents that changed, which is a problem if we need to - * first truncate that entire inode. So set this flag so we write out - * all of the extents in the inode to the sync log so we're completely - * safe. - */ - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); trans->block_rsv = rsv; while (1) { ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), inode->i_size, - BTRFS_EXTENT_DATA_KEY); + BTRFS_EXTENT_DATA_KEY, + &extents_found); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; @@ -8793,6 +8824,22 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) } out: btrfs_free_block_rsv(fs_info, rsv); + /* + * So if we truncate and then write and fsync we normally would just + * write the extents that changed, which is a problem if we need to + * first truncate that entire inode. So set this flag so we write out + * all of the extents in the inode to the sync log so we're completely + * safe. + * + * If no extents were dropped or trimmed we don't need to force the next + * fsync to truncate all the inode's items from the log and re-log them + * all. This means the truncate operation did not change the file size, + * or changed it to a smaller size but there was only an implicit hole + * between the old i_size and the new i_size, and there were no prealloc + * extents beyond i_size to drop. + */ + if (extents_found > 0) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); return ret; } @@ -10199,17 +10246,21 @@ out: return ret; } -void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) { - struct inode *inode = tree->private_data; + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; struct page *page; + u32 len; + ASSERT(end + 1 - start <= U32_MAX); + len = end + 1 - start; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); + page = find_get_page(inode->vfs_inode.i_mapping, index); ASSERT(page); /* Pages should be in the extent_io_tree */ - set_page_writeback(page); + + btrfs_page_set_writeback(fs_info, page, start, len); put_page(page); index++; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5dc2fd843ae3..0ba98e08a029 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -353,15 +353,55 @@ update_flags: return ret; } +/* + * Start exclusive operation @type, return true on success + */ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type) { - return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type); + bool ret = false; + + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { + fs_info->exclusive_operation = type; + ret = true; + } + spin_unlock(&fs_info->super_lock); + + return ret; +} + +/* + * Conditionally allow to enter the exclusive operation in case it's compatible + * with the running one. This must be paired with btrfs_exclop_start_unlock and + * btrfs_exclop_finish. + * + * Compatibility: + * - the same type is already running + * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller + * must check the condition first that would allow none -> @type + */ +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == type) + return true; + + spin_unlock(&fs_info->super_lock); + return false; +} + +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) +{ + spin_unlock(&fs_info->super_lock); } void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) { + spin_lock(&fs_info->super_lock); WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + spin_unlock(&fs_info->super_lock); sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } @@ -1455,7 +1495,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (btrfs_defrag_cancelled(fs_info)) { btrfs_debug(fs_info, "defrag_file cancelled"); ret = -EAGAIN; - break; + goto error; } if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, @@ -1533,6 +1573,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } } + ret = defrag_count; +error: if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { filemap_flush(inode->i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, @@ -1546,8 +1588,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } - ret = defrag_count; - out_ra: if (do_compress) { btrfs_inode_lock(inode, 0); @@ -1560,6 +1600,48 @@ out_ra: return ret; } +/* + * Try to start exclusive operation @type or cancel it if it's running. + * + * Return: + * 0 - normal mode, newly claimed op started + * >0 - normal mode, something else is running, + * return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space + * ECANCELED - cancel mode, successful cancel + * ENOTCONN - cancel mode, operation not running anymore + */ +static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type, bool cancel) +{ + if (!cancel) { + /* Start normal op */ + if (!btrfs_exclop_start(fs_info, type)) + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + /* Exclusive operation is now claimed */ + return 0; + } + + /* Cancel running op */ + if (btrfs_exclop_start_try_lock(fs_info, type)) { + /* + * This blocks any exclop finish from setting it to NONE, so we + * request cancellation. Either it runs and we will wait for it, + * or it has finished and no waiting will happen. + */ + atomic_inc(&fs_info->reloc_cancel_req); + btrfs_exclop_start_unlock(fs_info); + + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING, + TASK_INTERRUPTIBLE); + + return -ECANCELED; + } + + /* Something else is running or none */ + return -ENOTCONN; +} + static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { @@ -1577,6 +1659,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, char *devstr = NULL; int ret = 0; int mod = 0; + bool cancel; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1585,20 +1668,23 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) { - mnt_drop_write_file(file); - return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; - } - + /* + * Read the arguments before checking exclusivity to be able to + * distinguish regular resize and cancel + */ vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); - goto out; + goto out_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - sizestr = vol_args->name; + cancel = (strcmp("cancel", sizestr) == 0); + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); + if (ret) + goto out_free; + /* Exclusive operation is now claimed */ + devstr = strchr(sizestr, ':'); if (devstr) { sizestr = devstr + 1; @@ -1606,10 +1692,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, devstr = vol_args->name; ret = kstrtoull(devstr, 10, &devid); if (ret) - goto out_free; + goto out_finish; if (!devid) { ret = -EINVAL; - goto out_free; + goto out_finish; } btrfs_info(fs_info, "resizing devid %llu", devid); } @@ -1619,7 +1705,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, btrfs_info(fs_info, "resizer unable to find device %llu", devid); ret = -ENODEV; - goto out_free; + goto out_finish; } if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -1627,7 +1713,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, "resizer unable to apply on readonly device %llu", devid); ret = -EPERM; - goto out_free; + goto out_finish; } if (!strcmp(sizestr, "max")) @@ -1643,13 +1729,13 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = memparse(sizestr, &retptr); if (*retptr != '\0' || new_size == 0) { ret = -EINVAL; - goto out_free; + goto out_finish; } } if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -EPERM; - goto out_free; + goto out_finish; } old_size = btrfs_device_get_total_bytes(device); @@ -1657,24 +1743,24 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (mod < 0) { if (new_size > old_size) { ret = -EINVAL; - goto out_free; + goto out_finish; } new_size = old_size - new_size; } else if (mod > 0) { if (new_size > ULLONG_MAX - old_size) { ret = -ERANGE; - goto out_free; + goto out_finish; } new_size = old_size + new_size; } if (new_size < SZ_256M) { ret = -EINVAL; - goto out_free; + goto out_finish; } if (new_size > device->bdev->bd_inode->i_size) { ret = -EFBIG; - goto out_free; + goto out_finish; } new_size = round_down(new_size, fs_info->sectorsize); @@ -1683,7 +1769,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out_free; + goto out_finish; } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans); @@ -1696,10 +1782,11 @@ static noinline int btrfs_ioctl_resize(struct file *file, "resize device %s (devid %llu) from %llu to %llu", rcu_str_deref(device->name), device->devid, old_size, new_size); +out_finish: + btrfs_exclop_finish(fs_info); out_free: kfree(vol_args); -out: - btrfs_exclop_finish(fs_info); +out_drop: mnt_drop_write_file(file); return ret; } @@ -2897,7 +2984,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = PTR_ERR(subvol_name_ptr); goto free_parent; } - /* subvol_name_ptr is already NULL termined */ + /* subvol_name_ptr is already nul terminated */ subvol_name = (char *)kbasename(subvol_name_ptr); } } else { @@ -3119,6 +3206,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; + bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3137,18 +3225,22 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) ret = -EOPNOTSUPP; goto out; } + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) && + strcmp("cancel", vol_args->name) == 0) + cancel = true; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, + cancel); + if (ret) goto out; - } + /* Exclusive operation is now claimed */ - if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); - } else { - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + else ret = btrfs_rm_device(fs_info, vol_args->name, 0); - } + btrfs_exclop_finish(fs_info); if (!ret) { @@ -3172,6 +3264,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; int ret; + bool cancel; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3180,25 +3273,24 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; - goto out_drop_write; - } - vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); - goto out; + goto out_drop_write; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_rm_device(fs_info, vol_args->name, 0); + cancel = (strcmp("cancel", vol_args->name) == 0); + + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, + cancel); + if (ret == 0) { + ret = btrfs_rm_device(fs_info, vol_args->name, 0); + if (!ret) + btrfs_info(fs_info, "disk deleted %s", vol_args->name); + btrfs_exclop_finish(fs_info); + } - if (!ret) - btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); -out: - btrfs_exclop_finish(fs_info); out_drop_write: mnt_drop_write_file(file); @@ -3551,7 +3643,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, goto out; } transid = trans->transid; - ret = btrfs_commit_transaction_async(trans, 0); + ret = btrfs_commit_transaction_async(trans); if (ret) { btrfs_end_transaction(trans); return ret; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 5fafc5e89bb7..313d9d685adb 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -57,7 +57,7 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) /* * Try-lock for read. * - * Retrun 1 if the rwlock has been taken, 0 otherwise + * Return 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_read_lock(struct extent_buffer *eb) { @@ -72,7 +72,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) /* * Try-lock for write. * - * Retrun 1 if the rwlock has been taken, 0 otherwise + * Return 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_write_lock(struct extent_buffer *eb) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6c413bb451a3..6eb41b7c0c84 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -16,6 +16,7 @@ #include "compression.h" #include "delalloc-space.h" #include "qgroup.h" +#include "subpage.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -300,81 +301,142 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, } /* - * Finish IO for one ordered extent across a given range. The range can - * contain several ordered extents. + * Mark all ordered extents io inside the specified range finished. * - * @found_ret: Return the finished ordered extent - * @file_offset: File offset for the finished IO - * Will also be updated to one byte past the range that is - * recordered as finished. This allows caller to walk forward. - * @io_size: Length of the finish IO range - * @uptodate: If the IO finished without problem - * - * Return true if any ordered extent is finished in the range, and update - * @found_ret and @file_offset. - * Return false otherwise. + * @page: The invovled page for the opeartion. + * For uncompressed buffered IO, the page status also needs to be + * updated to indicate whether the pending ordered io is finished. + * Can be NULL for direct IO and compressed write. + * For these cases, callers are ensured they won't execute the + * endio function twice. + * @finish_func: The function to be executed when all the IO of an ordered + * extent are finished. * - * NOTE: Although The range can cross multiple ordered extents, only one - * ordered extent will be updated during one call. The caller is responsible to - * iterate all ordered extents in the range. + * This function is called for endio, thus the range must have ordered + * extent(s) coveri it. */ -bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **finished_ret, - u64 *file_offset, u64 io_size, int uptodate) +void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, + struct page *page, u64 file_offset, + u64 num_bytes, btrfs_func_t finish_func, + bool uptodate) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_workqueue *wq; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - bool finished = false; unsigned long flags; - u64 dec_end; - u64 dec_start; - u64 to_dec; + u64 cur = file_offset; + + if (btrfs_is_free_space_inode(inode)) + wq = fs_info->endio_freespace_worker; + else + wq = fs_info->endio_write_workers; + + if (page) + ASSERT(page->mapping && page_offset(page) <= file_offset && + file_offset + num_bytes <= page_offset(page) + PAGE_SIZE); spin_lock_irqsave(&tree->lock, flags); - node = tree_search(tree, *file_offset); - if (!node) - goto out; + while (cur < file_offset + num_bytes) { + u64 entry_end; + u64 end; + u32 len; - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!in_range(*file_offset, entry->file_offset, entry->num_bytes)) - goto out; + node = tree_search(tree, cur); + /* No ordered extents at all */ + if (!node) + break; - dec_start = max(*file_offset, entry->file_offset); - dec_end = min(*file_offset + io_size, - entry->file_offset + entry->num_bytes); - *file_offset = dec_end; - if (dec_start > dec_end) { - btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu", - dec_start, dec_end); - } - to_dec = dec_end - dec_start; - if (to_dec > entry->bytes_left) { - btrfs_crit(fs_info, - "bad ordered accounting left %llu size %llu", - entry->bytes_left, to_dec); - } - entry->bytes_left -= to_dec; - if (!uptodate) - set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + entry_end = entry->file_offset + entry->num_bytes; + /* + * |<-- OE --->| | + * cur + * Go to next OE. + */ + if (cur >= entry_end) { + node = rb_next(node); + /* No more ordered extents, exit */ + if (!node) + break; + entry = rb_entry(node, struct btrfs_ordered_extent, + rb_node); + + /* Go to next ordered extent and continue */ + cur = entry->file_offset; + continue; + } + /* + * | |<--- OE --->| + * cur + * Go to the start of OE. + */ + if (cur < entry->file_offset) { + cur = entry->file_offset; + continue; + } - if (entry->bytes_left == 0) { /* - * Ensure only one caller can set the flag and finished_ret - * accordingly + * Now we are definitely inside one ordered extent. + * + * |<--- OE --->| + * | + * cur */ - finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - /* test_and_set_bit implies a barrier */ - cond_wake_up_nomb(&entry->wait); - } -out: - if (finished && finished_ret && entry) { - *finished_ret = entry; - refcount_inc(&entry->refs); + end = min(entry->file_offset + entry->num_bytes, + file_offset + num_bytes) - 1; + ASSERT(end + 1 - cur < U32_MAX); + len = end + 1 - cur; + + if (page) { + /* + * Ordered (Private2) bit indicates whether we still + * have pending io unfinished for the ordered extent. + * + * If there's no such bit, we need to skip to next range. + */ + if (!btrfs_page_test_ordered(fs_info, page, cur, len)) { + cur += len; + continue; + } + btrfs_page_clear_ordered(fs_info, page, cur, len); + } + + /* Now we're fine to update the accounting */ + if (unlikely(len > entry->bytes_left)) { + WARN_ON(1); + btrfs_crit(fs_info, +"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu", + inode->root->root_key.objectid, + btrfs_ino(inode), + entry->file_offset, + entry->num_bytes, + len, entry->bytes_left); + entry->bytes_left = 0; + } else { + entry->bytes_left -= len; + } + + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + + /* + * All the IO of the ordered extent is finished, we need to queue + * the finish_func to be executed. + */ + if (entry->bytes_left == 0) { + set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + cond_wake_up(&entry->wait); + refcount_inc(&entry->refs); + spin_unlock_irqrestore(&tree->lock, flags); + btrfs_init_work(&entry->work, finish_func, NULL, NULL); + btrfs_queue_work(wq, &entry->work); + spin_lock_irqsave(&tree->lock, flags); + } + cur += len; } spin_unlock_irqrestore(&tree->lock, flags); - return finished; } /* @@ -870,6 +932,81 @@ out: } /* + * Lookup the first ordered extent that overlaps the range + * [@file_offset, @file_offset + @len). + * + * The difference between this and btrfs_lookup_first_ordered_extent() is + * that this one won't return any ordered extent that does not overlap the range. + * And the difference against btrfs_lookup_ordered_extent() is, this function + * ensures the first ordered extent gets returned. + */ +struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( + struct btrfs_inode *inode, u64 file_offset, u64 len) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *node; + struct rb_node *cur; + struct rb_node *prev; + struct rb_node *next; + struct btrfs_ordered_extent *entry = NULL; + + spin_lock_irq(&tree->lock); + node = tree->tree.rb_node; + /* + * Here we don't want to use tree_search() which will use tree->last + * and screw up the search order. + * And __tree_search() can't return the adjacent ordered extents + * either, thus here we do our own search. + */ + while (node) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + + if (file_offset < entry->file_offset) { + node = node->rb_left; + } else if (file_offset >= entry_end(entry)) { + node = node->rb_right; + } else { + /* + * Direct hit, got an ordered extent that starts at + * @file_offset + */ + goto out; + } + } + if (!entry) { + /* Empty tree */ + goto out; + } + + cur = &entry->rb_node; + /* We got an entry around @file_offset, check adjacent entries */ + if (entry->file_offset < file_offset) { + prev = cur; + next = rb_next(cur); + } else { + prev = rb_prev(cur); + next = cur; + } + if (prev) { + entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + goto out; + } + if (next) { + entry = rb_entry(next, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + goto out; + } + /* No ordered extent in the range */ + entry = NULL; +out: + if (entry) + refcount_inc(&entry->refs); + spin_unlock_irq(&tree->lock); + return entry; +} + +/* * btrfs_flush_ordered_range - Lock the passed range and ensures all pending * ordered extents in it are run to completion. * diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e60c07f36427..566472004edd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -172,13 +172,13 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); +void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, + struct page *page, u64 file_offset, + u64 num_bytes, btrfs_func_t finish_func, + bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate); -bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **finished_ret, - u64 *file_offset, u64 io_size, - int uptodate); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type); @@ -196,6 +196,8 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); +struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( + struct btrfs_inode *inode, u64 file_offset, u64 len); struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 2dcb1cb21634..b1cb5a8c2999 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -260,6 +260,10 @@ static int prop_compression_validate(const char *value, size_t len) if (btrfs_compress_is_valid_type(value, len)) return 0; + if ((len == 2 && strncmp("no", value, 2) == 0) || + (len == 4 && strncmp("none", value, 4) == 0)) + return 0; + return -EINVAL; } @@ -269,7 +273,17 @@ static int prop_compression_apply(struct inode *inode, const char *value, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int type; + /* Reset to defaults */ if (len == 0) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + return 0; + } + + /* Set NOCOMPRESS flag */ + if ((len == 2 && strncmp("no", value, 2) == 0) || + (len == 4 && strncmp("none", value, 4) == 0)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; @@ -348,7 +362,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, /* * This is not strictly necessary as the property should be - * valid, but in case it isn't, don't propagate it futher. + * valid, but in case it isn't, don't propagate it further. */ ret = h->validate(value, strlen(value)); if (ret) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3ded812f522c..07ec06d4e972 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2521,7 +2521,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, int ret = 0; /* - * If quotas get disabled meanwhile, the resouces need to be freed and + * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) @@ -3545,13 +3545,7 @@ static int try_flush_qgroup(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - /* - * Can't hold an open transaction or we run the risk of deadlocking, - * and can't either be under the context of a send operation (where - * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that - * would result in a crash when starting a transaction and does not - * make sense either (send is a read-only operation). - */ + /* Can't hold an open transaction or we run the risk of deadlocking. */ ASSERT(current->journal_info == NULL); if (WARN_ON(current->journal_info)) return 0; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9178da07cc9c..9b0814318e72 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -7,6 +7,7 @@ #include "delalloc-space.h" #include "reflink.h" #include "transaction.h" +#include "subpage.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M @@ -52,7 +53,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, const u64 datal, const u8 comp_type) { - const u64 block_size = btrfs_inode_sectorsize(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 block_size = fs_info->sectorsize; const u64 range_end = file_offset + block_size - 1; const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); @@ -106,10 +108,12 @@ static int copy_inline_to_page(struct btrfs_inode *inode, set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { - memcpy_to_page(page, 0, data_start, datal); + memcpy_to_page(page, offset_in_page(file_offset), data_start, + datal); flush_dcache_page(page); } else { - ret = btrfs_decompress(comp_type, data_start, page, 0, + ret = btrfs_decompress(comp_type, data_start, page, + offset_in_page(file_offset), inline_size, datal); if (ret) goto out_unlock; @@ -133,9 +137,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode, flush_dcache_page(page); } - SetPageUptodate(page); + btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); ClearPageChecked(page); - set_page_dirty(page); + btrfs_page_set_dirty(fs_info, page, file_offset, block_size); out_unlock: if (page) { unlock_page(page); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b70be2ac2e9e..fc831597cb22 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2876,11 +2876,12 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, } /* - * Allow error injection to test balance cancellation + * Allow error injection to test balance/relocation cancellation */ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) { return atomic_read(&fs_info->balance_cancel_req) || + atomic_read(&fs_info->reloc_cancel_req) || fatal_signal_pending(current); } ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); @@ -3780,6 +3781,60 @@ out: return inode; } +/* + * Mark start of chunk relocation that is cancellable. Check if the cancellation + * has been requested meanwhile and don't start in that case. + * + * Return: + * 0 success + * -EINPROGRESS operation is already in progress, that's probably a bug + * -ECANCELED cancellation request was set before the operation started + * -EAGAIN can not start because there are ongoing send operations + */ +static int reloc_chunk_start(struct btrfs_fs_info *fs_info) +{ + spin_lock(&fs_info->send_reloc_lock); + if (fs_info->send_in_progress) { + btrfs_warn_rl(fs_info, +"cannot run relocation while send operations are in progress (%d in progress)", + fs_info->send_in_progress); + spin_unlock(&fs_info->send_reloc_lock); + return -EAGAIN; + } + if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { + /* This should not happen */ + spin_unlock(&fs_info->send_reloc_lock); + btrfs_err(fs_info, "reloc already running, cannot start"); + return -EINPROGRESS; + } + spin_unlock(&fs_info->send_reloc_lock); + + if (atomic_read(&fs_info->reloc_cancel_req) > 0) { + btrfs_info(fs_info, "chunk relocation canceled on start"); + /* + * On cancel, clear all requests but let the caller mark + * the end after cleanup operations. + */ + atomic_set(&fs_info->reloc_cancel_req, 0); + return -ECANCELED; + } + return 0; +} + +/* + * Mark end of chunk relocation that is cancellable and wake any waiters. + */ +static void reloc_chunk_end(struct btrfs_fs_info *fs_info) +{ + /* Requested after start, clear bit first so any waiters can continue */ + if (atomic_read(&fs_info->reloc_cancel_req) > 0) + btrfs_info(fs_info, "chunk relocation canceled during operation"); + spin_lock(&fs_info->send_reloc_lock); + clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); + spin_unlock(&fs_info->send_reloc_lock); + atomic_set(&fs_info->reloc_cancel_req, 0); +} + static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) { struct reloc_control *rc; @@ -3862,6 +3917,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) return -ENOMEM; } + ret = reloc_chunk_start(fs_info); + if (ret < 0) { + err = ret; + goto out_put_bg; + } + rc->extent_root = extent_root; rc->block_group = bg; @@ -3952,7 +4013,9 @@ out: if (err && rw) btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); - btrfs_put_block_group(rc->block_group); +out_put_bg: + btrfs_put_block_group(bg); + reloc_chunk_end(fs_info); free_reloc_control(rc); return err; } @@ -4073,6 +4136,12 @@ int btrfs_recover_relocation(struct btrfs_root *root) goto out; } + ret = reloc_chunk_start(fs_info); + if (ret < 0) { + err = ret; + goto out_end; + } + rc->extent_root = fs_info->extent_root; set_reloc_control(rc); @@ -4137,6 +4206,8 @@ out_clean: err = ret; out_unset: unset_reloc_control(rc); +out_end: + reloc_chunk_end(fs_info); free_reloc_control(rc); out: free_reloc_roots(&reloc_roots); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 485cda3eb8d7..088641ba7a8e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -165,6 +165,10 @@ struct scrub_ctx { int readonly; int pages_per_rd_bio; + /* State of IO submission throttling affecting the associated device */ + ktime_t throttle_deadline; + u64 throttle_sent; + int is_dev_replace; u64 write_pointer; @@ -605,6 +609,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); init_waitqueue_head(&sctx->list_wait); + sctx->throttle_deadline = 0; WARN_ON(sctx->wr_curr_bio != NULL); mutex_init(&sctx->wr_lock); @@ -626,7 +631,6 @@ nomem: static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *warn_ctx) { - u64 isize; u32 nlink; int ret; int i; @@ -662,7 +666,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, eb = swarn->path->nodes[0]; inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], struct btrfs_inode_item); - isize = btrfs_inode_size(eb, inode_item); nlink = btrfs_inode_nlink(eb, inode_item); btrfs_release_path(swarn->path); @@ -691,12 +694,12 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", +"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), swarn->physical, root, inum, offset, - min(isize - offset, (u64)PAGE_SIZE), nlink, + fs_info->sectorsize, nlink, (char *)(unsigned long)ipath->fspath->val[i]); btrfs_put_root(local_root); @@ -885,25 +888,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * read all mirrors one after the other. This includes to * re-read the extent or metadata block that failed (that was * the cause that this fixup code is called) another time, - * page by page this time in order to know which pages + * sector by sector this time in order to know which sectors * caused I/O errors and which ones are good (for all mirrors). * It is the goal to handle the situation when more than one * mirror contains I/O errors, but the errors do not * overlap, i.e. the data can be repaired by selecting the - * pages from those mirrors without I/O error on the - * particular pages. One example (with blocks >= 2 * PAGE_SIZE) - * would be that mirror #1 has an I/O error on the first page, - * the second page is good, and mirror #2 has an I/O error on - * the second page, but the first page is good. - * Then the first page of the first mirror can be repaired by - * taking the first page of the second mirror, and the - * second page of the second mirror can be repaired by - * copying the contents of the 2nd page of the 1st mirror. - * One more note: if the pages of one mirror contain I/O + * sectors from those mirrors without I/O error on the + * particular sectors. One example (with blocks >= 2 * sectorsize) + * would be that mirror #1 has an I/O error on the first sector, + * the second sector is good, and mirror #2 has an I/O error on + * the second sector, but the first sector is good. + * Then the first sector of the first mirror can be repaired by + * taking the first sector of the second mirror, and the + * second sector of the second mirror can be repaired by + * copying the contents of the 2nd sector of the 1st mirror. + * One more note: if the sectors of one mirror contain I/O * errors, the checksum cannot be verified. In order to get * the best data for repairing, the first attempt is to find * a mirror without I/O errors and with a validated checksum. - * Only if this is not possible, the pages are picked from + * Only if this is not possible, the sectors are picked from * mirrors with I/O errors without considering the checksum. * If the latter is the case, at the end, the checksum of the * repaired area is verified in order to correctly maintain @@ -1060,26 +1063,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* * In case of I/O errors in the area that is supposed to be - * repaired, continue by picking good copies of those pages. - * Select the good pages from mirrors to rewrite bad pages from + * repaired, continue by picking good copies of those sectors. + * Select the good sectors from mirrors to rewrite bad sectors from * the area to fix. Afterwards verify the checksum of the block * that is supposed to be repaired. This verification step is * only done for the purpose of statistic counting and for the * final scrub report, whether errors remain. * A perfect algorithm could make use of the checksum and try - * all possible combinations of pages from the different mirrors + * all possible combinations of sectors from the different mirrors * until the checksum verification succeeds. For example, when - * the 2nd page of mirror #1 faces I/O errors, and the 2nd page + * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector * of mirror #2 is readable but the final checksum test fails, - * then the 2nd page of mirror #3 could be tried, whether now + * then the 2nd sector of mirror #3 could be tried, whether now * the final checksum succeeds. But this would be a rare * exception and is therefore not implemented. At least it is * avoided that the good copy is overwritten. * A more useful improvement would be to pick the sectors * without I/O error based on sector sizes (512 bytes on legacy - * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one + * disks) instead of on sectorsize. Then maybe 512 byte of one * mirror could be repaired by taking 512 byte of a different - * mirror, even if other 512 byte sectors in the same PAGE_SIZE + * mirror, even if other 512 byte sectors in the same sectorsize * area are unreadable. */ success = 1; @@ -1260,7 +1263,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, { struct scrub_ctx *sctx = original_sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = original_sblock->page_count * PAGE_SIZE; + u64 length = original_sblock->page_count * fs_info->sectorsize; u64 logical = original_sblock->pagev[0]->logical; u64 generation = original_sblock->pagev[0]->generation; u64 flags = original_sblock->pagev[0]->flags; @@ -1283,13 +1286,13 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, */ while (length > 0) { - sublen = min_t(u64, length, PAGE_SIZE); + sublen = min_t(u64, length, fs_info->sectorsize); mapped_length = sublen; bbio = NULL; /* - * with a length of PAGE_SIZE, each returned stripe - * represents one mirror + * With a length of sectorsize, each returned stripe represents + * one mirror */ btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, @@ -1480,7 +1483,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, bio = btrfs_io_bio_alloc(1); bio_set_dev(bio, spage->dev->bdev); - bio_add_page(bio, spage->page, PAGE_SIZE, 0); + bio_add_page(bio, spage->page, fs_info->sectorsize, 0); bio->bi_iter.bi_sector = spage->physical >> 9; bio->bi_opf = REQ_OP_READ; @@ -1544,6 +1547,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; struct scrub_page *spage_good = sblock_good->pagev[page_num]; struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; + const u32 sectorsize = fs_info->sectorsize; BUG_ON(spage_bad->page == NULL); BUG_ON(spage_good->page == NULL); @@ -1563,8 +1567,8 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, bio->bi_iter.bi_sector = spage_bad->physical >> 9; bio->bi_opf = REQ_OP_WRITE; - ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0); - if (PAGE_SIZE != ret) { + ret = bio_add_page(bio, spage_good->page, sectorsize, 0); + if (ret != sectorsize) { bio_put(bio); return -EIO; } @@ -1642,6 +1646,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, { struct scrub_bio *sbio; int ret; + const u32 sectorsize = sctx->fs_info->sectorsize; mutex_lock(&sctx->wr_lock); again: @@ -1681,16 +1686,16 @@ again: bio->bi_iter.bi_sector = sbio->physical >> 9; bio->bi_opf = REQ_OP_WRITE; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + } else if (sbio->physical + sbio->page_count * sectorsize != spage->physical_for_dev_replace || - sbio->logical + sbio->page_count * PAGE_SIZE != + sbio->logical + sbio->page_count * sectorsize != spage->logical) { scrub_wr_submit(sctx); goto again; } - ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { + ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + if (ret != sectorsize) { if (sbio->page_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; @@ -1729,7 +1734,8 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) btrfsic_submit_bio(sbio->bio); if (btrfs_is_zoned(sctx->fs_info)) - sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE; + sctx->write_pointer = sbio->physical + sbio->page_count * + sctx->fs_info->sectorsize; } static void scrub_wr_bio_end_io(struct bio *bio) @@ -1988,6 +1994,65 @@ static void scrub_page_put(struct scrub_page *spage) } } +/* + * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 + * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. + */ +static void scrub_throttle(struct scrub_ctx *sctx) +{ + const int time_slice = 1000; + struct scrub_bio *sbio; + struct btrfs_device *device; + s64 delta; + ktime_t now; + u32 div; + u64 bwlimit; + + sbio = sctx->bios[sctx->curr]; + device = sbio->dev; + bwlimit = READ_ONCE(device->scrub_speed_max); + if (bwlimit == 0) + return; + + /* + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. + */ + div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); + div = min_t(u32, 64, div); + + /* Start new epoch, set deadline */ + now = ktime_get(); + if (sctx->throttle_deadline == 0) { + sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); + sctx->throttle_sent = 0; + } + + /* Still in the time to send? */ + if (ktime_before(now, sctx->throttle_deadline)) { + /* If current bio is within the limit, send it */ + sctx->throttle_sent += sbio->bio->bi_iter.bi_size; + if (sctx->throttle_sent <= div_u64(bwlimit, div)) + return; + + /* We're over the limit, sleep until the rest of the slice */ + delta = ktime_ms_delta(sctx->throttle_deadline, now); + } else { + /* New request after deadline, start new epoch */ + delta = 0; + } + + if (delta) { + long timeout; + + timeout = div_u64(delta * HZ, 1000); + schedule_timeout_interruptible(timeout); + } + + /* Next call will start the deadline period */ + sctx->throttle_deadline = 0; +} + static void scrub_submit(struct scrub_ctx *sctx) { struct scrub_bio *sbio; @@ -1995,6 +2060,8 @@ static void scrub_submit(struct scrub_ctx *sctx) if (sctx->curr == -1) return; + scrub_throttle(sctx); + sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); @@ -2006,6 +2073,7 @@ static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, { struct scrub_block *sblock = spage->sblock; struct scrub_bio *sbio; + const u32 sectorsize = sctx->fs_info->sectorsize; int ret; again: @@ -2044,9 +2112,9 @@ again: bio->bi_iter.bi_sector = sbio->physical >> 9; bio->bi_opf = REQ_OP_READ; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + } else if (sbio->physical + sbio->page_count * sectorsize != spage->physical || - sbio->logical + sbio->page_count * PAGE_SIZE != + sbio->logical + sbio->page_count * sectorsize != spage->logical || sbio->dev != spage->dev) { scrub_submit(sctx); @@ -2054,8 +2122,8 @@ again: } sbio->pagev[sbio->page_count] = spage; - ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { + ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + if (ret != sectorsize) { if (sbio->page_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; @@ -2398,7 +2466,7 @@ static void scrub_block_complete(struct scrub_block *sblock) if (sblock->sparity && corrupted && !sblock->data_corrected) { u64 start = sblock->pagev[0]->logical; u64 end = sblock->pagev[sblock->page_count - 1]->logical + - PAGE_SIZE; + sblock->sctx->fs_info->sectorsize; ASSERT(end - start <= U32_MAX); scrub_parity_mark_sectors_error(sblock->sparity, @@ -2418,7 +2486,7 @@ static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *su * the csum into @csum. * * The search source is sctx->csum_list, which is a pre-populated list - * storing bytenr ordered csum ranges. We're reponsible to cleanup any range + * storing bytenr ordered csum ranges. We're responsible to cleanup any range * that is before @logical. * * Return 0 if there is no csum for the range. @@ -3138,28 +3206,23 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, physical = map->stripes[num].physical; offset = 0; nstripes = div64_u64(length, map->stripe_len); + mirror_num = 1; + increment = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; - mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; mirror_num = num % map->sub_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { get_raid56_logic_offset(physical, num, map, &offset, NULL); increment = map->stripe_len * nr_data_stripes(map); - mirror_num = 1; - } else { - increment = map->stripe_len; - mirror_num = 1; } path = btrfs_alloc_path(); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index bd69db72acc5..6ac37ae6c811 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2078,16 +2078,6 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, } /* - * Removes the entry from the list and adds it back to the end. This marks the - * entry as recently used so that name_cache_clean_unused does not remove it. - */ -static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) -{ - list_del(&nce->list); - list_add_tail(&nce->list, &sctx->name_cache_list); -} - -/* * Remove some entries from the beginning of name_cache_list. */ static void name_cache_clean_unused(struct send_ctx *sctx) @@ -2147,7 +2137,13 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, kfree(nce); nce = NULL; } else { - name_cache_used(sctx, nce); + /* + * Removes the entry from the list and adds it back to + * the end. This marks the entry as recently used so + * that name_cache_clean_unused does not remove it. + */ + list_move_tail(&nce->list, &sctx->name_cache_list); + *parent_ino = nce->parent_ino; *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); @@ -4064,6 +4060,17 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; } else { + /* + * If we previously orphanized a directory that + * collided with a new reference that we already + * processed, recompute the current path because + * that directory may be part of the path. + */ + if (orphanized_dir) { + ret = refresh_ref_path(sctx, cur); + if (ret < 0) + goto out; + } ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; @@ -6507,7 +6514,7 @@ static int changed_extent(struct send_ctx *sctx, * updates the inode item, but it only changes the iversion (sequence * field in the inode item) of the inode, so if a file is deduplicated * the same amount of times in both the parent and send snapshots, its - * iversion becames the same in both snapshots, whence the inode item is + * iversion becomes the same in both snapshots, whence the inode item is * the same on both snapshots. */ if (sctx->cur_ino != sctx->cmp_key->objectid) @@ -7409,23 +7416,21 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (ret) goto out; - mutex_lock(&fs_info->balance_mutex); - if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - mutex_unlock(&fs_info->balance_mutex); + spin_lock(&fs_info->send_reloc_lock); + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { + spin_unlock(&fs_info->send_reloc_lock); btrfs_warn_rl(fs_info, - "cannot run send because a balance operation is in progress"); + "cannot run send because a relocation operation is in progress"); ret = -EAGAIN; goto out; } fs_info->send_in_progress++; - mutex_unlock(&fs_info->balance_mutex); + spin_unlock(&fs_info->send_reloc_lock); - current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); - current->journal_info = NULL; - mutex_lock(&fs_info->balance_mutex); + spin_lock(&fs_info->send_reloc_lock); fs_info->send_in_progress--; - mutex_unlock(&fs_info->balance_mutex); + spin_unlock(&fs_info->send_reloc_lock); if (ret < 0) goto out; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2dc674b7c3b1..f79bf85f2439 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -133,18 +133,13 @@ * operations, however they won't be usable until the transaction commits. * * COMMIT_TRANS - * may_commit_transaction() is the ultimate arbiter on whether we commit the - * transaction or not. In order to avoid constantly churning we do all the - * above flushing first and then commit the transaction as the last resort. - * However we need to take into account things like pinned space that would - * be freed, plus any delayed work we may not have gotten rid of in the case - * of metadata. - * - * FORCE_COMMIT_TRANS - * For use by the preemptive flusher. We use this to bypass the ticketing - * checks in may_commit_transaction, as we have more information about the - * overall state of the system and may want to commit the transaction ahead - * of actual ENOSPC conditions. + * This will commit the transaction. Historically we had a lot of logic + * surrounding whether or not we'd commit the transaction, but this waits born + * out of a pre-tickets era where we could end up committing the transaction + * thousands of times in a row without making progress. Now thanks to our + * ticketing system we know if we're not making progress and can error + * everybody out after a few commits rather than burning the disk hoping for + * a different answer. * * OVERCOMMIT * @@ -197,13 +192,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (!space_info) return -ENOMEM; - ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, - GFP_KERNEL); - if (ret) { - kfree(space_info); - return ret; - } - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); @@ -389,7 +377,7 @@ again: ticket = list_first_entry(head, struct reserve_ticket, list); - /* Check and see if our ticket can be satisified now. */ + /* Check and see if our ticket can be satisfied now. */ if ((used + ticket->bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { @@ -495,7 +483,8 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, */ static void shrink_delalloc(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, - u64 to_reclaim, bool wait_ordered) + u64 to_reclaim, bool wait_ordered, + bool for_preempt) { struct btrfs_trans_handle *trans; u64 delalloc_bytes; @@ -532,7 +521,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, * ordered extents, otherwise we'll waste time trying to flush delalloc * that likely won't give us the space back we need. */ - if (ordered_bytes > delalloc_bytes) + if (ordered_bytes > delalloc_bytes && !for_preempt) wait_ordered = true; loops = 0; @@ -551,6 +540,14 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, break; } + /* + * If we are for preemption we just want a one-shot of delalloc + * flushing so we can stop flushing if we decide we don't need + * to anymore. + */ + if (for_preempt) + break; + spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { @@ -566,109 +563,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, } } -/** - * Possibly commit the transaction if its ok to - * - * @fs_info: the filesystem - * @space_info: space_info we are checking for commit, either data or metadata - * - * This will check to make sure that committing the transaction will actually - * get us somewhere and then commit the transaction if it does. Otherwise it - * will return -ENOSPC. - */ -static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) -{ - struct reserve_ticket *ticket = NULL; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; - struct btrfs_trans_handle *trans; - u64 reclaim_bytes = 0; - u64 bytes_needed = 0; - u64 cur_free_bytes = 0; - - trans = (struct btrfs_trans_handle *)current->journal_info; - if (trans) - return -EAGAIN; - - spin_lock(&space_info->lock); - cur_free_bytes = btrfs_space_info_used(space_info, true); - if (cur_free_bytes < space_info->total_bytes) - cur_free_bytes = space_info->total_bytes - cur_free_bytes; - else - cur_free_bytes = 0; - - if (!list_empty(&space_info->priority_tickets)) - ticket = list_first_entry(&space_info->priority_tickets, - struct reserve_ticket, list); - else if (!list_empty(&space_info->tickets)) - ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); - if (ticket) - bytes_needed = ticket->bytes; - - if (bytes_needed > cur_free_bytes) - bytes_needed -= cur_free_bytes; - else - bytes_needed = 0; - spin_unlock(&space_info->lock); - - if (!bytes_needed) - return 0; - - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - /* - * See if there is enough pinned space to make this reservation, or if - * we have block groups that are going to be freed, allowing us to - * possibly do a chunk allocation the next loop through. - */ - if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || - __percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) - goto commit; - - /* - * See if there is some space in the delayed insertion reserve for this - * reservation. If the space_info's don't match (like for DATA or - * SYSTEM) then just go enospc, reclaiming this space won't recover any - * space to satisfy those reservations. - */ - if (space_info != delayed_rsv->space_info) - goto enospc; - - spin_lock(&delayed_rsv->lock); - reclaim_bytes += delayed_rsv->reserved; - spin_unlock(&delayed_rsv->lock); - - spin_lock(&delayed_refs_rsv->lock); - reclaim_bytes += delayed_refs_rsv->reserved; - spin_unlock(&delayed_refs_rsv->lock); - - spin_lock(&trans_rsv->lock); - reclaim_bytes += trans_rsv->reserved; - spin_unlock(&trans_rsv->lock); - - if (reclaim_bytes >= bytes_needed) - goto commit; - bytes_needed -= reclaim_bytes; - - if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) - goto enospc; - -commit: - return btrfs_commit_transaction(trans); -enospc: - btrfs_end_transaction(trans); - return -ENOSPC; -} - /* * Try to flush some data based on policy set by @state. This is only advisory * and may fail for various reasons. The caller is supposed to examine the @@ -702,7 +596,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: shrink_delalloc(fs_info, space_info, num_bytes, - state == FLUSH_DELALLOC_WAIT); + state == FLUSH_DELALLOC_WAIT, for_preempt); break; case FLUSH_DELAYED_REFS_NR: case FLUSH_DELAYED_REFS: @@ -743,9 +637,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, btrfs_wait_on_delayed_iputs(fs_info); break; case COMMIT_TRANS: - ret = may_commit_transaction(fs_info, space_info); - break; - case FORCE_COMMIT_TRANS: + ASSERT(current->journal_info == NULL); trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -792,12 +684,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { + u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; u64 thresh = div_factor_fine(space_info->total_bytes, 98); u64 used; /* If we're just plain full then async reclaim just slows us down. */ - if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) + if ((space_info->bytes_used + space_info->bytes_reserved + + global_rsv_size) >= thresh) return false; /* @@ -838,8 +732,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, thresh = calc_available_free_space(fs_info, space_info, BTRFS_RESERVE_FLUSH_ALL); - thresh += (space_info->total_bytes - space_info->bytes_used - - space_info->bytes_reserved - space_info->bytes_readonly); + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_readonly + global_rsv_size; + if (used < space_info->total_bytes) + thresh += space_info->total_bytes - used; thresh >>= space_info->clamp; used = space_info->bytes_pinned; @@ -860,14 +756,20 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, * clearly be heavy enough to warrant preemptive flushing. In the case * of heavy DIO or ordered reservations, preemptive flushing will just * waste time and cause us to slow down. + * + * We want to make sure we truly are maxed out on ordered however, so + * cut ordered in half, and if it's still higher than delalloc then we + * can keep flushing. This is to avoid the case where we start + * flushing, and now delalloc == ordered and we stop preemptively + * flushing when we could still have several gigs of delalloc to flush. */ - ordered = percpu_counter_read_positive(&fs_info->ordered_bytes); + ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1; delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); if (ordered >= delalloc) used += fs_info->delayed_refs_rsv.reserved + fs_info->delayed_block_rsv.reserved; else - used += space_info->bytes_may_use; + used += space_info->bytes_may_use - global_rsv_size; return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); @@ -921,7 +823,6 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, { struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; - u64 first_ticket_bytes = 0; if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); @@ -937,21 +838,6 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, steal_from_global_rsv(fs_info, space_info, ticket)) return true; - /* - * may_commit_transaction will avoid committing the transaction - * if it doesn't feel like the space reclaimed by the commit - * would result in the ticket succeeding. However if we have a - * smaller ticket in the queue it may be small enough to be - * satisified by committing the transaction, so if any - * subsequent ticket is smaller than the first ticket go ahead - * and send us back for another loop through the enospc flushing - * code. - */ - if (first_ticket_bytes == 0) - first_ticket_bytes = ticket->bytes; - else if (first_ticket_bytes > ticket->bytes) - return true; - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) btrfs_info(fs_info, "failing ticket with %llu bytes", ticket->bytes); @@ -1117,7 +1003,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) (delayed_block_rsv->reserved + delayed_refs_rsv->reserved)) { to_reclaim = space_info->bytes_pinned; - flush = FORCE_COMMIT_TRANS; + flush = COMMIT_TRANS; } else if (delayed_block_rsv->reserved > delayed_refs_rsv->reserved) { to_reclaim = delayed_block_rsv->reserved; @@ -1171,28 +1057,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * immediately re-usable, it comes in the form of a delayed ref, which must be * run and then the transaction must be committed. * - * FLUSH_DELAYED_REFS - * The above two cases generate delayed refs that will affect - * ->total_bytes_pinned. However this counter can be inconsistent with - * reality if there are outstanding delayed refs. This is because we adjust - * the counter based solely on the current set of delayed refs and disregard - * any on-disk state which might include more refs. So for example, if we - * have an extent with 2 references, but we only drop 1, we'll see that there - * is a negative delayed ref count for the extent and assume that the space - * will be freed, and thus increase ->total_bytes_pinned. - * - * Running the delayed refs gives us the actual real view of what will be - * freed at the transaction commit time. This stage will not actually free - * space for us, it just makes sure that may_commit_transaction() has all of - * the information it needs to make the right decision. - * * COMMIT_TRANS - * This is where we reclaim all of the pinned space generated by the previous - * two stages. We will not commit the transaction if we don't think we're - * likely to satisfy our request, which means if our current free space + - * total_bytes_pinned < reservation we will not commit. This is why the - * previous states are actually important, to make sure we know for sure - * whether committing the transaction will allow us to make progress. + * This is where we reclaim all of the pinned space generated by running the + * iputs * * ALLOC_CHUNK_FORCE * For data we start with alloc chunk force, however we could have been full @@ -1202,7 +1069,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_WAIT, RUN_DELAYED_IPUTS, - FLUSH_DELAYED_REFS, COMMIT_TRANS, ALLOC_CHUNK_FORCE, }; @@ -1561,6 +1427,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, flush == BTRFS_RESERVE_FLUSH_DATA) { list_add_tail(&ticket.list, &space_info->tickets); if (!space_info->flush) { + /* + * We were forced to add a reserve ticket, so + * our preemptive flushing is unable to keep + * up. Clamp down on the threshold for the + * preemptive flushing in order to keep up with + * the workload. + */ + maybe_clamp_preempt(fs_info, space_info); + space_info->flush = 1; trace_btrfs_trigger_flush(fs_info, space_info->flags, @@ -1572,14 +1447,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, list_add_tail(&ticket.list, &space_info->priority_tickets); } - - /* - * We were forced to add a reserve ticket, so our preemptive - * flushing is unable to keep up. Clamp down on the threshold - * for the preemptive flushing in order to keep up with the - * workload. - */ - maybe_clamp_preempt(fs_info, space_info); } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; /* @@ -1588,8 +1455,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_preemptive_reclaim(fs_info, space_info) && - !work_busy(&fs_info->preempt_reclaim_work)) { + !work_busy(&fs_info->preempt_reclaim_work) && + need_preemptive_reclaim(fs_info, space_info)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); queue_work(system_unbound_wq, diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index b1a8ffb03b3e..cb5056472e79 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -43,18 +43,6 @@ struct btrfs_space_info { u64 flags; - /* - * bytes_pinned is kept in line with what is actually pinned, as in - * we've called update_block_group and dropped the bytes_used counter - * and increased the bytes_pinned counter. However this means that - * bytes_pinned does not reflect the bytes that will be pinned once the - * delayed refs are flushed, so this counter is inc'ed every time we - * call btrfs_free_extent so it is a realtime count of what will be - * freed once the transaction is committed. It will be zeroed every - * time the transaction commits. - */ - struct percpu_counter total_bytes_pinned; - struct list_head list; /* Protected by the spinlock 'lock'. */ struct list_head ro_bgs; @@ -157,22 +145,4 @@ static inline void btrfs_space_info_free_bytes_may_use( } int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); - -static inline void __btrfs_mod_total_bytes_pinned( - struct btrfs_space_info *space_info, - s64 mod) -{ - percpu_counter_add_batch(&space_info->total_bytes_pinned, mod, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} - -static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info, - u64 flags, s64 mod) -{ - struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags); - - ASSERT(space_info); - __btrfs_mod_total_bytes_pinned(space_info, mod); -} - #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 2d19089ab625..640bcd21bf28 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -3,6 +3,7 @@ #include <linux/slab.h> #include "ctree.h" #include "subpage.h" +#include "btrfs_inode.h" /* * Subpage (sectorsize < PAGE_SIZE) support overview: @@ -110,10 +111,12 @@ int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, if (!*ret) return -ENOMEM; spin_lock_init(&(*ret)->lock); - if (type == BTRFS_SUBPAGE_METADATA) + if (type == BTRFS_SUBPAGE_METADATA) { atomic_set(&(*ret)->eb_refs, 0); - else + } else { atomic_set(&(*ret)->readers, 0); + atomic_set(&(*ret)->writers, 0); + } return 0; } @@ -183,12 +186,10 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; const int nbits = len >> fs_info->sectorsize_bits; - int ret; btrfs_subpage_assert(fs_info, page, start, len); - ret = atomic_add_return(nbits, &subpage->readers); - ASSERT(ret == nbits); + atomic_add(nbits, &subpage->readers); } void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, @@ -196,10 +197,95 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; const int nbits = len >> fs_info->sectorsize_bits; + bool is_data; + bool last; btrfs_subpage_assert(fs_info, page, start, len); + is_data = is_data_inode(page->mapping->host); ASSERT(atomic_read(&subpage->readers) >= nbits); - if (atomic_sub_and_test(nbits, &subpage->readers)) + last = atomic_sub_and_test(nbits, &subpage->readers); + + /* + * For data we need to unlock the page if the last read has finished. + * + * And please don't replace @last with atomic_sub_and_test() call + * inside if () condition. + * As we want the atomic_sub_and_test() to be always executed. + */ + if (is_data && last) + unlock_page(page); +} + +static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) +{ + u64 orig_start = *start; + u32 orig_len = *len; + + *start = max_t(u64, page_offset(page), orig_start); + *len = min_t(u64, page_offset(page) + PAGE_SIZE, + orig_start + orig_len) - *start; +} + +void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = (len >> fs_info->sectorsize_bits); + int ret; + + btrfs_subpage_assert(fs_info, page, start, len); + + ASSERT(atomic_read(&subpage->readers) == 0); + ret = atomic_add_return(nbits, &subpage->writers); + ASSERT(ret == nbits); +} + +bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = (len >> fs_info->sectorsize_bits); + + btrfs_subpage_assert(fs_info, page, start, len); + + ASSERT(atomic_read(&subpage->writers) >= nbits); + return atomic_sub_and_test(nbits, &subpage->writers); +} + +/* + * Lock a page for delalloc page writeback. + * + * Return -EAGAIN if the page is not properly initialized. + * Return 0 with the page locked, and writer counter updated. + * + * Even with 0 returned, the page still need extra check to make sure + * it's really the correct page, as the caller is using + * find_get_pages_contig(), which can race with page invalidating. + */ +int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { + lock_page(page); + return 0; + } + lock_page(page); + if (!PagePrivate(page) || !page->private) { + unlock_page(page); + return -EAGAIN; + } + btrfs_subpage_clamp_range(page, &start, &len); + btrfs_subpage_start_writer(fs_info, page, start, len); + return 0; +} + +void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) + return unlock_page(page); + btrfs_subpage_clamp_range(page, &start, &len); + if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) unlock_page(page); } @@ -354,6 +440,32 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } +void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->ordered_bitmap |= tmp; + SetPageOrdered(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->ordered_bitmap &= ~tmp; + if (subpage->ordered_bitmap == 0) + ClearPageOrdered(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} /* * Unlike set/clear which is dependent on each page status, for test all bits * are tested in the same way. @@ -376,6 +488,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); /* * Note that, in selftests (extent-io-tests), we can have empty fs_info passed @@ -408,6 +521,34 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ return test_page_func(page); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + set_page_func(page); \ + return; \ + } \ + btrfs_subpage_clamp_range(page, &start, &len); \ + btrfs_subpage_set_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + clear_page_func(page); \ + return; \ + } \ + btrfs_subpage_clamp_range(page, &start, &len); \ + btrfs_subpage_clear_##name(fs_info, page, start, len); \ +} \ +bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + return test_page_func(page); \ + btrfs_subpage_clamp_range(page, &start, &len); \ + return btrfs_subpage_test_##name(fs_info, page, start, len); \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, PageUptodate); @@ -416,3 +557,5 @@ IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, PageDirty); IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, PageWriteback); +IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, + PageOrdered); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index bfd626e955be..4d7aca85d915 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -22,6 +22,14 @@ struct btrfs_subpage { u16 error_bitmap; u16 dirty_bitmap; u16 writeback_bitmap; + /* + * Both data and metadata needs to track how many readers are for the + * page. + * Data relies on @readers to unlock the page when last reader finished. + * While metadata doesn't need page unlock, it needs to prevent + * page::private get cleared before the last end_page_read(). + */ + atomic_t readers; union { /* * Structures only used by metadata @@ -32,7 +40,10 @@ struct btrfs_subpage { atomic_t eb_refs; /* Structures only used by data */ struct { - atomic_t readers; + atomic_t writers; + + /* Tracke pending ordered extent in this sector */ + u16 ordered_bitmap; }; }; }; @@ -63,6 +74,15 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); +void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); + /* * Template for subpage related operations. * @@ -72,6 +92,10 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, * btrfs_page_*() are for call sites where the page can either be subpage * specific or regular page. The function will handle both cases. * But the range still needs to be inside the page. + * + * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't + * need to be inside the page. Those functions will truncate the range + * automatically. */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ @@ -85,12 +109,19 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len); \ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(error); DECLARE_BTRFS_SUBPAGE_OPS(dirty); DECLARE_BTRFS_SUBPAGE_OPS(writeback); +DECLARE_BTRFS_SUBPAGE_OPS(ordered); bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4a396c1147f1..d07b18b2b250 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -299,17 +299,6 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; WRITE_ONCE(trans->aborted, errno); - /* Nothing used. The other threads that have joined this - * transaction may be able to continue. */ - if (!trans->dirty && list_empty(&trans->new_bgs)) { - const char *errstr; - - errstr = btrfs_decode_error(errno); - btrfs_warn(fs_info, - "%s:%d: Aborting unused transaction(%s).", - function, line, errstr); - return; - } WRITE_ONCE(trans->transaction->aborted, errno); /* Wake up anybody who may be waiting on this transaction */ wake_up(&fs_info->transaction_wait); @@ -945,8 +934,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_check_integrity_including_extent_data: btrfs_info(info, "enabling check integrity including extent data"); - btrfs_set_opt(info->mount_opt, - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity: @@ -1527,7 +1515,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) + if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA)) seq_puts(seq, ",check_int_data"); else if (btrfs_test_opt(info, CHECK_INTEGRITY)) seq_puts(seq, ",check_int"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 436ac7b4b334..9d1d140118ff 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -429,7 +429,7 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->discard_ctl.discard_bitmap_bytes); } BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); @@ -451,7 +451,7 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->discard_ctl.discard_extent_bytes); } BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); @@ -665,15 +665,6 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ } \ BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) -static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, - struct kobj_attribute *a, - char *buf) -{ - struct btrfs_space_info *sinfo = to_space_info(kobj); - s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); - return scnprintf(buf, PAGE_SIZE, "%lld\n", val); -} - SPACE_INFO_ATTR(flags); SPACE_INFO_ATTR(total_bytes); SPACE_INFO_ATTR(bytes_used); @@ -684,8 +675,6 @@ SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); -BTRFS_ATTR(space_info, total_bytes_pinned, - btrfs_space_info_show_total_bytes_pinned); static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, flags), @@ -698,7 +687,6 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), - BTRFS_ATTR_PTR(space_info, total_bytes_pinned), NULL, }; ATTRIBUTE_GROUPS(space_info); @@ -706,7 +694,6 @@ ATTRIBUTE_GROUPS(space_info); static void space_info_release(struct kobject *kobj) { struct btrfs_space_info *sinfo = to_space_info(kobj); - percpu_counter_destroy(&sinfo->total_bytes_pinned); kfree(sinfo); } @@ -1455,6 +1442,33 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, } BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); +static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", + READ_ONCE(device->scrub_speed_max)); +} + +static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + char *endptr; + unsigned long long limit; + + limit = memparse(buf, &endptr); + WRITE_ONCE(device->scrub_speed_max, limit); + return len; +} +BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show, + btrfs_devinfo_scrub_speed_max_store); + static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1468,10 +1482,40 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); +static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + if (!device->dev_stats_valid) + return scnprintf(buf, PAGE_SIZE, "invalid\n"); + + /* + * Print all at once so we get a snapshot of all values from the same + * time. Keep them in sync and in order of definition of + * btrfs_dev_stat_values. + */ + return scnprintf(buf, PAGE_SIZE, + "write_errs %d\n" + "read_errs %d\n" + "flush_errs %d\n" + "corruption_errs %d\n" + "generation_errs %d\n", + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS)); +} +BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); + static struct attribute *devid_attrs[] = { + BTRFS_ATTR_PTR(devid, error_stats), BTRFS_ATTR_PTR(devid, in_fs_metadata), BTRFS_ATTR_PTR(devid, missing), BTRFS_ATTR_PTR(devid, replace_target), + BTRFS_ATTR_PTR(devid, scrub_speed_max), BTRFS_ATTR_PTR(devid, writeable), NULL }; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index c0aefe6dee0b..319fed82d741 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -557,7 +557,7 @@ int btrfs_test_extent_map(void) { /* * Test a chunk with 2 data stripes one of which - * interesects the physical address of the super block + * intersects the physical address of the super block * is correctly recognised. */ .raid_type = BTRFS_BLOCK_GROUP_RAID1, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f75de9f6c0ad..50318231c1a8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -583,9 +583,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, bool do_chunk_alloc = false; int ret; - /* Send isn't supposed to start transactions. */ - ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return ERR_PTR(-EROFS); @@ -1406,8 +1403,10 @@ int btrfs_defrag_root(struct btrfs_root *root) while (1) { trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } ret = btrfs_defrag_leaves(trans, root); @@ -1476,7 +1475,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } /* @@ -1869,31 +1868,6 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) } /* - * wait for the current transaction commit to start and block subsequent - * transaction joins - */ -static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info, - struct btrfs_transaction *trans) -{ - wait_event(fs_info->transaction_blocked_wait, - trans->state >= TRANS_STATE_COMMIT_START || - TRANS_ABORTED(trans)); -} - -/* - * wait for the current transaction to start and then become unblocked. - * caller holds ref. - */ -static void wait_current_trans_commit_start_and_unblock( - struct btrfs_fs_info *fs_info, - struct btrfs_transaction *trans) -{ - wait_event(fs_info->transaction_wait, - trans->state >= TRANS_STATE_UNBLOCKED || - TRANS_ABORTED(trans)); -} - -/* * commit transactions asynchronously. once btrfs_commit_transaction_async * returns, any subsequent transaction will not be allowed to join. */ @@ -1920,8 +1894,7 @@ static void do_async_commit(struct work_struct *work) kfree(ac); } -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, - int wait_for_unblock) +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_async_commit *ac; @@ -1953,13 +1926,13 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, __sb_writers_release(fs_info->sb, SB_FREEZE_FS); schedule_work(&ac->work); - - /* wait for transaction to start and unblock */ - if (wait_for_unblock) - wait_current_trans_commit_start_and_unblock(fs_info, cur_trans); - else - wait_current_trans_commit_start(fs_info, cur_trans); - + /* + * Wait for the current transaction commit to start and block + * subsequent transaction joins + */ + wait_event(fs_info->transaction_blocked_wait, + cur_trans->state >= TRANS_STATE_COMMIT_START || + TRANS_ABORTED(cur_trans)); if (current->journal_info == trans) current->journal_info = NULL; @@ -2074,14 +2047,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ASSERT(refcount_read(&trans->use_count) == 1); - /* - * Some places just start a transaction to commit it. We need to make - * sure that if this commit fails that the abort code actually marks the - * transaction as failed, so set trans->dirty to make the abort code do - * the right thing. - */ - trans->dirty = true; - /* Stop the commit early if ->aborted is set */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 364cfbb4c5c5..07d76029f598 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -122,8 +122,6 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) -#define BTRFS_SEND_TRANS_STUB ((void *)1) - struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; @@ -143,7 +141,6 @@ struct btrfs_trans_handle { bool allocating_chunk; bool can_flush_pending_bgs; bool reloc_reserved; - bool dirty; bool in_fsync; struct btrfs_root *root; struct btrfs_fs_info *fs_info; @@ -227,8 +224,7 @@ void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, - int wait_for_unblock); +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 362d14db1e38..cab451d19547 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3302,6 +3302,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * begins and releases it only after writing its superblock. */ mutex_lock(&fs_info->tree_log_mutex); + + /* + * The previous transaction writeout phase could have failed, and thus + * marked the fs in an error state. We must not commit here, as we + * could have updated our generation in the super_for_commit and + * writing the super here would result in transid mismatches. If there + * is an error here just bail. + */ + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + ret = -EIO; + btrfs_set_log_full_commit(trans); + btrfs_abort_transaction(trans, ret); + mutex_unlock(&fs_info->tree_log_mutex); + goto out_wake_log_root; + } + btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); ret = write_all_supers(fs_info, 1); @@ -4452,7 +4468,8 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, ret = btrfs_truncate_inode_items(trans, root->log_root, inode, truncate_offset, - BTRFS_EXTENT_DATA_KEY); + BTRFS_EXTENT_DATA_KEY, + NULL); } while (ret == -EAGAIN); if (ret) goto out; @@ -5400,7 +5417,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &inode->runtime_flags); while(1) { ret = btrfs_truncate_inode_items(trans, - log, inode, 0, 0); + log, inode, 0, 0, NULL); if (ret != -EAGAIN) break; } @@ -5450,13 +5467,23 @@ log_extents: btrfs_release_path(dst_path); if (need_log_inode_item) { err = log_inode_item(trans, log, dst_path, inode); - if (!err && !xattrs_logged) { + if (err) + goto out_unlock; + /* + * If we are doing a fast fsync and the inode was logged before + * in this transaction, we don't need to log the xattrs because + * they were logged before. If xattrs were added, changed or + * deleted since the last time we logged the inode, then we have + * already logged them because the inode had the runtime flag + * BTRFS_INODE_COPY_EVERYTHING set. + */ + if (!xattrs_logged && inode->logged_trans < trans->transid) { err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + if (err) + goto out_unlock; btrfs_release_path(path); } - if (err) - goto out_unlock; } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, @@ -6355,6 +6382,7 @@ next: error: if (wc.trans) btrfs_end_transaction(wc.trans); + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 47d27059d064..807502cd6510 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -717,7 +717,7 @@ static struct btrfs_fs_devices *find_fsid_changed( /* * Handles the case where scanned device is part of an fs that had - * multiple successful changes of FSID but curently device didn't + * multiple successful changes of FSID but currently device didn't * observe it. Meaning our fsid will be different than theirs. We need * to handle two subcases : * 1 - The fs still continues to have different METADATA/FSID uuids. @@ -1247,7 +1247,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, lockdep_assert_held(&uuid_mutex); /* * The device_list_mutex cannot be taken here in case opening the - * underlying device takes further locks like bd_mutex. + * underlying device takes further locks like open_mutex. * * We also don't need the lock here as this is called during mount and * exclusion is provided by uuid_mutex @@ -1550,7 +1550,7 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, * check to ensure dev extents are not double allocated. * This makes the function safe to allocate dev extents but may not report * correct usable device space, as device extent freed in current transaction - * is not reported as avaiable. + * is not reported as available. */ static int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes, u64 search_start, u64 *start, @@ -4217,14 +4217,6 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, btrfs_bg_type_to_raid_name(data_target)); } - if (fs_info->send_in_progress) { - btrfs_warn_rl(fs_info, -"cannot run balance while send operations are in progress (%d in progress)", - fs_info->send_in_progress); - ret = -EAGAIN; - goto out; - } - ret = insert_balance_item(fs_info, bctl); if (ret && ret != -EEXIST) goto out; @@ -6127,17 +6119,17 @@ static bool need_full_stripe(enum btrfs_map_op op) * @em: mapping containing the logical extent * @op: type of operation - write or read * @logical: address that we want to figure out the geometry of - * @len: the length of IO we are going to perform, starting at @logical * @io_geom: pointer used to return values * * Returns < 0 in case a chunk for the given logical address cannot be found, * usually shouldn't happen unless @logical is corrupted, 0 otherwise. */ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, - enum btrfs_map_op op, u64 logical, u64 len, + enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom) { struct map_lookup *map; + u64 len; u64 offset; u64 stripe_offset; u64 stripe_nr; @@ -6152,7 +6144,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, offset = logical - em->start; /* Len of a stripe in a chunk */ stripe_len = map->stripe_len; - /* Stripe wher this block falls in */ + /* Stripe where this block falls in */ stripe_nr = div64_u64(offset, stripe_len); /* Offset of stripe in the chunk */ stripe_offset = stripe_nr * stripe_len; @@ -6243,7 +6235,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, em = btrfs_get_chunk_map(fs_info, logical, *length); ASSERT(!IS_ERR(em)); - ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom); + ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); if (ret < 0) return ret; @@ -6670,8 +6662,6 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, * * If devid and uuid are both specified, the match must be exact, otherwise * only devid is used. - * - * If @seed is true, traverse through the seed devices. */ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, u64 devid, u8 *uuid, u8 *fsid) @@ -7865,7 +7855,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, ret = -EUCLEAN; } - /* Make sure no dev extent is beyond device bondary */ + /* Make sure no dev extent is beyond device boundary */ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 9c0d84e5ec06..c7fc7caf575c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -143,6 +143,9 @@ struct btrfs_device { struct completion kobj_unregister; /* For sysfs/FSID/devinfo/devid/ */ struct kobject devid_kobj; + + /* Bandwidth limit for scrub, in bytes */ + u64 scrub_speed_max; }; /* @@ -443,7 +446,7 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, - enum btrfs_map_op op, u64 logical, u64 len, + enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1bb8ee97aae0..297c0b1c0634 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -81,7 +81,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, * *: Special case, no superblock is written * 0: Use write pointer of zones[0] * 1: Use write pointer of zones[1] - * C: Compare super blcoks from zones[0] and zones[1], use the latest + * C: Compare super blocks from zones[0] and zones[1], use the latest * one determined by generation * x: Invalid state */ @@ -150,6 +150,18 @@ static inline u32 sb_zone_number(int shift, int mirror) return (u32)zone; } +static inline sector_t zone_start_sector(u32 zone_number, + struct block_device *bdev) +{ + return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev)); +} + +static inline u64 zone_start_physical(u32 zone_number, + struct btrfs_zoned_device_info *zone_info) +{ + return (u64)zone_number << zone_info->zone_size_shift; +} + /* * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block * device into static sized chunks and fake a conventional zone on each of @@ -405,8 +417,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (sb_zone + 1 >= zone_info->nr_zones) continue; - sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT); - ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, + ret = btrfs_get_dev_zones(device, + zone_start_physical(sb_zone, zone_info), &zone_info->sb_zones[sb_pos], &nr_zones); if (ret) @@ -421,7 +433,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) } /* - * If zones[0] is conventional, always use the beggining of the + * If zones[0] is conventional, always use the beginning of the * zone to record superblock. No need to validate in that case. */ if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == @@ -721,7 +733,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, if (sb_zone + 1 >= nr_zones) return -ENOENT; - ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift, + ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, zones); if (ret < 0) @@ -826,7 +838,7 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) return -ENOENT; return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - sb_zone << zone_sectors_shift, + zone_start_sector(sb_zone, bdev), zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); } @@ -878,7 +890,8 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, if (!(end <= sb_zone || sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { have_sb = true; - pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift; + pos = zone_start_physical( + sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo); break; } @@ -1127,6 +1140,10 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + btrfs_err_in_rcu(fs_info, + "zoned: unexpected conventional zone %llu on device %s (devid %llu)", + zone.start << SECTOR_SHIFT, + rcu_str_deref(device->name), device->devid); ret = -EIO; goto out; } @@ -1187,6 +1204,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case 0: /* single */ + if (alloc_offsets[0] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical); + ret = -EIO; + goto out; + } cache->alloc_offset = alloc_offsets[0]; break; case BTRFS_BLOCK_GROUP_DUP: @@ -1204,6 +1228,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } out: + if (cache->alloc_offset > fs_info->zone_size) { + btrfs_err(fs_info, + "zoned: invalid write pointer %llu in block group %llu", + cache->alloc_offset, cache->start); + ret = -EIO; + } + /* An extent is allocated after the write pointer */ if (!ret && num_conventional && last_alloc > cache->alloc_offset) { btrfs_err(fs_info, @@ -1502,3 +1533,24 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, length = wp - physical_pos; return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); } + +struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct btrfs_device *device; + struct extent_map *em; + struct map_lookup *map; + + em = btrfs_get_chunk_map(fs_info, logical, length); + if (IS_ERR(em)) + return ERR_CAST(em); + + map = em->map_lookup; + /* We only support single profile for now */ + ASSERT(map->num_stripes == 1); + device = map->stripes[0].dev; + + free_extent_map(em); + + return device; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index e55d32595c2c..b0ae2608cb6b 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -65,6 +65,8 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); +struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -191,6 +193,13 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, return -EOPNOTSUPP; } +static inline struct btrfs_device *btrfs_zoned_get_device( + struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + return ERR_PTR(-EOPNOTSUPP); +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/buffer.c b/fs/buffer.c index ea48c01fb76b..6290c3afdba4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -589,31 +589,6 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) EXPORT_SYMBOL(mark_buffer_dirty_inode); /* - * Mark the page dirty, and set it dirty in the page cache, and mark the inode - * dirty. - * - * If warn is true, then emit a warning if the page is not uptodate and has - * not been truncated. - * - * The caller must hold lock_page_memcg(). - */ -void __set_page_dirty(struct page *page, struct address_space *mapping, - int warn) -{ - unsigned long flags; - - xa_lock_irqsave(&mapping->i_pages, flags); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(warn && !PageUptodate(page)); - account_page_dirtied(page, mapping); - __xa_set_mark(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); - } - xa_unlock_irqrestore(&mapping->i_pages, flags); -} -EXPORT_SYMBOL_GPL(__set_page_dirty); - -/* * Add a page to the dirty page list. * * It is a sad fact of life that this function is called from several places diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c1570fada3d8..a1e2813731d1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -82,10 +82,6 @@ static int ceph_set_page_dirty(struct page *page) struct inode *inode; struct ceph_inode_info *ci; struct ceph_snap_context *snapc; - int ret; - - if (unlikely(!mapping)) - return !TestSetPageDirty(page); if (PageDirty(page)) { dout("%p set_page_dirty %p idx %lu -- already dirty\n", @@ -130,11 +126,7 @@ static int ceph_set_page_dirty(struct page *page) BUG_ON(PagePrivate(page)); attach_page_private(page, snapc); - ret = __set_page_dirty_nobuffers(page); - WARN_ON(!PageLocked(page)); - WARN_ON(!page->mapping); - - return ret; + return __set_page_dirty_nobuffers(page); } /* @@ -226,7 +218,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) int err = req->r_result; ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, err); + req->r_end_latency, osd_data->length, err); dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result, subreq->len, i_size_read(req->r_inode)); @@ -313,7 +305,7 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) ceph_put_cap_refs(ci, got); } -const struct netfs_read_request_ops ceph_netfs_read_ops = { +static const struct netfs_read_request_ops ceph_netfs_read_ops = { .init_rreq = ceph_init_rreq, .is_cache_enabled = ceph_is_cache_enabled, .begin_cache_operation = ceph_begin_cache_operation, @@ -560,7 +552,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) err = ceph_osdc_wait_request(osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, err); + req->r_end_latency, len, err); ceph_osdc_put_request(req); if (err == 0) @@ -635,6 +627,7 @@ static void writepages_finish(struct ceph_osd_request *req) struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + unsigned int len = 0; bool remove_page; dout("writepages_finish %p rc %d\n", inode, rc); @@ -647,9 +640,6 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_clear_error_write(ci); } - ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, rc); - /* * We lost the cache cap, need to truncate the page before * it is unlocked, otherwise we'd truncate it later in the @@ -666,6 +656,7 @@ static void writepages_finish(struct ceph_osd_request *req) osd_data = osd_req_op_extent_osd_data(req, i); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + len += osd_data->length; num_pages = calc_pages_for((u64)osd_data->alignment, (u64)osd_data->length); total_pages += num_pages; @@ -696,6 +687,9 @@ static void writepages_finish(struct ceph_osd_request *req) release_pages(osd_data->pages, num_pages); } + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, len, rc); + ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); osd_data = osd_req_op_extent_osd_data(req, 0); @@ -1711,7 +1705,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, err); + req->r_end_latency, len, err); out_put: ceph_osdc_put_request(req); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a5e93b185515..7bdefd0c789a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -645,9 +645,7 @@ void ceph_add_cap(struct inode *inode, dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, session->s_mds, cap_id, ceph_cap_string(issued), seq); - spin_lock(&session->s_gen_ttl_lock); - gen = session->s_cap_gen; - spin_unlock(&session->s_gen_ttl_lock); + gen = atomic_read(&session->s_cap_gen); cap = __get_cap_for_mds(ci, mds); if (!cap) { @@ -785,10 +783,8 @@ static int __cap_is_valid(struct ceph_cap *cap) unsigned long ttl; u32 gen; - spin_lock(&cap->session->s_gen_ttl_lock); - gen = cap->session->s_cap_gen; + gen = atomic_read(&cap->session->s_cap_gen); ttl = cap->session->s_cap_ttl; - spin_unlock(&cap->session->s_gen_ttl_lock); if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { dout("__cap_is_valid %p cap %p issued %s " @@ -1182,7 +1178,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) * s_cap_gen while session is in the reconnect state. */ if (queue_release && - (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { + (!session->s_cap_reconnect || + cap->cap_gen == atomic_read(&session->s_cap_gen))) { cap->queue_release = 1; if (removed) { __ceph_queue_cap_release(session, cap); @@ -1534,7 +1531,7 @@ static inline int __send_flush_snap(struct inode *inode, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * - * Called under i_ceph_lock. Takes s_mutex as needed. + * Called under i_ceph_lock. */ static void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session *session) @@ -1656,7 +1653,6 @@ retry: mds = ci->i_auth_cap->session->s_mds; if (session && session->s_mds != mds) { dout(" oops, wrong session %p mutex\n", session); - mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); session = NULL; } @@ -1665,10 +1661,6 @@ retry: mutex_lock(&mdsc->mutex); session = __ceph_lookup_mds_session(mdsc, mds); mutex_unlock(&mdsc->mutex); - if (session) { - dout(" inverting session/ino locks on %p\n", session); - mutex_lock(&session->s_mutex); - } goto retry; } @@ -1680,12 +1672,10 @@ retry: out: spin_unlock(&ci->i_ceph_lock); - if (psession) { + if (psession) *psession = session; - } else if (session) { - mutex_unlock(&session->s_mutex); + else ceph_put_mds_session(session); - } /* we flushed them all; remove this inode from the queue */ spin_lock(&mdsc->snap_flush_lock); list_del_init(&ci->i_snap_flush_item); @@ -1915,7 +1905,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; - int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ @@ -1923,14 +1912,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, bool queue_invalidate = false; bool tried_invalidate = false; + if (session) + ceph_get_mds_session(session); + spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; - - goto retry_locked; retry: - spin_lock(&ci->i_ceph_lock); -retry_locked: /* Caps wanted by virtue of active open files. */ file_wanted = __ceph_caps_file_wanted(ci); @@ -2010,7 +1998,7 @@ retry_locked: ci->i_rdcache_revoking = ci->i_rdcache_gen; } tried_invalidate = true; - goto retry_locked; + goto retry; } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { @@ -2024,8 +2012,6 @@ retry_locked: ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) continue; - /* NOTE: no side-effects allowed, until we take s_mutex */ - /* * If we have an auth cap, we don't need to consider any * overlapping caps as used. @@ -2088,37 +2074,8 @@ retry_locked: continue; /* nope, all good */ ack: - if (session && session != cap->session) { - dout("oops, wrong session %p mutex\n", session); - mutex_unlock(&session->s_mutex); - session = NULL; - } - if (!session) { - session = cap->session; - if (mutex_trylock(&session->s_mutex) == 0) { - dout("inverting session/ino locks on %p\n", - session); - session = ceph_get_mds_session(session); - spin_unlock(&ci->i_ceph_lock); - if (took_snap_rwsem) { - up_read(&mdsc->snap_rwsem); - took_snap_rwsem = 0; - } - if (session) { - mutex_lock(&session->s_mutex); - ceph_put_mds_session(session); - } else { - /* - * Because we take the reference while - * holding the i_ceph_lock, it should - * never be NULL. Throw a warning if it - * ever is. - */ - WARN_ON_ONCE(true); - } - goto retry; - } - } + ceph_put_mds_session(session); + session = ceph_get_mds_session(cap->session); /* kick flushing and flush snaps before sending normal * cap message */ @@ -2130,20 +2087,7 @@ ack: if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) __ceph_flush_snaps(ci, session); - goto retry_locked; - } - - /* take snap_rwsem after session mutex */ - if (!took_snap_rwsem) { - if (down_read_trylock(&mdsc->snap_rwsem) == 0) { - dout("inverting snap/in locks on %p\n", - inode); - spin_unlock(&ci->i_ceph_lock); - down_read(&mdsc->snap_rwsem); - took_snap_rwsem = 1; - goto retry; - } - took_snap_rwsem = 1; + goto retry; } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { @@ -2165,9 +2109,10 @@ ack: __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, want, retain, flushing, flush_tid, oldest_flush_tid); - spin_unlock(&ci->i_ceph_lock); + spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); + spin_lock(&ci->i_ceph_lock); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -2182,13 +2127,9 @@ ack: spin_unlock(&ci->i_ceph_lock); + ceph_put_mds_session(session); if (queue_invalidate) ceph_queue_invalidate(inode); - - if (session) - mutex_unlock(&session->s_mutex); - if (took_snap_rwsem) - up_read(&mdsc->snap_rwsem); } /* @@ -2198,26 +2139,17 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_session *session = NULL; int flushing = 0; u64 flush_tid = 0, oldest_flush_tid = 0; -retry: spin_lock(&ci->i_ceph_lock); retry_locked: if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; struct cap_msg_args arg; + struct ceph_mds_session *session = cap->session; - if (session != cap->session) { - spin_unlock(&ci->i_ceph_lock); - if (session) - mutex_unlock(&session->s_mutex); - session = cap->session; - mutex_lock(&session->s_mutex); - goto retry; - } - if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) { + if (session->s_state < CEPH_MDS_SESSION_OPEN) { spin_unlock(&ci->i_ceph_lock); goto out; } @@ -2254,9 +2186,6 @@ retry_locked: spin_unlock(&ci->i_ceph_lock); } out: - if (session) - mutex_unlock(&session->s_mutex); - *ptid = flush_tid; return flushing; } @@ -3213,8 +3142,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (complete_capsnap) wake_up_all(&ci->i_cap_wq); while (put-- > 0) { - /* avoid calling iput_final() in osd dispatch threads */ - ceph_async_iput(inode); + iput(inode); } } @@ -3288,7 +3216,7 @@ static void handle_cap_grant(struct inode *inode, u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); unsigned char check_caps = 0; - bool was_stale = cap->cap_gen < session->s_cap_gen; + bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen); bool wake = false; bool writeback = false; bool queue_trunc = false; @@ -3340,7 +3268,7 @@ static void handle_cap_grant(struct inode *inode, } /* side effects now are allowed */ - cap->cap_gen = session->s_cap_gen; + cap->cap_gen = atomic_read(&session->s_cap_gen); cap->seq = seq; __check_cap_issue(ci, cap, newcaps); @@ -3553,13 +3481,12 @@ static void handle_cap_grant(struct inode *inode, if (wake) wake_up_all(&ci->i_cap_wq); + mutex_unlock(&session->s_mutex); if (check_caps == 1) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL, session); else if (check_caps == 2) ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session); - else - mutex_unlock(&session->s_mutex); } /* @@ -4203,8 +4130,7 @@ done: mutex_unlock(&session->s_mutex); done_unlocked: ceph_put_string(extra_info.pool_ns); - /* avoid calling iput_final() in mds dispatch threads */ - ceph_async_iput(inode); + iput(inode); return; flush_cap_releases: @@ -4246,8 +4172,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->cap_delay_lock); dout("check_delayed_caps on %p\n", inode); ceph_check_caps(ci, 0, NULL); - /* avoid calling iput_final() in tick thread */ - ceph_async_iput(inode); + iput(inode); spin_lock(&mdsc->cap_delay_lock); } } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 425f3356332a..38b78b45811f 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -127,7 +127,7 @@ static int mdsc_show(struct seq_file *s, void *p) return 0; } -#define CEPH_METRIC_SHOW(name, total, avg, min, max, sq) { \ +#define CEPH_LAT_METRIC_SHOW(name, total, avg, min, max, sq) { \ s64 _total, _avg, _min, _max, _sq, _st; \ _avg = ktime_to_us(avg); \ _min = ktime_to_us(min == KTIME_MAX ? 0 : min); \ @@ -140,6 +140,12 @@ static int mdsc_show(struct seq_file *s, void *p) name, total, _avg, _min, _max, _st); \ } +#define CEPH_SZ_METRIC_SHOW(name, total, avg, min, max, sum) { \ + u64 _min = min == U64_MAX ? 0 : min; \ + seq_printf(s, "%-14s%-12lld%-16llu%-16llu%-16llu%llu\n", \ + name, total, avg, _min, max, sum); \ +} + static int metric_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; @@ -147,6 +153,7 @@ static int metric_show(struct seq_file *s, void *p) struct ceph_client_metric *m = &mdsc->metric; int nr_caps = 0; s64 total, sum, avg, min, max, sq; + u64 sum_sz, avg_sz, min_sz, max_sz; sum = percpu_counter_sum(&m->total_inodes); seq_printf(s, "item total\n"); @@ -170,7 +177,7 @@ static int metric_show(struct seq_file *s, void *p) max = m->read_latency_max; sq = m->read_latency_sq_sum; spin_unlock(&m->read_metric_lock); - CEPH_METRIC_SHOW("read", total, avg, min, max, sq); + CEPH_LAT_METRIC_SHOW("read", total, avg, min, max, sq); spin_lock(&m->write_metric_lock); total = m->total_writes; @@ -180,7 +187,7 @@ static int metric_show(struct seq_file *s, void *p) max = m->write_latency_max; sq = m->write_latency_sq_sum; spin_unlock(&m->write_metric_lock); - CEPH_METRIC_SHOW("write", total, avg, min, max, sq); + CEPH_LAT_METRIC_SHOW("write", total, avg, min, max, sq); spin_lock(&m->metadata_metric_lock); total = m->total_metadatas; @@ -190,7 +197,29 @@ static int metric_show(struct seq_file *s, void *p) max = m->metadata_latency_max; sq = m->metadata_latency_sq_sum; spin_unlock(&m->metadata_metric_lock); - CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq); + CEPH_LAT_METRIC_SHOW("metadata", total, avg, min, max, sq); + + seq_printf(s, "\n"); + seq_printf(s, "item total avg_sz(bytes) min_sz(bytes) max_sz(bytes) total_sz(bytes)\n"); + seq_printf(s, "----------------------------------------------------------------------------------------\n"); + + spin_lock(&m->read_metric_lock); + total = m->total_reads; + sum_sz = m->read_size_sum; + avg_sz = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum_sz, total) : 0; + min_sz = m->read_size_min; + max_sz = m->read_size_max; + spin_unlock(&m->read_metric_lock); + CEPH_SZ_METRIC_SHOW("read", total, avg_sz, min_sz, max_sz, sum_sz); + + spin_lock(&m->write_metric_lock); + total = m->total_writes; + sum_sz = m->write_size_sum; + avg_sz = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum_sz, total) : 0; + min_sz = m->write_size_min; + max_sz = m->write_size_max; + spin_unlock(&m->write_metric_lock); + CEPH_SZ_METRIC_SHOW("write", total, avg_sz, min_sz, max_sz, sum_sz); seq_printf(s, "\n"); seq_printf(s, "item total miss hit\n"); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 5624fae7a603..133dbd9338e7 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -668,14 +668,13 @@ out: * Handle lookups for the hidden .snap directory. */ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, - struct dentry *dentry, int err) + struct dentry *dentry) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ /* .snap dir? */ - if (err == -ENOENT && - ceph_snap(parent) == CEPH_NOSNAP && + if (ceph_snap(parent) == CEPH_NOSNAP && strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) { struct dentry *res; struct inode *inode = ceph_get_snapdir(parent); @@ -742,7 +741,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; - struct dentry *res; int op; int mask; int err; @@ -790,15 +788,20 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, mask |= CEPH_CAP_XATTR_SHARED; req->r_args.getattr.mask = cpu_to_le32(mask); + ihold(dir); req->r_parent = dir; set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, NULL, req); - res = ceph_handle_snapdir(req, dentry, err); - if (IS_ERR(res)) { - err = PTR_ERR(res); - } else { - dentry = res; - err = 0; + if (err == -ENOENT) { + struct dentry *res; + + res = ceph_handle_snapdir(req, dentry); + if (IS_ERR(res)) { + err = PTR_ERR(res); + } else { + dentry = res; + err = 0; + } } dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ @@ -866,6 +869,7 @@ static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; + ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mknod.mode = cpu_to_le32(mode); req->r_args.mknod.rdev = cpu_to_le32(rdev); @@ -927,6 +931,8 @@ static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir, goto out; } req->r_parent = dir; + ihold(dir); + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry = dget(dentry); req->r_num_caps = 2; @@ -991,6 +997,7 @@ static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; + ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_args.mkdir.mode = cpu_to_le32(mode); req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; @@ -1035,6 +1042,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); req->r_parent = dir; + ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -1156,6 +1164,7 @@ retry: req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; + ihold(dir); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_inode_drop = ceph_drop_caps_for_unlink(inode); @@ -1230,6 +1239,7 @@ static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir, req->r_old_dentry = dget(old_dentry); req->r_old_dentry_dir = old_dir; req->r_parent = new_dir; + ihold(new_dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -1546,10 +1556,8 @@ static bool __dentry_lease_is_valid(struct ceph_dentry_info *di) u32 gen; unsigned long ttl; - spin_lock(&session->s_gen_ttl_lock); - gen = session->s_cap_gen; + gen = atomic_read(&session->s_cap_gen); ttl = session->s_cap_ttl; - spin_unlock(&session->s_gen_ttl_lock); if (di->lease_gen == gen && time_before(jiffies, ttl) && @@ -1728,6 +1736,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; + ihold(dir); mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; if (ceph_security_xattr_wanted(dir)) @@ -1807,8 +1816,7 @@ static void ceph_d_release(struct dentry *dentry) dentry->d_fsdata = NULL; spin_unlock(&dentry->d_lock); - if (di->lease_session) - ceph_put_mds_session(di->lease_session); + ceph_put_mds_session(di->lease_session); kmem_cache_free(ceph_dentry_cachep, di); } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 65540a4429b2..1d65934c1262 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -542,6 +542,7 @@ static int ceph_get_name(struct dentry *parent, char *name, ihold(inode); req->r_ino2 = ceph_vino(d_inode(parent)); req->r_parent = d_inode(parent); + ihold(req->r_parent); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 77fc037d5beb..d1755ac1d964 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -578,6 +578,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, struct ceph_inode_info *ci = ceph_inode(dir); struct inode *inode; struct timespec64 now; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_vino vino = { .ino = req->r_deleg_ino, .snap = CEPH_NOSNAP }; @@ -615,8 +616,10 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, ceph_file_layout_to_legacy(lo, &in.layout); + down_read(&mdsc->snap_rwsem); ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, req->r_fmode, NULL); + up_read(&mdsc->snap_rwsem); if (ret) { dout("%s failed to fill inode: %d\n", __func__, ret); ceph_dir_clear_complete(dir); @@ -703,6 +706,7 @@ retry: mask |= CEPH_CAP_XATTR_SHARED; req->r_args.open.mask = cpu_to_le32(mask); req->r_parent = dir; + ihold(dir); if (flags & O_CREAT) { struct ceph_file_layout lo; @@ -739,14 +743,16 @@ retry: err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); - dentry = ceph_handle_snapdir(req, dentry, err); - if (IS_ERR(dentry)) { - err = PTR_ERR(dentry); - goto out_req; + if (err == -ENOENT) { + dentry = ceph_handle_snapdir(req, dentry); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_req; + } + err = 0; } - err = 0; - if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); if (d_in_lookup(dentry)) { @@ -898,7 +904,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, - ret); + len, ret); ceph_osdc_put_request(req); @@ -1030,12 +1036,12 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; + unsigned int len = osd_data->bvec_pos.iter.bi_size; BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); BUG_ON(!osd_data->num_bvecs); - dout("ceph_aio_complete_req %p rc %d bytes %u\n", - inode, rc, osd_data->bvec_pos.iter.bi_size); + dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, len); if (rc == -EOLDSNAPC) { struct ceph_aio_work *aio_work; @@ -1053,9 +1059,9 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } else if (!aio_req->write) { if (rc == -ENOENT) rc = 0; - if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) { + if (rc >= 0 && len > rc) { struct iov_iter i; - int zlen = osd_data->bvec_pos.iter.bi_size - rc; + int zlen = len - rc; /* * If read is satisfied by single OSD request, @@ -1072,8 +1078,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs, - osd_data->num_bvecs, - osd_data->bvec_pos.iter.bi_size); + osd_data->num_bvecs, len); iov_iter_advance(&i, rc); iov_iter_zero(zlen, &i); } @@ -1083,10 +1088,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) if (req->r_start_latency) { if (aio_req->write) ceph_update_write_metrics(metric, req->r_start_latency, - req->r_end_latency, rc); + req->r_end_latency, len, rc); else ceph_update_read_metrics(metric, req->r_start_latency, - req->r_end_latency, rc); + req->r_end_latency, len, rc); } put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, @@ -1294,10 +1299,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (write) ceph_update_write_metrics(metric, req->r_start_latency, - req->r_end_latency, ret); + req->r_end_latency, len, ret); else ceph_update_read_metrics(metric, req->r_start_latency, - req->r_end_latency, ret); + req->r_end_latency, len, ret); size = i_size_read(inode); if (!write) { @@ -1471,7 +1476,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, ret = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, ret); + req->r_end_latency, len, ret); out: ceph_osdc_put_request(req); if (ret != 0) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e1c63adb196d..1bd2cc015913 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -777,6 +777,8 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, umode_t mode = le32_to_cpu(info->mode); dev_t rdev = le32_to_cpu(info->rdev); + lockdep_assert_held(&mdsc->snap_rwsem); + dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__, inode, ceph_vinop(inode), le64_to_cpu(info->version), ci->i_version); @@ -1122,7 +1124,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, return; } - if (di->lease_gen == session->s_cap_gen && + if (di->lease_gen == atomic_read(&session->s_cap_gen) && time_before(ttl, di->time)) return; /* we already have a newer lease. */ @@ -1133,7 +1135,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, if (!di->lease_session) di->lease_session = ceph_get_mds_session(session); - di->lease_gen = session->s_cap_gen; + di->lease_gen = atomic_read(&session->s_cap_gen); di->lease_seq = le32_to_cpu(lease->seq); di->lease_renew_after = half_ttl; di->lease_renew_from = 0; @@ -1152,8 +1154,7 @@ static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry, __update_dentry_lease(dir, dentry, lease, session, from_time, &old_lease_session); spin_unlock(&dentry->d_lock); - if (old_lease_session) - ceph_put_mds_session(old_lease_session); + ceph_put_mds_session(old_lease_session); } /* @@ -1198,8 +1199,7 @@ static void update_dentry_lease_careful(struct dentry *dentry, from_time, &old_lease_session); out_unlock: spin_unlock(&dentry->d_lock); - if (old_lease_session) - ceph_put_mds_session(old_lease_session); + ceph_put_mds_session(old_lease_session); } /* @@ -1566,8 +1566,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, unlock_new_inode(in); } - /* avoid calling iput_final() in mds dispatch threads */ - ceph_async_iput(in); + iput(in); } return err; @@ -1764,13 +1763,11 @@ retry_lookup: if (ret < 0) { pr_err("ceph_fill_inode badness on %p\n", in); if (d_really_is_negative(dn)) { - /* avoid calling iput_final() in mds - * dispatch threads */ if (in->i_state & I_NEW) { ihold(in); discard_new_inode(in); } - ceph_async_iput(in); + iput(in); } d_drop(dn); err = ret; @@ -1783,7 +1780,7 @@ retry_lookup: if (ceph_security_xattr_deadlock(in)) { dout(" skip splicing dn %p to inode %p" " (security xattr deadlock)\n", dn, in); - ceph_async_iput(in); + iput(in); skipped++; goto next_item; } @@ -1834,25 +1831,6 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size) return ret; } -/* - * Put reference to inode, but avoid calling iput_final() in current thread. - * iput_final() may wait for reahahead pages. The wait can cause deadlock in - * some contexts. - */ -void ceph_async_iput(struct inode *inode) -{ - if (!inode) - return; - for (;;) { - if (atomic_add_unless(&inode->i_count, -1, 1)) - break; - if (queue_work(ceph_inode_to_client(inode)->inode_wq, - &ceph_inode(inode)->i_work)) - break; - /* queue work failed, i_count must be at least 2 */ - } -} - void ceph_queue_inode_work(struct inode *inode, int work_bit) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index e5af591d3bd4..a818213c972f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -664,6 +664,9 @@ struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s) void ceph_put_mds_session(struct ceph_mds_session *s) { + if (IS_ERR_OR_NULL(s)) + return; + dout("mdsc put_session %p %d -> %d\n", s, refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1); if (refcount_dec_and_test(&s->s_ref)) { @@ -746,8 +749,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); - spin_lock_init(&s->s_gen_ttl_lock); - s->s_cap_gen = 1; + atomic_set(&s->s_cap_gen, 1); s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); @@ -822,14 +824,13 @@ void ceph_mdsc_release_request(struct kref *kref) ceph_msg_put(req->r_reply); if (req->r_inode) { ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); - /* avoid calling iput_final() in mds dispatch threads */ - ceph_async_iput(req->r_inode); + iput(req->r_inode); } if (req->r_parent) { ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); - ceph_async_iput(req->r_parent); + iput(req->r_parent); } - ceph_async_iput(req->r_target_inode); + iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) @@ -843,7 +844,7 @@ void ceph_mdsc_release_request(struct kref *kref) */ ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); - ceph_async_iput(req->r_old_dentry_dir); + iput(req->r_old_dentry_dir); } kfree(req->r_path1); kfree(req->r_path2); @@ -958,8 +959,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, } if (req->r_unsafe_dir) { - /* avoid calling iput_final() in mds dispatch threads */ - ceph_async_iput(req->r_unsafe_dir); + iput(req->r_unsafe_dir); req->r_unsafe_dir = NULL; } @@ -1130,7 +1130,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); if (!cap) { spin_unlock(&ci->i_ceph_lock); - ceph_async_iput(inode); + iput(inode); goto random; } mds = cap->session->s_mds; @@ -1139,9 +1139,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, cap == ci->i_auth_cap ? "auth " : "", cap); spin_unlock(&ci->i_ceph_lock); out: - /* avoid calling iput_final() while holding mdsc->mutex or - * in mds dispatch threads */ - ceph_async_iput(inode); + iput(inode); return mds; random: @@ -1438,8 +1436,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, for (i = 0; i < mi->num_export_targets; i++) { ts = __open_export_target_session(mdsc, mi->export_targets[i]); - if (!IS_ERR(ts)) - ceph_put_mds_session(ts); + ceph_put_mds_session(ts); } } @@ -1545,9 +1542,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, spin_unlock(&session->s_cap_lock); if (last_inode) { - /* avoid calling iput_final() while holding - * s_mutex or in mds dispatch threads */ - ceph_async_iput(last_inode); + iput(last_inode); last_inode = NULL; } if (old_cap) { @@ -1581,7 +1576,7 @@ out: session->s_cap_iterator = NULL; spin_unlock(&session->s_cap_lock); - ceph_async_iput(last_inode); + iput(last_inode); if (old_cap) ceph_put_cap(session->s_mdsc, old_cap); @@ -1721,8 +1716,7 @@ static void remove_session_caps(struct ceph_mds_session *session) spin_unlock(&session->s_cap_lock); inode = ceph_find_inode(sb, vino); - /* avoid calling iput_final() while holding s_mutex */ - ceph_async_iput(inode); + iput(inode); spin_lock(&session->s_cap_lock); } @@ -1761,7 +1755,7 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } else if (ev == RENEWCAPS) { - if (cap->cap_gen < cap->session->s_cap_gen) { + if (cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) { /* mds did not re-issue stale cap */ spin_lock(&ci->i_ceph_lock); cap->issued = cap->implemented = CEPH_CAP_PIN; @@ -2988,7 +2982,6 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, ceph_take_cap_refs(ci, CEPH_CAP_PIN, false); __ceph_touch_fmode(ci, mdsc, fmode); spin_unlock(&ci->i_ceph_lock); - ihold(req->r_parent); } if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), @@ -3499,10 +3492,8 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_STALE: pr_info("mds%d caps went stale, renewing\n", session->s_mds); - spin_lock(&session->s_gen_ttl_lock); - session->s_cap_gen++; + atomic_inc(&session->s_cap_gen); session->s_cap_ttl = jiffies - 1; - spin_unlock(&session->s_gen_ttl_lock); send_renew_caps(mdsc, session); break; @@ -3771,7 +3762,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ cap->mseq = 0; /* and migrate_seq */ - cap->cap_gen = cap->session->s_cap_gen; + cap->cap_gen = atomic_read(&cap->session->s_cap_gen); /* These are lost when the session goes away */ if (S_ISDIR(inode->i_mode)) { @@ -4011,9 +4002,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, dout("session %p state %s\n", session, ceph_session_state_name(session->s_state)); - spin_lock(&session->s_gen_ttl_lock); - session->s_cap_gen++; - spin_unlock(&session->s_gen_ttl_lock); + atomic_inc(&session->s_cap_gen); spin_lock(&session->s_cap_lock); /* don't know if session is readonly */ @@ -4344,7 +4333,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, case CEPH_MDS_LEASE_RENEW: if (di->lease_session == session && - di->lease_gen == session->s_cap_gen && + di->lease_gen == atomic_read(&session->s_cap_gen) && di->lease_renew_from && di->lease_renew_after == 0) { unsigned long duration = @@ -4372,8 +4361,7 @@ release: out: mutex_unlock(&session->s_mutex); - /* avoid calling iput_final() in mds dispatch threads */ - ceph_async_iput(inode); + iput(inode); return; bad: diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 15c11a0f2caf..20e42d8b66c6 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -186,10 +186,8 @@ struct ceph_mds_session { struct ceph_auth_handshake s_auth; - /* protected by s_gen_ttl_lock */ - spinlock_t s_gen_ttl_lock; - u32 s_cap_gen; /* inc each time we get mds stale msg */ - unsigned long s_cap_ttl; /* when session caps expire */ + atomic_t s_cap_gen; /* inc each time we get mds stale msg */ + unsigned long s_cap_ttl; /* when session caps expire. protected by s_mutex */ /* protected by s_cap_lock */ spinlock_t s_cap_lock; diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 28b6b42ad677..5ac151eb0d49 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -20,8 +20,11 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_opened_files *files; struct ceph_pinned_icaps *icaps; struct ceph_opened_inodes *inodes; + struct ceph_read_io_size *rsize; + struct ceph_write_io_size *wsize; struct ceph_client_metric *m = &mdsc->metric; u64 nr_caps = atomic64_read(&m->total_caps); + u32 header_len = sizeof(struct ceph_metric_header); struct ceph_msg *msg; struct timespec64 ts; s64 sum; @@ -30,7 +33,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) + sizeof(*meta) + sizeof(*dlease) + sizeof(*files) - + sizeof(*icaps) + sizeof(*inodes); + + sizeof(*icaps) + sizeof(*inodes) + sizeof(*rsize) + + sizeof(*wsize); msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); if (!msg) { @@ -43,10 +47,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the cap metric */ cap = (struct ceph_metric_cap *)(head + 1); - cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); - cap->ver = 1; - cap->compat = 1; - cap->data_len = cpu_to_le32(sizeof(*cap) - 10); + cap->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO); + cap->header.ver = 1; + cap->header.compat = 1; + cap->header.data_len = cpu_to_le32(sizeof(*cap) - header_len); cap->hit = cpu_to_le64(percpu_counter_sum(&m->i_caps_hit)); cap->mis = cpu_to_le64(percpu_counter_sum(&m->i_caps_mis)); cap->total = cpu_to_le64(nr_caps); @@ -54,10 +58,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the read latency metric */ read = (struct ceph_metric_read_latency *)(cap + 1); - read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); - read->ver = 1; - read->compat = 1; - read->data_len = cpu_to_le32(sizeof(*read) - 10); + read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); + read->header.ver = 1; + read->header.compat = 1; + read->header.data_len = cpu_to_le32(sizeof(*read) - header_len); sum = m->read_latency_sum; jiffies_to_timespec64(sum, &ts); read->sec = cpu_to_le32(ts.tv_sec); @@ -66,10 +70,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the write latency metric */ write = (struct ceph_metric_write_latency *)(read + 1); - write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); - write->ver = 1; - write->compat = 1; - write->data_len = cpu_to_le32(sizeof(*write) - 10); + write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); + write->header.ver = 1; + write->header.compat = 1; + write->header.data_len = cpu_to_le32(sizeof(*write) - header_len); sum = m->write_latency_sum; jiffies_to_timespec64(sum, &ts); write->sec = cpu_to_le32(ts.tv_sec); @@ -78,10 +82,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the metadata latency metric */ meta = (struct ceph_metric_metadata_latency *)(write + 1); - meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); - meta->ver = 1; - meta->compat = 1; - meta->data_len = cpu_to_le32(sizeof(*meta) - 10); + meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); + meta->header.ver = 1; + meta->header.compat = 1; + meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len); sum = m->metadata_latency_sum; jiffies_to_timespec64(sum, &ts); meta->sec = cpu_to_le32(ts.tv_sec); @@ -90,10 +94,10 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the dentry lease metric */ dlease = (struct ceph_metric_dlease *)(meta + 1); - dlease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); - dlease->ver = 1; - dlease->compat = 1; - dlease->data_len = cpu_to_le32(sizeof(*dlease) - 10); + dlease->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); + dlease->header.ver = 1; + dlease->header.compat = 1; + dlease->header.data_len = cpu_to_le32(sizeof(*dlease) - header_len); dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit)); dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis)); dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries)); @@ -103,34 +107,54 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the opened files metric */ files = (struct ceph_opened_files *)(dlease + 1); - files->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES); - files->ver = 1; - files->compat = 1; - files->data_len = cpu_to_le32(sizeof(*files) - 10); + files->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES); + files->header.ver = 1; + files->header.compat = 1; + files->header.data_len = cpu_to_le32(sizeof(*files) - header_len); files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files)); files->total = cpu_to_le64(sum); items++; /* encode the pinned icaps metric */ icaps = (struct ceph_pinned_icaps *)(files + 1); - icaps->type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS); - icaps->ver = 1; - icaps->compat = 1; - icaps->data_len = cpu_to_le32(sizeof(*icaps) - 10); + icaps->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS); + icaps->header.ver = 1; + icaps->header.compat = 1; + icaps->header.data_len = cpu_to_le32(sizeof(*icaps) - header_len); icaps->pinned_icaps = cpu_to_le64(nr_caps); icaps->total = cpu_to_le64(sum); items++; /* encode the opened inodes metric */ inodes = (struct ceph_opened_inodes *)(icaps + 1); - inodes->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES); - inodes->ver = 1; - inodes->compat = 1; - inodes->data_len = cpu_to_le32(sizeof(*inodes) - 10); + inodes->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES); + inodes->header.ver = 1; + inodes->header.compat = 1; + inodes->header.data_len = cpu_to_le32(sizeof(*inodes) - header_len); inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes)); inodes->total = cpu_to_le64(sum); items++; + /* encode the read io size metric */ + rsize = (struct ceph_read_io_size *)(inodes + 1); + rsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_IO_SIZES); + rsize->header.ver = 1; + rsize->header.compat = 1; + rsize->header.data_len = cpu_to_le32(sizeof(*rsize) - header_len); + rsize->total_ops = cpu_to_le64(m->total_reads); + rsize->total_size = cpu_to_le64(m->read_size_sum); + items++; + + /* encode the write io size metric */ + wsize = (struct ceph_write_io_size *)(rsize + 1); + wsize->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_IO_SIZES); + wsize->header.ver = 1; + wsize->header.compat = 1; + wsize->header.data_len = cpu_to_le32(sizeof(*wsize) - header_len); + wsize->total_ops = cpu_to_le64(m->total_writes); + wsize->total_size = cpu_to_le64(m->write_size_sum); + items++; + put_unaligned_le32(items, &head->num); msg->front.iov_len = len; msg->hdr.version = cpu_to_le16(1); @@ -225,6 +249,9 @@ int ceph_metric_init(struct ceph_client_metric *m) m->read_latency_max = 0; m->total_reads = 0; m->read_latency_sum = 0; + m->read_size_min = U64_MAX; + m->read_size_max = 0; + m->read_size_sum = 0; spin_lock_init(&m->write_metric_lock); m->write_latency_sq_sum = 0; @@ -232,6 +259,9 @@ int ceph_metric_init(struct ceph_client_metric *m) m->write_latency_max = 0; m->total_writes = 0; m->write_latency_sum = 0; + m->write_size_min = U64_MAX; + m->write_size_max = 0; + m->write_size_sum = 0; spin_lock_init(&m->metadata_metric_lock); m->metadata_latency_sq_sum = 0; @@ -281,23 +311,21 @@ void ceph_metric_destroy(struct ceph_client_metric *m) cancel_delayed_work_sync(&m->delayed_work); - if (m->session) - ceph_put_mds_session(m->session); + ceph_put_mds_session(m->session); } -static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, - ktime_t *min, ktime_t *max, - ktime_t *sq_sump, ktime_t lat) -{ - ktime_t total, avg, sq, lsum; - - total = ++(*totalp); - lsum = (*lsump += lat); +#define METRIC_UPDATE_MIN_MAX(min, max, new) \ +{ \ + if (unlikely(new < min)) \ + min = new; \ + if (unlikely(new > max)) \ + max = new; \ +} - if (unlikely(lat < *min)) - *min = lat; - if (unlikely(lat > *max)) - *max = lat; +static inline void __update_stdev(ktime_t total, ktime_t lsum, + ktime_t *sq_sump, ktime_t lat) +{ + ktime_t avg, sq; if (unlikely(total == 1)) return; @@ -312,33 +340,51 @@ static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, void ceph_update_read_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, - int rc) + unsigned int size, int rc) { ktime_t lat = ktime_sub(r_end, r_start); + ktime_t total; if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) return; spin_lock(&m->read_metric_lock); - __update_latency(&m->total_reads, &m->read_latency_sum, - &m->read_latency_min, &m->read_latency_max, - &m->read_latency_sq_sum, lat); + total = ++m->total_reads; + m->read_size_sum += size; + m->read_latency_sum += lat; + METRIC_UPDATE_MIN_MAX(m->read_size_min, + m->read_size_max, + size); + METRIC_UPDATE_MIN_MAX(m->read_latency_min, + m->read_latency_max, + lat); + __update_stdev(total, m->read_latency_sum, + &m->read_latency_sq_sum, lat); spin_unlock(&m->read_metric_lock); } void ceph_update_write_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, - int rc) + unsigned int size, int rc) { ktime_t lat = ktime_sub(r_end, r_start); + ktime_t total; if (unlikely(rc && rc != -ETIMEDOUT)) return; spin_lock(&m->write_metric_lock); - __update_latency(&m->total_writes, &m->write_latency_sum, - &m->write_latency_min, &m->write_latency_max, - &m->write_latency_sq_sum, lat); + total = ++m->total_writes; + m->write_size_sum += size; + m->write_latency_sum += lat; + METRIC_UPDATE_MIN_MAX(m->write_size_min, + m->write_size_max, + size); + METRIC_UPDATE_MIN_MAX(m->write_latency_min, + m->write_latency_max, + lat); + __update_stdev(total, m->write_latency_sum, + &m->write_latency_sq_sum, lat); spin_unlock(&m->write_metric_lock); } @@ -347,13 +393,18 @@ void ceph_update_metadata_metrics(struct ceph_client_metric *m, int rc) { ktime_t lat = ktime_sub(r_end, r_start); + ktime_t total; if (unlikely(rc && rc != -ENOENT)) return; spin_lock(&m->metadata_metric_lock); - __update_latency(&m->total_metadatas, &m->metadata_latency_sum, - &m->metadata_latency_min, &m->metadata_latency_max, - &m->metadata_latency_sq_sum, lat); + total = ++m->total_metadatas; + m->metadata_latency_sum += lat; + METRIC_UPDATE_MIN_MAX(m->metadata_latency_min, + m->metadata_latency_max, + lat); + __update_stdev(total, m->metadata_latency_sum, + &m->metadata_latency_sq_sum, lat); spin_unlock(&m->metadata_metric_lock); } diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index e984eb2bb14b..0133955a3c6a 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -17,8 +17,10 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_OPENED_FILES, CLIENT_METRIC_TYPE_PINNED_ICAPS, CLIENT_METRIC_TYPE_OPENED_INODES, + CLIENT_METRIC_TYPE_READ_IO_SIZES, + CLIENT_METRIC_TYPE_WRITE_IO_SIZES, - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_OPENED_INODES, + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES, }; /* @@ -34,18 +36,22 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_OPENED_FILES, \ CLIENT_METRIC_TYPE_PINNED_ICAPS, \ CLIENT_METRIC_TYPE_OPENED_INODES, \ + CLIENT_METRIC_TYPE_READ_IO_SIZES, \ + CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ \ CLIENT_METRIC_TYPE_MAX, \ } -/* metric caps header */ -struct ceph_metric_cap { +struct ceph_metric_header { __le32 type; /* ceph metric type */ - __u8 ver; __u8 compat; - __le32 data_len; /* length of sizeof(hit + mis + total) */ +} __packed; + +/* metric caps header */ +struct ceph_metric_cap { + struct ceph_metric_header header; __le64 hit; __le64 mis; __le64 total; @@ -53,48 +59,28 @@ struct ceph_metric_cap { /* metric read latency header */ struct ceph_metric_read_latency { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(sec + nsec) */ + struct ceph_metric_header header; __le32 sec; __le32 nsec; } __packed; /* metric write latency header */ struct ceph_metric_write_latency { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(sec + nsec) */ + struct ceph_metric_header header; __le32 sec; __le32 nsec; } __packed; /* metric metadata latency header */ struct ceph_metric_metadata_latency { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(sec + nsec) */ + struct ceph_metric_header header; __le32 sec; __le32 nsec; } __packed; /* metric dentry lease header */ struct ceph_metric_dlease { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(hit + mis + total) */ + struct ceph_metric_header header; __le64 hit; __le64 mis; __le64 total; @@ -102,40 +88,39 @@ struct ceph_metric_dlease { /* metric opened files header */ struct ceph_opened_files { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(opened_files + total) */ + struct ceph_metric_header header; __le64 opened_files; __le64 total; } __packed; /* metric pinned i_caps header */ struct ceph_pinned_icaps { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(pinned_icaps + total) */ + struct ceph_metric_header header; __le64 pinned_icaps; __le64 total; } __packed; /* metric opened inodes header */ struct ceph_opened_inodes { - __le32 type; /* ceph metric type */ - - __u8 ver; - __u8 compat; - - __le32 data_len; /* length of sizeof(opened_inodes + total) */ + struct ceph_metric_header header; __le64 opened_inodes; __le64 total; } __packed; +/* metric read io size header */ +struct ceph_read_io_size { + struct ceph_metric_header header; + __le64 total_ops; + __le64 total_size; +} __packed; + +/* metric write io size header */ +struct ceph_write_io_size { + struct ceph_metric_header header; + __le64 total_ops; + __le64 total_size; +} __packed; + struct ceph_metric_head { __le32 num; /* the number of metrics that will be sent */ } __packed; @@ -152,6 +137,9 @@ struct ceph_client_metric { spinlock_t read_metric_lock; u64 total_reads; + u64 read_size_sum; + u64 read_size_min; + u64 read_size_max; ktime_t read_latency_sum; ktime_t read_latency_sq_sum; ktime_t read_latency_min; @@ -159,6 +147,9 @@ struct ceph_client_metric { spinlock_t write_metric_lock; u64 total_writes; + u64 write_size_sum; + u64 write_size_min; + u64 write_size_max; ktime_t write_latency_sum; ktime_t write_latency_sq_sum; ktime_t write_latency_min; @@ -206,10 +197,10 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m) extern void ceph_update_read_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, - int rc); + unsigned int size, int rc); extern void ceph_update_write_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, - int rc); + unsigned int size, int rc); extern void ceph_update_metadata_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 4e32c9600ecc..620c691af40e 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -74,8 +74,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, le64_to_cpu(h->max_files)); spin_unlock(&ci->i_ceph_lock); - /* avoid calling iput_final() in dispatch thread */ - ceph_async_iput(inode); + iput(inode); } static struct ceph_quotarealm_inode * @@ -247,8 +246,7 @@ restart: ci = ceph_inode(in); has_quota = __ceph_has_any_quota(ci); - /* avoid calling iput_final() while holding mdsc->snap_rwsem */ - ceph_async_iput(in); + iput(in); next = realm->parent; if (has_quota || !next) @@ -383,8 +381,7 @@ restart: pr_warn("Invalid quota check op (%d)\n", op); exceeded = true; /* Just break the loop */ } - /* avoid calling iput_final() while holding mdsc->snap_rwsem */ - ceph_async_iput(in); + iput(in); next = realm->parent; if (exceeded || !next) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 4ce18055d931..4ac0606dcbd4 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -60,11 +60,13 @@ /* * increase ref count for the realm * - * caller must hold snap_rwsem for write. + * caller must hold snap_rwsem. */ void ceph_get_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { + lockdep_assert_held(&mdsc->snap_rwsem); + dout("get_realm %p %d -> %d\n", realm, atomic_read(&realm->nref), atomic_read(&realm->nref)+1); /* @@ -113,6 +115,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm( { struct ceph_snap_realm *realm; + lockdep_assert_held_write(&mdsc->snap_rwsem); + realm = kzalloc(sizeof(*realm), GFP_NOFS); if (!realm) return ERR_PTR(-ENOMEM); @@ -135,7 +139,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm( /* * lookup the realm rooted at @ino. * - * caller must hold snap_rwsem for write. + * caller must hold snap_rwsem. */ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino) @@ -143,6 +147,8 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, struct rb_node *n = mdsc->snap_realms.rb_node; struct ceph_snap_realm *r; + lockdep_assert_held(&mdsc->snap_rwsem); + while (n) { r = rb_entry(n, struct ceph_snap_realm, node); if (ino < r->ino) @@ -176,6 +182,8 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc, static void __destroy_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { + lockdep_assert_held_write(&mdsc->snap_rwsem); + dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); rb_erase(&realm->node, &mdsc->snap_realms); @@ -198,6 +206,8 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc, static void __put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { + lockdep_assert_held_write(&mdsc->snap_rwsem); + dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, atomic_read(&realm->nref), atomic_read(&realm->nref)-1); if (atomic_dec_and_test(&realm->nref)) @@ -236,6 +246,8 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) { struct ceph_snap_realm *realm; + lockdep_assert_held_write(&mdsc->snap_rwsem); + spin_lock(&mdsc->snap_empty_lock); while (!list_empty(&mdsc->snap_empty)) { realm = list_first_entry(&mdsc->snap_empty, @@ -269,6 +281,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, { struct ceph_snap_realm *parent; + lockdep_assert_held_write(&mdsc->snap_rwsem); + if (realm->parent_ino == parentino) return 0; @@ -460,7 +474,7 @@ static bool has_new_snaps(struct ceph_snap_context *o, * Caller must hold snap_rwsem for read (i.e., the realm topology won't * change). */ -void ceph_queue_cap_snap(struct ceph_inode_info *ci) +static void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; @@ -663,15 +677,13 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); - /* avoid calling iput_final() while holding - * mdsc->snap_rwsem or in mds dispatch threads */ - ceph_async_iput(lastinode); + iput(lastinode); lastinode = inode; ceph_queue_cap_snap(ci); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); - ceph_async_iput(lastinode); + iput(lastinode); dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); } @@ -696,6 +708,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, int err = -ENOMEM; LIST_HEAD(dirty_realms); + lockdep_assert_held_write(&mdsc->snap_rwsem); + dout("update_snap_trace deletion=%d\n", deletion); more: ceph_decode_need(&p, e, sizeof(*ri), bad); @@ -791,7 +805,7 @@ more: return 0; bad: - err = -EINVAL; + err = -EIO; fail: if (realm && !IS_ERR(realm)) ceph_put_snap_realm(mdsc, realm); @@ -823,17 +837,12 @@ static void flush_snaps(struct ceph_mds_client *mdsc) ihold(inode); spin_unlock(&mdsc->snap_flush_lock); ceph_flush_snaps(ci, &session); - /* avoid calling iput_final() while holding - * session->s_mutex or in mds dispatch threads */ - ceph_async_iput(inode); + iput(inode); spin_lock(&mdsc->snap_flush_lock); } spin_unlock(&mdsc->snap_flush_lock); - if (session) { - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - } + ceph_put_mds_session(session); dout("flush_snaps done\n"); } @@ -969,14 +978,12 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, ceph_get_snap_realm(mdsc, realm); ceph_put_snap_realm(mdsc, oldrealm); - /* avoid calling iput_final() while holding - * mdsc->snap_rwsem or mds in dispatch threads */ - ceph_async_iput(inode); + iput(inode); continue; skip_inode: spin_unlock(&ci->i_ceph_lock); - ceph_async_iput(inode); + iput(inode); } /* we may have taken some of the old realm's children. */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index db80d89556b1..6b6332a5c113 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -931,7 +931,6 @@ extern int ceph_update_snap_trace(struct ceph_mds_client *m, extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); -extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); @@ -989,8 +988,6 @@ extern int ceph_inode_holds_cap(struct inode *inode, int mask); extern bool ceph_inode_set_size(struct inode *inode, loff_t size); extern void __ceph_do_pending_vmtruncate(struct inode *inode); -extern void ceph_async_iput(struct inode *inode); - void ceph_queue_inode_work(struct inode *inode, int work_bit); static inline void ceph_queue_vmtruncate(struct inode *inode) @@ -1218,7 +1215,7 @@ extern const struct dentry_operations ceph_dentry_ops; extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, - struct dentry *dentry, int err); + struct dentry *dentry); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index bf52e9326ebe..7364950a9ef4 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -19,6 +19,8 @@ config CIFS select CRYPTO_LIB_DES select KEYS select DNS_RESOLVER + select ASN1 + select OID_REGISTRY help This is the client VFS module for the SMB3 family of NAS protocols, (including support for the most recent, most secure dialect SMB3.1.1) @@ -57,6 +59,7 @@ config CIFS config CIFS_STATS2 bool "Extended statistics" depends on CIFS + default y help Enabling this option will allow more detailed statistics on SMB request timing to be displayed in /proc/fs/cifs/DebugData and also @@ -65,8 +68,7 @@ config CIFS_STATS2 for more details. These additional statistics may have a minor effect on performance and memory utilization. - Unless you are a developer or are doing network performance analysis - or tuning, say N. + If unsure, say Y. config CIFS_ALLOW_INSECURE_LEGACY bool "Support legacy servers which use less secure dialects" diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 3ee3b7de4ded..87fcacdf3de7 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -6,12 +6,16 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_CIFS) += cifs.o cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ - inode.o link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ + inode.o link.o misc.o netmisc.o smbencrypt.o transport.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \ - dns_resolve.o + dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o + +$(obj)/asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.h + +$(obj)/cifs_spnego_negtokeninit.asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.c $(obj)/cifs_spnego_negtokeninit.asn1.h cifs-$(CONFIG_CIFS_XATTR) += xattr.o diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 3150c19cdc2f..b5724ef9f182 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -1,612 +1,63 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in - * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich - * - * Copyright (c) 2000 RP Internet (www.rpi.net.au). - */ #include <linux/module.h> -#include <linux/types.h> #include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include "cifspdu.h" +#include <linux/oid_registry.h> #include "cifsglob.h" #include "cifs_debug.h" #include "cifsproto.h" +#include "cifs_spnego_negtokeninit.asn1.h" -/***************************************************************************** - * - * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) - * - *****************************************************************************/ - -/* Class */ -#define ASN1_UNI 0 /* Universal */ -#define ASN1_APL 1 /* Application */ -#define ASN1_CTX 2 /* Context */ -#define ASN1_PRV 3 /* Private */ - -/* Tag */ -#define ASN1_EOC 0 /* End Of Contents or N/A */ -#define ASN1_BOL 1 /* Boolean */ -#define ASN1_INT 2 /* Integer */ -#define ASN1_BTS 3 /* Bit String */ -#define ASN1_OTS 4 /* Octet String */ -#define ASN1_NUL 5 /* Null */ -#define ASN1_OJI 6 /* Object Identifier */ -#define ASN1_OJD 7 /* Object Description */ -#define ASN1_EXT 8 /* External */ -#define ASN1_ENUM 10 /* Enumerated */ -#define ASN1_SEQ 16 /* Sequence */ -#define ASN1_SET 17 /* Set */ -#define ASN1_NUMSTR 18 /* Numerical String */ -#define ASN1_PRNSTR 19 /* Printable String */ -#define ASN1_TEXSTR 20 /* Teletext String */ -#define ASN1_VIDSTR 21 /* Video String */ -#define ASN1_IA5STR 22 /* IA5 String */ -#define ASN1_UNITIM 23 /* Universal Time */ -#define ASN1_GENTIM 24 /* General Time */ -#define ASN1_GRASTR 25 /* Graphical String */ -#define ASN1_VISSTR 26 /* Visible String */ -#define ASN1_GENSTR 27 /* General String */ - -/* Primitive / Constructed methods*/ -#define ASN1_PRI 0 /* Primitive */ -#define ASN1_CON 1 /* Constructed */ - -/* - * Error codes. - */ -#define ASN1_ERR_NOERROR 0 -#define ASN1_ERR_DEC_EMPTY 2 -#define ASN1_ERR_DEC_EOC_MISMATCH 3 -#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 -#define ASN1_ERR_DEC_BADVALUE 5 - -#define SPNEGO_OID_LEN 7 -#define NTLMSSP_OID_LEN 10 -#define KRB5_OID_LEN 7 -#define KRB5U2U_OID_LEN 8 -#define MSKRB5_OID_LEN 7 -static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 }; -static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 }; -static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 }; -static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 }; -static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 }; - -/* - * ASN.1 context. - */ -struct asn1_ctx { - int error; /* Error condition */ - unsigned char *pointer; /* Octet just to be decoded */ - unsigned char *begin; /* First octet */ - unsigned char *end; /* Octet after last octet */ -}; - -/* - * Octet string (not null terminated) - */ -struct asn1_octstr { - unsigned char *data; - unsigned int len; -}; - -static void -asn1_open(struct asn1_ctx *ctx, unsigned char *buf, unsigned int len) -{ - ctx->begin = buf; - ctx->end = buf + len; - ctx->pointer = buf; - ctx->error = ASN1_ERR_NOERROR; -} - -static unsigned char -asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) -{ - if (ctx->pointer >= ctx->end) { - ctx->error = ASN1_ERR_DEC_EMPTY; - return 0; - } - *ch = *(ctx->pointer)++; - return 1; -} - -#if 0 /* will be needed later by spnego decoding/encoding of ntlmssp */ -static unsigned char -asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val) -{ - unsigned char ch; - - if (ctx->pointer >= ctx->end) { - ctx->error = ASN1_ERR_DEC_EMPTY; - return 0; - } - - ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */ - if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */ - *val = *(++(ctx->pointer)); /* value has enum value */ - else - return 0; - - ctx->pointer++; - return 1; -} -#endif - -static unsigned char -asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) -{ - unsigned char ch; - - *tag = 0; - - do { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *tag <<= 7; - *tag |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static unsigned char -asn1_id_decode(struct asn1_ctx *ctx, - unsigned int *cls, unsigned int *con, unsigned int *tag) -{ - unsigned char ch; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *cls = (ch & 0xC0) >> 6; - *con = (ch & 0x20) >> 5; - *tag = (ch & 0x1F); - - if (*tag == 0x1F) { - if (!asn1_tag_decode(ctx, tag)) - return 0; - } - return 1; -} - -static unsigned char -asn1_length_decode(struct asn1_ctx *ctx, unsigned int *def, unsigned int *len) -{ - unsigned char ch, cnt; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch == 0x80) - *def = 0; - else { - *def = 1; - - if (ch < 0x80) - *len = ch; - else { - cnt = (unsigned char) (ch & 0x7F); - *len = 0; - - while (cnt > 0) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *len <<= 8; - *len |= ch; - cnt--; - } - } - } - - /* don't trust len bigger than ctx buffer */ - if (*len > ctx->end - ctx->pointer) - return 0; - - return 1; -} - -static unsigned char -asn1_header_decode(struct asn1_ctx *ctx, - unsigned char **eoc, - unsigned int *cls, unsigned int *con, unsigned int *tag) -{ - unsigned int def = 0; - unsigned int len = 0; - - if (!asn1_id_decode(ctx, cls, con, tag)) - return 0; - - if (!asn1_length_decode(ctx, &def, &len)) - return 0; - - /* primitive shall be definite, indefinite shall be constructed */ - if (*con == ASN1_PRI && !def) - return 0; - - if (def) - *eoc = ctx->pointer + len; - else - *eoc = NULL; - return 1; -} - -static unsigned char -asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) +int +decode_negTokenInit(unsigned char *security_blob, int length, + struct TCP_Server_Info *server) { - unsigned char ch; - - if (eoc == NULL) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - return 1; - } else { - if (ctx->pointer != eoc) { - ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; - return 0; - } + if (asn1_ber_decoder(&cifs_spnego_negtokeninit_decoder, server, + security_blob, length) == 0) return 1; - } -} - -/* static unsigned char asn1_null_decode(struct asn1_ctx *ctx, - unsigned char *eoc) -{ - ctx->pointer = eoc; - return 1; -} - -static unsigned char asn1_long_decode(struct asn1_ctx *ctx, - unsigned char *eoc, long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = (signed char) ch; - len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof(long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned int *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = ch; - if (ch == 0) - len = 0; else - len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof(unsigned int)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) return 0; - - *integer = ch; - if (ch == 0) - len = 0; - else - len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof(unsigned long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; } -static unsigned char -asn1_octets_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned char **octets, unsigned int *len) +int cifs_gssapi_this_mech(void *context, size_t hdrlen, + unsigned char tag, const void *value, size_t vlen) { - unsigned char *ptr; - - *len = 0; - - *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); - if (*octets == NULL) { - return 0; - } - - ptr = *octets; - while (ctx->pointer < eoc) { - if (!asn1_octet_decode(ctx, (unsigned char *) ptr++)) { - kfree(*octets); - *octets = NULL; - return 0; - } - (*len)++; - } - return 1; -} */ - -static unsigned char -asn1_subid_decode(struct asn1_ctx *ctx, unsigned long *subid) -{ - unsigned char ch; - - *subid = 0; - - do { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *subid <<= 7; - *subid |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static int -asn1_oid_decode(struct asn1_ctx *ctx, - unsigned char *eoc, unsigned long **oid, unsigned int *len) -{ - unsigned long subid; - unsigned int size; - unsigned long *optr; - - size = eoc - ctx->pointer + 1; - - /* first subid actually encodes first two subids */ - if (size < 2 || size > UINT_MAX/sizeof(unsigned long)) - return 0; - - *oid = kmalloc_array(size, sizeof(unsigned long), GFP_ATOMIC); - if (*oid == NULL) - return 0; - - optr = *oid; - - if (!asn1_subid_decode(ctx, &subid)) { - kfree(*oid); - *oid = NULL; - return 0; - } - - if (subid < 40) { - optr[0] = 0; - optr[1] = subid; - } else if (subid < 80) { - optr[0] = 1; - optr[1] = subid - 40; - } else { - optr[0] = 2; - optr[1] = subid - 80; - } - - *len = 2; - optr += 2; + enum OID oid; - while (ctx->pointer < eoc) { - if (++(*len) > size) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - kfree(*oid); - *oid = NULL; - return 0; - } + oid = look_up_OID(value, vlen); + if (oid != OID_spnego) { + char buf[50]; - if (!asn1_subid_decode(ctx, optr++)) { - kfree(*oid); - *oid = NULL; - return 0; - } + sprint_oid(value, vlen, buf, sizeof(buf)); + cifs_dbg(FYI, "Error decoding negTokenInit header: unexpected OID %s\n", + buf); + return -EBADMSG; } - return 1; + return 0; } -static int -compare_oid(unsigned long *oid1, unsigned int oid1len, - unsigned long *oid2, unsigned int oid2len) +int cifs_neg_token_init_mech_type(void *context, size_t hdrlen, + unsigned char tag, + const void *value, size_t vlen) { - unsigned int i; + struct TCP_Server_Info *server = context; + enum OID oid; - if (oid1len != oid2len) - return 0; + oid = look_up_OID(value, vlen); + if (oid == OID_mskrb5) + server->sec_mskerberos = true; + else if (oid == OID_krb5u2u) + server->sec_kerberosu2u = true; + else if (oid == OID_krb5) + server->sec_kerberos = true; + else if (oid == OID_ntlmssp) + server->sec_ntlmssp = true; else { - for (i = 0; i < oid1len; i++) { - if (oid1[i] != oid2[i]) - return 0; - } - return 1; - } -} - - /* BB check for endian conversion issues here */ - -int -decode_negTokenInit(unsigned char *security_blob, int length, - struct TCP_Server_Info *server) -{ - struct asn1_ctx ctx; - unsigned char *end; - unsigned char *sequence_end; - unsigned long *oid = NULL; - unsigned int cls, con, tag, oidlen, rc; - - /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */ - - asn1_open(&ctx, security_blob, length); + char buf[50]; - /* GSSAPI header */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cifs_dbg(FYI, "Error decoding negTokenInit header\n"); - return 0; - } else if ((cls != ASN1_APL) || (con != ASN1_CON) - || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d\n", cls, con, tag); - return 0; + sprint_oid(value, vlen, buf, sizeof(buf)); + cifs_dbg(FYI, "Decoding negTokenInit: unsupported OID %s\n", + buf); } - - /* Check for SPNEGO OID -- remember to free obj->oid */ - rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); - if (rc) { - if ((tag == ASN1_OJI) && (con == ASN1_PRI) && - (cls == ASN1_UNI)) { - rc = asn1_oid_decode(&ctx, end, &oid, &oidlen); - if (rc) { - rc = compare_oid(oid, oidlen, SPNEGO_OID, - SPNEGO_OID_LEN); - kfree(oid); - } - } else - rc = 0; - } - - /* SPNEGO OID not present or garbled -- bail out */ - if (!rc) { - cifs_dbg(FYI, "Error decoding negTokenInit header\n"); - return 0; - } - - /* SPNEGO */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cifs_dbg(FYI, "Error decoding negTokenInit\n"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON) - || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n", - cls, con, tag, end); - return 0; - } - - /* negTokenInit */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cifs_dbg(FYI, "Error decoding negTokenInit\n"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 1\n", - cls, con, tag, end); - return 0; - } - - /* sequence */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON) - || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n", - cls, con, tag, end); - return 0; - } - - /* sequence of */ - if (asn1_header_decode - (&ctx, &sequence_end, &cls, &con, &tag) == 0) { - cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d sequence_end = %p exit 1\n", - cls, con, tag, sequence_end); - return 0; - } - - /* list of security mechanisms */ - while (!asn1_eoc_decode(&ctx, sequence_end)) { - rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); - if (!rc) { - cifs_dbg(FYI, "Error decoding negTokenInit hdr exit2\n"); - return 0; - } - if ((tag == ASN1_OJI) && (con == ASN1_PRI)) { - if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) { - - cifs_dbg(FYI, "OID len = %d oid = 0x%lx 0x%lx 0x%lx 0x%lx\n", - oidlen, *oid, *(oid + 1), *(oid + 2), - *(oid + 3)); - - if (compare_oid(oid, oidlen, MSKRB5_OID, - MSKRB5_OID_LEN)) - server->sec_mskerberos = true; - else if (compare_oid(oid, oidlen, KRB5U2U_OID, - KRB5U2U_OID_LEN)) - server->sec_kerberosu2u = true; - else if (compare_oid(oid, oidlen, KRB5_OID, - KRB5_OID_LEN)) - server->sec_kerberos = true; - else if (compare_oid(oid, oidlen, NTLMSSP_OID, - NTLMSSP_OID_LEN)) - server->sec_ntlmssp = true; - - kfree(oid); - } - } else { - cifs_dbg(FYI, "Should be an oid what is going on?\n"); - } - } - - /* - * We currently ignore anything at the end of the SPNEGO blob after - * the mechTypes have been parsed, since none of that info is - * used at the moment. - */ - return 1; + return 0; } diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 488fe0ffc1ef..8a3b30ec860c 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/cache.c - CIFS filesystem cache index structure definitions * * Copyright (c) 2010 Novell, Inc. * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "fscache.h" #include "cifs_debug.h" diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 68e8e5b27841..8857ac7e7a14 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -50,7 +50,6 @@ void cifs_dump_detail(void *buf, struct TCP_Server_Info *server) void cifs_dump_mids(struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 - struct list_head *tmp; struct mid_q_entry *mid_entry; if (server == NULL) @@ -58,8 +57,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server) cifs_dbg(VFS, "Dump pending requests:\n"); spin_lock(&GlobalMid_Lock); - list_for_each(tmp, &server->pending_mid_q) { - mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n", mid_entry->mid_state, le16_to_cpu(mid_entry->command), @@ -168,7 +166,7 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) static int cifs_debug_files_proc_show(struct seq_file *m, void *v) { - struct list_head *stmp, *tmp, *tmp1, *tmp2; + struct list_head *tmp, *tmp1, *tmp2; struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -183,9 +181,7 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) seq_printf(m, " <filename>\n"); #endif /* CIFS_DEBUG2 */ spin_lock(&cifs_tcp_ses_lock); - list_for_each(stmp, &cifs_tcp_ses_list) { - server = list_entry(stmp, struct TCP_Server_Info, - tcp_ses_list); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { list_for_each(tmp, &server->smb_ses_list) { ses = list_entry(tmp, struct cifs_ses, smb_ses_list); list_for_each(tmp1, &ses->tcon_list) { @@ -220,7 +216,7 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { - struct list_head *tmp1, *tmp2, *tmp3; + struct list_head *tmp2, *tmp3; struct mid_q_entry *mid_entry; struct TCP_Server_Info *server; struct cifs_ses *ses; @@ -278,11 +274,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) c = 0; spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp1, &cifs_tcp_ses_list) { - server = list_entry(tmp1, struct TCP_Server_Info, - tcp_ses_list); - - /* channel info will be printed as a part of sessions below */ + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { if (server->is_channel) continue; @@ -563,7 +555,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_CIFS_STATS2 int j; #endif /* STATS2 */ - struct list_head *tmp1, *tmp2, *tmp3; + struct list_head *tmp2, *tmp3; struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -594,9 +586,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) i = 0; spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp1, &cifs_tcp_ses_list) { - server = list_entry(tmp1, struct TCP_Server_Info, - tcp_ses_list); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { seq_printf(m, "\nMax requests in flight: %d", server->max_in_flight); #ifdef CONFIG_CIFS_STATS2 seq_puts(m, "\nTotal time spent processing by command. Time "); diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 5e66dab712d0..ee4ea2b60c0f 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -3,7 +3,7 @@ * * Copyright (c) International Business Machines Corp., 2000,2002 * Modified by Steve French (sfrench@us.ibm.com) -*/ + */ #ifndef _H_CIFS_DEBUG #define _H_CIFS_DEBUG diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index c87c37cf2914..ec57cdb1590f 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -125,7 +125,7 @@ cifs_build_devname(char *nodename, const char *prepath) * @sb_mountdata: parent/root DFS mount options (template) * @fullpath: full path in UNC format * @ref: optional server's referral - * + * @devname: return the built cifs device name if passed pointer not NULL * creates mount options for submount based on template options sb_mountdata * and replacing unc,ip,prefixpath options with ones we've got form ref_unc. * diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 9c45b3a82ad9..4fd788586399 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -1,19 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifs_fs_sb.h * * Copyright (c) International Business Machines Corp., 2002,2004 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * */ #include <linux/rbtree.h> @@ -72,11 +63,12 @@ struct cifs_sb_info { char *prepath; /* - * Path initially provided by the mount call. We might connect - * to something different via DFS but we want to keep it to do - * failover properly. + * Canonical DFS path initially provided by the mount call. We might connect to something + * different via DFS but we want to keep it to do failover properly. */ char *origin_fullpath; /* \\HOST\SHARE\[OPTIONAL PATH] */ + /* randomly generated 128-bit number for indexing dfs mount groups in referral cache */ + uuid_t dfs_mount_id; /* * Indicate whether serverino option was turned off later * (cifs_autodisable_serverino) in order to match new mounts. diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 37fc7d6ac457..ef723be358af 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifs_ioctl.h * @@ -5,16 +6,6 @@ * * Copyright (c) 2015 Steve French <steve.french@primarydata.com> * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * */ struct smb_mnt_fs_info { diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 7b9b876b513b..8fa26a8530f8 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/cifs_spnego.c -- SPNEGO upcall management for CIFS * * Copyright (c) 2007 Red Hat, Inc. * Author(s): Jeff Layton (jlayton@redhat.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/list.h> diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index 31bef9ee078b..31387d0ea32e 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifs_spnego.h -- SPNEGO upcall management for CIFS * @@ -5,19 +6,6 @@ * Author(s): Jeff Layton (jlayton@redhat.com) * Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFS_SPNEGO_H diff --git a/fs/cifs/cifs_spnego_negtokeninit.asn1 b/fs/cifs/cifs_spnego_negtokeninit.asn1 new file mode 100644 index 000000000000..181c083887d5 --- /dev/null +++ b/fs/cifs/cifs_spnego_negtokeninit.asn1 @@ -0,0 +1,40 @@ +GSSAPI ::= + [APPLICATION 0] IMPLICIT SEQUENCE { + thisMech + OBJECT IDENTIFIER ({cifs_gssapi_this_mech}), + negotiationToken + NegotiationToken + } + +MechType ::= OBJECT IDENTIFIER ({cifs_neg_token_init_mech_type}) + +MechTypeList ::= SEQUENCE OF MechType + +NegHints ::= SEQUENCE { + hintName + [0] GeneralString OPTIONAL, + hintAddress + [1] OCTET STRING OPTIONAL + } + +NegTokenInit2 ::= + SEQUENCE { + mechTypes + [0] MechTypeList OPTIONAL, + reqFlags + [1] BIT STRING OPTIONAL, + mechToken + [2] OCTET STRING OPTIONAL, + negHints + [3] NegHints OPTIONAL, + mechListMIC + [3] OCTET STRING OPTIONAL + } + +NegotiationToken ::= + CHOICE { + negTokenInit + [0] NegTokenInit2, + negTokenTarg + [1] ANY + } diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index d829b8bf833e..93b47818c6c2 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -447,15 +447,13 @@ static int cifs_swn_store_swn_addr(const struct sockaddr_storage *new, const struct sockaddr_storage *old, struct sockaddr_storage *dst) { - __be16 port; + __be16 port = cpu_to_be16(CIFS_PORT); if (old->ss_family == AF_INET) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)old; port = ipv4->sin_port; - } - - if (old->ss_family == AF_INET6) { + } else if (old->ss_family == AF_INET6) { struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)old; port = ipv6->sin6_port; @@ -465,9 +463,7 @@ static int cifs_swn_store_swn_addr(const struct sockaddr_storage *new, struct sockaddr_in *ipv4 = (struct sockaddr_in *)new; ipv4->sin_port = port; - } - - if (new->ss_family == AF_INET6) { + } else if (new->ss_family == AF_INET6) { struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)new; ipv6->sin6_port = port; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 784407f9280f..388eb536cff1 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/cifsacl.c * @@ -6,19 +7,6 @@ * * Contains the routines for mapping CIFS/NTFS ACLs * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> @@ -409,7 +397,6 @@ try_upcall_to_get_id: saved_cred = override_creds(root_cred); sidkey = request_key(&cifs_idmap_key_type, sidstr, ""); if (IS_ERR(sidkey)) { - rc = -EINVAL; cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n", __func__, sidstr, sidtype == SIDOWNER ? 'u' : 'g'); goto out_revert_creds; @@ -422,7 +409,6 @@ try_upcall_to_get_id: */ BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); if (sidkey->datalen != sizeof(uid_t)) { - rc = -EIO; cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n", __func__, sidkey->datalen); key_invalidate(sidkey); @@ -1308,7 +1294,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); ndacl_ptr->revision = dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); - ndacl_ptr->num_aces = dacl_ptr->num_aces; + ndacl_ptr->num_aces = dacl_ptr ? dacl_ptr->num_aces : 0; if (uid_valid(uid)) { /* chown */ uid_t id; diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index d9e704979d99..f8292bcf8594 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -1,28 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifsacl.h * * Copyright (c) International Business Machines Corp., 2007 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFSACL_H #define _CIFSACL_H - #define NUM_AUTHS (6) /* number of authority fields */ #define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index b8f1ff9a83f3..ecf15d845dbd 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/cifsencrypt.c * @@ -7,19 +8,6 @@ * Copyright (C) International Business Machines Corp., 2005,2013 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2ffcb29d5c8f..9fb874dd8d24 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/cifsfs.c * @@ -6,19 +7,6 @@ * * Common Internet FileSystem (CIFS) client * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* Note that BB means BUGBUG (ie something to fix eventually) */ diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 6beddb108ba0..177f3e7ab86d 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifsfs.h * * Copyright (c) International Business Machines Corp., 2002, 2007 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFSFS_H diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8488d7024462..3100f8b66e60 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifsglob.h * @@ -5,16 +6,6 @@ * Author(s): Steve French (sfrench@us.ibm.com) * Jeremy Allison (jra@samba.org) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * */ #ifndef _CIFS_GLOB_H #define _CIFS_GLOB_H @@ -630,7 +621,7 @@ struct TCP_Server_Info { /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ unsigned int capabilities; /* selective disabling of caps by smb sess */ int timeAdj; /* Adjust for difference in server time zone in sec */ - __u64 CurrentMid; /* multiplex id - rotating counter */ + __u64 CurrentMid; /* multiplex id - rotating counter, protected by GlobalMid_Lock */ char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; @@ -896,7 +887,7 @@ struct cifs_ses { struct mutex session_mutex; struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ - enum statusEnum status; + enum statusEnum status; /* updates protected by GlobalMid_Lock */ unsigned overrideSecFlg; /* if non-zero override global sec flags */ char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ @@ -1093,8 +1084,7 @@ struct cifs_tcon { struct cached_fid crfid; /* Cached root fid */ /* BB add field for back pointer to sb struct(s)? */ #ifdef CONFIG_CIFS_DFS_UPCALL - char *dfs_path; - int remap:2; + char *dfs_path; /* canonical DFS path */ struct list_head ulist; /* cache update list */ #endif }; @@ -1795,6 +1785,8 @@ require use of the stronger protocol */ * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers * list operations on global DnotifyReqList + * updates to ses->status + * updates to server->CurrentMid * tcp_ses_lock protects: * list operations on tcp and SMB session lists * tcon->open_file_lock protects the list of open files hanging off the tcon diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 554d64fe171e..0923f72d27e9 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifspdu.h * * Copyright (c) International Business Machines Corp., 2002,2009 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFSPDU_H diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index d30cba44ba29..e0def0f0714b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifsproto.h * * Copyright (c) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFSPROTO_H #define _CIFSPROTO_H diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 41f74163cc1c..58ebec4d4413 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/cifssmb.c * @@ -6,19 +7,6 @@ * * Contains the routines for constructing the SMB PDUs themselves * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c */ @@ -1220,7 +1208,7 @@ SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon, int *pOplock, FILE_ALL_INFO *pfile_info, const struct nls_table *nls_codepage, int remap) { - int rc = -EACCES; + int rc; OPENX_REQ *pSMB = NULL; OPENX_RSP *pSMBr = NULL; int bytes_returned; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 495c395f9def..5d269f583dac 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/connect.c * * Copyright (C) International Business Machines Corp., 2002,2011 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/net.h> @@ -368,13 +356,7 @@ cifs_reconnect(struct TCP_Server_Info *server) cifs_server_dbg(VFS, "%s: failed to update DFS target hint: rc = %d\n", __func__, rc); } - rc = dfs_cache_update_vol(cifs_sb->origin_fullpath, server); - if (rc) { - cifs_server_dbg(VFS, "%s: failed to update vol info in DFS cache: rc = %d\n", - __func__, rc); - } dfs_cache_free_tgts(&tgt_list); - } cifs_put_tcp_super(sb); @@ -1557,29 +1539,25 @@ out: /** * cifs_free_ipc - helper to release the session IPC tcon * - * Needs to be called everytime a session is destroyed + * Needs to be called everytime a session is destroyed. + * + * On session close, the IPC is closed and the server must release all tcons of the session. + * No need to send a tree disconnect here. + * + * Besides, it will make the server to not close durable and resilient files on session close, as + * specified in MS-SMB2 3.3.5.6 Receiving an SMB2 LOGOFF Request. */ static int cifs_free_ipc(struct cifs_ses *ses) { - int rc = 0, xid; struct cifs_tcon *tcon = ses->tcon_ipc; if (tcon == NULL) return 0; - if (ses->server->ops->tree_disconnect) { - xid = get_xid(); - rc = ses->server->ops->tree_disconnect(xid, tcon); - free_xid(xid); - } - - if (rc) - cifs_dbg(FYI, "failed to disconnect IPC tcon (rc=%d)\n", rc); - tconInfoFree(tcon); ses->tcon_ipc = NULL; - return rc; + return 0; } static struct cifs_ses * @@ -1605,7 +1583,6 @@ void cifs_put_smb_ses(struct cifs_ses *ses) { unsigned int rc, xid; struct TCP_Server_Info *server = ses->server; - cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); spin_lock(&cifs_tcp_ses_lock); @@ -1613,13 +1590,20 @@ void cifs_put_smb_ses(struct cifs_ses *ses) spin_unlock(&cifs_tcp_ses_lock); return; } + + cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); + cifs_dbg(FYI, "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->treeName : "NONE"); + if (--ses->ses_count > 0) { spin_unlock(&cifs_tcp_ses_lock); return; } + spin_unlock(&cifs_tcp_ses_lock); + + spin_lock(&GlobalMid_Lock); if (ses->status == CifsGood) ses->status = CifsExiting; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&GlobalMid_Lock); cifs_free_ipc(ses); @@ -1951,10 +1935,7 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp, &ses->tcon_list) { tcon = list_entry(tmp, struct cifs_tcon, tcon_list); -#ifdef CONFIG_CIFS_DFS_UPCALL - if (tcon->dfs_path) - continue; -#endif + if (!match_tcon(tcon, ctx)) continue; ++tcon->tc_count; @@ -3017,9 +2998,8 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, return rc; } -static inline int get_next_dfs_tgt(const char *path, - struct dfs_cache_tgt_list *tgt_list, - struct dfs_cache_tgt_iterator **tgt_it) +static int get_next_dfs_tgt(struct dfs_cache_tgt_list *tgt_list, + struct dfs_cache_tgt_iterator **tgt_it) { if (!*tgt_it) *tgt_it = dfs_cache_get_tgt_iterator(tgt_list); @@ -3059,6 +3039,7 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_ struct cifs_ses **ses, struct cifs_tcon **tcon) { int rc; + char *npath = NULL; struct dfs_cache_tgt_list tgt_list = {0}; struct dfs_cache_tgt_iterator *tgt_it = NULL; struct smb3_fs_context tmp_ctx = {NULL}; @@ -3066,11 +3047,15 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) return -EOPNOTSUPP; - cifs_dbg(FYI, "%s: path=%s full_path=%s\n", __func__, path, full_path); + npath = dfs_cache_canonical_path(path, cifs_sb->local_nls, cifs_remap(cifs_sb)); + if (IS_ERR(npath)) + return PTR_ERR(npath); + + cifs_dbg(FYI, "%s: path=%s full_path=%s\n", __func__, npath, full_path); - rc = dfs_cache_noreq_find(path, NULL, &tgt_list); + rc = dfs_cache_noreq_find(npath, NULL, &tgt_list); if (rc) - return rc; + goto out; /* * We use a 'tmp_ctx' here because we need pass it down to the mount_{get,put} functions to * test connection against new DFS targets. @@ -3084,11 +3069,11 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_ char *fake_devname = NULL, *mdata = NULL; /* Get next DFS target server - if any */ - rc = get_next_dfs_tgt(path, &tgt_list, &tgt_it); + rc = get_next_dfs_tgt(&tgt_list, &tgt_it); if (rc) break; - rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref); + rc = dfs_cache_get_tgt_referral(npath, tgt_it, &ref); if (rc) break; @@ -3137,6 +3122,7 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_ } out: + kfree(npath); smb3_cleanup_fs_context_contents(&tmp_ctx); dfs_cache_free_tgts(&tgt_list); return rc; @@ -3288,25 +3274,18 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb3_fs_context * } #ifdef CONFIG_CIFS_DFS_UPCALL -static void set_root_ses(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, +static void set_root_ses(struct cifs_sb_info *cifs_sb, const uuid_t *mount_id, struct cifs_ses *ses, struct cifs_ses **root_ses) { if (ses) { spin_lock(&cifs_tcp_ses_lock); ses->ses_count++; - if (ses->tcon_ipc) - ses->tcon_ipc->remap = cifs_remap(cifs_sb); spin_unlock(&cifs_tcp_ses_lock); + dfs_cache_add_refsrv_session(mount_id, ses); } *root_ses = ses; } -static void put_root_ses(struct cifs_ses *ses) -{ - if (ses) - cifs_put_smb_ses(ses); -} - /* Set up next dfs prefix path in @dfs_path */ static int next_dfs_prepath(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, const unsigned int xid, struct TCP_Server_Info *server, @@ -3352,17 +3331,25 @@ out: } /* Check if resolved targets can handle any DFS referrals */ -static int is_referral_server(const char *ref_path, struct cifs_tcon *tcon, bool *ref_server) +static int is_referral_server(const char *ref_path, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, bool *ref_server) { int rc; struct dfs_info3_param ref = {0}; + cifs_dbg(FYI, "%s: ref_path=%s\n", __func__, ref_path); + if (is_tcon_dfs(tcon)) { *ref_server = true; } else { - cifs_dbg(FYI, "%s: ref_path=%s\n", __func__, ref_path); + char *npath; - rc = dfs_cache_noreq_find(ref_path, &ref, NULL); + npath = dfs_cache_canonical_path(ref_path, cifs_sb->local_nls, cifs_remap(cifs_sb)); + if (IS_ERR(npath)) + return PTR_ERR(npath); + + rc = dfs_cache_noreq_find(npath, &ref, NULL); + kfree(npath); if (rc) { cifs_dbg(VFS, "%s: dfs_cache_noreq_find: failed (rc=%d)\n", __func__, rc); return rc; @@ -3386,9 +3373,9 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) struct cifs_ses *ses = NULL, *root_ses = NULL; struct cifs_tcon *tcon = NULL; int count = 0; + uuid_t mount_id = {0}; char *ref_path = NULL, *full_path = NULL; char *oldmnt = NULL; - char *mntdata = NULL; bool ref_server = false; rc = mount_get_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon); @@ -3411,12 +3398,9 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) if (rc != -EREMOTE) goto error; } - /* Save mount options */ - mntdata = kstrdup(cifs_sb->ctx->mount_options, GFP_KERNEL); - if (!mntdata) { - rc = -ENOMEM; - goto error; - } + + ctx->nosharesock = true; + /* Get path of DFS root */ ref_path = build_unc_path_to_root(ctx, cifs_sb, false); if (IS_ERR(ref_path)) { @@ -3425,7 +3409,8 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) goto error; } - set_root_ses(cifs_sb, ses, &root_ses); + uuid_gen(&mount_id); + set_root_ses(cifs_sb, &mount_id, ses, &root_ses); do { /* Save full path of last DFS path we used to resolve final target server */ kfree(full_path); @@ -3456,13 +3441,11 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) continue; /* Make sure that requests go through new root servers */ - rc = is_referral_server(ref_path + 1, tcon, &ref_server); + rc = is_referral_server(ref_path + 1, cifs_sb, tcon, &ref_server); if (rc) break; - if (ref_server) { - put_root_ses(root_ses); - set_root_ses(cifs_sb, ses, &root_ses); - } + if (ref_server) + set_root_ses(cifs_sb, &mount_id, ses, &root_ses); /* Get next dfs path and then continue chasing them if -EREMOTE */ rc = next_dfs_prepath(cifs_sb, ctx, xid, server, tcon, &ref_path); @@ -3471,12 +3454,10 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) rc = -ELOOP; } while (rc == -EREMOTE); - if (rc) + if (rc || !tcon) goto error; - put_root_ses(root_ses); - root_ses = NULL; + kfree(ref_path); - ref_path = NULL; /* * Store DFS full path in both superblock and tree connect structures. * @@ -3485,21 +3466,27 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) * links, the prefix path is included in both and may be changed during reconnect. See * cifs_tree_connect(). */ - cifs_sb->origin_fullpath = kstrdup(full_path, GFP_KERNEL); - if (!cifs_sb->origin_fullpath) { + ref_path = dfs_cache_canonical_path(full_path, cifs_sb->local_nls, cifs_remap(cifs_sb)); + kfree(full_path); + full_path = NULL; + + if (IS_ERR(ref_path)) { + rc = PTR_ERR(ref_path); + ref_path = NULL; + goto error; + } + cifs_sb->origin_fullpath = ref_path; + + ref_path = kstrdup(cifs_sb->origin_fullpath, GFP_KERNEL); + if (!ref_path) { rc = -ENOMEM; goto error; } spin_lock(&cifs_tcp_ses_lock); - tcon->dfs_path = full_path; - full_path = NULL; - tcon->remap = cifs_remap(cifs_sb); + tcon->dfs_path = ref_path; + ref_path = NULL; spin_unlock(&cifs_tcp_ses_lock); - /* Add original context for DFS cache to be used when refreshing referrals */ - rc = dfs_cache_add_vol(mntdata, ctx, cifs_sb->origin_fullpath); - if (rc) - goto error; /* * After reconnecting to a different server, unique ids won't * match anymore, so we disable serverino. This prevents @@ -3514,6 +3501,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) kfree(cifs_sb->prepath); cifs_sb->prepath = ctx->prepath; ctx->prepath = NULL; + uuid_copy(&cifs_sb->dfs_mount_id, &mount_id); out: free_xid(xid); @@ -3523,9 +3511,8 @@ out: error: kfree(ref_path); kfree(full_path); - kfree(mntdata); kfree(cifs_sb->origin_fullpath); - put_root_ses(root_ses); + dfs_cache_put_refsrv_sessions(&mount_id); mount_put_conns(cifs_sb, xid, server, ses, tcon); return rc; } @@ -3755,7 +3742,7 @@ cifs_umount(struct cifs_sb_info *cifs_sb) kfree(cifs_sb->prepath); #ifdef CONFIG_CIFS_DFS_UPCALL - dfs_cache_del_vol(cifs_sb->origin_fullpath); + dfs_cache_put_refsrv_sessions(&cifs_sb->dfs_mount_id); kfree(cifs_sb->origin_fullpath); #endif call_rcu(&cifs_sb->rcu, delayed_free); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index b1fa30fefe1f..7c1769714609 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> #include <linux/nls.h> #include <linux/workqueue.h> +#include <linux/uuid.h> #include "cifsglob.h" #include "smb2pdu.h" #include "smb2proto.h" @@ -18,15 +19,14 @@ #include "cifs_debug.h" #include "cifs_unicode.h" #include "smb2glob.h" -#include "fs_context.h" #include "dfs_cache.h" #define CACHE_HTABLE_SIZE 32 #define CACHE_MAX_ENTRIES 64 +#define CACHE_MIN_TTL 120 /* 2 minutes */ -#define IS_INTERLINK_SET(v) ((v) & (DFSREF_REFERRAL_SERVER | \ - DFSREF_STORAGE_SERVER)) +#define IS_DFS_INTERLINK(v) (((v) & DFSREF_REFERRAL_SERVER) && !((v) & DFSREF_STORAGE_SERVER)) struct cache_dfs_tgt { char *name; @@ -48,14 +48,15 @@ struct cache_entry { struct cache_dfs_tgt *tgthint; }; -struct vol_info { - char *fullpath; - spinlock_t ctx_lock; - struct smb3_fs_context ctx; - char *mntdata; +/* List of referral server sessions per dfs mount */ +struct mount_group { struct list_head list; - struct list_head rlist; - struct kref refcnt; + uuid_t id; + struct cifs_ses *sessions[CACHE_MAX_ENTRIES]; + int num_sessions; + spinlock_t lock; + struct list_head refresh_list; + struct kref refcount; }; static struct kmem_cache *cache_slab __read_mostly; @@ -64,7 +65,7 @@ static struct workqueue_struct *dfscache_wq __read_mostly; static int cache_ttl; static DEFINE_SPINLOCK(cache_ttl_lock); -static struct nls_table *cache_nlsc; +static struct nls_table *cache_cp; /* * Number of entries in the cache @@ -74,34 +75,145 @@ static atomic_t cache_count; static struct hlist_head cache_htable[CACHE_HTABLE_SIZE]; static DECLARE_RWSEM(htable_rw_lock); -static LIST_HEAD(vol_list); -static DEFINE_SPINLOCK(vol_list_lock); +static LIST_HEAD(mount_group_list); +static DEFINE_MUTEX(mount_group_list_lock); static void refresh_cache_worker(struct work_struct *work); static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker); -static int get_normalized_path(const char *path, const char **npath) +static void get_ipc_unc(const char *ref_path, char *ipc, size_t ipclen) { - if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/')) - return -EINVAL; + const char *host; + size_t len; - if (*path == '\\') { - *npath = path; - } else { - char *s = kstrdup(path, GFP_KERNEL); - if (!s) - return -ENOMEM; - convert_delimiter(s, '\\'); - *npath = s; + extract_unc_hostname(ref_path, &host, &len); + scnprintf(ipc, ipclen, "\\\\%.*s\\IPC$", (int)len, host); +} + +static struct cifs_ses *find_ipc_from_server_path(struct cifs_ses **ses, const char *path) +{ + char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0}; + + get_ipc_unc(path, unc, sizeof(unc)); + for (; *ses; ses++) { + if (!strcasecmp(unc, (*ses)->tcon_ipc->treeName)) + return *ses; } - return 0; + return ERR_PTR(-ENOENT); +} + +static void __mount_group_release(struct mount_group *mg) +{ + int i; + + for (i = 0; i < mg->num_sessions; i++) + cifs_put_smb_ses(mg->sessions[i]); + kfree(mg); +} + +static void mount_group_release(struct kref *kref) +{ + struct mount_group *mg = container_of(kref, struct mount_group, refcount); + + mutex_lock(&mount_group_list_lock); + list_del(&mg->list); + mutex_unlock(&mount_group_list_lock); + __mount_group_release(mg); +} + +static struct mount_group *find_mount_group_locked(const uuid_t *id) +{ + struct mount_group *mg; + + list_for_each_entry(mg, &mount_group_list, list) { + if (uuid_equal(&mg->id, id)) + return mg; + } + return ERR_PTR(-ENOENT); +} + +static struct mount_group *__get_mount_group_locked(const uuid_t *id) +{ + struct mount_group *mg; + + mg = find_mount_group_locked(id); + if (!IS_ERR(mg)) + return mg; + + mg = kmalloc(sizeof(*mg), GFP_KERNEL); + if (!mg) + return ERR_PTR(-ENOMEM); + kref_init(&mg->refcount); + uuid_copy(&mg->id, id); + mg->num_sessions = 0; + spin_lock_init(&mg->lock); + list_add(&mg->list, &mount_group_list); + return mg; +} + +static struct mount_group *get_mount_group(const uuid_t *id) +{ + struct mount_group *mg; + + mutex_lock(&mount_group_list_lock); + mg = __get_mount_group_locked(id); + if (!IS_ERR(mg)) + kref_get(&mg->refcount); + mutex_unlock(&mount_group_list_lock); + + return mg; } -static inline void free_normalized_path(const char *path, const char *npath) +static void free_mount_group_list(void) { - if (path != npath) - kfree(npath); + struct mount_group *mg, *tmp_mg; + + list_for_each_entry_safe(mg, tmp_mg, &mount_group_list, list) { + list_del_init(&mg->list); + __mount_group_release(mg); + } +} + +/** + * dfs_cache_canonical_path - get a canonical DFS path + * + * @path: DFS path + * @cp: codepage + * @remap: mapping type + * + * Return canonical path if success, otherwise error. + */ +char *dfs_cache_canonical_path(const char *path, const struct nls_table *cp, int remap) +{ + char *tmp; + int plen = 0; + char *npath; + + if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/')) + return ERR_PTR(-EINVAL); + + if (unlikely(strcmp(cp->charset, cache_cp->charset))) { + tmp = (char *)cifs_strndup_to_utf16(path, strlen(path), &plen, cp, remap); + if (!tmp) { + cifs_dbg(VFS, "%s: failed to convert path to utf16\n", __func__); + return ERR_PTR(-EINVAL); + } + + npath = cifs_strndup_from_utf16(tmp, plen, true, cache_cp); + kfree(tmp); + + if (!npath) { + cifs_dbg(VFS, "%s: failed to convert path from utf16\n", __func__); + return ERR_PTR(-EINVAL); + } + } else { + npath = kstrdup(path, GFP_KERNEL); + if (!npath) + return ERR_PTR(-ENOMEM); + } + convert_delimiter(npath, '\\'); + return npath; } static inline bool cache_entry_expired(const struct cache_entry *ce) @@ -171,7 +283,7 @@ static int dfscache_proc_show(struct seq_file *m, void *v) "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n", ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, ce->ref_flags, ce->hdr_flags, - IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no", + IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); list_for_each_entry(t, &ce->tlist, list) { @@ -240,7 +352,7 @@ static inline void dump_ce(const struct cache_entry *ce) ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, ce->hdr_flags, ce->ref_flags, - IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no", + IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); dump_tgts(ce); @@ -284,8 +396,7 @@ int dfs_cache_init(void) int rc; int i; - dfscache_wq = alloc_workqueue("cifs-dfscache", - WQ_FREEZABLE | WQ_MEM_RECLAIM, 1); + dfscache_wq = alloc_workqueue("cifs-dfscache", WQ_FREEZABLE | WQ_UNBOUND, 1); if (!dfscache_wq) return -ENOMEM; @@ -301,7 +412,9 @@ int dfs_cache_init(void) INIT_HLIST_HEAD(&cache_htable[i]); atomic_set(&cache_count, 0); - cache_nlsc = load_nls_default(); + cache_cp = load_nls("utf8"); + if (!cache_cp) + cache_cp = load_nls_default(); cifs_dbg(FYI, "%s: initialized DFS referral cache\n", __func__); return 0; @@ -311,23 +424,24 @@ out_destroy_wq: return rc; } -static inline unsigned int cache_entry_hash(const void *data, int size) +static int cache_entry_hash(const void *data, int size, unsigned int *hash) { - unsigned int h; - - h = jhash(data, size, 0); - return h & (CACHE_HTABLE_SIZE - 1); -} - -/* Check whether second path component of @path is SYSVOL or NETLOGON */ -static inline bool is_sysvol_or_netlogon(const char *path) -{ - const char *s; - char sep = path[0]; - - s = strchr(path + 1, sep) + 1; - return !strncasecmp(s, "sysvol", strlen("sysvol")) || - !strncasecmp(s, "netlogon", strlen("netlogon")); + int i, clen; + const unsigned char *s = data; + wchar_t c; + unsigned int h = 0; + + for (i = 0; i < size; i += clen) { + clen = cache_cp->char2uni(&s[i], size - i, &c); + if (unlikely(clen < 0)) { + cifs_dbg(VFS, "%s: can't convert char\n", __func__); + return clen; + } + c = cifs_toupper(c); + h = jhash(&c, sizeof(c), h); + } + *hash = h % CACHE_HTABLE_SIZE; + return 0; } /* Return target hint of a DFS cache entry */ @@ -378,7 +492,7 @@ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs, { int i; - ce->ttl = refs[0].ttl; + ce->ttl = max_t(int, refs[0].ttl, CACHE_MIN_TTL); ce->etime = get_expire_time(ce->ttl); ce->srvtype = refs[0].server_type; ce->hdr_flags = refs[0].flags; @@ -409,9 +523,7 @@ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs, } /* Allocate a new cache entry */ -static struct cache_entry *alloc_cache_entry(const char *path, - const struct dfs_info3_param *refs, - int numrefs) +static struct cache_entry *alloc_cache_entry(struct dfs_info3_param *refs, int numrefs) { struct cache_entry *ce; int rc; @@ -420,11 +532,9 @@ static struct cache_entry *alloc_cache_entry(const char *path, if (!ce) return ERR_PTR(-ENOMEM); - ce->path = kstrdup(path, GFP_KERNEL); - if (!ce->path) { - kmem_cache_free(cache_slab, ce); - return ERR_PTR(-ENOMEM); - } + ce->path = refs[0].path_name; + refs[0].path_name = NULL; + INIT_HLIST_NODE(&ce->hlist); INIT_LIST_HEAD(&ce->tlist); @@ -437,13 +547,14 @@ static struct cache_entry *alloc_cache_entry(const char *path, return ce; } -/* Must be called with htable_rw_lock held */ -static void remove_oldest_entry(void) +static void remove_oldest_entry_locked(void) { int i; struct cache_entry *ce; struct cache_entry *to_del = NULL; + WARN_ON(!rwsem_is_locked(&htable_rw_lock)); + for (i = 0; i < CACHE_HTABLE_SIZE; i++) { struct hlist_head *l = &cache_htable[i]; @@ -467,12 +578,24 @@ static void remove_oldest_entry(void) } /* Add a new DFS cache entry */ -static int add_cache_entry(const char *path, unsigned int hash, - struct dfs_info3_param *refs, int numrefs) +static int add_cache_entry_locked(struct dfs_info3_param *refs, int numrefs) { + int rc; struct cache_entry *ce; + unsigned int hash; + + WARN_ON(!rwsem_is_locked(&htable_rw_lock)); + + if (atomic_read(&cache_count) >= CACHE_MAX_ENTRIES) { + cifs_dbg(FYI, "%s: reached max cache size (%d)\n", __func__, CACHE_MAX_ENTRIES); + remove_oldest_entry_locked(); + } - ce = alloc_cache_entry(path, refs, numrefs); + rc = cache_entry_hash(refs[0].path_name, strlen(refs[0].path_name), &hash); + if (rc) + return rc; + + ce = alloc_cache_entry(refs, numrefs); if (IS_ERR(ce)) return PTR_ERR(ce); @@ -486,65 +609,77 @@ static int add_cache_entry(const char *path, unsigned int hash, } spin_unlock(&cache_ttl_lock); - down_write(&htable_rw_lock); hlist_add_head(&ce->hlist, &cache_htable[hash]); dump_ce(ce); - up_write(&htable_rw_lock); + + atomic_inc(&cache_count); return 0; } -static struct cache_entry *__lookup_cache_entry(const char *path) +/* Check if two DFS paths are equal. @s1 and @s2 are expected to be in @cache_cp's charset */ +static bool dfs_path_equal(const char *s1, int len1, const char *s2, int len2) { - struct cache_entry *ce; - unsigned int h; - bool found = false; + int i, l1, l2; + wchar_t c1, c2; - h = cache_entry_hash(path, strlen(path)); + if (len1 != len2) + return false; - hlist_for_each_entry(ce, &cache_htable[h], hlist) { - if (!strcasecmp(path, ce->path)) { - found = true; - dump_ce(ce); - break; + for (i = 0; i < len1; i += l1) { + l1 = cache_cp->char2uni(&s1[i], len1 - i, &c1); + l2 = cache_cp->char2uni(&s2[i], len2 - i, &c2); + if (unlikely(l1 < 0 && l2 < 0)) { + if (s1[i] != s2[i]) + return false; + l1 = 1; + continue; } + if (l1 != l2) + return false; + if (cifs_toupper(c1) != cifs_toupper(c2)) + return false; } + return true; +} - if (!found) - ce = ERR_PTR(-ENOENT); - return ce; +static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int hash, int len) +{ + struct cache_entry *ce; + + hlist_for_each_entry(ce, &cache_htable[hash], hlist) { + if (dfs_path_equal(ce->path, strlen(ce->path), path, len)) { + dump_ce(ce); + return ce; + } + } + return ERR_PTR(-EEXIST); } /* - * Find a DFS cache entry in hash table and optionally check prefix path against - * @path. - * Use whole path components in the match. - * Must be called with htable_rw_lock held. + * Find a DFS cache entry in hash table and optionally check prefix path against normalized @path. + * + * Use whole path components in the match. Must be called with htable_rw_lock held. * - * Return ERR_PTR(-ENOENT) if the entry is not found. + * Return ERR_PTR(-EEXIST) if the entry is not found. */ -static struct cache_entry *lookup_cache_entry(const char *path, unsigned int *hash) +static struct cache_entry *lookup_cache_entry(const char *path) { - struct cache_entry *ce = ERR_PTR(-ENOENT); - unsigned int h; + struct cache_entry *ce; int cnt = 0; - char *npath; - char *s, *e; - char sep; - - npath = kstrdup(path, GFP_KERNEL); - if (!npath) - return ERR_PTR(-ENOMEM); + const char *s = path, *e; + char sep = *s; + unsigned int hash; + int rc; - s = npath; - sep = *npath; while ((s = strchr(s, sep)) && ++cnt < 3) s++; if (cnt < 3) { - h = cache_entry_hash(path, strlen(path)); - ce = __lookup_cache_entry(path); - goto out; + rc = cache_entry_hash(path, strlen(path), &hash); + if (rc) + return ERR_PTR(rc); + return __lookup_cache_entry(path, hash, strlen(path)); } /* * Handle paths that have more than two path components and are a complete prefix of the DFS @@ -552,64 +687,29 @@ static struct cache_entry *lookup_cache_entry(const char *path, unsigned int *ha * * See MS-DFSC 3.2.5.5 "Receiving a Root Referral Request or Link Referral Request". */ - h = cache_entry_hash(npath, strlen(npath)); - e = npath + strlen(npath) - 1; + e = path + strlen(path) - 1; while (e > s) { - char tmp; + int len; /* skip separators */ while (e > s && *e == sep) e--; if (e == s) - goto out; - - tmp = *(e+1); - *(e+1) = 0; - - ce = __lookup_cache_entry(npath); - if (!IS_ERR(ce)) { - h = cache_entry_hash(npath, strlen(npath)); break; - } - *(e+1) = tmp; + len = e + 1 - path; + rc = cache_entry_hash(path, len, &hash); + if (rc) + return ERR_PTR(rc); + ce = __lookup_cache_entry(path, hash, len); + if (!IS_ERR(ce)) + return ce; + /* backward until separator */ while (e > s && *e != sep) e--; } -out: - if (hash) - *hash = h; - kfree(npath); - return ce; -} - -static void __vol_release(struct vol_info *vi) -{ - kfree(vi->fullpath); - kfree(vi->mntdata); - smb3_cleanup_fs_context_contents(&vi->ctx); - kfree(vi); -} - -static void vol_release(struct kref *kref) -{ - struct vol_info *vi = container_of(kref, struct vol_info, refcnt); - - spin_lock(&vol_list_lock); - list_del(&vi->list); - spin_unlock(&vol_list_lock); - __vol_release(vi); -} - -static inline void free_vol_list(void) -{ - struct vol_info *vi, *nvi; - - list_for_each_entry_safe(vi, nvi, &vol_list, list) { - list_del_init(&vi->list); - __vol_release(vi); - } + return ERR_PTR(-EEXIST); } /** @@ -618,8 +718,8 @@ static inline void free_vol_list(void) void dfs_cache_destroy(void) { cancel_delayed_work_sync(&refresh_task); - unload_nls(cache_nlsc); - free_vol_list(); + unload_nls(cache_cp); + free_mount_group_list(); flush_cache_ents(); kmem_cache_destroy(cache_slab); destroy_workqueue(dfscache_wq); @@ -627,18 +727,14 @@ void dfs_cache_destroy(void) cifs_dbg(FYI, "%s: destroyed DFS referral cache\n", __func__); } -/* Must be called with htable_rw_lock held */ -static int __update_cache_entry(const char *path, - const struct dfs_info3_param *refs, - int numrefs) +/* Update a cache entry with the new referral in @refs */ +static int update_cache_entry_locked(struct cache_entry *ce, const struct dfs_info3_param *refs, + int numrefs) { int rc; - struct cache_entry *ce; char *s, *th = NULL; - ce = lookup_cache_entry(path, NULL); - if (IS_ERR(ce)) - return PTR_ERR(ce); + WARN_ON(!rwsem_is_locked(&htable_rw_lock)); if (ce->tgthint) { s = ce->tgthint->name; @@ -657,37 +753,30 @@ static int __update_cache_entry(const char *path, return rc; } -static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, struct dfs_info3_param **refs, - int *numrefs) +static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, const char *path, + struct dfs_info3_param **refs, int *numrefs) { - cifs_dbg(FYI, "%s: get an DFS referral for %s\n", __func__, path); + int rc; + int i; - if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) - return -EOPNOTSUPP; - if (unlikely(!nls_codepage)) - return -EINVAL; + cifs_dbg(FYI, "%s: get an DFS referral for %s\n", __func__, path); *refs = NULL; *numrefs = 0; - return ses->server->ops->get_dfs_refer(xid, ses, path, refs, numrefs, - nls_codepage, remap); -} - -/* Update an expired cache entry by getting a new DFS referral from server */ -static int update_cache_entry(const char *path, - const struct dfs_info3_param *refs, - int numrefs) -{ - - int rc; + if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) + return -EOPNOTSUPP; + if (unlikely(!cache_cp)) + return -EINVAL; - down_write(&htable_rw_lock); - rc = __update_cache_entry(path, refs, numrefs); - up_write(&htable_rw_lock); + rc = ses->server->ops->get_dfs_refer(xid, ses, path, refs, numrefs, cache_cp, + NO_MAP_UNI_RSVD); + if (!rc) { + struct dfs_info3_param *ref = *refs; + for (i = 0; i < *numrefs; i++) + convert_delimiter(ref[i].path_name, '\\'); + } return rc; } @@ -697,15 +786,12 @@ static int update_cache_entry(const char *path, * If the entry wasn't found, it will create a new one. Or if it was found but * expired, then it will update the entry accordingly. * - * For interlinks, __cifs_dfs_mount() and expand_dfs_referral() are supposed to + * For interlinks, cifs_mount() and expand_dfs_referral() are supposed to * handle them properly. */ -static int __dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, bool noreq) +static int cache_refresh_path(const unsigned int xid, struct cifs_ses *ses, const char *path) { int rc; - unsigned int hash; struct cache_entry *ce; struct dfs_info3_param *refs = NULL; int numrefs = 0; @@ -713,62 +799,38 @@ static int __dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, cifs_dbg(FYI, "%s: search path: %s\n", __func__, path); - down_read(&htable_rw_lock); - - ce = lookup_cache_entry(path, &hash); - - /* - * If @noreq is set, no requests will be sent to the server. Just return - * the cache entry. - */ - if (noreq) { - up_read(&htable_rw_lock); - return PTR_ERR_OR_ZERO(ce); - } + down_write(&htable_rw_lock); + ce = lookup_cache_entry(path); if (!IS_ERR(ce)) { if (!cache_entry_expired(ce)) { dump_ce(ce); - up_read(&htable_rw_lock); + up_write(&htable_rw_lock); return 0; } } else { newent = true; } - up_read(&htable_rw_lock); - /* - * No entry was found. - * - * Request a new DFS referral in order to create a new cache entry, or - * updating an existing one. + * Either the entry was not found, or it is expired. + * Request a new DFS referral in order to create or update a cache entry. */ - rc = get_dfs_referral(xid, ses, nls_codepage, remap, path, - &refs, &numrefs); + rc = get_dfs_referral(xid, ses, path, &refs, &numrefs); if (rc) - return rc; + goto out_unlock; dump_refs(refs, numrefs); if (!newent) { - rc = update_cache_entry(path, refs, numrefs); - goto out_free_refs; - } - - if (atomic_read(&cache_count) >= CACHE_MAX_ENTRIES) { - cifs_dbg(FYI, "%s: reached max cache size (%d)\n", - __func__, CACHE_MAX_ENTRIES); - down_write(&htable_rw_lock); - remove_oldest_entry(); - up_write(&htable_rw_lock); + rc = update_cache_entry_locked(ce, refs, numrefs); + goto out_unlock; } - rc = add_cache_entry(path, hash, refs, numrefs); - if (!rc) - atomic_inc(&cache_count); + rc = add_cache_entry_locked(refs, numrefs); -out_free_refs: +out_unlock: + up_write(&htable_rw_lock); free_dfs_info_array(refs, numrefs); return rc; } @@ -868,7 +930,7 @@ err_free_it: * needs to be issued: * @xid: syscall xid * @ses: smb session to issue the request on - * @nls_codepage: charset conversion + * @cp: codepage * @remap: path character remapping type * @path: path to lookup in DFS referral cache. * @@ -877,26 +939,25 @@ err_free_it: * * Return zero if the target was found, otherwise non-zero. */ -int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, struct dfs_info3_param *ref, +int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *cp, + int remap, const char *path, struct dfs_info3_param *ref, struct dfs_cache_tgt_list *tgt_list) { int rc; const char *npath; struct cache_entry *ce; - rc = get_normalized_path(path, &npath); - if (rc) - return rc; + npath = dfs_cache_canonical_path(path, cp, remap); + if (IS_ERR(npath)) + return PTR_ERR(npath); - rc = __dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); + rc = cache_refresh_path(xid, ses, npath); if (rc) goto out_free_path; down_read(&htable_rw_lock); - ce = lookup_cache_entry(npath, NULL); + ce = lookup_cache_entry(npath); if (IS_ERR(ce)) { up_read(&htable_rw_lock); rc = PTR_ERR(ce); @@ -913,7 +974,7 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, up_read(&htable_rw_lock); out_free_path: - free_normalized_path(path, npath); + kfree(npath); return rc; } @@ -925,7 +986,7 @@ out_free_path: * expired, nor create a new cache entry if @path hasn't been found. It heavily * relies on an existing cache entry. * - * @path: path to lookup in the DFS referral cache. + * @path: canonical DFS path to lookup in the DFS referral cache. * @ref: when non-NULL, store single DFS referral result in it. * @tgt_list: when non-NULL, store complete DFS target list in it. * @@ -937,18 +998,13 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, struct dfs_cache_tgt_list *tgt_list) { int rc; - const char *npath; struct cache_entry *ce; - rc = get_normalized_path(path, &npath); - if (rc) - return rc; - - cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + cifs_dbg(FYI, "%s: path: %s\n", __func__, path); down_read(&htable_rw_lock); - ce = lookup_cache_entry(npath, NULL); + ce = lookup_cache_entry(path); if (IS_ERR(ce)) { rc = PTR_ERR(ce); goto out_unlock; @@ -963,8 +1019,6 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, out_unlock: up_read(&htable_rw_lock); - free_normalized_path(path, npath); - return rc; } @@ -979,16 +1033,15 @@ out_unlock: * * @xid: syscall id * @ses: smb session - * @nls_codepage: charset conversion + * @cp: codepage * @remap: type of character remapping for paths - * @path: path to lookup in DFS referral cache. + * @path: path to lookup in DFS referral cache * @it: DFS target iterator * * Return zero if the target hint was updated successfully, otherwise non-zero. */ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, + const struct nls_table *cp, int remap, const char *path, const struct dfs_cache_tgt_iterator *it) { int rc; @@ -996,19 +1049,19 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, struct cache_entry *ce; struct cache_dfs_tgt *t; - rc = get_normalized_path(path, &npath); - if (rc) - return rc; + npath = dfs_cache_canonical_path(path, cp, remap); + if (IS_ERR(npath)) + return PTR_ERR(npath); cifs_dbg(FYI, "%s: update target hint - path: %s\n", __func__, npath); - rc = __dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); + rc = cache_refresh_path(xid, ses, npath); if (rc) goto out_free_path; down_write(&htable_rw_lock); - ce = lookup_cache_entry(npath, NULL); + ce = lookup_cache_entry(npath); if (IS_ERR(ce)) { rc = PTR_ERR(ce); goto out_unlock; @@ -1031,8 +1084,7 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, out_unlock: up_write(&htable_rw_lock); out_free_path: - free_normalized_path(path, npath); - + kfree(npath); return rc; } @@ -1044,32 +1096,26 @@ out_free_path: * expired, nor create a new cache entry if @path hasn't been found. It heavily * relies on an existing cache entry. * - * @path: path to lookup in DFS referral cache. + * @path: canonical DFS path to lookup in DFS referral cache. * @it: target iterator which contains the target hint to update the cache * entry with. * * Return zero if the target hint was updated successfully, otherwise non-zero. */ -int dfs_cache_noreq_update_tgthint(const char *path, - const struct dfs_cache_tgt_iterator *it) +int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it) { int rc; - const char *npath; struct cache_entry *ce; struct cache_dfs_tgt *t; if (!it) return -EINVAL; - rc = get_normalized_path(path, &npath); - if (rc) - return rc; - - cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + cifs_dbg(FYI, "%s: path: %s\n", __func__, path); down_write(&htable_rw_lock); - ce = lookup_cache_entry(npath, NULL); + ce = lookup_cache_entry(path); if (IS_ERR(ce)) { rc = PTR_ERR(ce); goto out_unlock; @@ -1092,8 +1138,6 @@ int dfs_cache_noreq_update_tgthint(const char *path, out_unlock: up_write(&htable_rw_lock); - free_normalized_path(path, npath); - return rc; } @@ -1101,32 +1145,26 @@ out_unlock: * dfs_cache_get_tgt_referral - returns a DFS referral (@ref) from a given * target iterator (@it). * - * @path: path to lookup in DFS referral cache. + * @path: canonical DFS path to lookup in DFS referral cache. * @it: DFS target iterator. * @ref: DFS referral pointer to set up the gathered information. * * Return zero if the DFS referral was set up correctly, otherwise non-zero. */ -int dfs_cache_get_tgt_referral(const char *path, - const struct dfs_cache_tgt_iterator *it, +int dfs_cache_get_tgt_referral(const char *path, const struct dfs_cache_tgt_iterator *it, struct dfs_info3_param *ref) { int rc; - const char *npath; struct cache_entry *ce; if (!it || !ref) return -EINVAL; - rc = get_normalized_path(path, &npath); - if (rc) - return rc; - - cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + cifs_dbg(FYI, "%s: path: %s\n", __func__, path); down_read(&htable_rw_lock); - ce = lookup_cache_entry(npath, NULL); + ce = lookup_cache_entry(path); if (IS_ERR(ce)) { rc = PTR_ERR(ce); goto out_unlock; @@ -1138,132 +1176,55 @@ int dfs_cache_get_tgt_referral(const char *path, out_unlock: up_read(&htable_rw_lock); - free_normalized_path(path, npath); - return rc; } /** - * dfs_cache_add_vol - add a cifs context during mount() that will be handled by - * DFS cache refresh worker. - * - * @mntdata: mount data. - * @ctx: cifs context. - * @fullpath: origin full path. + * dfs_cache_add_refsrv_session - add SMB session of referral server * - * Return zero if context was set up correctly, otherwise non-zero. + * @mount_id: mount group uuid to lookup. + * @ses: reference counted SMB session of referral server. */ -int dfs_cache_add_vol(char *mntdata, struct smb3_fs_context *ctx, const char *fullpath) +void dfs_cache_add_refsrv_session(const uuid_t *mount_id, struct cifs_ses *ses) { - int rc; - struct vol_info *vi; - - if (!ctx || !fullpath || !mntdata) - return -EINVAL; - - cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); - - vi = kzalloc(sizeof(*vi), GFP_KERNEL); - if (!vi) - return -ENOMEM; + struct mount_group *mg; - vi->fullpath = kstrdup(fullpath, GFP_KERNEL); - if (!vi->fullpath) { - rc = -ENOMEM; - goto err_free_vi; - } - - rc = smb3_fs_context_dup(&vi->ctx, ctx); - if (rc) - goto err_free_fullpath; - - vi->mntdata = mntdata; - spin_lock_init(&vi->ctx_lock); - kref_init(&vi->refcnt); - - spin_lock(&vol_list_lock); - list_add_tail(&vi->list, &vol_list); - spin_unlock(&vol_list_lock); - - return 0; - -err_free_fullpath: - kfree(vi->fullpath); -err_free_vi: - kfree(vi); - return rc; -} + if (WARN_ON_ONCE(!mount_id || uuid_is_null(mount_id) || !ses)) + return; -/* Must be called with vol_list_lock held */ -static struct vol_info *find_vol(const char *fullpath) -{ - struct vol_info *vi; + mg = get_mount_group(mount_id); + if (WARN_ON_ONCE(IS_ERR(mg))) + return; - list_for_each_entry(vi, &vol_list, list) { - cifs_dbg(FYI, "%s: vi->fullpath: %s\n", __func__, vi->fullpath); - if (!strcasecmp(vi->fullpath, fullpath)) - return vi; - } - return ERR_PTR(-ENOENT); + spin_lock(&mg->lock); + if (mg->num_sessions < ARRAY_SIZE(mg->sessions)) + mg->sessions[mg->num_sessions++] = ses; + spin_unlock(&mg->lock); + kref_put(&mg->refcount, mount_group_release); } /** - * dfs_cache_update_vol - update vol info in DFS cache after failover + * dfs_cache_put_refsrv_sessions - put all referral server sessions * - * @fullpath: fullpath to look up in volume list. - * @server: TCP ses pointer. + * Put all SMB sessions from the given mount group id. * - * Return zero if volume was updated, otherwise non-zero. + * @mount_id: mount group uuid to lookup. */ -int dfs_cache_update_vol(const char *fullpath, struct TCP_Server_Info *server) +void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id) { - struct vol_info *vi; - - if (!fullpath || !server) - return -EINVAL; - - cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); - - spin_lock(&vol_list_lock); - vi = find_vol(fullpath); - if (IS_ERR(vi)) { - spin_unlock(&vol_list_lock); - return PTR_ERR(vi); - } - kref_get(&vi->refcnt); - spin_unlock(&vol_list_lock); - - cifs_dbg(FYI, "%s: updating volume info\n", __func__); - spin_lock(&vi->ctx_lock); - memcpy(&vi->ctx.dstaddr, &server->dstaddr, - sizeof(vi->ctx.dstaddr)); - spin_unlock(&vi->ctx_lock); + struct mount_group *mg; - kref_put(&vi->refcnt, vol_release); - - return 0; -} - -/** - * dfs_cache_del_vol - remove volume info in DFS cache during umount() - * - * @fullpath: fullpath to look up in volume list. - */ -void dfs_cache_del_vol(const char *fullpath) -{ - struct vol_info *vi; - - if (!fullpath || !*fullpath) + if (!mount_id || uuid_is_null(mount_id)) return; - cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); - - spin_lock(&vol_list_lock); - vi = find_vol(fullpath); - spin_unlock(&vol_list_lock); - - if (!IS_ERR(vi)) - kref_put(&vi->refcnt, vol_release); + mutex_lock(&mount_group_list_lock); + mg = find_mount_group_locked(mount_id); + if (IS_ERR(mg)) { + mutex_unlock(&mount_group_list_lock); + return; + } + mutex_unlock(&mount_group_list_lock); + kref_put(&mg->refcount, mount_group_release); } /** @@ -1276,8 +1237,8 @@ void dfs_cache_del_vol(const char *fullpath) * * Return zero if target was parsed correctly, otherwise non-zero. */ -int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, - char **share, char **prefix) +int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **share, + char **prefix) { char *s, sep, *p; size_t len; @@ -1332,278 +1293,190 @@ int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, return 0; } -/* Get all tcons that are within a DFS namespace and can be refreshed */ -static void get_tcons(struct TCP_Server_Info *server, struct list_head *head) +/* + * Refresh all active dfs mounts regardless of whether they are in cache or not. + * (cache can be cleared) + */ +static void refresh_mounts(struct cifs_ses **sessions) { + struct TCP_Server_Info *server; struct cifs_ses *ses; - struct cifs_tcon *tcon; + struct cifs_tcon *tcon, *ntcon; + struct list_head tcons; + unsigned int xid; - INIT_LIST_HEAD(head); + INIT_LIST_HEAD(&tcons); spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { - if (!tcon->need_reconnect && !tcon->need_reopen_files && - tcon->dfs_path) { - tcon->tc_count++; - list_add_tail(&tcon->ulist, head); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + if (tcon->dfs_path) { + tcon->tc_count++; + list_add_tail(&tcon->ulist, &tcons); + } } } - if (ses->tcon_ipc && !ses->tcon_ipc->need_reconnect && - ses->tcon_ipc->dfs_path) { - list_add_tail(&ses->tcon_ipc->ulist, head); - } } spin_unlock(&cifs_tcp_ses_lock); -} -static bool is_dfs_link(const char *path) -{ - char *s; - - s = strchr(path + 1, '\\'); - if (!s) - return false; - return !!strchr(s + 1, '\\'); -} - -static char *get_dfs_root(const char *path) -{ - char *s, *npath; - - s = strchr(path + 1, '\\'); - if (!s) - return ERR_PTR(-EINVAL); - - s = strchr(s + 1, '\\'); - if (!s) - return ERR_PTR(-EINVAL); - - npath = kstrndup(path, s - path, GFP_KERNEL); - if (!npath) - return ERR_PTR(-ENOMEM); + list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) { + const char *path = tcon->dfs_path + 1; + struct cache_entry *ce; + struct dfs_info3_param *refs = NULL; + int numrefs = 0; + bool needs_refresh = false; + int rc = 0; - return npath; -} + list_del_init(&tcon->ulist); -static inline void put_tcp_server(struct TCP_Server_Info *server) -{ - cifs_put_tcp_session(server, 0); -} + ses = find_ipc_from_server_path(sessions, path); + if (IS_ERR(ses)) + goto next_tcon; -static struct TCP_Server_Info *get_tcp_server(struct smb3_fs_context *ctx) -{ - struct TCP_Server_Info *server; + down_read(&htable_rw_lock); + ce = lookup_cache_entry(path); + needs_refresh = IS_ERR(ce) || cache_entry_expired(ce); + up_read(&htable_rw_lock); - server = cifs_find_tcp_session(ctx); - if (IS_ERR_OR_NULL(server)) - return NULL; + if (!needs_refresh) + goto next_tcon; + + xid = get_xid(); + rc = get_dfs_referral(xid, ses, path, &refs, &numrefs); + free_xid(xid); + + /* Create or update a cache entry with the new referral */ + if (!rc) { + down_write(&htable_rw_lock); + ce = lookup_cache_entry(path); + if (IS_ERR(ce)) + add_cache_entry_locked(refs, numrefs); + else if (cache_entry_expired(ce)) + update_cache_entry_locked(ce, refs, numrefs); + up_write(&htable_rw_lock); + } - spin_lock(&GlobalMid_Lock); - if (server->tcpStatus != CifsGood) { - spin_unlock(&GlobalMid_Lock); - put_tcp_server(server); - return NULL; +next_tcon: + free_dfs_info_array(refs, numrefs); + cifs_put_tcon(tcon); } - spin_unlock(&GlobalMid_Lock); - - return server; } -/* Find root SMB session out of a DFS link path */ -static struct cifs_ses *find_root_ses(struct vol_info *vi, - struct cifs_tcon *tcon, - const char *path) +static void refresh_cache(struct cifs_ses **sessions) { - char *rpath; - int rc; - struct cache_entry *ce; - struct dfs_info3_param ref = {0}; - char *mdata = NULL, *devname = NULL; - struct TCP_Server_Info *server; + int i; struct cifs_ses *ses; - struct smb3_fs_context ctx = {NULL}; + unsigned int xid; + char *ref_paths[CACHE_MAX_ENTRIES]; + int count = 0; + struct cache_entry *ce; - rpath = get_dfs_root(path); - if (IS_ERR(rpath)) - return ERR_CAST(rpath); + /* + * Refresh all cached entries. Get all new referrals outside critical section to avoid + * starvation while performing SMB2 IOCTL on broken or slow connections. + * The cache entries may cover more paths than the active mounts + * (e.g. domain-based DFS referrals or multi tier DFS setups). + */ down_read(&htable_rw_lock); + for (i = 0; i < CACHE_HTABLE_SIZE; i++) { + struct hlist_head *l = &cache_htable[i]; - ce = lookup_cache_entry(rpath, NULL); - if (IS_ERR(ce)) { - up_read(&htable_rw_lock); - ses = ERR_CAST(ce); - goto out; - } - - rc = setup_referral(path, ce, &ref, get_tgt_name(ce)); - if (rc) { - up_read(&htable_rw_lock); - ses = ERR_PTR(rc); - goto out; + hlist_for_each_entry(ce, l, hlist) { + if (count == ARRAY_SIZE(ref_paths)) + goto out_unlock; + if (hlist_unhashed(&ce->hlist) || !cache_entry_expired(ce) || + IS_ERR(find_ipc_from_server_path(sessions, ce->path))) + continue; + ref_paths[count++] = kstrdup(ce->path, GFP_ATOMIC); + } } +out_unlock: up_read(&htable_rw_lock); - mdata = cifs_compose_mount_options(vi->mntdata, rpath, &ref, - &devname); - free_dfs_info_param(&ref); - - if (IS_ERR(mdata)) { - ses = ERR_CAST(mdata); - mdata = NULL; - goto out; - } - - rc = cifs_setup_volume_info(&ctx, NULL, devname); - - if (rc) { - ses = ERR_PTR(rc); - goto out; - } - - server = get_tcp_server(&ctx); - if (!server) { - ses = ERR_PTR(-EHOSTDOWN); - goto out; - } - - ses = cifs_get_smb_ses(server, &ctx); - -out: - smb3_cleanup_fs_context_contents(&ctx); - kfree(mdata); - kfree(rpath); - kfree(devname); - - return ses; -} - -/* Refresh DFS cache entry from a given tcon */ -static int refresh_tcon(struct vol_info *vi, struct cifs_tcon *tcon) -{ - int rc = 0; - unsigned int xid; - const char *path, *npath; - struct cache_entry *ce; - struct cifs_ses *root_ses = NULL, *ses; - struct dfs_info3_param *refs = NULL; - int numrefs = 0; - - xid = get_xid(); - - path = tcon->dfs_path + 1; + for (i = 0; i < count; i++) { + char *path = ref_paths[i]; + struct dfs_info3_param *refs = NULL; + int numrefs = 0; + int rc = 0; - rc = get_normalized_path(path, &npath); - if (rc) - goto out_free_xid; - - down_read(&htable_rw_lock); - - ce = lookup_cache_entry(npath, NULL); - if (IS_ERR(ce)) { - rc = PTR_ERR(ce); - up_read(&htable_rw_lock); - goto out_free_path; - } + if (!path) + continue; - if (!cache_entry_expired(ce)) { - up_read(&htable_rw_lock); - goto out_free_path; - } + ses = find_ipc_from_server_path(sessions, path); + if (IS_ERR(ses)) + goto next_referral; - up_read(&htable_rw_lock); + xid = get_xid(); + rc = get_dfs_referral(xid, ses, path, &refs, &numrefs); + free_xid(xid); - /* If it's a DFS Link, then use root SMB session for refreshing it */ - if (is_dfs_link(npath)) { - ses = root_ses = find_root_ses(vi, tcon, npath); - if (IS_ERR(ses)) { - rc = PTR_ERR(ses); - root_ses = NULL; - goto out_free_path; + if (!rc) { + down_write(&htable_rw_lock); + ce = lookup_cache_entry(path); + /* + * We need to re-check it because other tasks might have it deleted or + * updated. + */ + if (!IS_ERR(ce) && cache_entry_expired(ce)) + update_cache_entry_locked(ce, refs, numrefs); + up_write(&htable_rw_lock); } - } else { - ses = tcon->ses; - } - rc = get_dfs_referral(xid, ses, cache_nlsc, tcon->remap, npath, &refs, - &numrefs); - if (!rc) { - dump_refs(refs, numrefs); - rc = update_cache_entry(npath, refs, numrefs); +next_referral: + kfree(path); free_dfs_info_array(refs, numrefs); } - - if (root_ses) - cifs_put_smb_ses(root_ses); - -out_free_path: - free_normalized_path(path, npath); - -out_free_xid: - free_xid(xid); - return rc; } /* - * Worker that will refresh DFS cache based on lowest TTL value from a DFS + * Worker that will refresh DFS cache and active mounts based on lowest TTL value from a DFS * referral. */ static void refresh_cache_worker(struct work_struct *work) { - struct vol_info *vi, *nvi; - struct TCP_Server_Info *server; - LIST_HEAD(vols); - LIST_HEAD(tcons); - struct cifs_tcon *tcon, *ntcon; - int rc; - - /* - * Find SMB volumes that are eligible (server->tcpStatus == CifsGood) - * for refreshing. - */ - spin_lock(&vol_list_lock); - list_for_each_entry(vi, &vol_list, list) { - server = get_tcp_server(&vi->ctx); - if (!server) - continue; - - kref_get(&vi->refcnt); - list_add_tail(&vi->rlist, &vols); - put_tcp_server(server); + struct list_head mglist; + struct mount_group *mg, *tmp_mg; + struct cifs_ses *sessions[CACHE_MAX_ENTRIES + 1] = {NULL}; + int max_sessions = ARRAY_SIZE(sessions) - 1; + int i = 0, count; + + INIT_LIST_HEAD(&mglist); + + /* Get refereces of mount groups */ + mutex_lock(&mount_group_list_lock); + list_for_each_entry(mg, &mount_group_list, list) { + kref_get(&mg->refcount); + list_add(&mg->refresh_list, &mglist); } - spin_unlock(&vol_list_lock); - - /* Walk through all TCONs and refresh any expired cache entry */ - list_for_each_entry_safe(vi, nvi, &vols, rlist) { - spin_lock(&vi->ctx_lock); - server = get_tcp_server(&vi->ctx); - spin_unlock(&vi->ctx_lock); + mutex_unlock(&mount_group_list_lock); - if (!server) - goto next_vol; - - get_tcons(server, &tcons); - rc = 0; - - list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) { - /* - * Skip tcp server if any of its tcons failed to refresh - * (possibily due to reconnects). - */ - if (!rc) - rc = refresh_tcon(vi, tcon); + /* Fill in local array with an NULL-terminated list of all referral server sessions */ + list_for_each_entry(mg, &mglist, refresh_list) { + if (i >= max_sessions) + break; - list_del_init(&tcon->ulist); - cifs_put_tcon(tcon); - } + spin_lock(&mg->lock); + if (i + mg->num_sessions > max_sessions) + count = max_sessions - i; + else + count = mg->num_sessions; + memcpy(&sessions[i], mg->sessions, count * sizeof(mg->sessions[0])); + spin_unlock(&mg->lock); + i += count; + } - put_tcp_server(server); + if (sessions[0]) { + /* Refresh all active mounts and cached entries */ + refresh_mounts(sessions); + refresh_cache(sessions); + } -next_vol: - list_del_init(&vi->rlist); - kref_put(&vi->refcnt, vol_release); + list_for_each_entry_safe(mg, tmp_mg, &mglist, refresh_list) { + list_del_init(&mg->refresh_list); + kref_put(&mg->refcount, mount_group_release); } spin_lock(&cache_ttl_lock); diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h index 1afc4f590c47..b29d3ae64829 100644 --- a/fs/cifs/dfs_cache.h +++ b/fs/cifs/dfs_cache.h @@ -10,6 +10,7 @@ #include <linux/nls.h> #include <linux/list.h> +#include <linux/uuid.h> #include "cifsglob.h" struct dfs_cache_tgt_list { @@ -23,34 +24,26 @@ struct dfs_cache_tgt_iterator { struct list_head it_list; }; -extern int dfs_cache_init(void); -extern void dfs_cache_destroy(void); +int dfs_cache_init(void); +void dfs_cache_destroy(void); extern const struct proc_ops dfscache_proc_ops; -extern int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, struct dfs_info3_param *ref, - struct dfs_cache_tgt_list *tgt_list); -extern int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, - struct dfs_cache_tgt_list *tgt_list); -extern int dfs_cache_update_tgthint(const unsigned int xid, - struct cifs_ses *ses, - const struct nls_table *nls_codepage, - int remap, const char *path, - const struct dfs_cache_tgt_iterator *it); -extern int -dfs_cache_noreq_update_tgthint(const char *path, - const struct dfs_cache_tgt_iterator *it); -extern int dfs_cache_get_tgt_referral(const char *path, - const struct dfs_cache_tgt_iterator *it, - struct dfs_info3_param *ref); -extern int dfs_cache_add_vol(char *mntdata, struct smb3_fs_context *ctx, - const char *fullpath); -extern int dfs_cache_update_vol(const char *fullpath, - struct TCP_Server_Info *server); -extern void dfs_cache_del_vol(const char *fullpath); -extern int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, - char **share, char **prefix); +int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *cp, + int remap, const char *path, struct dfs_info3_param *ref, + struct dfs_cache_tgt_list *tgt_list); +int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, + struct dfs_cache_tgt_list *tgt_list); +int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *cp, int remap, const char *path, + const struct dfs_cache_tgt_iterator *it); +int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it); +int dfs_cache_get_tgt_referral(const char *path, const struct dfs_cache_tgt_iterator *it, + struct dfs_info3_param *ref); +int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **share, + char **prefix); +void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id); +void dfs_cache_add_refsrv_session(const uuid_t *mount_id, struct cifs_ses *ses); +char *dfs_cache_canonical_path(const char *path, const struct nls_table *cp, int remap); static inline struct dfs_cache_tgt_iterator * dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl, diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 6bcd3e8f7cda..79402ca0ddfa 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/dir.c * @@ -6,19 +7,6 @@ * Copyright (C) International Business Machines Corp., 2002,2009 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/stat.h> @@ -396,10 +384,11 @@ cifs_create_set_dentry: goto out_err; } - if (S_ISDIR(newinode->i_mode)) { - rc = -EISDIR; - goto out_err; - } + if (newinode) + if (S_ISDIR(newinode->i_mode)) { + rc = -EISDIR; + goto out_err; + } d_drop(direntry); d_add(direntry, newinode); @@ -630,6 +619,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct inode *newInode = NULL; const char *full_path; void *page; + int retry_count = 0; xid = get_xid(); @@ -673,6 +663,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, d_inode(direntry)); +again: if (pTcon->posix_extensions) rc = smb311_posix_get_inode_info(&newInode, full_path, parent_dir_inode->i_sb, xid); else if (pTcon->unix_ext) { @@ -687,6 +678,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, /* since paths are not looked up by component - the parent directories are presumed to be good here */ renew_parental_timestamps(direntry); + } else if (rc == -EAGAIN && retry_count++ < 10) { + goto again; } else if (rc == -ENOENT) { cifs_set_time(direntry, jiffies); newInode = NULL; diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 534cbba72789..d15b82d569ef 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/dns_resolve.c * @@ -10,19 +11,6 @@ * Contains the CIFS DFS upcall routines used for hostname to * IP address translation. * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/slab.h> diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h index d3f5d27f4d06..5be060b82b13 100644 --- a/fs/cifs/dns_resolve.h +++ b/fs/cifs/dns_resolve.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS * Handles host name to IP address resolution @@ -5,19 +6,6 @@ * Copyright (c) International Business Machines Corp., 2008 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _DNS_RESOLVE_H diff --git a/fs/cifs/export.c b/fs/cifs/export.c index eb0bb8ca8e63..747a540db954 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/export.c * @@ -8,19 +9,6 @@ * * Operations related to support for exporting files via NFSD * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 379a427f3c2f..cd108607a070 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/file.c * @@ -7,19 +8,6 @@ * Author(s): Steve French (sfrench@us.ibm.com) * Jeremy Allison (jra@samba.org) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/backing-dev.h> diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 20d24af33ee2..dd625033cd6b 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/fscache.c - CIFS filesystem cache interface * * Copyright (c) 2010 Novell, Inc. * Author(s): Suresh Jayaraman <sjayaraman@suse.de> * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "fscache.h" #include "cifsglob.h" diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index e811f2dd7619..3d55cb2ef055 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/fscache.h - CIFS filesystem cache interface definitions * * Copyright (c) 2010 Novell, Inc. * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFS_FSCACHE_H #define _CIFS_FSCACHE_H diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 1dfa57982522..b96b253e7635 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/inode.c * * Copyright (C) International Business Machines Corp., 2002,2010 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/stat.h> @@ -367,9 +355,12 @@ cifs_get_file_info_unix(struct file *filp) } else if (rc == -EREMOTE) { cifs_create_dfs_fattr(&fattr, inode->i_sb); rc = 0; - } + } else + goto cifs_gfiunix_out; rc = cifs_fattr_to_inode(inode, &fattr); + +cifs_gfiunix_out: free_xid(xid); return rc; } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index d67d281ab863..42c6a0bac6c8 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/ioctl.c * @@ -6,19 +7,6 @@ * Copyright (C) International Business Machines Corp., 2005,2013 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 970fcf2adb08..f0a6d63bc08c 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/link.c * * Copyright (C) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/stat.h> diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 7207a63819cb..184138b4eb8c 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/misc.c * * Copyright (C) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/slab.h> diff --git a/fs/cifs/netlink.c b/fs/cifs/netlink.c index 5aaabe4cc0a7..291cb606f149 100644 --- a/fs/cifs/netlink.c +++ b/fs/cifs/netlink.c @@ -30,7 +30,7 @@ static const struct nla_policy cifs_genl_policy[CIFS_GENL_ATTR_MAX + 1] = { [CIFS_GENL_ATTR_SWN_RESOURCE_NAME] = { .type = NLA_STRING}, }; -static struct genl_ops cifs_genl_ops[] = { +static const struct genl_ops cifs_genl_ops[] = { { .cmd = CIFS_GENL_CMD_SWN_NOTIFY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 3079b38f0afb..378133ce8869 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/ntlmssp.h * * Copyright (c) International Business Machines Corp., 2002,2007 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define NTLMSSP_SIGNATURE "NTLMSSP" diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 63bfc533c9fb..bfee176b901d 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/readdir.c * @@ -7,19 +8,6 @@ * Copyright (C) Red Hat, Inc., 2011 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/pagemap.h> @@ -321,7 +309,7 @@ static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr, { __dir_info_to_fattr(fattr, info); - /* See MS-FSCC 2.4.18 FileIdFullDirectoryInformation */ + /* See MS-FSCC 2.4.19 FileIdFullDirectoryInformation */ if (fattr->cf_cifsattrs & ATTR_REPARSE) fattr->cf_cifstag = le32_to_cpu(info->EaSize); cifs_fill_common_info(fattr, cifs_sb); diff --git a/fs/cifs/rfc1002pdu.h b/fs/cifs/rfc1002pdu.h index 8b69fcceb597..137f7c95afd6 100644 --- a/fs/cifs/rfc1002pdu.h +++ b/fs/cifs/rfc1002pdu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/rfc1002pdu.h * @@ -6,19 +7,6 @@ * Copyright (c) International Business Machines Corp., 2004 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* NB: unlike smb/cifs packets, the RFC1002 structures are big endian */ diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index a92a1fb7cb52..c5785fd3f52e 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/sess.c * @@ -6,19 +7,6 @@ * Copyright (c) International Business Machines Corp., 2006, 2009 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "cifspdu.h" @@ -195,7 +183,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, ses, iface->speed, iface->rdma_capable ? "yes" : "no", &ipv4->sin_addr); else - cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI4)\n", + cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI6)\n", ses, iface->speed, iface->rdma_capable ? "yes" : "no", &ipv6->sin6_addr); diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 2fa3ba354cc9..c9d8a50062b8 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/smb2file.c * @@ -5,19 +6,6 @@ * Author(s): Steve French (sfrench@us.ibm.com), * Pavel Shilovsky ((pshilovsky@samba.org) 2012 * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/stat.h> diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index d9a990c99121..d0e9f3782bd9 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/smb2glob.h * @@ -9,16 +10,6 @@ * Jeremy Allison (jra@samba.org) * Pavel Shilovsky (pshilovsky@samba.org) 2012 * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * */ #ifndef _SMB2_GLOB_H #define _SMB2_GLOB_H diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 9a61209a283e..957b2594f02e 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/smb2inode.c * @@ -6,19 +7,6 @@ * Author(s): Pavel Shilovsky (pshilovsky@samba.org), * Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/stat.h> diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index c775682ee973..cea39bcecbab 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/smb2/smb2maperror.c * @@ -6,19 +7,6 @@ * Copyright (C) International Business Machines Corp., 2009 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/errno.h> #include "cifsglob.h" diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 06d555d4da9a..668f77108831 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/smb2misc.c * @@ -6,19 +7,6 @@ * Author(s): Steve French (sfrench@us.ibm.com) * Pavel Shilovsky (pshilovsky@samba.org) 2012 * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/ctype.h> #include "smb2pdu.h" @@ -164,19 +152,16 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) struct smb2_transform_hdr *thdr = (struct smb2_transform_hdr *)buf; struct cifs_ses *ses = NULL; - struct list_head *tmp; /* decrypt frame now that it is completely read in */ spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &srvr->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) { if (ses->Suid == thdr->SessionId) break; - - ses = NULL; } spin_unlock(&cifs_tcp_ses_lock); - if (ses == NULL) { + if (list_entry_is_head(ses, &srvr->smb_ses_list, + smb_ses_list)) { cifs_dbg(VFS, "no decryption - session id not found\n"); return 1; } @@ -548,7 +533,6 @@ static bool smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp) { __u8 lease_state; - struct list_head *tmp; struct cifsFileInfo *cfile; struct cifsInodeInfo *cinode; int ack_req = le32_to_cpu(rsp->Flags & @@ -556,8 +540,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp) lease_state = le32_to_cpu(rsp->NewLeaseState); - list_for_each(tmp, &tcon->openFileList) { - cfile = list_entry(tmp, struct cifsFileInfo, tlist); + list_for_each_entry(cfile, &tcon->openFileList, tlist) { cinode = CIFS_I(d_inode(cfile->dentry)); if (memcmp(cinode->lease_key, rsp->LeaseKey, @@ -618,7 +601,6 @@ static bool smb2_is_valid_lease_break(char *buffer) { struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer; - struct list_head *tmp, *tmp1, *tmp2; struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -628,15 +610,9 @@ smb2_is_valid_lease_break(char *buffer) /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &cifs_tcp_ses_list) { - server = list_entry(tmp, struct TCP_Server_Info, tcp_ses_list); - - list_for_each(tmp1, &server->smb_ses_list) { - ses = list_entry(tmp1, struct cifs_ses, smb_ses_list); - - list_for_each(tmp2, &ses->tcon_list) { - tcon = list_entry(tmp2, struct cifs_tcon, - tcon_list); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); cifs_stats_inc( &tcon->stats.cifs_stats.num_oplock_brks); @@ -687,7 +663,6 @@ bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) { struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer; - struct list_head *tmp, *tmp1, *tmp2; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifsInodeInfo *cinode; @@ -710,16 +685,11 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); - - list_for_each(tmp1, &ses->tcon_list) { - tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); - list_for_each(tmp2, &tcon->openFileList) { - cfile = list_entry(tmp2, struct cifsFileInfo, - tlist); + list_for_each_entry(cfile, &tcon->openFileList, tlist) { if (rsp->PersistentFid != cfile->fid.persistent_fid || rsp->VolatileFid != diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 21ef51d338e0..e4c8f603dd58 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -388,7 +388,9 @@ smb2_negotiate(const unsigned int xid, struct cifs_ses *ses) { int rc; + spin_lock(&GlobalMid_Lock); cifs_ses_server(ses)->CurrentMid = 0; + spin_unlock(&GlobalMid_Lock); rc = SMB2_negotiate(xid, ses); /* BB we probably don't need to retry with modern servers */ if (rc == -EAGAIN) @@ -2325,6 +2327,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_query_directory_rsp *qd_rsp = NULL; struct smb2_create_rsp *op_rsp = NULL; struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); + int retry_count = 0; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) @@ -2372,10 +2375,14 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_related(&rqst[1]); +again: rc = compound_send_recv(xid, tcon->ses, server, flags, 2, rqst, resp_buftype, rsp_iov); + if (rc == -EAGAIN && retry_count++ < 10) + goto again; + /* If the open failed there is nothing to do */ op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) { @@ -3601,6 +3608,119 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, return rc; } +static int smb3_simple_fallocate_write_range(unsigned int xid, + struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, + loff_t off, loff_t len, + char *buf) +{ + struct cifs_io_parms io_parms = {0}; + int nbytes; + struct kvec iov[2]; + + io_parms.netfid = cfile->fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.persistent_fid = cfile->fid.persistent_fid; + io_parms.volatile_fid = cfile->fid.volatile_fid; + io_parms.offset = off; + io_parms.length = len; + + /* iov[0] is reserved for smb header */ + iov[1].iov_base = buf; + iov[1].iov_len = io_parms.length; + return SMB2_write(xid, &io_parms, &nbytes, iov, 1); +} + +static int smb3_simple_fallocate_range(unsigned int xid, + struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, + loff_t off, loff_t len) +{ + struct file_allocated_range_buffer in_data, *out_data = NULL, *tmp_data; + u32 out_data_len; + char *buf = NULL; + loff_t l; + int rc; + + in_data.file_offset = cpu_to_le64(off); + in_data.length = cpu_to_le64(len); + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_QUERY_ALLOCATED_RANGES, true, + (char *)&in_data, sizeof(in_data), + 1024 * sizeof(struct file_allocated_range_buffer), + (char **)&out_data, &out_data_len); + if (rc) + goto out; + /* + * It is already all allocated + */ + if (out_data_len == 0) + goto out; + + buf = kzalloc(1024 * 1024, GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto out; + } + + tmp_data = out_data; + while (len) { + /* + * The rest of the region is unmapped so write it all. + */ + if (out_data_len == 0) { + rc = smb3_simple_fallocate_write_range(xid, tcon, + cfile, off, len, buf); + goto out; + } + + if (out_data_len < sizeof(struct file_allocated_range_buffer)) { + rc = -EINVAL; + goto out; + } + + if (off < le64_to_cpu(tmp_data->file_offset)) { + /* + * We are at a hole. Write until the end of the region + * or until the next allocated data, + * whichever comes next. + */ + l = le64_to_cpu(tmp_data->file_offset) - off; + if (len < l) + l = len; + rc = smb3_simple_fallocate_write_range(xid, tcon, + cfile, off, l, buf); + if (rc) + goto out; + off = off + l; + len = len - l; + if (len == 0) + goto out; + } + /* + * We are at a section of allocated data, just skip forward + * until the end of the data or the end of the region + * we are supposed to fallocate, whichever comes first. + */ + l = le64_to_cpu(tmp_data->length); + if (len < l) + l = len; + off += l; + len -= l; + + tmp_data = &tmp_data[1]; + out_data_len -= sizeof(struct file_allocated_range_buffer); + } + + out: + kfree(out_data); + kfree(buf); + return rc; +} + + static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, loff_t off, loff_t len, bool keep_size) { @@ -3662,6 +3782,26 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, if ((keep_size == true) || (i_size_read(inode) >= off + len)) { /* + * At this point, we are trying to fallocate an internal + * regions of a sparse file. Since smb2 does not have a + * fallocate command we have two otions on how to emulate this. + * We can either turn the entire file to become non-sparse + * which we only do if the fallocate is for virtually + * the whole file, or we can overwrite the region with zeroes + * using SMB2_write, which could be prohibitevly expensive + * if len is large. + */ + /* + * We are only trying to fallocate a small region so + * just write it with zero. + */ + if (len <= 1024 * 1024) { + rc = smb3_simple_fallocate_range(xid, tcon, cfile, + off, len); + goto out; + } + + /* * Check if falloc starts within first few pages of file * and ends within a few pages of the end of file to * ensure that most of file is being forced to be diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index c205f93e0a10..962826dc3316 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/smb2pdu.c * @@ -8,19 +9,6 @@ * * Contains the routines for constructing the SMB2 PDUs themselves * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* SMB2 PDU handling routines here - except for leftovers (eg session setup) */ @@ -1791,10 +1779,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; trace_smb3_tcon(xid, tcon->tid, ses->Suid, tree, rc); if (rc != 0) { - if (tcon) { - cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE); - tcon->need_reconnect = true; - } + cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE); + tcon->need_reconnect = true; goto tcon_error_exit; } @@ -2906,7 +2892,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, #endif /* CIFS_DEBUG2 */ if (buf) { - memcpy(buf, &rsp->CreationTime, 32); + buf->CreationTime = rsp->CreationTime; + buf->LastAccessTime = rsp->LastAccessTime; + buf->LastWriteTime = rsp->LastWriteTime; + buf->ChangeTime = rsp->ChangeTime; buf->AllocationSize = rsp->AllocationSize; buf->EndOfFile = rsp->EndofFile; buf->Attributes = rsp->FileAttributes; @@ -3484,6 +3473,8 @@ int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, NULL); } +#if 0 +/* currently unused, as now we are doing compounding instead (see smb311_posix_query_path_info) */ int SMB311_posix_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct smb311_posix_qinfo *data, u32 *plen) @@ -3495,7 +3486,9 @@ SMB311_posix_query_info(const unsigned int xid, struct cifs_tcon *tcon, return query_info(xid, tcon, persistent_fid, volatile_fid, SMB_FIND_FILE_POSIX_INFO, SMB2_O_INFO_FILE, 0, output_len, sizeof(struct smb311_posix_qinfo), (void **)&data, plen); + /* Note caller must free "data" (passed in above). It may be allocated in query_info call */ } +#endif int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon, @@ -4498,7 +4491,7 @@ int posix_info_parse(const void *beg, const void *end, { int total_len = 0; - int sid_len; + int owner_len, group_len; int name_len; const void *owner_sid; const void *group_sid; @@ -4521,17 +4514,17 @@ int posix_info_parse(const void *beg, const void *end, /* check owner sid */ owner_sid = beg + total_len; - sid_len = posix_info_sid_size(owner_sid, end); - if (sid_len < 0) + owner_len = posix_info_sid_size(owner_sid, end); + if (owner_len < 0) return -1; - total_len += sid_len; + total_len += owner_len; /* check group sid */ group_sid = beg + total_len; - sid_len = posix_info_sid_size(group_sid, end); - if (sid_len < 0) + group_len = posix_info_sid_size(group_sid, end); + if (group_len < 0) return -1; - total_len += sid_len; + total_len += group_len; /* check name len */ if (beg + total_len + 4 > end) @@ -4552,10 +4545,8 @@ int posix_info_parse(const void *beg, const void *end, out->size = total_len; out->name_len = name_len; out->name = name; - memcpy(&out->owner, owner_sid, - posix_info_sid_size(owner_sid, end)); - memcpy(&out->group, group_sid, - posix_info_sid_size(group_sid, end)); + memcpy(&out->owner, owner_sid, owner_len); + memcpy(&out->group, group_sid, group_len); } return total_len; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 6442dc1c292b..a5c48b85549a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/smb2pdu.h * @@ -6,19 +7,6 @@ * Author(s): Steve French (sfrench@us.ibm.com) * Pavel Shilovsky (pshilovsky@samba.org) 2012 * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _SMB2PDU_H @@ -276,7 +264,7 @@ struct share_redirect_error_context_rsp { __le32 NotificationType; __le32 ResourceNameOffset; __le32 ResourceNameLength; - __le16 Flags; + __le16 Reserved; __le16 TargetType; __le32 IPAddrCount; struct move_dst_ipaddr IpAddrMoveList[]; @@ -1460,6 +1448,22 @@ struct smb2_echo_rsp { #define SMB2_QUERY_DIRECTORY_IOV_SIZE 2 +/* + * Valid FileInformation classes. + * + * Note that these are a subset of the (file) QUERY_INFO levels defined + * later in this file (but since QUERY_DIRECTORY uses equivalent numbers + * we do not redefine them here) + * + * FileDirectoryInfomation 0x01 + * FileFullDirectoryInformation 0x02 + * FileIdFullDirectoryInformation 0x26 + * FileBothDirectoryInformation 0x03 + * FileIdBothDirectoryInformation 0x25 + * FileNamesInformation 0x0C + * FileIdExtdDirectoryInformation 0x3C + */ + struct smb2_query_directory_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 33 */ @@ -1696,6 +1700,7 @@ struct smb3_fs_vol_info { #define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 #define FILE_STANDARD_LINK_INFORMATION 54 #define FILE_ID_INFORMATION 59 +#define FILE_ID_EXTD_DIRECTORY_INFORMATION 60 struct smb2_file_internal_info { __le64 IndexNumber; @@ -1776,13 +1781,31 @@ struct smb2_file_network_open_info { __le32 Reserved; } __packed; /* level 34 Query also similar returned in close rsp and open rsp */ -/* See MS-FSCC 2.4.43 */ +/* See MS-FSCC 2.4.21 */ struct smb2_file_id_information { __le64 VolumeSerialNumber; __u64 PersistentFileId; /* opaque endianness */ __u64 VolatileFileId; /* opaque endianness */ } __packed; /* level 59 */ +/* See MS-FSCC 2.4.18 */ +struct smb2_file_id_extd_directory_info { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 FileAttributes; + __le32 FileNameLength; + __le32 EaSize; /* EA size */ + __le32 ReparsePointTag; /* valid if FILE_ATTR_REPARSE_POINT set in FileAttributes */ + __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit */ + char FileName[1]; +} __packed; /* level 60 */ + extern char smb2_padding[7]; /* equivalent of the contents of SMB3.1.1 POSIX open context response */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index a5f87b02cfaf..263767f644f8 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/smb2proto.h * @@ -6,19 +7,6 @@ * Author(s): Steve French (sfrench@us.ibm.com) * Pavel Shilovsky (pshilovsky@samba.org) 2012 * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _SMB2PROTO_H #define _SMB2PROTO_H @@ -64,8 +52,6 @@ extern void smb2_echo_request(struct work_struct *work); extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); extern bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv); -extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, - __u64 ses_id); extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h index 7505056e9580..0215ef36e240 100644 --- a/fs/cifs/smb2status.h +++ b/fs/cifs/smb2status.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/smb2status.h * @@ -7,19 +8,6 @@ * Copyright (c) International Business Machines Corp., 2009,2011 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index e6fa76ab70be..6f7952ea4941 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/smb2transport.c * @@ -7,19 +8,6 @@ * Jeremy Allison (jra@samba.org) 2006 * Pavel Shilovsky (pshilovsky@samba.org) 2012 * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> @@ -154,6 +142,7 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { if (ses->Suid != ses_id) continue; + ++ses->ses_count; return ses; } @@ -205,7 +194,14 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid) return NULL; } tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid); + if (!tcon) { + cifs_put_smb_ses(ses); + spin_unlock(&cifs_tcp_ses_lock); + return NULL; + } spin_unlock(&cifs_tcp_ses_lock); + /* tcon already has a ref to ses, so we don't need ses anymore */ + cifs_put_smb_ses(ses); return tcon; } @@ -239,7 +235,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, if (rc) { cifs_server_dbg(VFS, "%s: sha256 alloc failed\n", __func__); - return rc; + goto out; } shash = &sdesc->shash; } else { @@ -290,6 +286,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, out: if (allocate_crypto) cifs_free_hash(&hash, &sdesc); + if (ses) + cifs_put_smb_ses(ses); return rc; } diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 10dfe5006792..31ef64eb7fbb 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -572,8 +572,13 @@ static struct rdma_cm_id *smbd_create_id( log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc); goto out; } - wait_for_completion_interruptible_timeout( + rc = wait_for_completion_interruptible_timeout( &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + /* e.g. if interrupted returns -ERESTARTSYS */ + if (rc < 0) { + log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); + goto out; + } rc = info->ri_rc; if (rc) { log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); @@ -586,8 +591,13 @@ static struct rdma_cm_id *smbd_create_id( log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc); goto out; } - wait_for_completion_interruptible_timeout( + rc = wait_for_completion_interruptible_timeout( &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + /* e.g. if interrupted returns -ERESTARTSYS */ + if (rc < 0) { + log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); + goto out; + } rc = info->ri_rc; if (rc) { log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h index 7f16cb825fe5..60189efb3236 100644 --- a/fs/cifs/smberr.h +++ b/fs/cifs/smberr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/smberr.h * @@ -7,19 +8,6 @@ * See Error Codes section of the SNIA CIFS Specification * for more information * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define SUCCESS 0x00 /* The request was successful. */ diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index a0e84747f567..d0fc42061f49 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions * * Copyright (c) International Business Machines Corp., 2002,2013 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* IOCTL information */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c1725b55f364..f65f9a692ca2 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/transport.c * @@ -5,19 +6,6 @@ * Author(s): Steve French (sfrench@us.ibm.com) * Jeremy Allison (jra@samba.org) 2006. * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index aa3e8ca0457c..9ed481e79ce0 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/xattr.c * * Copyright (c) International Business Machines Corp., 2003, 2007 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> diff --git a/fs/configfs/file.c b/fs/configfs/file.c index e26060dae70a..2f63bf3a7325 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -14,7 +14,7 @@ #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> - +#include <linux/uio.h> #include <linux/configfs.h> #include "configfs_internal.h" @@ -77,28 +77,9 @@ static int fill_read_buffer(struct file *file, struct configfs_buffer *buffer) return 0; } -/** - * configfs_read_file - read an attribute. - * @file: file pointer. - * @buf: buffer to fill. - * @count: number of bytes to read. - * @ppos: starting offset in file. - * - * Userspace wants to read an attribute file. The attribute descriptor - * is in the file's ->d_fsdata. The target item is in the directory's - * ->d_fsdata. - * - * We call fill_read_buffer() to allocate and fill the buffer from the - * item's show() method exactly once (if the read is happening from - * the beginning of the file). That should fill the entire buffer with - * all the data the item has to offer for that attribute. - * We then call flush_read_buffer() to copy the buffer to userspace - * in the increments specified. - */ - -static ssize_t -configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) +static ssize_t configfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { + struct file *file = iocb->ki_filp; struct configfs_buffer *buffer = file->private_data; ssize_t retval = 0; @@ -108,43 +89,24 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp if (retval) goto out; } - pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", - __func__, count, *ppos, buffer->page); - retval = simple_read_from_buffer(buf, count, ppos, buffer->page, - buffer->count); + pr_debug("%s: count = %zd, pos = %lld, buf = %s\n", + __func__, iov_iter_count(to), iocb->ki_pos, buffer->page); + retval = copy_to_iter(buffer->page, buffer->count, to); + iocb->ki_pos += retval; + if (retval == 0) + retval = -EFAULT; out: mutex_unlock(&buffer->mutex); return retval; } -/** - * configfs_read_bin_file - read a binary attribute. - * @file: file pointer. - * @buf: buffer to fill. - * @count: number of bytes to read. - * @ppos: starting offset in file. - * - * Userspace wants to read a binary attribute file. The attribute - * descriptor is in the file's ->d_fsdata. The target item is in the - * directory's ->d_fsdata. - * - * We check whether we need to refill the buffer. If so we will - * call the attributes' attr->read() twice. The first time we - * will pass a NULL as a buffer pointer, which the attributes' method - * will use to return the size of the buffer required. If no error - * occurs we will allocate the buffer using vmalloc and call - * attr->read() again passing that buffer as an argument. - * Then we just copy to user-space using simple_read_from_buffer. - */ - -static ssize_t -configfs_read_bin_file(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t configfs_bin_read_iter(struct kiocb *iocb, struct iov_iter *to) { + struct file *file = iocb->ki_filp; struct configfs_fragment *frag = to_frag(file); struct configfs_buffer *buffer = file->private_data; ssize_t retval = 0; - ssize_t len = min_t(size_t, count, PAGE_SIZE); + ssize_t len; mutex_lock(&buffer->mutex); @@ -200,42 +162,31 @@ configfs_read_bin_file(struct file *file, char __user *buf, buffer->needs_read_fill = 0; } - retval = simple_read_from_buffer(buf, count, ppos, buffer->bin_buffer, - buffer->bin_buffer_size); + retval = copy_to_iter(buffer->bin_buffer, buffer->bin_buffer_size, to); + iocb->ki_pos += retval; + if (retval == 0) + retval = -EFAULT; out: mutex_unlock(&buffer->mutex); return retval; } - -/** - * fill_write_buffer - copy buffer from userspace. - * @buffer: data buffer for file. - * @buf: data from user. - * @count: number of bytes in @userbuf. - * - * Allocate @buffer->page if it hasn't been already, then - * copy the user-supplied buffer into it. - */ - -static int -fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count) +static int fill_write_buffer(struct configfs_buffer *buffer, + struct iov_iter *from) { - int error; + int copied; if (!buffer->page) buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0); if (!buffer->page) return -ENOMEM; - if (count >= SIMPLE_ATTR_SIZE) - count = SIMPLE_ATTR_SIZE - 1; - error = copy_from_user(buffer->page,buf,count); + copied = copy_from_iter(buffer->page, SIMPLE_ATTR_SIZE - 1, from); buffer->needs_read_fill = 1; /* if buf is assumed to contain a string, terminate it by \0, * so e.g. sscanf() can scan the string easily */ - buffer->page[count] = 0; - return error ? -EFAULT : count; + buffer->page[copied] = 0; + return copied ? : -EFAULT; } static int @@ -252,58 +203,36 @@ flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t cou } -/** - * configfs_write_file - write an attribute. - * @file: file pointer - * @buf: data to write - * @count: number of bytes - * @ppos: starting offset - * - * Similar to configfs_read_file(), though working in the opposite direction. - * We allocate and fill the data from the user in fill_write_buffer(), - * then push it to the config_item in flush_write_buffer(). - * There is no easy way for us to know if userspace is only doing a partial - * write, so we don't support them. We expect the entire buffer to come - * on the first write. - * Hint: if you're writing a value, first read the file, modify only - * the value you're changing, then write entire buffer back. +/* + * There is no easy way for us to know if userspace is only doing a partial + * write, so we don't support them. We expect the entire buffer to come on the + * first write. + * Hint: if you're writing a value, first read the file, modify only the value + * you're changing, then write entire buffer back. */ - -static ssize_t -configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +static ssize_t configfs_write_iter(struct kiocb *iocb, struct iov_iter *from) { + struct file *file = iocb->ki_filp; struct configfs_buffer *buffer = file->private_data; ssize_t len; mutex_lock(&buffer->mutex); - len = fill_write_buffer(buffer, buf, count); + len = fill_write_buffer(buffer, from); if (len > 0) len = flush_write_buffer(file, buffer, len); if (len > 0) - *ppos += len; + iocb->ki_pos += len; mutex_unlock(&buffer->mutex); return len; } -/** - * configfs_write_bin_file - write a binary attribute. - * @file: file pointer - * @buf: data to write - * @count: number of bytes - * @ppos: starting offset - * - * Writing to a binary attribute file is similar to a normal read. - * We buffer the consecutive writes (binary attribute files do not - * support lseek) in a continuously growing buffer, but we don't - * commit until the close of the file. - */ - -static ssize_t -configfs_write_bin_file(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t configfs_bin_write_iter(struct kiocb *iocb, + struct iov_iter *from) { + struct file *file = iocb->ki_filp; struct configfs_buffer *buffer = file->private_data; void *tbuf = NULL; + size_t end_offset; ssize_t len; mutex_lock(&buffer->mutex); @@ -316,15 +245,14 @@ configfs_write_bin_file(struct file *file, const char __user *buf, buffer->write_in_progress = true; /* buffer grows? */ - if (*ppos + count > buffer->bin_buffer_size) { - - if (buffer->cb_max_size && - *ppos + count > buffer->cb_max_size) { + end_offset = iocb->ki_pos + iov_iter_count(from); + if (end_offset > buffer->bin_buffer_size) { + if (buffer->cb_max_size && end_offset > buffer->cb_max_size) { len = -EFBIG; goto out; } - tbuf = vmalloc(*ppos + count); + tbuf = vmalloc(end_offset); if (tbuf == NULL) { len = -ENOMEM; goto out; @@ -339,16 +267,15 @@ configfs_write_bin_file(struct file *file, const char __user *buf, /* clear the new area */ memset(tbuf + buffer->bin_buffer_size, 0, - *ppos + count - buffer->bin_buffer_size); + end_offset - buffer->bin_buffer_size); buffer->bin_buffer = tbuf; - buffer->bin_buffer_size = *ppos + count; + buffer->bin_buffer_size = end_offset; } - len = simple_write_to_buffer(buffer->bin_buffer, - buffer->bin_buffer_size, ppos, buf, count); + len = copy_from_iter(buffer->bin_buffer, buffer->bin_buffer_size, from); out: mutex_unlock(&buffer->mutex); - return len; + return len ? : -EFAULT; } static int __configfs_open_file(struct inode *inode, struct file *file, int type) @@ -466,11 +393,8 @@ static int configfs_release_bin_file(struct inode *inode, struct file *file) { struct configfs_buffer *buffer = file->private_data; - buffer->read_in_progress = false; - if (buffer->write_in_progress) { struct configfs_fragment *frag = to_frag(file); - buffer->write_in_progress = false; down_read(&frag->frag_sem); if (!frag->frag_dead) { @@ -480,29 +404,26 @@ static int configfs_release_bin_file(struct inode *inode, struct file *file) buffer->bin_buffer_size); } up_read(&frag->frag_sem); - /* vfree on NULL is safe */ - vfree(buffer->bin_buffer); - buffer->bin_buffer = NULL; - buffer->bin_buffer_size = 0; - buffer->needs_read_fill = 1; } + vfree(buffer->bin_buffer); + configfs_release(inode, file); return 0; } const struct file_operations configfs_file_operations = { - .read = configfs_read_file, - .write = configfs_write_file, + .read_iter = configfs_read_iter, + .write_iter = configfs_write_iter, .llseek = generic_file_llseek, .open = configfs_open_file, .release = configfs_release, }; const struct file_operations configfs_bin_file_operations = { - .read = configfs_read_bin_file, - .write = configfs_write_bin_file, + .read_iter = configfs_bin_read_iter, + .write_iter = configfs_bin_write_iter, .llseek = NULL, /* bin file is not seekable */ .open = configfs_open_bin_file, .release = configfs_release_bin_file, @@ -532,7 +453,7 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib /** * configfs_create_bin_file - create a binary attribute file for an item. * @item: item we're creating for. - * @attr: atrribute descriptor. + * @bin_attr: atrribute descriptor. */ int configfs_create_bin_file(struct config_item *item, diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index eb5ec3e46283..b601610e9907 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -28,12 +28,6 @@ static struct lock_class_key default_group_class[MAX_LOCK_DEPTH]; #endif -static const struct address_space_operations configfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, -}; - static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; @@ -114,7 +108,7 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, struct inode * inode = new_inode(s); if (inode) { inode->i_ino = get_next_ino(); - inode->i_mapping->a_ops = &configfs_aops; + inode->i_mapping->a_ops = &ram_aops; inode->i_op = &configfs_inode_operations; if (sd->s_iattr) { diff --git a/fs/coredump.c b/fs/coredump.c index 2868e3e171ae..07afb5ddb1c4 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -519,7 +519,7 @@ static bool dump_interrupted(void) * but then we need to teach dump_write() to restart and clear * TIF_SIGPENDING. */ - return signal_pending(current); + return fatal_signal_pending(current) || freezing(current); } static void wait_for_dump_helpers(struct file *file) @@ -755,8 +755,8 @@ void do_coredump(const kernel_siginfo_t *siginfo) task_lock(&init_task); get_fs_root(init_task.fs, &root); task_unlock(&init_task); - cprm.file = file_open_root(root.dentry, root.mnt, - cn.corename, open_flags, 0600); + cprm.file = file_open_root(&root, cn.corename, + open_flags, 0600); path_put(&root); } else { cprm.file = filp_open(cn.corename, open_flags, 0600); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6ca7d16593ff..d00455440d08 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -344,13 +344,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, offsetof(struct fscrypt_nokey_name, sha256)); BUILD_BUG_ON(BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) > NAME_MAX); - if (hash) { - nokey_name.dirhash[0] = hash; - nokey_name.dirhash[1] = minor_hash; - } else { - nokey_name.dirhash[0] = 0; - nokey_name.dirhash[1] = 0; - } + nokey_name.dirhash[0] = hash; + nokey_name.dirhash[1] = minor_hash; + if (iname->len <= sizeof(nokey_name.bytes)) { memcpy(nokey_name.bytes, iname->name, iname->len); size = offsetof(struct fscrypt_nokey_name, bytes[iname->len]); diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 261293fb7097..bca9c6658a7c 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -210,15 +210,40 @@ out_unlock: return err; } +/* + * Derive a SipHash key from the given fscrypt master key and the given + * application-specific information string. + * + * Note that the KDF produces a byte array, but the SipHash APIs expect the key + * as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an + * endianness swap in order to get the same results as on little endian CPUs. + */ +static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, + u8 context, const u8 *info, + unsigned int infolen, siphash_key_t *key) +{ + int err; + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, + (u8 *)key, sizeof(*key)); + if (err) + return err; + + BUILD_BUG_ON(sizeof(*key) != 16); + BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2); + le64_to_cpus(&key->key[0]); + le64_to_cpus(&key->key[1]); + return 0; +} + int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, const struct fscrypt_master_key *mk) { int err; - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_DIRHASH_KEY, - ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, - (u8 *)&ci->ci_dirhash_key, - sizeof(ci->ci_dirhash_key)); + err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, + ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, + &ci->ci_dirhash_key); if (err) return err; ci->ci_dirhash_key_initialized = true; @@ -253,10 +278,9 @@ static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, if (mk->mk_ino_hash_key_initialized) goto unlock; - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, - HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0, - (u8 *)&mk->mk_ino_hash_key, - sizeof(mk->mk_ino_hash_key)); + err = fscrypt_derive_siphash_key(mk, + HKDF_CONTEXT_INODE_HASH_KEY, + NULL, 0, &mk->mk_ino_hash_key); if (err) goto unlock; /* pairs with smp_load_acquire() above */ diff --git a/fs/d_path.c b/fs/d_path.c index 270d62133996..23a53f7b5c71 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -8,14 +8,27 @@ #include <linux/prefetch.h> #include "mount.h" -static int prepend(char **buffer, int *buflen, const char *str, int namelen) +struct prepend_buffer { + char *buf; + int len; +}; +#define DECLARE_BUFFER(__name, __buf, __len) \ + struct prepend_buffer __name = {.buf = __buf + __len, .len = __len} + +static char *extract_string(struct prepend_buffer *p) { - *buflen -= namelen; - if (*buflen < 0) - return -ENAMETOOLONG; - *buffer -= namelen; - memcpy(*buffer, str, namelen); - return 0; + if (likely(p->len >= 0)) + return p->buf; + return ERR_PTR(-ENAMETOOLONG); +} + +static void prepend(struct prepend_buffer *p, const char *str, int namelen) +{ + p->len -= namelen; + if (likely(p->len >= 0)) { + p->buf -= namelen; + memcpy(p->buf, str, namelen); + } } /** @@ -35,22 +48,58 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) * * Load acquire is needed to make sure that we see that terminating NUL. */ -static int prepend_name(char **buffer, int *buflen, const struct qstr *name) +static bool prepend_name(struct prepend_buffer *p, const struct qstr *name) { const char *dname = smp_load_acquire(&name->name); /* ^^^ */ u32 dlen = READ_ONCE(name->len); - char *p; + char *s; - *buflen -= dlen + 1; - if (*buflen < 0) - return -ENAMETOOLONG; - p = *buffer -= dlen + 1; - *p++ = '/'; + p->len -= dlen + 1; + if (unlikely(p->len < 0)) + return false; + s = p->buf -= dlen + 1; + *s++ = '/'; while (dlen--) { char c = *dname++; if (!c) break; - *p++ = c; + *s++ = c; + } + return true; +} + +static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, + const struct path *root, struct prepend_buffer *p) +{ + while (dentry != root->dentry || &mnt->mnt != root->mnt) { + const struct dentry *parent = READ_ONCE(dentry->d_parent); + + if (dentry == mnt->mnt.mnt_root) { + struct mount *m = READ_ONCE(mnt->mnt_parent); + struct mnt_namespace *mnt_ns; + + if (likely(mnt != m)) { + dentry = READ_ONCE(mnt->mnt_mountpoint); + mnt = m; + continue; + } + /* Global root */ + mnt_ns = READ_ONCE(mnt->mnt_ns); + /* open-coded is_mounted() to use local mnt_ns */ + if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns)) + return 1; // absolute root + else + return 2; // detached or not attached yet + } + + if (unlikely(dentry == parent)) + /* Escaped? */ + return 3; + + prefetch(parent); + if (!prepend_name(p, &dentry->d_name)) + break; + dentry = parent; } return 0; } @@ -74,15 +123,11 @@ static int prepend_name(char **buffer, int *buflen, const struct qstr *name) */ static int prepend_path(const struct path *path, const struct path *root, - char **buffer, int *buflen) + struct prepend_buffer *p) { - struct dentry *dentry; - struct vfsmount *vfsmnt; - struct mount *mnt; - int error = 0; unsigned seq, m_seq = 0; - char *bptr; - int blen; + struct prepend_buffer b; + int error; rcu_read_lock(); restart_mnt: @@ -90,50 +135,9 @@ restart_mnt: seq = 0; rcu_read_lock(); restart: - bptr = *buffer; - blen = *buflen; - error = 0; - dentry = path->dentry; - vfsmnt = path->mnt; - mnt = real_mount(vfsmnt); + b = *p; read_seqbegin_or_lock(&rename_lock, &seq); - while (dentry != root->dentry || vfsmnt != root->mnt) { - struct dentry * parent; - - if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { - struct mount *parent = READ_ONCE(mnt->mnt_parent); - struct mnt_namespace *mnt_ns; - - /* Escaped? */ - if (dentry != vfsmnt->mnt_root) { - bptr = *buffer; - blen = *buflen; - error = 3; - break; - } - /* Global root? */ - if (mnt != parent) { - dentry = READ_ONCE(mnt->mnt_mountpoint); - mnt = parent; - vfsmnt = &mnt->mnt; - continue; - } - mnt_ns = READ_ONCE(mnt->mnt_ns); - /* open-coded is_mounted() to use local mnt_ns */ - if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns)) - error = 1; // absolute root - else - error = 2; // detached or not attached yet - break; - } - parent = dentry->d_parent; - prefetch(parent); - error = prepend_name(&bptr, &blen, &dentry->d_name); - if (error) - break; - - dentry = parent; - } + error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b); if (!(seq & 1)) rcu_read_unlock(); if (need_seqretry(&rename_lock, seq)) { @@ -150,14 +154,13 @@ restart: } done_seqretry(&mount_lock, m_seq); - if (error >= 0 && bptr == *buffer) { - if (--blen < 0) - error = -ENAMETOOLONG; - else - *--bptr = '/'; - } - *buffer = bptr; - *buflen = blen; + if (unlikely(error == 3)) + b = *p; + + if (b.len == p->len) + prepend(&b, "/", 1); + + *p = b; return error; } @@ -181,56 +184,24 @@ char *__d_path(const struct path *path, const struct path *root, char *buf, int buflen) { - char *res = buf + buflen; - int error; - - prepend(&res, &buflen, "\0", 1); - error = prepend_path(path, root, &res, &buflen); + DECLARE_BUFFER(b, buf, buflen); - if (error < 0) - return ERR_PTR(error); - if (error > 0) + prepend(&b, "", 1); + if (unlikely(prepend_path(path, root, &b) > 0)) return NULL; - return res; + return extract_string(&b); } char *d_absolute_path(const struct path *path, char *buf, int buflen) { struct path root = {}; - char *res = buf + buflen; - int error; - - prepend(&res, &buflen, "\0", 1); - error = prepend_path(path, &root, &res, &buflen); - - if (error > 1) - error = -EINVAL; - if (error < 0) - return ERR_PTR(error); - return res; -} - -/* - * same as __d_path but appends "(deleted)" for unlinked files. - */ -static int path_with_deleted(const struct path *path, - const struct path *root, - char **buf, int *buflen) -{ - prepend(buf, buflen, "\0", 1); - if (d_unlinked(path->dentry)) { - int error = prepend(buf, buflen, " (deleted)", 10); - if (error) - return error; - } - - return prepend_path(path, root, buf, buflen); -} + DECLARE_BUFFER(b, buf, buflen); -static int prepend_unreachable(char **buffer, int *buflen) -{ - return prepend(buffer, buflen, "(unreachable)", 13); + prepend(&b, "", 1); + if (unlikely(prepend_path(path, &root, &b) > 1)) + return ERR_PTR(-EINVAL); + return extract_string(&b); } static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) @@ -261,9 +232,8 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) */ char *d_path(const struct path *path, char *buf, int buflen) { - char *res = buf + buflen; + DECLARE_BUFFER(b, buf, buflen); struct path root; - int error; /* * We have various synthetic filesystems that never get mounted. On @@ -282,12 +252,14 @@ char *d_path(const struct path *path, char *buf, int buflen) rcu_read_lock(); get_fs_root_rcu(current->fs, &root); - error = path_with_deleted(path, &root, &res, &buflen); + if (unlikely(d_unlinked(path->dentry))) + prepend(&b, " (deleted)", 11); + else + prepend(&b, "", 1); + prepend_path(path, &root, &b); rcu_read_unlock(); - if (error < 0) - res = ERR_PTR(error); - return res; + return extract_string(&b); } EXPORT_SYMBOL(d_path); @@ -314,47 +286,34 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, char *simple_dname(struct dentry *dentry, char *buffer, int buflen) { - char *end = buffer + buflen; + DECLARE_BUFFER(b, buffer, buflen); /* these dentries are never renamed, so d_lock is not needed */ - if (prepend(&end, &buflen, " (deleted)", 11) || - prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) || - prepend(&end, &buflen, "/", 1)) - end = ERR_PTR(-ENAMETOOLONG); - return end; + prepend(&b, " (deleted)", 11); + prepend(&b, dentry->d_name.name, dentry->d_name.len); + prepend(&b, "/", 1); + return extract_string(&b); } /* * Write full pathname from the root of the filesystem into the buffer. */ -static char *__dentry_path(const struct dentry *d, char *buf, int buflen) +static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p) { const struct dentry *dentry; - char *end, *retval; - int len, seq = 0; - int error = 0; - - if (buflen < 2) - goto Elong; + struct prepend_buffer b; + int seq = 0; rcu_read_lock(); restart: dentry = d; - end = buf + buflen; - len = buflen; - prepend(&end, &len, "\0", 1); - /* Get '/' right */ - retval = end-1; - *retval = '/'; + b = *p; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { const struct dentry *parent = dentry->d_parent; prefetch(parent); - error = prepend_name(&end, &len, &dentry->d_name); - if (error) + if (!prepend_name(&b, &dentry->d_name)) break; - - retval = end; dentry = parent; } if (!(seq & 1)) @@ -364,36 +323,29 @@ restart: goto restart; } done_seqretry(&rename_lock, seq); - if (error) - goto Elong; - return retval; -Elong: - return ERR_PTR(-ENAMETOOLONG); + if (b.len == p->len) + prepend(&b, "/", 1); + return extract_string(&b); } char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen) { - return __dentry_path(dentry, buf, buflen); + DECLARE_BUFFER(b, buf, buflen); + + prepend(&b, "", 1); + return __dentry_path(dentry, &b); } EXPORT_SYMBOL(dentry_path_raw); char *dentry_path(const struct dentry *dentry, char *buf, int buflen) { - char *p = NULL; - char *retval; - - if (d_unlinked(dentry)) { - p = buf + buflen; - if (prepend(&p, &buflen, "//deleted", 10) != 0) - goto Elong; - buflen++; - } - retval = __dentry_path(dentry, buf, buflen); - if (!IS_ERR(retval) && p) - *p = '/'; /* restore '/' overriden with '\0' */ - return retval; -Elong: - return ERR_PTR(-ENAMETOOLONG); + DECLARE_BUFFER(b, buf, buflen); + + if (unlikely(d_unlinked(dentry))) + prepend(&b, "//deleted", 10); + else + prepend(&b, "", 1); + return __dentry_path(dentry, &b); } static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, @@ -438,38 +390,28 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) rcu_read_lock(); get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); - error = -ENOENT; - if (!d_unlinked(pwd.dentry)) { - unsigned long len; - char *cwd = page + PATH_MAX; - int buflen = PATH_MAX; - - prepend(&cwd, &buflen, "\0", 1); - error = prepend_path(&pwd, &root, &cwd, &buflen); + if (unlikely(d_unlinked(pwd.dentry))) { rcu_read_unlock(); + error = -ENOENT; + } else { + unsigned len; + DECLARE_BUFFER(b, page, PATH_MAX); - if (error < 0) - goto out; - - /* Unreachable from current root */ - if (error > 0) { - error = prepend_unreachable(&cwd, &buflen); - if (error) - goto out; - } + prepend(&b, "", 1); + if (unlikely(prepend_path(&pwd, &root, &b) > 0)) + prepend(&b, "(unreachable)", 13); + rcu_read_unlock(); - error = -ERANGE; - len = PATH_MAX + page - cwd; - if (len <= size) { + len = PATH_MAX - b.len; + if (unlikely(len > PATH_MAX)) + error = -ENAMETOOLONG; + else if (unlikely(len > size)) + error = -ERANGE; + else if (copy_to_user(buf, b.buf, len)) + error = -EFAULT; + else error = len; - if (copy_to_user(buf, cwd, len)) - error = -EFAULT; - } - } else { - rcu_read_unlock(); } - -out: __putname(page); return error; } @@ -488,10 +488,11 @@ static void *grab_mapping_entry(struct xa_state *xas, struct address_space *mapping, unsigned int order) { unsigned long index = xas->xa_index; - bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */ + bool pmd_downgrade; /* splitting PMD entry into PTE entries? */ void *entry; retry: + pmd_downgrade = false; xas_lock_irq(xas); entry = get_unlocked_entry(xas, order); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index e813acfaa6e8..df00231d3ecc 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -582,22 +582,12 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_ulong(const char *name, umode_t mode, - struct dentry *parent, unsigned long *value) +void debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent, + unsigned long *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, - &fops_ulong, &fops_ulong_ro, - &fops_ulong_wo); + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_ulong, + &fops_ulong_ro, &fops_ulong_wo); } EXPORT_SYMBOL_GPL(debugfs_create_ulong); @@ -846,20 +836,11 @@ static const struct file_operations fops_bool_wo = { * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_bool(const char *name, umode_t mode, - struct dentry *parent, bool *value) +void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, + bool *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool, &fops_bool_ro, &fops_bool_wo); } EXPORT_SYMBOL_GPL(debugfs_create_bool); @@ -893,7 +874,7 @@ ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf, copy[copy_len] = '\n'; - ret = simple_read_from_buffer(user_buf, count, ppos, copy, copy_len); + ret = simple_read_from_buffer(user_buf, count, ppos, copy, len); kfree(copy); return ret; @@ -980,7 +961,8 @@ static const struct file_operations fops_blob = { /** * debugfs_create_blob - create a debugfs file that is used to read a binary blob * @name: a pointer to a string containing the name of the file to create. - * @mode: the permission that the file should have + * @mode: the read permission that the file should have (other permissions are + * masked out) * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. @@ -1004,7 +986,7 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { - return debugfs_create_file_unsafe(name, mode, parent, blob, &fops_blob); + return debugfs_create_file_unsafe(name, mode & 0444, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 88d95d96e36c..42eee2783756 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -20,6 +20,7 @@ #include <net/sock.h> #include "config.h" +#include "midcomms.h" #include "lowcomms.h" /* @@ -79,6 +80,9 @@ struct dlm_cluster { unsigned int cl_new_rsb_count; unsigned int cl_recover_callbacks; char cl_cluster_name[DLM_LOCKSPACE_LEN]; + + struct dlm_spaces *sps; + struct dlm_comms *cms; }; static struct dlm_cluster *config_item_to_cluster(struct config_item *i) @@ -204,7 +208,7 @@ static int dlm_check_zero(unsigned int x) static int dlm_check_buffer_size(unsigned int x) { - if (x < DEFAULT_BUFFER_SIZE) + if (x < DLM_MAX_SOCKET_BUFSIZE) return -EINVAL; return 0; @@ -409,6 +413,9 @@ static struct config_group *make_cluster(struct config_group *g, if (!cl || !sps || !cms) goto fail; + cl->sps = sps; + cl->cms = cms; + config_group_init_type_name(&cl->group, name, &cluster_type); config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type); config_group_init_type_name(&cms->cs_group, "comms", &comms_type); @@ -458,6 +465,9 @@ static void drop_cluster(struct config_group *g, struct config_item *i) static void release_cluster(struct config_item *i) { struct dlm_cluster *cl = config_item_to_cluster(i); + + kfree(cl->sps); + kfree(cl->cms); kfree(cl); } @@ -532,7 +542,7 @@ static void drop_comm(struct config_group *g, struct config_item *i) struct dlm_comm *cm = config_item_to_comm(i); if (local_comm == cm) local_comm = NULL; - dlm_lowcomms_close(cm->nodeid); + dlm_midcomms_close(cm->nodeid); while (cm->addr_count--) kfree(cm->addr[cm->addr_count]); config_item_put(i); @@ -942,7 +952,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_SCAN_SECS 5 #define DEFAULT_LOG_DEBUG 0 #define DEFAULT_LOG_INFO 1 -#define DEFAULT_PROTOCOL 0 +#define DEFAULT_PROTOCOL DLM_PROTO_TCP #define DEFAULT_MARK 0 #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ #define DEFAULT_WAITWARN_US 0 @@ -952,7 +962,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) struct dlm_config_info dlm_config = { .ci_tcp_port = DEFAULT_TCP_PORT, - .ci_buffer_size = DEFAULT_BUFFER_SIZE, + .ci_buffer_size = DLM_MAX_SOCKET_BUFSIZE, .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, .ci_recover_timer = DEFAULT_RECOVER_TIMER, .ci_toss_secs = DEFAULT_TOSS_SECS, diff --git a/fs/dlm/config.h b/fs/dlm/config.h index d2cd4bd20313..df92b0a07fc6 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -12,7 +12,7 @@ #ifndef __CONFIG_DOT_H__ #define __CONFIG_DOT_H__ -#define DEFAULT_BUFFER_SIZE 4096 +#define DLM_MAX_SOCKET_BUFSIZE 4096 struct dlm_config_node { int nodeid; @@ -23,6 +23,9 @@ struct dlm_config_node { #define DLM_MAX_ADDR_COUNT 3 +#define DLM_PROTO_TCP 0 +#define DLM_PROTO_SCTP 1 + struct dlm_config_info { int ci_tcp_port; int ci_buffer_size; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index d5bd990bcab8..47e9d57e4cae 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include "dlm_internal.h" +#include "midcomms.h" #include "lock.h" #define DLM_DEBUG_BUF_LEN 4096 @@ -23,6 +24,7 @@ static char debug_buf[DLM_DEBUG_BUF_LEN]; static struct mutex debug_buf_lock; static struct dentry *dlm_root; +static struct dentry *dlm_comms; static char *print_lockmode(int mode) { @@ -738,6 +740,57 @@ void dlm_delete_debug_file(struct dlm_ls *ls) debugfs_remove(ls->ls_debug_toss_dentry); } +static int dlm_state_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "%s\n", dlm_midcomms_state(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_state); + +static int dlm_flags_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "%lu\n", dlm_midcomms_flags(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_flags); + +static int dlm_send_queue_cnt_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "%d\n", dlm_midcomms_send_queue_cnt(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_send_queue_cnt); + +static int dlm_version_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "0x%08x\n", dlm_midcomms_version(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_version); + +void *dlm_create_debug_comms_file(int nodeid, void *data) +{ + struct dentry *d_node; + char name[256]; + + memset(name, 0, sizeof(name)); + snprintf(name, 256, "%d", nodeid); + + d_node = debugfs_create_dir(name, dlm_comms); + debugfs_create_file("state", 0444, d_node, data, &dlm_state_fops); + debugfs_create_file("flags", 0444, d_node, data, &dlm_flags_fops); + debugfs_create_file("send_queue_count", 0444, d_node, data, + &dlm_send_queue_cnt_fops); + debugfs_create_file("version", 0444, d_node, data, &dlm_version_fops); + + return d_node; +} + +void dlm_delete_debug_comms_file(void *ctx) +{ + debugfs_remove(ctx); +} + void dlm_create_debug_file(struct dlm_ls *ls) { char name[DLM_LOCKSPACE_LEN + 8]; @@ -797,6 +850,7 @@ void __init dlm_register_debugfs(void) { mutex_init(&debug_buf_lock); dlm_root = debugfs_create_dir("dlm", NULL); + dlm_comms = debugfs_create_dir("comms", dlm_root); } void dlm_unregister_debugfs(void) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 04fe9f525ac7..91d1ca3a121a 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -57,9 +57,12 @@ struct dlm_header; struct dlm_message; struct dlm_rcom; struct dlm_mhandle; +struct dlm_msg; #define log_print(fmt, args...) \ printk(KERN_ERR "dlm: "fmt"\n" , ##args) +#define log_print_ratelimited(fmt, args...) \ + printk_ratelimited(KERN_ERR "dlm: "fmt"\n", ##args) #define log_error(ls, fmt, args...) \ printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) @@ -368,23 +371,33 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) /* dlm_header is first element of all structs sent between nodes */ #define DLM_HEADER_MAJOR 0x00030000 -#define DLM_HEADER_MINOR 0x00000001 +#define DLM_HEADER_MINOR 0x00000002 + +#define DLM_VERSION_3_1 0x00030001 +#define DLM_VERSION_3_2 0x00030002 #define DLM_HEADER_SLOTS 0x00000001 #define DLM_MSG 1 #define DLM_RCOM 2 +#define DLM_OPTS 3 +#define DLM_ACK 4 +#define DLM_FIN 5 struct dlm_header { uint32_t h_version; - uint32_t h_lockspace; + union { + /* for DLM_MSG and DLM_RCOM */ + uint32_t h_lockspace; + /* for DLM_ACK and DLM_OPTS */ + uint32_t h_seq; + } u; uint32_t h_nodeid; /* nodeid of sender */ uint16_t h_length; uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */ uint8_t h_pad; }; - #define DLM_MSG_REQUEST 1 #define DLM_MSG_CONVERT 2 #define DLM_MSG_UNLOCK 3 @@ -452,10 +465,29 @@ struct dlm_rcom { char rc_buf[]; }; +struct dlm_opt_header { + uint16_t t_type; + uint16_t t_length; + uint32_t o_pad; + /* need to be 8 byte aligned */ + char t_value[]; +}; + +/* encapsulation header */ +struct dlm_opts { + struct dlm_header o_header; + uint8_t o_nextcmd; + uint8_t o_pad; + uint16_t o_optlen; + uint32_t o_pad2; + char o_opts[]; +}; + union dlm_packet { struct dlm_header header; /* common to other two */ struct dlm_message message; struct dlm_rcom rcom; + struct dlm_opts opts; }; #define DLM_RSF_NEED_SLOTS 0x00000001 @@ -722,11 +754,15 @@ void dlm_register_debugfs(void); void dlm_unregister_debugfs(void); void dlm_create_debug_file(struct dlm_ls *ls); void dlm_delete_debug_file(struct dlm_ls *ls); +void *dlm_create_debug_comms_file(int nodeid, void *data); +void dlm_delete_debug_comms_file(void *ctx); #else static inline void dlm_register_debugfs(void) { } static inline void dlm_unregister_debugfs(void) { } static inline void dlm_create_debug_file(struct dlm_ls *ls) { } static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } +static inline void *dlm_create_debug_comms_file(int nodeid, void *data) { return NULL; } +static inline void dlm_delete_debug_comms_file(void *ctx) { } #endif #endif /* __DLM_INTERNAL_DOT_H__ */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index b93df39d0915..c502c065d007 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -59,7 +59,7 @@ #include "dlm_internal.h" #include <linux/dlm_device.h> #include "memory.h" -#include "lowcomms.h" +#include "midcomms.h" #include "requestqueue.h" #include "util.h" #include "dir.h" @@ -3534,17 +3534,17 @@ static int _create_message(struct dlm_ls *ls, int mb_len, char *mb; /* get_buffer gives us a message handle (mh) that we need to - pass into lowcomms_commit and a message buffer (mb) that we + pass into midcomms_commit and a message buffer (mb) that we write our data into */ - mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); + mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb); if (!mh) return -ENOBUFS; ms = (struct dlm_message *) mb; ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - ms->m_header.h_lockspace = ls->ls_global_id; + ms->m_header.u.h_lockspace = ls->ls_global_id; ms->m_header.h_nodeid = dlm_our_nodeid(); ms->m_header.h_length = mb_len; ms->m_header.h_cmd = DLM_MSG; @@ -3589,7 +3589,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms) { dlm_message_out(ms); - dlm_lowcomms_commit_buffer(mh); + dlm_midcomms_commit_mhandle(mh); return 0; } @@ -5038,16 +5038,16 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid) if (hd->h_nodeid != nodeid) { log_print("invalid h_nodeid %d from %d lockspace %x", - hd->h_nodeid, nodeid, hd->h_lockspace); + hd->h_nodeid, nodeid, hd->u.h_lockspace); return; } - ls = dlm_find_lockspace_global(hd->h_lockspace); + ls = dlm_find_lockspace_global(hd->u.h_lockspace); if (!ls) { if (dlm_config.ci_log_debug) { printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace " "%u from %d cmd %d type %d\n", - hd->h_lockspace, nodeid, hd->h_cmd, type); + hd->u.h_lockspace, nodeid, hd->h_cmd, type); } if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index c14cf2b7faab..d71aba8c3e64 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -16,6 +16,7 @@ #include "member.h" #include "recoverd.h" #include "dir.h" +#include "midcomms.h" #include "lowcomms.h" #include "config.h" #include "memory.h" @@ -390,7 +391,7 @@ static int threads_start(void) } /* Thread for sending/receiving messages for all lockspace's */ - error = dlm_lowcomms_start(); + error = dlm_midcomms_start(); if (error) { log_print("cannot start dlm lowcomms %d", error); goto scand_fail; @@ -566,7 +567,12 @@ static int new_lockspace(const char *name, const char *cluster, mutex_init(&ls->ls_requestqueue_mutex); mutex_init(&ls->ls_clear_proc_locks); - ls->ls_recover_buf = kmalloc(LOWCOMMS_MAX_TX_BUFFER_LEN, GFP_NOFS); + /* Due backwards compatibility with 3.1 we need to use maximum + * possible dlm message size to be sure the message will fit and + * not having out of bounds issues. However on sending side 3.2 + * might send less. + */ + ls->ls_recover_buf = kmalloc(DLM_MAX_SOCKET_BUFSIZE, GFP_NOFS); if (!ls->ls_recover_buf) goto out_lkbidr; @@ -698,7 +704,7 @@ int dlm_new_lockspace(const char *name, const char *cluster, error = 0; if (!ls_count) { dlm_scand_stop(); - dlm_lowcomms_shutdown(); + dlm_midcomms_shutdown(); dlm_lowcomms_stop(); } out: @@ -787,7 +793,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) if (ls_count == 1) { dlm_scand_stop(); - dlm_lowcomms_shutdown(); + dlm_midcomms_shutdown(); } dlm_callback_stop(ls); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 166e36fcf3e4..0ea9ae35da0b 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -59,7 +59,6 @@ #include "config.h" #define NEEDED_RMEM (4*1024*1024) -#define CONN_HASH_SIZE 32 /* Number of messages to send before rescheduling */ #define MAX_SEND_MSG_COUNT 25 @@ -79,14 +78,20 @@ struct connection { #define CF_CLOSING 8 #define CF_SHUTDOWN 9 #define CF_CONNECTED 10 +#define CF_RECONNECT 11 +#define CF_DELAY_CONNECT 12 +#define CF_EOF 13 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; + atomic_t writequeue_cnt; void (*connect_action) (struct connection *); /* What to do to connect */ void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ + bool (*eof_condition)(struct connection *con); /* What to do to eof check */ int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; struct connection *othercon; + struct connection *sendcon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ @@ -113,7 +118,22 @@ struct writequeue_entry { int len; int end; int users; + bool dirty; struct connection *con; + struct list_head msgs; + struct kref ref; +}; + +struct dlm_msg { + struct writequeue_entry *entry; + struct dlm_msg *orig_msg; + bool retransmit; + void *ppc; + int len; + int idx; /* new()/commit() idx exchange */ + + struct list_head list; + struct kref ref; }; struct dlm_node_addr { @@ -155,33 +175,23 @@ static void sctp_connect_to_sock(struct connection *con); static void tcp_connect_to_sock(struct connection *con); static void dlm_tcp_shutdown(struct connection *con); -/* This is deliberately very simple because most clusters have simple - sequential nodeids, so we should be able to go straight to a connection - struct in the array */ -static inline int nodeid_hash(int nodeid) +static struct connection *__find_con(int nodeid, int r) { - return nodeid & (CONN_HASH_SIZE-1); -} - -static struct connection *__find_con(int nodeid) -{ - int r, idx; struct connection *con; - r = nodeid_hash(nodeid); - - idx = srcu_read_lock(&connections_srcu); hlist_for_each_entry_rcu(con, &connection_hash[r], list) { - if (con->nodeid == nodeid) { - srcu_read_unlock(&connections_srcu, idx); + if (con->nodeid == nodeid) return con; - } } - srcu_read_unlock(&connections_srcu, idx); return NULL; } +static bool tcp_eof_condition(struct connection *con) +{ + return atomic_read(&con->writequeue_cnt); +} + static int dlm_con_init(struct connection *con, int nodeid) { con->rx_buflen = dlm_config.ci_buffer_size; @@ -193,15 +203,23 @@ static int dlm_con_init(struct connection *con, int nodeid) mutex_init(&con->sock_mutex); INIT_LIST_HEAD(&con->writequeue); spin_lock_init(&con->writequeue_lock); + atomic_set(&con->writequeue_cnt, 0); INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); init_waitqueue_head(&con->shutdown_wait); - if (dlm_config.ci_protocol == 0) { + switch (dlm_config.ci_protocol) { + case DLM_PROTO_TCP: con->connect_action = tcp_connect_to_sock; con->shutdown_action = dlm_tcp_shutdown; - } else { + con->eof_condition = tcp_eof_condition; + break; + case DLM_PROTO_SCTP: con->connect_action = sctp_connect_to_sock; + break; + default: + kfree(con->rx_buf); + return -EINVAL; } return 0; @@ -216,7 +234,8 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) struct connection *con, *tmp; int r, ret; - con = __find_con(nodeid); + r = nodeid_hash(nodeid); + con = __find_con(nodeid, r); if (con || !alloc) return con; @@ -230,8 +249,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) return NULL; } - r = nodeid_hash(nodeid); - spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can * race on multiple cpu's. Instead of locking hot path __find_con() @@ -239,7 +256,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) * under protection of connections_lock. If this is the case we * abort our connection creation and return the existing connection. */ - tmp = __find_con(nodeid); + tmp = __find_con(nodeid, r); if (tmp) { spin_unlock(&connections_lock); kfree(con->rx_buf); @@ -256,15 +273,13 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) /* Loop round all connections */ static void foreach_conn(void (*conn_func)(struct connection *c)) { - int i, idx; + int i; struct connection *con; - idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) conn_func(con); } - srcu_read_unlock(&connections_srcu, idx); } static struct dlm_node_addr *find_node_addr(int nodeid) @@ -462,6 +477,9 @@ static void lowcomms_data_ready(struct sock *sk) static void lowcomms_listen_data_ready(struct sock *sk) { + if (!dlm_allow_conn) + return; + queue_work(recv_workqueue, &listen_con.rwork); } @@ -518,14 +536,21 @@ static void lowcomms_state_change(struct sock *sk) int dlm_lowcomms_connect_node(int nodeid) { struct connection *con; + int idx; if (nodeid == dlm_our_nodeid()) return 0; + idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, GFP_NOFS); - if (!con) + if (!con) { + srcu_read_unlock(&connections_srcu, idx); return -ENOMEM; + } + lowcomms_connect_sock(con); + srcu_read_unlock(&connections_srcu, idx); + return 0; } @@ -587,6 +612,22 @@ static void lowcomms_error_report(struct sock *sk) dlm_config.ci_tcp_port, sk->sk_err, sk->sk_err_soft); } + + /* below sendcon only handling */ + if (test_bit(CF_IS_OTHERCON, &con->flags)) + con = con->sendcon; + + switch (sk->sk_err) { + case ECONNREFUSED: + set_bit(CF_DELAY_CONNECT, &con->flags); + break; + default: + break; + } + + if (!test_and_set_bit(CF_RECONNECT, &con->flags)) + queue_work(send_workqueue, &con->swork); + out: read_unlock_bh(&sk->sk_callback_lock); if (orig_report) @@ -669,6 +710,42 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len); } +static void dlm_page_release(struct kref *kref) +{ + struct writequeue_entry *e = container_of(kref, struct writequeue_entry, + ref); + + __free_page(e->page); + kfree(e); +} + +static void dlm_msg_release(struct kref *kref) +{ + struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref); + + kref_put(&msg->entry->ref, dlm_page_release); + kfree(msg); +} + +static void free_entry(struct writequeue_entry *e) +{ + struct dlm_msg *msg, *tmp; + + list_for_each_entry_safe(msg, tmp, &e->msgs, list) { + if (msg->orig_msg) { + msg->orig_msg->retransmit = false; + kref_put(&msg->orig_msg->ref, dlm_msg_release); + } + + list_del(&msg->list); + kref_put(&msg->ref, dlm_msg_release); + } + + list_del(&e->list); + atomic_dec(&e->con->writequeue_cnt); + kref_put(&e->ref, dlm_page_release); +} + static void dlm_close_sock(struct socket **sock) { if (*sock) { @@ -683,6 +760,7 @@ static void close_connection(struct connection *con, bool and_other, bool tx, bool rx) { bool closing = test_and_set_bit(CF_CLOSING, &con->flags); + struct writequeue_entry *e; if (tx && !closing && cancel_work_sync(&con->swork)) { log_print("canceled swork for node %d", con->nodeid); @@ -698,12 +776,35 @@ static void close_connection(struct connection *con, bool and_other, if (con->othercon && and_other) { /* Will only re-enter once. */ - close_connection(con->othercon, false, true, true); + close_connection(con->othercon, false, tx, rx); + } + + /* if we send a writequeue entry only a half way, we drop the + * whole entry because reconnection and that we not start of the + * middle of a msg which will confuse the other end. + * + * we can always drop messages because retransmits, but what we + * cannot allow is to transmit half messages which may be processed + * at the other side. + * + * our policy is to start on a clean state when disconnects, we don't + * know what's send/received on transport layer in this case. + */ + spin_lock(&con->writequeue_lock); + if (!list_empty(&con->writequeue)) { + e = list_first_entry(&con->writequeue, struct writequeue_entry, + list); + if (e->dirty) + free_entry(e); } + spin_unlock(&con->writequeue_lock); con->rx_leftover = 0; con->retries = 0; clear_bit(CF_CONNECTED, &con->flags); + clear_bit(CF_DELAY_CONNECT, &con->flags); + clear_bit(CF_RECONNECT, &con->flags); + clear_bit(CF_EOF, &con->flags); mutex_unlock(&con->sock_mutex); clear_bit(CF_CLOSING, &con->flags); } @@ -841,19 +942,26 @@ out_resched: return -EAGAIN; out_close: - mutex_unlock(&con->sock_mutex); - if (ret != -EAGAIN) { - /* Reconnect when there is something to send */ - close_connection(con, false, true, false); - if (ret == 0) { - log_print("connection %p got EOF from %d", - con, con->nodeid); + if (ret == 0) { + log_print("connection %p got EOF from %d", + con, con->nodeid); + + if (con->eof_condition && con->eof_condition(con)) { + set_bit(CF_EOF, &con->flags); + mutex_unlock(&con->sock_mutex); + } else { + mutex_unlock(&con->sock_mutex); + close_connection(con, false, true, false); + /* handling for tcp shutdown */ clear_bit(CF_SHUTDOWN, &con->flags); wake_up(&con->shutdown_wait); - /* signal to breaking receive worker */ - ret = -1; } + + /* signal to breaking receive worker */ + ret = -1; + } else { + mutex_unlock(&con->sock_mutex); } return ret; } @@ -864,16 +972,12 @@ static int accept_from_sock(struct listen_connection *con) int result; struct sockaddr_storage peeraddr; struct socket *newsock; - int len; + int len, idx; int nodeid; struct connection *newcon; struct connection *addcon; unsigned int mark; - if (!dlm_allow_conn) { - return -1; - } - if (!con->sock) return -ENOTCONN; @@ -907,8 +1011,10 @@ static int accept_from_sock(struct listen_connection *con) * the same time and the connections cross on the wire. * In this case we store the incoming one in "othercon" */ + idx = srcu_read_lock(&connections_srcu); newcon = nodeid2con(nodeid, GFP_NOFS); if (!newcon) { + srcu_read_unlock(&connections_srcu, idx); result = -ENOMEM; goto accept_err; } @@ -924,6 +1030,7 @@ static int accept_from_sock(struct listen_connection *con) if (!othercon) { log_print("failed to allocate incoming socket"); mutex_unlock(&newcon->sock_mutex); + srcu_read_unlock(&connections_srcu, idx); result = -ENOMEM; goto accept_err; } @@ -932,11 +1039,14 @@ static int accept_from_sock(struct listen_connection *con) if (result < 0) { kfree(othercon); mutex_unlock(&newcon->sock_mutex); + srcu_read_unlock(&connections_srcu, idx); goto accept_err; } lockdep_set_subclass(&othercon->sock_mutex, 1); + set_bit(CF_IS_OTHERCON, &othercon->flags); newcon->othercon = othercon; + othercon->sendcon = newcon; } else { /* close other sock con if we have something new */ close_connection(othercon, false, true, false); @@ -966,6 +1076,8 @@ static int accept_from_sock(struct listen_connection *con) if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) queue_work(recv_workqueue, &addcon->rwork); + srcu_read_unlock(&connections_srcu, idx); + return 0; accept_err: @@ -977,12 +1089,6 @@ accept_err: return result; } -static void free_entry(struct writequeue_entry *e) -{ - __free_page(e->page); - kfree(e); -} - /* * writequeue_entry_complete - try to delete and free write queue entry * @e: write queue entry to try to delete @@ -994,11 +1100,11 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) { e->offset += completed; e->len -= completed; + /* signal that page was half way transmitted */ + e->dirty = true; - if (e->len == 0 && e->users == 0) { - list_del(&e->list); + if (e->len == 0 && e->users == 0) free_entry(e); - } } /* @@ -1075,7 +1181,7 @@ static void sctp_connect_to_sock(struct connection *con) make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len); - log_print("connecting to %d", con->nodeid); + log_print_ratelimited("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ sctp_sock_set_nodelay(sock->sk); @@ -1171,7 +1277,7 @@ static void tcp_connect_to_sock(struct connection *con) make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); - log_print("connecting to %d", con->nodeid); + log_print_ratelimited("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ tcp_sock_set_nodelay(sock->sk); @@ -1364,12 +1470,16 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con, entry->con = con; entry->users = 1; + kref_init(&entry->ref); + INIT_LIST_HEAD(&entry->msgs); return entry; } static struct writequeue_entry *new_wq_entry(struct connection *con, int len, - gfp_t allocation, char **ppc) + gfp_t allocation, char **ppc, + void (*cb)(struct dlm_mhandle *mh), + struct dlm_mhandle *mh) { struct writequeue_entry *e; @@ -1377,7 +1487,12 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, if (!list_empty(&con->writequeue)) { e = list_last_entry(&con->writequeue, struct writequeue_entry, list); if (DLM_WQ_REMAIN_BYTES(e) >= len) { + kref_get(&e->ref); + *ppc = page_address(e->page) + e->end; + if (cb) + cb(mh); + e->end += len; e->users++; spin_unlock(&con->writequeue_lock); @@ -1391,42 +1506,92 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, if (!e) return NULL; + kref_get(&e->ref); *ppc = page_address(e->page); e->end += len; + atomic_inc(&con->writequeue_cnt); spin_lock(&con->writequeue_lock); + if (cb) + cb(mh); + list_add_tail(&e->list, &con->writequeue); spin_unlock(&con->writequeue_lock); return e; }; -void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) +static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, + gfp_t allocation, char **ppc, + void (*cb)(struct dlm_mhandle *mh), + struct dlm_mhandle *mh) +{ + struct writequeue_entry *e; + struct dlm_msg *msg; + + msg = kzalloc(sizeof(*msg), allocation); + if (!msg) + return NULL; + + kref_init(&msg->ref); + + e = new_wq_entry(con, len, allocation, ppc, cb, mh); + if (!e) { + kfree(msg); + return NULL; + } + + msg->ppc = *ppc; + msg->len = len; + msg->entry = e; + + return msg; +} + +struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, + char **ppc, void (*cb)(struct dlm_mhandle *mh), + struct dlm_mhandle *mh) { struct connection *con; + struct dlm_msg *msg; + int idx; - if (len > DEFAULT_BUFFER_SIZE || + if (len > DLM_MAX_SOCKET_BUFSIZE || len < sizeof(struct dlm_header)) { - BUILD_BUG_ON(PAGE_SIZE < DEFAULT_BUFFER_SIZE); + BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE); log_print("failed to allocate a buffer of size %d", len); WARN_ON(1); return NULL; } + idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, allocation); - if (!con) + if (!con) { + srcu_read_unlock(&connections_srcu, idx); return NULL; + } - return new_wq_entry(con, len, allocation, ppc); + msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, mh); + if (!msg) { + srcu_read_unlock(&connections_srcu, idx); + return NULL; + } + + /* we assume if successful commit must called */ + msg->idx = idx; + return msg; } -void dlm_lowcomms_commit_buffer(void *mh) +static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) { - struct writequeue_entry *e = (struct writequeue_entry *)mh; + struct writequeue_entry *e = msg->entry; struct connection *con = e->con; int users; spin_lock(&con->writequeue_lock); + kref_get(&msg->ref); + list_add(&msg->list, &e->msgs); + users = --e->users; if (users) goto out; @@ -1442,6 +1607,42 @@ out: return; } +void dlm_lowcomms_commit_msg(struct dlm_msg *msg) +{ + _dlm_lowcomms_commit_msg(msg); + srcu_read_unlock(&connections_srcu, msg->idx); +} + +void dlm_lowcomms_put_msg(struct dlm_msg *msg) +{ + kref_put(&msg->ref, dlm_msg_release); +} + +/* does not held connections_srcu, usage workqueue only */ +int dlm_lowcomms_resend_msg(struct dlm_msg *msg) +{ + struct dlm_msg *msg_resend; + char *ppc; + + if (msg->retransmit) + return 1; + + msg_resend = dlm_lowcomms_new_msg_con(msg->entry->con, msg->len, + GFP_ATOMIC, &ppc, NULL, NULL); + if (!msg_resend) + return -ENOMEM; + + msg->retransmit = true; + kref_get(&msg->ref); + msg_resend->orig_msg = msg; + + memcpy(ppc, msg->ppc, msg->len); + _dlm_lowcomms_commit_msg(msg_resend); + dlm_lowcomms_put_msg(msg_resend); + + return 0; +} + /* Send a message */ static void send_to_sock(struct connection *con) { @@ -1483,7 +1684,7 @@ static void send_to_sock(struct connection *con) cond_resched(); goto out; } else if (ret < 0) - goto send_error; + goto out; } /* Don't starve people filling buffers */ @@ -1496,16 +1697,23 @@ static void send_to_sock(struct connection *con) writequeue_entry_complete(e, ret); } spin_unlock(&con->writequeue_lock); -out: - mutex_unlock(&con->sock_mutex); + + /* close if we got EOF */ + if (test_and_clear_bit(CF_EOF, &con->flags)) { + mutex_unlock(&con->sock_mutex); + close_connection(con, false, false, true); + + /* handling for tcp shutdown */ + clear_bit(CF_SHUTDOWN, &con->flags); + wake_up(&con->shutdown_wait); + } else { + mutex_unlock(&con->sock_mutex); + } + return; -send_error: +out: mutex_unlock(&con->sock_mutex); - close_connection(con, false, false, true); - /* Requeue the send work. When the work daemon runs again, it will try - a new connection, then call this function again. */ - queue_work(send_workqueue, &con->swork); return; out_connect: @@ -1520,7 +1728,6 @@ static void clean_one_writequeue(struct connection *con) spin_lock(&con->writequeue_lock); list_for_each_entry_safe(e, safe, &con->writequeue, list) { - list_del(&e->list); free_entry(e); } spin_unlock(&con->writequeue_lock); @@ -1532,8 +1739,10 @@ int dlm_lowcomms_close(int nodeid) { struct connection *con; struct dlm_node_addr *na; + int idx; log_print("closing connection to node %d", nodeid); + idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (con) { set_bit(CF_CLOSE, &con->flags); @@ -1542,6 +1751,7 @@ int dlm_lowcomms_close(int nodeid) if (con->othercon) clean_one_writequeue(con->othercon); } + srcu_read_unlock(&connections_srcu, idx); spin_lock(&dlm_node_addrs_spin); na = find_node_addr(nodeid); @@ -1578,35 +1788,50 @@ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); + WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags)); + clear_bit(CF_WRITE_PENDING, &con->flags); - if (con->sock == NULL) /* not mutex protected so check it inside too */ + + if (test_and_clear_bit(CF_RECONNECT, &con->flags)) { + close_connection(con, false, false, true); + dlm_midcomms_unack_msg_resend(con->nodeid); + } + + if (con->sock == NULL) { /* not mutex protected so check it inside too */ + if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags)) + msleep(1000); con->connect_action(con); + } if (!list_empty(&con->writequeue)) send_to_sock(con); } static void work_stop(void) { - if (recv_workqueue) + if (recv_workqueue) { destroy_workqueue(recv_workqueue); - if (send_workqueue) + recv_workqueue = NULL; + } + + if (send_workqueue) { destroy_workqueue(send_workqueue); + send_workqueue = NULL; + } } static int work_start(void) { - recv_workqueue = alloc_workqueue("dlm_recv", - WQ_UNBOUND | WQ_MEM_RECLAIM, 1); + recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM); if (!recv_workqueue) { log_print("can't start dlm_recv"); return -ENOMEM; } - send_workqueue = alloc_workqueue("dlm_send", - WQ_UNBOUND | WQ_MEM_RECLAIM, 1); + send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM); if (!send_workqueue) { log_print("can't start dlm_send"); destroy_workqueue(recv_workqueue); + recv_workqueue = NULL; return -ENOMEM; } @@ -1621,6 +1846,8 @@ static void shutdown_conn(struct connection *con) void dlm_lowcomms_shutdown(void) { + int idx; + /* Set all the flags to prevent any * socket activity. */ @@ -1633,7 +1860,9 @@ void dlm_lowcomms_shutdown(void) dlm_close_sock(&listen_con.sock); + idx = srcu_read_lock(&connections_srcu); foreach_conn(shutdown_conn); + srcu_read_unlock(&connections_srcu, idx); } static void _stop_conn(struct connection *con, bool and_other) @@ -1682,7 +1911,7 @@ static void free_conn(struct connection *con) static void work_flush(void) { - int ok, idx; + int ok; int i; struct connection *con; @@ -1693,7 +1922,6 @@ static void work_flush(void) flush_workqueue(recv_workqueue); if (send_workqueue) flush_workqueue(send_workqueue); - idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE && ok; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) { @@ -1707,14 +1935,17 @@ static void work_flush(void) } } } - srcu_read_unlock(&connections_srcu, idx); } while (!ok); } void dlm_lowcomms_stop(void) { + int idx; + + idx = srcu_read_lock(&connections_srcu); work_flush(); foreach_conn(free_conn); + srcu_read_unlock(&connections_srcu, idx); work_stop(); deinit_local(); } @@ -1738,15 +1969,24 @@ int dlm_lowcomms_start(void) error = work_start(); if (error) - goto fail; + goto fail_local; dlm_allow_conn = 1; /* Start listening */ - if (dlm_config.ci_protocol == 0) + switch (dlm_config.ci_protocol) { + case DLM_PROTO_TCP: error = tcp_listen_for_all(); - else + break; + case DLM_PROTO_SCTP: error = sctp_listen_for_all(&listen_con); + break; + default: + log_print("Invalid protocol identifier %d set", + dlm_config.ci_protocol); + error = -EINVAL; + break; + } if (error) goto fail_unlisten; @@ -1755,6 +1995,9 @@ int dlm_lowcomms_start(void) fail_unlisten: dlm_allow_conn = 0; dlm_close_sock(&listen_con.sock); + work_stop(); +fail_local: + deinit_local(); fail: return error; } diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 48bbc4e18761..aaae7115c00d 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -12,7 +12,22 @@ #ifndef __LOWCOMMS_DOT_H__ #define __LOWCOMMS_DOT_H__ -#define LOWCOMMS_MAX_TX_BUFFER_LEN 4096 +#include "dlm_internal.h" + +#define DLM_MIDCOMMS_OPT_LEN sizeof(struct dlm_opts) +#define DLM_MAX_APP_BUFSIZE (DLM_MAX_SOCKET_BUFSIZE - \ + DLM_MIDCOMMS_OPT_LEN) + +#define CONN_HASH_SIZE 32 + +/* This is deliberately very simple because most clusters have simple + * sequential nodeids, so we should be able to go straight to a connection + * struct in the array + */ +static inline int nodeid_hash(int nodeid) +{ + return nodeid & (CONN_HASH_SIZE-1); +} /* switch to check if dlm is running */ extern int dlm_allow_conn; @@ -22,8 +37,12 @@ void dlm_lowcomms_shutdown(void); void dlm_lowcomms_stop(void); void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); -void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); -void dlm_lowcomms_commit_buffer(void *mh); +struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, + char **ppc, void (*cb)(struct dlm_mhandle *mh), + struct dlm_mhandle *mh); +void dlm_lowcomms_commit_msg(struct dlm_msg *msg); +void dlm_lowcomms_put_msg(struct dlm_msg *msg); +int dlm_lowcomms_resend_msg(struct dlm_msg *msg); int dlm_lowcomms_connect_node(int nodeid); int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index ceef3f2074ff..d9e1e4170eb1 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -15,6 +15,7 @@ #include "recover.h" #include "rcom.h" #include "config.h" +#include "midcomms.h" #include "lowcomms.h" int dlm_slots_version(struct dlm_header *h) @@ -270,7 +271,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, log_slots(ls, gen, num, NULL, array, array_size); - max_slots = (LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom) - + max_slots = (DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom) - sizeof(struct rcom_config)) / sizeof(struct rcom_slot); if (num > max_slots) { @@ -329,6 +330,7 @@ static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) memb->nodeid = node->nodeid; memb->weight = node->weight; memb->comm_seq = node->comm_seq; + dlm_midcomms_add_member(node->nodeid); add_ordered_member(ls, memb); ls->ls_num_nodes++; return 0; @@ -359,26 +361,34 @@ int dlm_is_removed(struct dlm_ls *ls, int nodeid) return 0; } -static void clear_memb_list(struct list_head *head) +static void clear_memb_list(struct list_head *head, + void (*after_del)(int nodeid)) { struct dlm_member *memb; while (!list_empty(head)) { memb = list_entry(head->next, struct dlm_member, list); list_del(&memb->list); + if (after_del) + after_del(memb->nodeid); kfree(memb); } } +static void clear_members_cb(int nodeid) +{ + dlm_midcomms_remove_member(nodeid); +} + void dlm_clear_members(struct dlm_ls *ls) { - clear_memb_list(&ls->ls_nodes); + clear_memb_list(&ls->ls_nodes, clear_members_cb); ls->ls_num_nodes = 0; } void dlm_clear_members_gone(struct dlm_ls *ls) { - clear_memb_list(&ls->ls_nodes_gone); + clear_memb_list(&ls->ls_nodes_gone, NULL); } static void make_member_array(struct dlm_ls *ls) @@ -552,6 +562,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) neg++; list_move(&memb->list, &ls->ls_nodes_gone); + dlm_midcomms_remove_member(memb->nodeid); ls->ls_num_nodes--; dlm_lsop_recover_slot(ls, memb); } @@ -576,12 +587,18 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) *neg_out = neg; error = ping_members(ls); - if (!error || error == -EPROTO) { - /* new_lockspace() may be waiting to know if the config - is good or bad */ - ls->ls_members_result = error; - complete(&ls->ls_members_done); - } + /* error -EINTR means that a new recovery action is triggered. + * We ignore this recovery action and let run the new one which might + * have new member configuration. + */ + if (error == -EINTR) + error = 0; + + /* new_lockspace() may be waiting to know if the config + * is good or bad + */ + ls->ls_members_result = error; + complete(&ls->ls_members_done); log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 1c6654a21ec4..e3de268898ed 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -3,7 +3,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2021 Red Hat, Inc. All rights reserved. ** ** ******************************************************************************* @@ -12,22 +12,866 @@ /* * midcomms.c * - * This is the appallingly named "mid-level" comms layer. + * This is the appallingly named "mid-level" comms layer. It takes care about + * deliver an on application layer "reliable" communication above the used + * lowcomms transport layer. * - * Its purpose is to take packets from the "real" comms layer, - * split them up into packets and pass them to the interested - * part of the locking mechanism. + * How it works: * - * It also takes messages from the locking layer, formats them - * into packets and sends them to the comms layer. + * Each nodes keeps track of all send DLM messages in send_queue with a sequence + * number. The receive will send an DLM_ACK message back for every DLM message + * received at the other side. If a reconnect happens in lowcomms we will send + * all unacknowledged dlm messages again. The receiving side might drop any already + * received message by comparing sequence numbers. + * + * How version detection works: + * + * Due the fact that dlm has pre-configured node addresses on every side + * it is in it's nature that every side connects at starts to transmit + * dlm messages which ends in a race. However DLM_RCOM_NAMES, DLM_RCOM_STATUS + * and their replies are the first messages which are exchanges. Due backwards + * compatibility these messages are not covered by the midcomms re-transmission + * layer. These messages have their own re-transmission handling in the dlm + * application layer. The version field of every node will be set on these RCOM + * messages as soon as they arrived and the node isn't yet part of the nodes + * hash. There exists also logic to detect version mismatched if something weird + * going on or the first messages isn't an expected one. + * + * Termination: + * + * The midcomms layer does a 4 way handshake for termination on DLM protocol + * like TCP supports it with half-closed socket support. SCTP doesn't support + * half-closed socket, so we do it on DLM layer. Also socket shutdown() can be + * interrupted by .e.g. tcp reset itself. Additional there exists the othercon + * paradigm in lowcomms which cannot be easily without breaking backwards + * compatibility. A node cannot send anything to another node when a DLM_FIN + * message was send. There exists additional logic to print a warning if + * DLM wants to do it. There exists a state handling like RFC 793 but reduced + * to termination only. The event "member removal event" describes the cluster + * manager removed the node from internal lists, at this point DLM does not + * send any message to the other node. There exists two cases: + * + * 1. The cluster member was removed and we received a FIN + * OR + * 2. We received a FIN but the member was not removed yet + * + * One of these cases will do the CLOSE_WAIT to LAST_ACK change. + * + * + * +---------+ + * | CLOSED | + * +---------+ + * | add member/receive RCOM version + * | detection msg + * V + * +---------+ + * | ESTAB | + * +---------+ + * CLOSE | | rcv FIN + * ------- | | ------- + * +---------+ snd FIN / \ snd ACK +---------+ + * | FIN |<----------------- ------------------>| CLOSE | + * | WAIT-1 |------------------ | WAIT | + * +---------+ rcv FIN \ +---------+ + * | rcv ACK of FIN ------- | CLOSE | member + * | -------------- snd ACK | ------- | removal + * V x V snd FIN V event + * +---------+ +---------+ +---------+ + * |FINWAIT-2| | CLOSING | | LAST-ACK| + * +---------+ +---------+ +---------+ + * | rcv ACK of FIN | rcv ACK of FIN | + * | rcv FIN -------------- | -------------- | + * | ------- x V x V + * \ snd ACK +---------+ +---------+ + * ------------------------>| CLOSED | | CLOSED | + * +---------+ +---------+ + * + * NOTE: any state can interrupted by midcomms_close() and state will be + * switched to CLOSED in case of fencing. There exists also some timeout + * handling when we receive the version detection RCOM messages which is + * made by observation. + * + * Future improvements: + * + * There exists some known issues/improvements of the dlm handling. Some + * of them should be done in a next major dlm version bump which makes + * it incompatible with previous versions. + * + * Unaligned memory access: + * + * There exists cases when the dlm message buffer length is not aligned + * to 8 byte. However seems nobody detected any problem with it. This + * can be fixed in the next major version bump of dlm. + * + * Version detection: + * + * The version detection and how it's done is related to backwards + * compatibility. There exists better ways to make a better handling. + * However this should be changed in the next major version bump of dlm. + * + * Ack handling: + * + * Currently we send an ack message for every dlm message. However we + * can ack multiple dlm messages with one ack by just delaying the ack + * message. Will reduce some traffic but makes the drop detection slower. + * + * Tail Size checking: + * + * There exists a message tail payload in e.g. DLM_MSG however we don't + * check it against the message length yet regarding to the receive buffer + * length. That need to be validated. + * + * Fencing bad nodes: + * + * At timeout places or weird sequence number behaviours we should send + * a fencing request to the cluster manager. + */ + +/* Debug switch to enable a 5 seconds sleep waiting of a termination. + * This can be useful to test fencing while termination is running. + * This requires a setup with only gfs2 as dlm user, so that the + * last umount will terminate the connection. + * + * However it became useful to test, while the 5 seconds block in umount + * just press the reset button. In a lot of dropping the termination + * process can could take several seconds. */ +#define DLM_DEBUG_FENCE_TERMINATION 0 + +#include <net/tcp.h> #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" #include "lock.h" +#include "util.h" #include "midcomms.h" +/* init value for sequence numbers for testing purpose only e.g. overflows */ +#define DLM_SEQ_INIT 0 +/* 3 minutes wait to sync ending of dlm */ +#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(3 * 60 * 1000) +#define DLM_VERSION_NOT_SET 0 + +struct midcomms_node { + int nodeid; + uint32_t version; + uint32_t seq_send; + uint32_t seq_next; + /* These queues are unbound because we cannot drop any message in dlm. + * We could send a fence signal for a specific node to the cluster + * manager if queues hits some maximum value, however this handling + * not supported yet. + */ + struct list_head send_queue; + spinlock_t send_queue_lock; + atomic_t send_queue_cnt; +#define DLM_NODE_FLAG_CLOSE 1 +#define DLM_NODE_FLAG_STOP_TX 2 +#define DLM_NODE_FLAG_STOP_RX 3 + unsigned long flags; + wait_queue_head_t shutdown_wait; + + /* dlm tcp termination state */ +#define DLM_CLOSED 1 +#define DLM_ESTABLISHED 2 +#define DLM_FIN_WAIT1 3 +#define DLM_FIN_WAIT2 4 +#define DLM_CLOSE_WAIT 5 +#define DLM_LAST_ACK 6 +#define DLM_CLOSING 7 + int state; + spinlock_t state_lock; + + /* counts how many lockspaces are using this node + * this refcount is necessary to determine if the + * node wants to disconnect. + */ + int users; + + /* not protected by srcu, node_hash lifetime */ + void *debugfs; + + struct hlist_node hlist; + struct rcu_head rcu; +}; + +struct dlm_mhandle { + const struct dlm_header *inner_hd; + struct midcomms_node *node; + struct dlm_opts *opts; + struct dlm_msg *msg; + bool committed; + uint32_t seq; + + void (*ack_rcv)(struct midcomms_node *node); + + /* get_mhandle/commit srcu idx exchange */ + int idx; + + struct list_head list; + struct rcu_head rcu; +}; + +static struct hlist_head node_hash[CONN_HASH_SIZE]; +static DEFINE_SPINLOCK(nodes_lock); +DEFINE_STATIC_SRCU(nodes_srcu); + +/* This mutex prevents that midcomms_close() is running while + * stop() or remove(). As I experienced invalid memory access + * behaviours when DLM_DEBUG_FENCE_TERMINATION is enabled and + * resetting machines. I will end in some double deletion in nodes + * datastructure. + */ +static DEFINE_MUTEX(close_lock); + +static inline const char *dlm_state_str(int state) +{ + switch (state) { + case DLM_CLOSED: + return "CLOSED"; + case DLM_ESTABLISHED: + return "ESTABLISHED"; + case DLM_FIN_WAIT1: + return "FIN_WAIT1"; + case DLM_FIN_WAIT2: + return "FIN_WAIT2"; + case DLM_CLOSE_WAIT: + return "CLOSE_WAIT"; + case DLM_LAST_ACK: + return "LAST_ACK"; + case DLM_CLOSING: + return "CLOSING"; + default: + return "UNKNOWN"; + } +} + +const char *dlm_midcomms_state(struct midcomms_node *node) +{ + return dlm_state_str(node->state); +} + +unsigned long dlm_midcomms_flags(struct midcomms_node *node) +{ + return node->flags; +} + +int dlm_midcomms_send_queue_cnt(struct midcomms_node *node) +{ + return atomic_read(&node->send_queue_cnt); +} + +uint32_t dlm_midcomms_version(struct midcomms_node *node) +{ + return node->version; +} + +static struct midcomms_node *__find_node(int nodeid, int r) +{ + struct midcomms_node *node; + + hlist_for_each_entry_rcu(node, &node_hash[r], hlist) { + if (node->nodeid == nodeid) + return node; + } + + return NULL; +} + +static void dlm_mhandle_release(struct rcu_head *rcu) +{ + struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu); + + dlm_lowcomms_put_msg(mh->msg); + kfree(mh); +} + +static void dlm_mhandle_delete(struct midcomms_node *node, + struct dlm_mhandle *mh) +{ + list_del_rcu(&mh->list); + atomic_dec(&node->send_queue_cnt); + call_rcu(&mh->rcu, dlm_mhandle_release); +} + +static void dlm_send_queue_flush(struct midcomms_node *node) +{ + struct dlm_mhandle *mh; + + pr_debug("flush midcomms send queue of node %d\n", node->nodeid); + + rcu_read_lock(); + spin_lock(&node->send_queue_lock); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + dlm_mhandle_delete(node, mh); + } + spin_unlock(&node->send_queue_lock); + rcu_read_unlock(); +} + +static void midcomms_node_reset(struct midcomms_node *node) +{ + pr_debug("reset node %d\n", node->nodeid); + + node->seq_next = DLM_SEQ_INIT; + node->seq_send = DLM_SEQ_INIT; + node->version = DLM_VERSION_NOT_SET; + node->flags = 0; + + dlm_send_queue_flush(node); + node->state = DLM_CLOSED; + wake_up(&node->shutdown_wait); +} + +static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc) +{ + struct midcomms_node *node, *tmp; + int r = nodeid_hash(nodeid); + + node = __find_node(nodeid, r); + if (node || !alloc) + return node; + + node = kmalloc(sizeof(*node), alloc); + if (!node) + return NULL; + + node->nodeid = nodeid; + spin_lock_init(&node->state_lock); + spin_lock_init(&node->send_queue_lock); + atomic_set(&node->send_queue_cnt, 0); + INIT_LIST_HEAD(&node->send_queue); + init_waitqueue_head(&node->shutdown_wait); + node->users = 0; + midcomms_node_reset(node); + + spin_lock(&nodes_lock); + /* check again if there was somebody else + * earlier here to add the node + */ + tmp = __find_node(nodeid, r); + if (tmp) { + spin_unlock(&nodes_lock); + kfree(node); + return tmp; + } + + hlist_add_head_rcu(&node->hlist, &node_hash[r]); + spin_unlock(&nodes_lock); + + node->debugfs = dlm_create_debug_comms_file(nodeid, node); + return node; +} + +static int dlm_send_ack(int nodeid, uint32_t seq) +{ + int mb_len = sizeof(struct dlm_header); + struct dlm_header *m_header; + struct dlm_msg *msg; + char *ppc; + + msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_NOFS, &ppc, + NULL, NULL); + if (!msg) + return -ENOMEM; + + m_header = (struct dlm_header *)ppc; + + m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + m_header->h_nodeid = dlm_our_nodeid(); + m_header->h_length = mb_len; + m_header->h_cmd = DLM_ACK; + m_header->u.h_seq = seq; + + header_out(m_header); + dlm_lowcomms_commit_msg(msg); + dlm_lowcomms_put_msg(msg); + + return 0; +} + +static int dlm_send_fin(struct midcomms_node *node, + void (*ack_rcv)(struct midcomms_node *node)) +{ + int mb_len = sizeof(struct dlm_header); + struct dlm_header *m_header; + struct dlm_mhandle *mh; + char *ppc; + + mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_NOFS, &ppc); + if (!mh) + return -ENOMEM; + + mh->ack_rcv = ack_rcv; + + m_header = (struct dlm_header *)ppc; + + m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + m_header->h_nodeid = dlm_our_nodeid(); + m_header->h_length = mb_len; + m_header->h_cmd = DLM_FIN; + + header_out(m_header); + + pr_debug("sending fin msg to node %d\n", node->nodeid); + dlm_midcomms_commit_mhandle(mh); + set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); + + return 0; +} + +static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) +{ + struct dlm_mhandle *mh; + + rcu_read_lock(); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + if (before(mh->seq, seq)) { + if (mh->ack_rcv) + mh->ack_rcv(node); + } else { + /* send queue should be ordered */ + break; + } + } + + spin_lock(&node->send_queue_lock); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + if (before(mh->seq, seq)) { + dlm_mhandle_delete(node, mh); + } else { + /* send queue should be ordered */ + break; + } + } + spin_unlock(&node->send_queue_lock); + rcu_read_unlock(); +} + +static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) +{ + spin_lock(&node->state_lock); + pr_debug("receive passive fin ack from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_LAST_ACK: + /* DLM_CLOSED */ + midcomms_node_reset(node); + break; + case DLM_CLOSED: + /* not valid but somehow we got what we want */ + wake_up(&node->shutdown_wait); + break; + default: + spin_unlock(&node->state_lock); + log_print("%s: unexpected state: %d\n", + __func__, node->state); + WARN_ON(1); + return; + } + spin_unlock(&node->state_lock); +} + +static void dlm_midcomms_receive_buffer(union dlm_packet *p, + struct midcomms_node *node, + uint32_t seq) +{ + if (seq == node->seq_next) { + node->seq_next++; + /* send ack before fin */ + dlm_send_ack(node->nodeid, node->seq_next); + + switch (p->header.h_cmd) { + case DLM_FIN: + spin_lock(&node->state_lock); + pr_debug("receive fin msg from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_ESTABLISHED: + node->state = DLM_CLOSE_WAIT; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + /* passive shutdown DLM_LAST_ACK case 1 + * additional we check if the node is used by + * cluster manager events at all. + */ + if (node->users == 0) { + node->state = DLM_LAST_ACK; + pr_debug("switch node %d to state %s case 1\n", + node->nodeid, dlm_state_str(node->state)); + spin_unlock(&node->state_lock); + goto send_fin; + } + break; + case DLM_FIN_WAIT1: + node->state = DLM_CLOSING; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_FIN_WAIT2: + midcomms_node_reset(node); + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + wake_up(&node->shutdown_wait); + break; + case DLM_LAST_ACK: + /* probably remove_member caught it, do nothing */ + break; + default: + spin_unlock(&node->state_lock); + log_print("%s: unexpected state: %d\n", + __func__, node->state); + WARN_ON(1); + return; + } + spin_unlock(&node->state_lock); + + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + break; + default: + WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + dlm_receive_buffer(p, node->nodeid); + break; + } + } else { + /* retry to ack message which we already have by sending back + * current node->seq_next number as ack. + */ + if (seq < node->seq_next) + dlm_send_ack(node->nodeid, node->seq_next); + + log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d", + seq, node->seq_next, node->nodeid); + } + + return; + +send_fin: + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); +} + +static struct midcomms_node * +dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p, + uint16_t msglen, int (*cb)(struct midcomms_node *node)) +{ + struct midcomms_node *node = NULL; + gfp_t allocation = 0; + int ret; + + switch (p->header.h_cmd) { + case DLM_RCOM: + if (msglen < sizeof(struct dlm_rcom)) { + log_print("rcom msg too small: %u, will skip this message from node %d", + msglen, nodeid); + return NULL; + } + + switch (le32_to_cpu(p->rcom.rc_type)) { + case DLM_RCOM_NAMES: + fallthrough; + case DLM_RCOM_NAMES_REPLY: + fallthrough; + case DLM_RCOM_STATUS: + fallthrough; + case DLM_RCOM_STATUS_REPLY: + node = nodeid2node(nodeid, 0); + if (node) { + spin_lock(&node->state_lock); + if (node->state != DLM_ESTABLISHED) + pr_debug("receive begin RCOM msg from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_CLOSED: + node->state = DLM_ESTABLISHED; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_ESTABLISHED: + break; + default: + /* some invalid state passive shutdown + * was failed, we try to reset and + * hope it will go on. + */ + log_print("reset node %d because shutdown stuck", + node->nodeid); + + midcomms_node_reset(node); + node->state = DLM_ESTABLISHED; + break; + } + spin_unlock(&node->state_lock); + } + + allocation = GFP_NOFS; + break; + default: + break; + } + + break; + default: + break; + } + + node = nodeid2node(nodeid, allocation); + if (!node) { + switch (p->header.h_cmd) { + case DLM_OPTS: + if (msglen < sizeof(struct dlm_opts)) { + log_print("opts msg too small: %u, will skip this message from node %d", + msglen, nodeid); + return NULL; + } + + log_print_ratelimited("received dlm opts message nextcmd %d from node %d in an invalid sequence", + p->opts.o_nextcmd, nodeid); + break; + default: + log_print_ratelimited("received dlm message cmd %d from node %d in an invalid sequence", + p->header.h_cmd, nodeid); + break; + } + + return NULL; + } + + ret = cb(node); + if (ret < 0) + return NULL; + + return node; +} + +static int dlm_midcomms_version_check_3_2(struct midcomms_node *node) +{ + switch (node->version) { + case DLM_VERSION_NOT_SET: + node->version = DLM_VERSION_3_2; + log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, + node->nodeid); + break; + case DLM_VERSION_3_2: + break; + default: + log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", + DLM_VERSION_3_2, node->nodeid, node->version); + return -1; + } + + return 0; +} + +static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodeid) +{ + int len = msglen; + + /* we only trust outer header msglen because + * it's checked against receive buffer length. + */ + if (len < sizeof(struct dlm_opts)) + return -1; + len -= sizeof(struct dlm_opts); + + if (len < le16_to_cpu(p->opts.o_optlen)) + return -1; + len -= le16_to_cpu(p->opts.o_optlen); + + switch (p->opts.o_nextcmd) { + case DLM_FIN: + if (len < sizeof(struct dlm_header)) { + log_print("fin too small: %d, will skip this message from node %d", + len, nodeid); + return -1; + } + + break; + case DLM_MSG: + if (len < sizeof(struct dlm_message)) { + log_print("msg too small: %d, will skip this message from node %d", + msglen, nodeid); + return -1; + } + + break; + case DLM_RCOM: + if (len < sizeof(struct dlm_rcom)) { + log_print("rcom msg too small: %d, will skip this message from node %d", + len, nodeid); + return -1; + } + + break; + default: + log_print("unsupported o_nextcmd received: %u, will skip this message from node %d", + p->opts.o_nextcmd, nodeid); + return -1; + } + + return 0; +} + +static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) +{ + uint16_t msglen = le16_to_cpu(p->header.h_length); + struct midcomms_node *node; + uint32_t seq; + int ret, idx; + + idx = srcu_read_lock(&nodes_srcu); + node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen, + dlm_midcomms_version_check_3_2); + if (!node) + goto out; + + switch (p->header.h_cmd) { + case DLM_RCOM: + /* these rcom message we use to determine version. + * they have their own retransmission handling and + * are the first messages of dlm. + * + * length already checked. + */ + switch (le32_to_cpu(p->rcom.rc_type)) { + case DLM_RCOM_NAMES: + fallthrough; + case DLM_RCOM_NAMES_REPLY: + fallthrough; + case DLM_RCOM_STATUS: + fallthrough; + case DLM_RCOM_STATUS_REPLY: + break; + default: + log_print("unsupported rcom type received: %u, will skip this message from node %d", + le32_to_cpu(p->rcom.rc_type), nodeid); + goto out; + } + + WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + dlm_receive_buffer(p, nodeid); + break; + case DLM_OPTS: + seq = le32_to_cpu(p->header.u.h_seq); + + ret = dlm_opts_check_msglen(p, msglen, nodeid); + if (ret < 0) { + log_print("opts msg too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + p = (union dlm_packet *)((unsigned char *)p->opts.o_opts + + le16_to_cpu(p->opts.o_optlen)); + + /* recheck inner msglen just if it's not garbage */ + msglen = le16_to_cpu(p->header.h_length); + switch (p->header.h_cmd) { + case DLM_RCOM: + if (msglen < sizeof(struct dlm_rcom)) { + log_print("inner rcom msg too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + break; + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("inner msg too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + break; + case DLM_FIN: + if (msglen < sizeof(struct dlm_header)) { + log_print("inner fin too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + break; + default: + log_print("unsupported inner h_cmd received: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + dlm_midcomms_receive_buffer(p, node, seq); + break; + case DLM_ACK: + seq = le32_to_cpu(p->header.u.h_seq); + dlm_receive_ack(node, seq); + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message from node %d", + p->header.h_cmd, nodeid); + break; + } + +out: + srcu_read_unlock(&nodes_srcu, idx); +} + +static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) +{ + switch (node->version) { + case DLM_VERSION_NOT_SET: + node->version = DLM_VERSION_3_1; + log_print("version 0x%08x for node %d detected", DLM_VERSION_3_1, + node->nodeid); + break; + case DLM_VERSION_3_1: + break; + default: + log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", + DLM_VERSION_3_1, node->nodeid, node->version); + return -1; + } + + return 0; +} + +static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid) +{ + uint16_t msglen = le16_to_cpu(p->header.h_length); + struct midcomms_node *node; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen, + dlm_midcomms_version_check_3_1); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + srcu_read_unlock(&nodes_srcu, idx); + + switch (p->header.h_cmd) { + case DLM_RCOM: + /* length already checked */ + break; + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("msg too small: %u, will skip this message from node %d", + msglen, nodeid); + return; + } + + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message from node %d", + p->header.h_cmd, nodeid); + return; + } + + dlm_receive_buffer(p, nodeid); +} + /* * Called from the low-level comms layer to process a buffer of * commands. @@ -43,7 +887,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) while (len >= sizeof(struct dlm_header)) { hd = (struct dlm_header *)ptr; - /* no message should be more than DEFAULT_BUFFER_SIZE or + /* no message should be more than DLM_MAX_SOCKET_BUFSIZE or * less than dlm_header size. * * Some messages does not have a 8 byte length boundary yet @@ -55,7 +899,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) * the next major version bump. */ msglen = le16_to_cpu(hd->h_length); - if (msglen > DEFAULT_BUFFER_SIZE || + if (msglen > DLM_MAX_SOCKET_BUFSIZE || msglen < sizeof(struct dlm_header)) { log_print("received invalid length header: %u from node %d, will abort message parsing", msglen, nodeid); @@ -68,32 +912,19 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) if (msglen > len) break; - switch (hd->h_cmd) { - case DLM_MSG: - if (msglen < sizeof(struct dlm_message)) { - log_print("dlm msg too small: %u, will skip this message", - msglen); - goto skip; - } - + switch (le32_to_cpu(hd->h_version)) { + case DLM_VERSION_3_1: + dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid); break; - case DLM_RCOM: - if (msglen < sizeof(struct dlm_rcom)) { - log_print("dlm rcom msg too small: %u, will skip this message", - msglen); - goto skip; - } - + case DLM_VERSION_3_2: + dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid); break; default: - log_print("unsupported h_cmd received: %u, will skip this message", - hd->h_cmd); - goto skip; + log_print("received invalid version header: %u from node %d, will skip this message", + le32_to_cpu(hd->h_version), nodeid); + break; } - dlm_receive_buffer((union dlm_packet *)ptr, nodeid); - -skip: ret += msglen; len -= msglen; ptr += msglen; @@ -102,3 +933,455 @@ skip: return ret; } +void dlm_midcomms_unack_msg_resend(int nodeid) +{ + struct midcomms_node *node; + struct dlm_mhandle *mh; + int idx, ret; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, 0); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + /* old protocol, we don't support to retransmit on failure */ + switch (node->version) { + case DLM_VERSION_3_2: + break; + default: + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + rcu_read_lock(); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + if (!mh->committed) + continue; + + ret = dlm_lowcomms_resend_msg(mh->msg); + if (!ret) + log_print_ratelimited("retransmit dlm msg, seq %u, nodeid %d", + mh->seq, node->nodeid); + } + rcu_read_unlock(); + srcu_read_unlock(&nodes_srcu, idx); +} + +static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len, + uint32_t seq) +{ + opts->o_header.h_cmd = DLM_OPTS; + opts->o_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + opts->o_header.h_nodeid = dlm_our_nodeid(); + opts->o_header.h_length = DLM_MIDCOMMS_OPT_LEN + inner_len; + opts->o_header.u.h_seq = seq; + header_out(&opts->o_header); +} + +static void midcomms_new_msg_cb(struct dlm_mhandle *mh) +{ + atomic_inc(&mh->node->send_queue_cnt); + + spin_lock(&mh->node->send_queue_lock); + list_add_tail_rcu(&mh->list, &mh->node->send_queue); + spin_unlock(&mh->node->send_queue_lock); + + mh->seq = mh->node->seq_send++; +} + +static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int nodeid, + int len, gfp_t allocation, char **ppc) +{ + struct dlm_opts *opts; + struct dlm_msg *msg; + + msg = dlm_lowcomms_new_msg(nodeid, len + DLM_MIDCOMMS_OPT_LEN, + allocation, ppc, midcomms_new_msg_cb, mh); + if (!msg) + return NULL; + + opts = (struct dlm_opts *)*ppc; + mh->opts = opts; + + /* add possible options here */ + dlm_fill_opts_header(opts, len, mh->seq); + + *ppc += sizeof(*opts); + mh->inner_hd = (const struct dlm_header *)*ppc; + return msg; +} + +struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, + gfp_t allocation, char **ppc) +{ + struct midcomms_node *node; + struct dlm_mhandle *mh; + struct dlm_msg *msg; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, 0); + if (!node) { + WARN_ON_ONCE(1); + goto err; + } + + /* this is a bug, however we going on and hope it will be resolved */ + WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); + + mh = kzalloc(sizeof(*mh), GFP_NOFS); + if (!mh) + goto err; + + mh->idx = idx; + mh->node = node; + + switch (node->version) { + case DLM_VERSION_3_1: + msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc, + NULL, NULL); + if (!msg) { + kfree(mh); + goto err; + } + + break; + case DLM_VERSION_3_2: + msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation, + ppc); + if (!msg) { + kfree(mh); + goto err; + } + + break; + default: + kfree(mh); + WARN_ON(1); + goto err; + } + + mh->msg = msg; + + /* keep in mind that is a must to call + * dlm_midcomms_commit_msg() which releases + * nodes_srcu using mh->idx which is assumed + * here that the application will call it. + */ + return mh; + +err: + srcu_read_unlock(&nodes_srcu, idx); + return NULL; +} + +static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh) +{ + /* nexthdr chain for fast lookup */ + mh->opts->o_nextcmd = mh->inner_hd->h_cmd; + mh->committed = true; + dlm_lowcomms_commit_msg(mh->msg); +} + +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) +{ + switch (mh->node->version) { + case DLM_VERSION_3_1: + srcu_read_unlock(&nodes_srcu, mh->idx); + + dlm_lowcomms_commit_msg(mh->msg); + dlm_lowcomms_put_msg(mh->msg); + /* mh is not part of rcu list in this case */ + kfree(mh); + break; + case DLM_VERSION_3_2: + dlm_midcomms_commit_msg_3_2(mh); + srcu_read_unlock(&nodes_srcu, mh->idx); + break; + default: + srcu_read_unlock(&nodes_srcu, mh->idx); + WARN_ON(1); + break; + } +} + +int dlm_midcomms_start(void) +{ + int i; + + for (i = 0; i < CONN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&node_hash[i]); + + return dlm_lowcomms_start(); +} + +static void dlm_act_fin_ack_rcv(struct midcomms_node *node) +{ + spin_lock(&node->state_lock); + pr_debug("receive active fin ack from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_FIN_WAIT1: + node->state = DLM_FIN_WAIT2; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_CLOSING: + midcomms_node_reset(node); + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + wake_up(&node->shutdown_wait); + break; + case DLM_CLOSED: + /* not valid but somehow we got what we want */ + wake_up(&node->shutdown_wait); + break; + default: + spin_unlock(&node->state_lock); + log_print("%s: unexpected state: %d\n", + __func__, node->state); + WARN_ON(1); + return; + } + spin_unlock(&node->state_lock); +} + +void dlm_midcomms_add_member(int nodeid) +{ + struct midcomms_node *node; + int idx; + + if (nodeid == dlm_our_nodeid()) + return; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, GFP_NOFS); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + spin_lock(&node->state_lock); + if (!node->users) { + pr_debug("receive add member from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + switch (node->state) { + case DLM_ESTABLISHED: + break; + case DLM_CLOSED: + node->state = DLM_ESTABLISHED; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + default: + /* some invalid state passive shutdown + * was failed, we try to reset and + * hope it will go on. + */ + log_print("reset node %d because shutdown stuck", + node->nodeid); + + midcomms_node_reset(node); + node->state = DLM_ESTABLISHED; + break; + } + } + + node->users++; + pr_debug("users inc count %d\n", node->users); + spin_unlock(&node->state_lock); + + srcu_read_unlock(&nodes_srcu, idx); +} + +void dlm_midcomms_remove_member(int nodeid) +{ + struct midcomms_node *node; + int idx; + + if (nodeid == dlm_our_nodeid()) + return; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, 0); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + spin_lock(&node->state_lock); + node->users--; + pr_debug("users dec count %d\n", node->users); + + /* hitting users count to zero means the + * other side is running dlm_midcomms_stop() + * we meet us to have a clean disconnect. + */ + if (node->users == 0) { + pr_debug("receive remove member from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + switch (node->state) { + case DLM_ESTABLISHED: + break; + case DLM_CLOSE_WAIT: + /* passive shutdown DLM_LAST_ACK case 2 */ + node->state = DLM_LAST_ACK; + spin_unlock(&node->state_lock); + + pr_debug("switch node %d to state %s case 2\n", + node->nodeid, dlm_state_str(node->state)); + goto send_fin; + case DLM_LAST_ACK: + /* probably receive fin caught it, do nothing */ + break; + case DLM_CLOSED: + /* already gone, do nothing */ + break; + default: + log_print("%s: unexpected state: %d\n", + __func__, node->state); + break; + } + } + spin_unlock(&node->state_lock); + + srcu_read_unlock(&nodes_srcu, idx); + return; + +send_fin: + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + srcu_read_unlock(&nodes_srcu, idx); +} + +static void midcomms_node_release(struct rcu_head *rcu) +{ + struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); + + WARN_ON(atomic_read(&node->send_queue_cnt)); + kfree(node); +} + +static void midcomms_shutdown(struct midcomms_node *node) +{ + int ret; + + /* old protocol, we don't wait for pending operations */ + switch (node->version) { + case DLM_VERSION_3_2: + break; + default: + return; + } + + spin_lock(&node->state_lock); + pr_debug("receive active shutdown for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + switch (node->state) { + case DLM_ESTABLISHED: + node->state = DLM_FIN_WAIT1; + pr_debug("switch node %d to state %s case 2\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_CLOSED: + /* we have what we want */ + spin_unlock(&node->state_lock); + return; + default: + /* busy to enter DLM_FIN_WAIT1, wait until passive + * done in shutdown_wait to enter DLM_CLOSED. + */ + break; + } + spin_unlock(&node->state_lock); + + if (node->state == DLM_FIN_WAIT1) { + dlm_send_fin(node, dlm_act_fin_ack_rcv); + + if (DLM_DEBUG_FENCE_TERMINATION) + msleep(5000); + } + + /* wait for other side dlm + fin */ + ret = wait_event_timeout(node->shutdown_wait, + node->state == DLM_CLOSED || + test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), + DLM_SHUTDOWN_TIMEOUT); + if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) { + pr_debug("active shutdown timed out for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + midcomms_node_reset(node); + return; + } + + pr_debug("active shutdown done for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); +} + +void dlm_midcomms_shutdown(void) +{ + struct midcomms_node *node; + int i, idx; + + mutex_lock(&close_lock); + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + midcomms_shutdown(node); + + dlm_delete_debug_comms_file(node->debugfs); + + spin_lock(&nodes_lock); + hlist_del_rcu(&node->hlist); + spin_unlock(&nodes_lock); + + call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); + } + } + srcu_read_unlock(&nodes_srcu, idx); + mutex_unlock(&close_lock); + + dlm_lowcomms_shutdown(); +} + +int dlm_midcomms_close(int nodeid) +{ + struct midcomms_node *node; + int idx, ret; + + if (nodeid == dlm_our_nodeid()) + return 0; + + idx = srcu_read_lock(&nodes_srcu); + /* Abort pending close/remove operation */ + node = nodeid2node(nodeid, 0); + if (node) { + /* let shutdown waiters leave */ + set_bit(DLM_NODE_FLAG_CLOSE, &node->flags); + wake_up(&node->shutdown_wait); + } + srcu_read_unlock(&nodes_srcu, idx); + + synchronize_srcu(&nodes_srcu); + + idx = srcu_read_lock(&nodes_srcu); + mutex_lock(&close_lock); + node = nodeid2node(nodeid, 0); + if (!node) { + mutex_unlock(&close_lock); + srcu_read_unlock(&nodes_srcu, idx); + return dlm_lowcomms_close(nodeid); + } + + ret = dlm_lowcomms_close(nodeid); + spin_lock(&node->state_lock); + midcomms_node_reset(node); + spin_unlock(&node->state_lock); + srcu_read_unlock(&nodes_srcu, idx); + mutex_unlock(&close_lock); + + return ret; +} diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 61e90a921849..579abc6929be 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -12,7 +12,22 @@ #ifndef __MIDCOMMS_DOT_H__ #define __MIDCOMMS_DOT_H__ +struct midcomms_node; + int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); +struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, + gfp_t allocation, char **ppc); +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh); +int dlm_midcomms_close(int nodeid); +int dlm_midcomms_start(void); +void dlm_midcomms_shutdown(void); +void dlm_midcomms_add_member(int nodeid); +void dlm_midcomms_remove_member(int nodeid); +void dlm_midcomms_unack_msg_resend(int nodeid); +const char *dlm_midcomms_state(struct midcomms_node *node); +unsigned long dlm_midcomms_flags(struct midcomms_node *node); +int dlm_midcomms_send_queue_cnt(struct midcomms_node *node); +uint32_t dlm_midcomms_version(struct midcomms_node *node); #endif /* __MIDCOMMS_DOT_H__ */ diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f5b1bd65728d..5651933f54a4 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -27,25 +27,15 @@ static int rcom_response(struct dlm_ls *ls) return test_bit(LSFL_RCOM_READY, &ls->ls_flags); } -static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, - struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret) +static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, + struct dlm_rcom **rc_ret, char *mb, int mb_len) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; - char *mb; - int mb_len = sizeof(struct dlm_rcom) + len; - - mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); - if (!mh) { - log_print("create_rcom to %d type %d len %d ENOBUFS", - to_nodeid, type, len); - return -ENOBUFS; - } rc = (struct dlm_rcom *) mb; rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - rc->rc_header.h_lockspace = ls->ls_global_id; + rc->rc_header.u.h_lockspace = ls->ls_global_id; rc->rc_header.h_nodeid = dlm_our_nodeid(); rc->rc_header.h_length = mb_len; rc->rc_header.h_cmd = DLM_RCOM; @@ -56,16 +46,67 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, rc->rc_seq = ls->ls_recover_seq; spin_unlock(&ls->ls_recover_lock); - *mh_ret = mh; *rc_ret = rc; +} + +static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, + struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret) +{ + int mb_len = sizeof(struct dlm_rcom) + len; + struct dlm_mhandle *mh; + char *mb; + + mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) { + log_print("%s to %d type %d len %d ENOBUFS", + __func__, to_nodeid, type, len); + return -ENOBUFS; + } + + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len); + *mh_ret = mh; + return 0; +} + +static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, + int len, struct dlm_rcom **rc_ret, + struct dlm_msg **msg_ret) +{ + int mb_len = sizeof(struct dlm_rcom) + len; + struct dlm_msg *msg; + char *mb; + + msg = dlm_lowcomms_new_msg(to_nodeid, mb_len, GFP_NOFS, &mb, + NULL, NULL); + if (!msg) { + log_print("create_rcom to %d type %d len %d ENOBUFS", + to_nodeid, type, len); + return -ENOBUFS; + } + + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len); + *msg_ret = msg; return 0; } +static void _send_rcom(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + dlm_rcom_out(rc); +} + static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh, struct dlm_rcom *rc) { - dlm_rcom_out(rc); - dlm_lowcomms_commit_buffer(mh); + _send_rcom(ls, rc); + dlm_midcomms_commit_mhandle(mh); +} + +static void send_rcom_stateless(struct dlm_ls *ls, struct dlm_msg *msg, + struct dlm_rcom *rc) +{ + _send_rcom(ls, rc); + dlm_lowcomms_commit_msg(msg); + dlm_lowcomms_put_msg(msg); } static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs, @@ -141,7 +182,7 @@ static void disallow_sync_reply(struct dlm_ls *ls) int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; + struct dlm_msg *msg; int error = 0; ls->ls_recover_nodeid = nodeid; @@ -153,17 +194,17 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) } retry: - error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, - sizeof(struct rcom_status), &rc, &mh); + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS, + sizeof(struct rcom_status), &rc, &msg); if (error) goto out; set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags); allow_sync_reply(ls, &rc->rc_id); - memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN); + memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom(ls, mh, rc); + send_rcom_stateless(ls, msg, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -191,11 +232,11 @@ retry: static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; struct rcom_status *rs; uint32_t status; int nodeid = rc_in->rc_header.h_nodeid; int len = sizeof(struct rcom_config); + struct dlm_msg *msg; int num_slots = 0; int error; @@ -218,8 +259,8 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) len += num_slots * sizeof(struct rcom_slot); do_create: - error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY, - len, &rc, &mh); + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS_REPLY, + len, &rc, &msg); if (error) return; @@ -246,7 +287,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) spin_unlock(&ls->ls_recover_lock); do_send: - send_rcom(ls, mh, rc); + send_rcom_stateless(ls, msg, rc); } static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) @@ -271,21 +312,22 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; + struct dlm_msg *msg; int error = 0; ls->ls_recover_nodeid = nodeid; retry: - error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh); + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES, last_len, + &rc, &msg); if (error) goto out; memcpy(rc->rc_buf, last_name, last_len); allow_sync_reply(ls, &rc->rc_id); - memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN); + memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom(ls, mh, rc); + send_rcom_stateless(ls, msg, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -298,14 +340,15 @@ retry: static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; int error, inlen, outlen, nodeid; + struct dlm_msg *msg; nodeid = rc_in->rc_header.h_nodeid; inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); - outlen = LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom); + outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom); - error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh); + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, + &rc, &msg); if (error) return; rc->rc_id = rc_in->rc_id; @@ -313,7 +356,7 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, nodeid); - send_rcom(ls, mh, rc); + send_rcom_stateless(ls, msg, rc); } int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) @@ -342,10 +385,6 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid; int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); - error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh); - if (error) - return; - /* Old code would send this special id to trigger a debug dump. */ if (rc_in->rc_id == 0xFFFFFFFF) { log_error(ls, "receive_rcom_lookup dump from %d", nodeid); @@ -353,6 +392,10 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) return; } + error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh); + if (error) + return; + error = dlm_master_lookup(ls, nodeid, rc_in->rc_buf, len, DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL); if (error) @@ -458,14 +501,14 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) char *mb; int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); - mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb); + mh = dlm_midcomms_get_mhandle(nodeid, mb_len, GFP_NOFS, &mb); if (!mh) return -ENOBUFS; rc = (struct dlm_rcom *) mb; rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace; + rc->rc_header.u.h_lockspace = rc_in->rc_header.u.h_lockspace; rc->rc_header.h_nodeid = dlm_our_nodeid(); rc->rc_header.h_length = mb_len; rc->rc_header.h_cmd = DLM_RCOM; @@ -479,7 +522,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) rf->rf_lvblen = cpu_to_le32(~0U); dlm_rcom_out(rc); - dlm_lowcomms_commit_buffer(mh); + dlm_midcomms_commit_mhandle(mh); return 0; } diff --git a/fs/dlm/util.c b/fs/dlm/util.c index cfd0d00b19ae..58acbcc2081a 100644 --- a/fs/dlm/util.c +++ b/fs/dlm/util.c @@ -20,18 +20,20 @@ #define DLM_ERRNO_ETIMEDOUT 110 #define DLM_ERRNO_EINPROGRESS 115 -static void header_out(struct dlm_header *hd) +void header_out(struct dlm_header *hd) { hd->h_version = cpu_to_le32(hd->h_version); - hd->h_lockspace = cpu_to_le32(hd->h_lockspace); + /* does it for others u32 in union as well */ + hd->u.h_lockspace = cpu_to_le32(hd->u.h_lockspace); hd->h_nodeid = cpu_to_le32(hd->h_nodeid); hd->h_length = cpu_to_le16(hd->h_length); } -static void header_in(struct dlm_header *hd) +void header_in(struct dlm_header *hd) { hd->h_version = le32_to_cpu(hd->h_version); - hd->h_lockspace = le32_to_cpu(hd->h_lockspace); + /* does it for others u32 in union as well */ + hd->u.h_lockspace = le32_to_cpu(hd->u.h_lockspace); hd->h_nodeid = le32_to_cpu(hd->h_nodeid); hd->h_length = le16_to_cpu(hd->h_length); } diff --git a/fs/dlm/util.h b/fs/dlm/util.h index cc719ca9397e..d46f23c7a6a0 100644 --- a/fs/dlm/util.h +++ b/fs/dlm/util.h @@ -15,6 +15,8 @@ void dlm_message_out(struct dlm_message *ms); void dlm_message_in(struct dlm_message *ms); void dlm_rcom_out(struct dlm_rcom *rc); void dlm_rcom_in(struct dlm_rcom *rc); +void header_out(struct dlm_header *hd); +void header_in(struct dlm_header *hd); #endif diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 392e721b50a3..7d85e64ea62f 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -533,7 +533,20 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block) return block; } +#include <linux/buffer_head.h> + const struct address_space_operations ecryptfs_aops = { + /* + * XXX: This is pretty broken for multiple reasons: ecryptfs does not + * actually use buffer_heads, and ecryptfs will crash without + * CONFIG_BLOCK. But it matches the behavior before the default for + * address_space_operations without the ->set_page_dirty method was + * cleaned up, so this is the best we can do without maintainer + * feedback. + */ +#ifdef CONFIG_BLOCK + .set_page_dirty = __set_page_dirty_buffers, +#endif .writepage = ecryptfs_writepage, .readpage = ecryptfs_readpage, .write_begin = ecryptfs_write_begin, diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 858b3339f381..906af0c1998c 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -75,4 +75,3 @@ config EROFS_FS_ZIP Enable fixed-sized output compression for EROFS. If you don't want to enable compression feature, say N. - diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index aea129ddda74..3701c72bacb2 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2019 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_COMPRESS_H #define __EROFS_FS_COMPRESS_H @@ -85,4 +84,3 @@ int z_erofs_decompress(struct z_erofs_decompress_req *rq, struct list_head *pagepool); #endif - diff --git a/fs/erofs/data.c b/fs/erofs/data.c index ebac756cb2a3..3787a5fb0a42 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "internal.h" #include <linux/prefetch.h> @@ -315,4 +314,3 @@ const struct address_space_operations erofs_raw_access_aops = { .readahead = erofs_raw_access_readahead, .bmap = erofs_bmap, }; - diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 88e33addf229..a5bc4b1b7813 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2019 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "compress.h" #include <linux/module.h> @@ -407,4 +406,3 @@ int z_erofs_decompress(struct z_erofs_decompress_req *rq, return z_erofs_shifted_transform(rq, pagepool); return z_erofs_decompress_generic(rq, pagepool); } - diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 2776bb832127..eee9b0b31b63 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "internal.h" @@ -139,4 +138,3 @@ const struct file_operations erofs_dir_fops = { .read = generic_read_dir, .iterate_shared = erofs_readdir, }; - diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 8739d3adf51f..0f8da74570b4 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -4,7 +4,6 @@ * * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_H #define __EROFS_FS_H @@ -348,4 +347,3 @@ static inline void erofs_check_ondisk_layout_definitions(void) } #endif - diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 7ed2d7391692..aa8a0d770ba3 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "xattr.h" @@ -374,4 +373,3 @@ const struct inode_operations erofs_fast_symlink_iops = { .listxattr = erofs_listxattr, .get_acl = erofs_get_acl, }; - diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index f92e3e32b9f4..543c2ff97d30 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_INTERNAL_H #define __EROFS_INTERNAL_H @@ -469,4 +468,3 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ - diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index 3a81e1f7fc06..a8271ce5e13f 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "xattr.h" @@ -247,4 +246,3 @@ const struct inode_operations erofs_dir_iops = { .listxattr = erofs_listxattr, .get_acl = erofs_get_acl, }; - diff --git a/fs/erofs/super.c b/fs/erofs/super.c index bbf3bbd908e0..8fc6c04b54f4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include <linux/module.h> #include <linux/buffer_head.h> @@ -285,6 +284,7 @@ static int erofs_read_superblock(struct super_block *sb) goto out; } + ret = -EINVAL; blkszbits = dsb->blkszbits; /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ if (blkszbits != LOG_BLOCK_SIZE) { @@ -751,4 +751,3 @@ module_exit(erofs_module_exit); MODULE_DESCRIPTION("Enhanced ROM File System"); MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc."); MODULE_LICENSE("GPL"); - diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h index a72897c86744..64ceb7270b5c 100644 --- a/fs/erofs/tagptr.h +++ b/fs/erofs/tagptr.h @@ -1,8 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* * A tagged pointer implementation - * - * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_TAGPTR_H #define __EROFS_FS_TAGPTR_H @@ -107,4 +105,3 @@ tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) *ptptr; }) #endif /* __EROFS_FS_TAGPTR_H */ - diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 6758c5b19f7c..bd86067a63f7 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "internal.h" #include <linux/pagevec.h> @@ -278,4 +277,3 @@ void erofs_exit_shrinker(void) unregister_shrinker(&erofs_shrinker_info); } #endif /* !CONFIG_EROFS_FS_ZIP */ - diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 47314a26767a..8dd54b420a1d 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include <linux/security.h> #include "xattr.h" @@ -709,4 +708,3 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type) return acl; } #endif - diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 815304bd335f..366dcb400525 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_XATTR_H #define __EROFS_XATTR_H diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 78e4b598ecca..cb4d0889eca9 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "zdata.h" #include "compress.h" @@ -380,7 +379,6 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt, enum z_erofs_page_type type) { int ret; - bool occupied; /* give priority for inplaceio */ if (clt->mode >= COLLECT_PRIMARY && @@ -388,8 +386,7 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt, z_erofs_try_inplace_io(clt, page)) return 0; - ret = z_erofs_pagevec_enqueue(&clt->vector, - page, type, &occupied); + ret = z_erofs_pagevec_enqueue(&clt->vector, page, type); clt->cl->vcnt += (unsigned int)ret; return ret ? 0 : -EAGAIN; @@ -1471,4 +1468,3 @@ const struct address_space_operations z_erofs_aops = { .readpage = z_erofs_readpage, .readahead = z_erofs_readahead, }; - diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 942ee69dff6a..3a008f1b9f78 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_ZDATA_H #define __EROFS_FS_ZDATA_H diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index efaf32596b97..f68aea4baed7 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2018-2019 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "internal.h" #include <asm/unaligned.h> @@ -597,4 +596,3 @@ out: DBG_BUGON(err < 0 && err != -ENOMEM); return err; } - diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h index 1d67cbd38704..dfd7fe0503bb 100644 --- a/fs/erofs/zpvec.h +++ b/fs/erofs/zpvec.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_ZPVEC_H #define __EROFS_FS_ZPVEC_H @@ -107,10 +106,8 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, struct page *page, - enum z_erofs_page_type type, - bool *occupied) + enum z_erofs_page_type type) { - *occupied = false; if (!ctor->next && type) if (ctor->index + 1 == ctor->nr) return false; @@ -125,7 +122,6 @@ static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, /* should remind that collector->next never equal to 1, 2 */ if (type == (uintptr_t)ctor->next) { ctor->next = page; - *occupied = true; } ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); return true; @@ -154,4 +150,3 @@ z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, return tagptr_unfold_ptr(t); } #endif - diff --git a/fs/exec.c b/fs/exec.c index 18594f11c31f..38f63451b928 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -84,9 +84,6 @@ static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { - BUG_ON(!fmt); - if (WARN_ON(!fmt->load_binary)) - return; write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); @@ -1360,6 +1357,10 @@ int begin_new_exec(struct linux_binprm * bprm) WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); + retval = set_cred_ucounts(bprm->cred); + if (retval < 0) + goto out_unlock; + /* * install the new credentials for this executable */ @@ -1874,7 +1875,7 @@ static int do_execveat_common(int fd, struct filename *filename, * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && - atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) { + is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index c4523648472a..cb1c0d8c1714 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -63,7 +63,7 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb, static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry) { int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext; - unsigned int type, clu_offset; + unsigned int type, clu_offset, max_dentries; sector_t sector; struct exfat_chain dir, clu; struct exfat_uni_name uni_name; @@ -86,6 +86,8 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent dentries_per_clu = sbi->dentries_per_clu; dentries_per_clu_bits = ilog2(dentries_per_clu); + max_dentries = (unsigned int)min_t(u64, MAX_EXFAT_DENTRIES, + (u64)sbi->num_clusters << dentries_per_clu_bits); clu_offset = dentry >> dentries_per_clu_bits; exfat_chain_dup(&clu, &dir); @@ -109,7 +111,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent } } - while (clu.dir != EXFAT_EOF_CLUSTER) { + while (clu.dir != EXFAT_EOF_CLUSTER && dentry < max_dentries) { i = dentry & (dentries_per_clu - 1); for ( ; i < dentries_per_clu; i++, dentry++) { @@ -245,7 +247,7 @@ static int exfat_iterate(struct file *filp, struct dir_context *ctx) if (err) goto unlock; get_new: - if (cpos >= i_size_read(inode)) + if (ei->flags == ALLOC_NO_FAT_CHAIN && cpos >= i_size_read(inode)) goto end_of_dir; err = exfat_readdir(inode, &cpos, &de); diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 1803ef3220fd..ca37d4344361 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -491,6 +491,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from) } static const struct address_space_operations exfat_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = exfat_readpage, .readahead = exfat_readahead, .writepage = exfat_writepage, diff --git a/fs/exfat/super.c b/fs/exfat/super.c index d38d17a77e76..5539ffc20d16 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -690,7 +690,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) if (!sb->s_root) { exfat_err(sb, "failed to get the root dentry"); err = -ENOMEM; - goto put_inode; + goto free_table; } return 0; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 68178b2234bd..dadb121beb22 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -961,6 +961,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc } const struct address_space_operations ext2_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = ext2_readpage, .readahead = ext2_readahead, .writepage = ext2_writepage, @@ -975,6 +976,7 @@ const struct address_space_operations ext2_aops = { }; const struct address_space_operations ext2_nobh_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = ext2_readpage, .readahead = ext2_readahead, .writepage = ext2_nobh_writepage, @@ -990,7 +992,7 @@ const struct address_space_operations ext2_nobh_aops = { static const struct address_space_operations ext2_dax_aops = { .writepages = ext2_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = noop_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .invalidatepage = noop_invalidatepage, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c5cf700e2c8f..d8de607849df 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3701,7 +3701,7 @@ static const struct address_space_operations ext4_da_aops = { static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = noop_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .bmap = ext4_bmap, .invalidatepage = noop_invalidatepage, .swap_activate = ext4_iomap_swap_activate, diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f795049e63d5..6c208108d69c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -444,7 +444,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); - f2fs_set_page_private(page, 0); + set_page_private_reference(page); return 1; } return 0; @@ -1018,7 +1018,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - f2fs_set_page_private(page, 0); + set_page_private_reference(page); } void f2fs_remove_dirty_inode(struct inode *inode) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 925a5ca3744a..455561826c7d 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -12,9 +12,11 @@ #include <linux/lzo.h> #include <linux/lz4.h> #include <linux/zstd.h> +#include <linux/pagevec.h> #include "f2fs.h" #include "node.h" +#include "segment.h" #include <trace/events/f2fs.h> static struct kmem_cache *cic_entry_slab; @@ -74,7 +76,7 @@ bool f2fs_is_compressed_page(struct page *page) return false; if (!page_private(page)) return false; - if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page)) + if (page_private_nonpointer(page)) return false; f2fs_bug_on(F2FS_M_SB(page->mapping), @@ -85,8 +87,7 @@ bool f2fs_is_compressed_page(struct page *page) static void f2fs_set_compressed_page(struct page *page, struct inode *inode, pgoff_t index, void *data) { - SetPagePrivate(page); - set_page_private(page, (unsigned long)data); + attach_page_private(page, (void *)data); /* i_crypto_info and iv index */ page->index = index; @@ -589,8 +590,7 @@ static void f2fs_compress_free_page(struct page *page) { if (!page) return; - set_page_private(page, (unsigned long)NULL); - ClearPagePrivate(page); + detach_page_private(page); page->mapping = NULL; unlock_page(page); mempool_free(page, compress_page_pool); @@ -738,7 +738,7 @@ out: return ret; } -static void f2fs_decompress_cluster(struct decompress_io_ctx *dic) +void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); struct f2fs_inode_info *fi = F2FS_I(dic->inode); @@ -837,7 +837,8 @@ out_end_io: * page being waited on in the cluster, and if so, it decompresses the cluster * (or in the case of a failure, cleans up without actually decompressing). */ -void f2fs_end_read_compressed_page(struct page *page, bool failed) +void f2fs_end_read_compressed_page(struct page *page, bool failed, + block_t blkaddr) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); @@ -847,6 +848,9 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed) if (failed) WRITE_ONCE(dic->failed, true); + else if (blkaddr) + f2fs_cache_compressed_page(sbi, page, + dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) f2fs_decompress_cluster(dic); @@ -876,7 +880,7 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) return is_page_in_cluster(cc, index); } -static bool __cluster_may_compress(struct compress_ctx *cc) +static bool cluster_has_invalid_data(struct compress_ctx *cc) { loff_t i_size = i_size_read(cc->inode); unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); @@ -889,19 +893,22 @@ static bool __cluster_may_compress(struct compress_ctx *cc) /* beyond EOF */ if (page->index >= nr_pages) - return false; + return true; } - return true; + return false; } -static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) +static int __f2fs_cluster_blocks(struct inode *inode, + unsigned int cluster_idx, bool compr) { struct dnode_of_data dn; + unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + unsigned int start_idx = cluster_idx << + F2FS_I(inode)->i_log_cluster_size; int ret; - set_new_dnode(&dn, cc->inode, NULL, NULL, 0); - ret = f2fs_get_dnode_of_data(&dn, start_idx_of_cluster(cc), - LOOKUP_NODE); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) { if (ret == -ENOENT) ret = 0; @@ -912,7 +919,7 @@ static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) int i; ret = 1; - for (i = 1; i < cc->cluster_size; i++) { + for (i = 1; i < cluster_size; i++) { block_t blkaddr; blkaddr = data_blkaddr(dn.inode, @@ -925,6 +932,10 @@ static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) ret++; } } + + f2fs_bug_on(F2FS_I_SB(inode), + !compr && ret != cluster_size && + !is_inode_flag_set(inode, FI_COMPRESS_RELEASED)); } fail: f2fs_put_dnode(&dn); @@ -934,25 +945,15 @@ fail: /* return # of compressed blocks in compressed cluster */ static int f2fs_compressed_blocks(struct compress_ctx *cc) { - return __f2fs_cluster_blocks(cc, true); + return __f2fs_cluster_blocks(cc->inode, cc->cluster_idx, true); } /* return # of valid blocks in compressed cluster */ -static int f2fs_cluster_blocks(struct compress_ctx *cc) -{ - return __f2fs_cluster_blocks(cc, false); -} - int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) { - struct compress_ctx cc = { - .inode = inode, - .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, - .cluster_size = F2FS_I(inode)->i_cluster_size, - .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, - }; - - return f2fs_cluster_blocks(&cc); + return __f2fs_cluster_blocks(inode, + index >> F2FS_I(inode)->i_log_cluster_size, + false); } static bool cluster_may_compress(struct compress_ctx *cc) @@ -961,13 +962,11 @@ static bool cluster_may_compress(struct compress_ctx *cc) return false; if (f2fs_is_atomic_file(cc->inode)) return false; - if (f2fs_is_mmap_file(cc->inode)) - return false; if (!f2fs_cluster_is_full(cc)) return false; if (unlikely(f2fs_cp_error(F2FS_I_SB(cc->inode)))) return false; - return __cluster_may_compress(cc); + return !cluster_has_invalid_data(cc); } static void set_cluster_writeback(struct compress_ctx *cc) @@ -995,21 +994,16 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct address_space *mapping = cc->inode->i_mapping; struct page *page; - struct dnode_of_data dn; sector_t last_block_in_bio; unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; pgoff_t start_idx = start_idx_of_cluster(cc); int i, ret; - bool prealloc; retry: - ret = f2fs_cluster_blocks(cc); + ret = f2fs_is_compressed_cluster(cc->inode, start_idx); if (ret <= 0) return ret; - /* compressed case */ - prealloc = (ret < cc->cluster_size); - ret = f2fs_init_compress_ctx(cc); if (ret) return ret; @@ -1067,25 +1061,6 @@ release_and_retry: } } - if (prealloc) { - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); - - set_new_dnode(&dn, cc->inode, NULL, NULL, 0); - - for (i = cc->cluster_size - 1; i > 0; i--) { - ret = f2fs_get_block(&dn, start_idx + i); - if (ret) { - i = cc->cluster_size; - break; - } - - if (dn.data_blkaddr != NEW_ADDR) - break; - } - - f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); - } - if (likely(!ret)) { *fsdata = cc->rpages; *pagep = cc->rpages[offset_in_cluster(cc, index)]; @@ -1216,6 +1191,12 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, loff_t psize; int i, err; + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(cc->rpages[0]->mapping, -EIO); + goto out_free; + } + if (IS_NOQUOTA(inode)) { /* * We need to wait for node_write to avoid block allocation during @@ -1399,7 +1380,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) for (i = 0; i < cic->nr_rpages; i++) { WARN_ON(!cic->rpages[i]); - clear_cold_data(cic->rpages[i]); + clear_page_private_gcing(cic->rpages[i]); end_page_writeback(cic->rpages[i]); } @@ -1685,6 +1666,164 @@ void f2fs_put_page_dic(struct page *page) f2fs_put_dic(dic); } +const struct address_space_operations f2fs_compress_aops = { + .releasepage = f2fs_release_page, + .invalidatepage = f2fs_invalidate_page, +}; + +struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi) +{ + return sbi->compress_inode->i_mapping; +} + +void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + if (!sbi->compress_inode) + return; + invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr); +} + +void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + nid_t ino, block_t blkaddr) +{ + struct page *cpage; + int ret; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) + return; + + if (!f2fs_available_free_memory(sbi, COMPRESS_PAGE)) + return; + + cpage = find_get_page(COMPRESS_MAPPING(sbi), blkaddr); + if (cpage) { + f2fs_put_page(cpage, 0); + return; + } + + cpage = alloc_page(__GFP_NOWARN | __GFP_IO); + if (!cpage) + return; + + ret = add_to_page_cache_lru(cpage, COMPRESS_MAPPING(sbi), + blkaddr, GFP_NOFS); + if (ret) { + f2fs_put_page(cpage, 0); + return; + } + + set_page_private_data(cpage, ino); + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) + goto out; + + memcpy(page_address(cpage), page_address(page), PAGE_SIZE); + SetPageUptodate(cpage); +out: + f2fs_put_page(cpage, 1); +} + +bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + block_t blkaddr) +{ + struct page *cpage; + bool hitted = false; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return false; + + cpage = f2fs_pagecache_get_page(COMPRESS_MAPPING(sbi), + blkaddr, FGP_LOCK | FGP_NOWAIT, GFP_NOFS); + if (cpage) { + if (PageUptodate(cpage)) { + atomic_inc(&sbi->compress_page_hit); + memcpy(page_address(page), + page_address(cpage), PAGE_SIZE); + hitted = true; + } + f2fs_put_page(cpage, 1); + } + + return hitted; +} + +void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct address_space *mapping = sbi->compress_inode->i_mapping; + struct pagevec pvec; + pgoff_t index = 0; + pgoff_t end = MAX_BLKADDR(sbi); + + if (!mapping->nrpages) + return; + + pagevec_init(&pvec); + + do { + unsigned int nr_pages; + int i; + + nr_pages = pagevec_lookup_range(&pvec, mapping, + &index, end - 1); + if (!nr_pages) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page->index > end) + break; + + lock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + continue; + } + + if (ino != get_page_private_data(page)) { + unlock_page(page); + continue; + } + + generic_error_remove_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } while (index < end); +} + +int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) +{ + struct inode *inode; + + if (!test_opt(sbi, COMPRESS_CACHE)) + return 0; + + inode = f2fs_iget(sbi->sb, F2FS_COMPRESS_INO(sbi)); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->compress_inode = inode; + + sbi->compress_percent = COMPRESS_PERCENT; + sbi->compress_watermark = COMPRESS_WATERMARK; + + atomic_set(&sbi->compress_page_hit, 0); + + return 0; +} + +void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) +{ + if (!sbi->compress_inode) + return; + iput(sbi->compress_inode); + sbi->compress_inode = NULL; +} + int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 009a09fb9d88..d2cf48c5a2e4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -58,18 +58,19 @@ static bool __is_cp_guaranteed(struct page *page) if (!mapping) return false; - if (f2fs_is_compressed_page(page)) - return false; - inode = mapping->host; sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_META_INO(sbi) || inode->i_ino == F2FS_NODE_INO(sbi) || - S_ISDIR(inode->i_mode) || - (S_ISREG(inode->i_mode) && + S_ISDIR(inode->i_mode)) + return true; + + if (f2fs_is_compressed_page(page)) + return false; + if ((S_ISREG(inode->i_mode) && (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || - is_cold_data(page)) + page_private_gcing(page)) return true; return false; } @@ -131,7 +132,7 @@ static void f2fs_finish_read_bio(struct bio *bio) if (f2fs_is_compressed_page(page)) { if (bio->bi_status) - f2fs_end_read_compressed_page(page, true); + f2fs_end_read_compressed_page(page, true, 0); f2fs_put_page_dic(page); continue; } @@ -227,15 +228,19 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) struct bio_vec *bv; struct bvec_iter_all iter_all; bool all_compressed = true; + block_t blkaddr = SECTOR_TO_BLOCK(ctx->bio->bi_iter.bi_sector); bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; /* PG_error was set if decryption failed. */ if (f2fs_is_compressed_page(page)) - f2fs_end_read_compressed_page(page, PageError(page)); + f2fs_end_read_compressed_page(page, PageError(page), + blkaddr); else all_compressed = false; + + blkaddr++; } /* @@ -299,9 +304,8 @@ static void f2fs_write_end_io(struct bio *bio) struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); - if (IS_DUMMY_WRITTEN_PAGE(page)) { - set_page_private(page, (unsigned long)NULL); - ClearPagePrivate(page); + if (page_private_dummy(page)) { + clear_page_private_dummy(page); unlock_page(page); mempool_free(page, sbi->write_io_dummy); @@ -331,7 +335,7 @@ static void f2fs_write_end_io(struct bio *bio) dec_page_count(sbi, type); if (f2fs_in_warm_node_list(sbi, page)) f2fs_del_fsync_node_entry(sbi, page); - clear_cold_data(page); + clear_page_private_gcing(page); end_page_writeback(page); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && @@ -455,10 +459,11 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, GFP_NOIO | __GFP_NOFAIL); f2fs_bug_on(sbi, !page); - zero_user_segment(page, 0, PAGE_SIZE); - SetPagePrivate(page); - set_page_private(page, DUMMY_WRITTEN_PAGE); lock_page(page); + + zero_user_segment(page, 0, PAGE_SIZE); + set_page_private_dummy(page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) f2fs_bug_on(sbi, 1); } @@ -1351,9 +1356,11 @@ alloc: old_blkaddr = dn->data_blkaddr; f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); + f2fs_invalidate_compress_page(sbi, old_blkaddr); + } f2fs_update_data_blkaddr(dn, dn->data_blkaddr); /* @@ -2173,7 +2180,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } - for (i = 0; i < dic->nr_cpages; i++) { + for (i = 0; i < cc->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; struct bio_post_read_ctx *ctx; @@ -2181,6 +2188,14 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); + f2fs_wait_on_block_writeback(inode, blkaddr); + + if (f2fs_load_compressed_page(sbi, page, blkaddr)) { + if (atomic_dec_and_test(&dic->remaining_pages)) + f2fs_decompress_cluster(dic); + continue; + } + if (bio && (!page_is_mergeable(sbi, bio, *last_block_in_bio, blkaddr) || !f2fs_crypt_mergeable_bio(bio, inode, page->index, NULL))) { @@ -2202,8 +2217,6 @@ submit_and_realloc: } } - f2fs_wait_on_block_writeback(inode, blkaddr); - if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; @@ -2459,6 +2472,10 @@ static inline bool check_inplace_update_policy(struct inode *inode, bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) { + /* swap file is migrating in aligned write mode */ + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return false; + if (f2fs_is_pinned_file(inode)) return true; @@ -2481,10 +2498,15 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; + + /* swap file is migrating in aligned write mode */ + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return true; + if (fio) { - if (is_cold_data(fio->page)) + if (page_private_gcing(fio->page)) return true; - if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + if (page_private_dummy(fio->page)) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) @@ -2540,7 +2562,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { ClearPageUptodate(page); - clear_cold_data(page); + clear_page_private_gcing(page); goto out_writepage; } got_it: @@ -2750,7 +2772,7 @@ out: inode_dec_dirty_pages(inode); if (err) { ClearPageUptodate(page); - clear_cold_data(page); + clear_page_private_gcing(page); } if (wbc->for_reclaim) { @@ -3224,7 +3246,7 @@ restart: f2fs_do_read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) - set_inline_node(ipage); + set_page_private_inline(ipage); } else { err = f2fs_convert_inline_page(&dn, page); if (err) @@ -3615,12 +3637,20 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, } } - clear_cold_data(page); + clear_page_private_gcing(page); + + if (test_opt(sbi, COMPRESS_CACHE)) { + if (f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(sbi, inode->i_ino); + if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(page); + } - if (IS_ATOMIC_WRITTEN_PAGE(page)) + if (page_private_atomic(page)) return f2fs_drop_inmem_page(inode, page); - f2fs_clear_page_private(page); + detach_page_private(page); + set_page_private(page, 0); } int f2fs_release_page(struct page *page, gfp_t wait) @@ -3630,11 +3660,23 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; /* This is atomic written page, keep Private */ - if (IS_ATOMIC_WRITTEN_PAGE(page)) + if (page_private_atomic(page)) return 0; - clear_cold_data(page); - f2fs_clear_page_private(page); + if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) { + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct inode *inode = page->mapping->host; + + if (f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(sbi, inode->i_ino); + if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(page); + } + + clear_page_private_gcing(page); + + detach_page_private(page); + set_page_private(page, 0); return 1; } @@ -3650,7 +3692,7 @@ static int f2fs_set_data_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { - if (!IS_ATOMIC_WRITTEN_PAGE(page)) { + if (!page_private_atomic(page)) { f2fs_register_inmem_page(inode, page); return 1; } @@ -3742,7 +3784,7 @@ int f2fs_migrate_page(struct address_space *mapping, { int rc, extra_count; struct f2fs_inode_info *fi = F2FS_I(mapping->host); - bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page); + bool atomic_written = page_private_atomic(page); BUG_ON(PageWriteback(page)); @@ -3777,9 +3819,16 @@ int f2fs_migrate_page(struct address_space *mapping, get_page(newpage); } + /* guarantee to start from no stale private field */ + set_page_private(newpage, 0); if (PagePrivate(page)) { - f2fs_set_page_private(newpage, page_private(page)); - f2fs_clear_page_private(page); + set_page_private(newpage, page_private(page)); + SetPagePrivate(newpage); + get_page(newpage); + + set_page_private(page, 0); + ClearPagePrivate(page); + put_page(page); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -3792,67 +3841,66 @@ int f2fs_migrate_page(struct address_space *mapping, #endif #ifdef CONFIG_SWAP -static int f2fs_is_file_aligned(struct inode *inode) +static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, + unsigned int blkcnt) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - block_t main_blkaddr = SM_I(sbi)->main_blkaddr; - block_t cur_lblock; - block_t last_lblock; - block_t pblock; - unsigned long nr_pblocks; - unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); - unsigned int not_aligned = 0; + unsigned int blkofs; + unsigned int blk_per_sec = BLKS_PER_SEC(sbi); + unsigned int secidx = start_blk / blk_per_sec; + unsigned int end_sec = secidx + blkcnt / blk_per_sec; int ret = 0; - cur_lblock = 0; - last_lblock = bytes_to_blks(inode, i_size_read(inode)); + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); - while (cur_lblock < last_lblock) { - struct f2fs_map_blocks map; + set_inode_flag(inode, FI_ALIGNED_WRITE); - memset(&map, 0, sizeof(map)); - map.m_lblk = cur_lblock; - map.m_len = last_lblock - cur_lblock; - map.m_next_pgofs = NULL; - map.m_next_extent = NULL; - map.m_seg_type = NO_CHECK_TYPE; - map.m_may_create = false; + for (; secidx < end_sec; secidx++) { + down_write(&sbi->pin_sem); - ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); - if (ret) - goto out; + f2fs_lock_op(sbi); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); + f2fs_unlock_op(sbi); - /* hole */ - if (!(map.m_flags & F2FS_MAP_FLAGS)) { - f2fs_err(sbi, "Swapfile has holes\n"); - ret = -ENOENT; - goto out; - } + set_inode_flag(inode, FI_DO_DEFRAG); - pblock = map.m_pblk; - nr_pblocks = map.m_len; + for (blkofs = 0; blkofs < blk_per_sec; blkofs++) { + struct page *page; + unsigned int blkidx = secidx * blk_per_sec + blkofs; - if ((pblock - main_blkaddr) & (blocks_per_sec - 1) || - nr_pblocks & (blocks_per_sec - 1)) { - if (f2fs_is_pinned_file(inode)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; + page = f2fs_get_lock_data_page(inode, blkidx, true); + if (IS_ERR(page)) { + up_write(&sbi->pin_sem); + ret = PTR_ERR(page); + goto done; } - not_aligned++; + + set_page_dirty(page); + f2fs_put_page(page, 1); } - cur_lblock += nr_pblocks; + clear_inode_flag(inode, FI_DO_DEFRAG); + + ret = filemap_fdatawrite(inode->i_mapping); + + up_write(&sbi->pin_sem); + + if (ret) + break; } - if (not_aligned) - f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" - "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", - not_aligned); -out: + +done: + clear_inode_flag(inode, FI_DO_DEFRAG); + clear_inode_flag(inode, FI_ALIGNED_WRITE); + + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + return ret; } -static int check_swap_activate_fast(struct swap_info_struct *sis, +static int check_swap_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; @@ -3865,7 +3913,8 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sector_t highest_pblock = 0; int nr_extents = 0; unsigned long nr_pblocks; - unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int blks_per_sec = BLKS_PER_SEC(sbi); + unsigned int sec_blks_mask = BLKS_PER_SEC(sbi) - 1; unsigned int not_aligned = 0; int ret = 0; @@ -3878,7 +3927,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; - +retry: cond_resched(); memset(&map, 0, sizeof(map)); @@ -3895,7 +3944,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, /* hole */ if (!(map.m_flags & F2FS_MAP_FLAGS)) { - f2fs_err(sbi, "Swapfile has holes\n"); + f2fs_err(sbi, "Swapfile has holes"); ret = -EINVAL; goto out; } @@ -3903,16 +3952,28 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, pblock = map.m_pblk; nr_pblocks = map.m_len; - if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) || - nr_pblocks & (blocks_per_sec - 1)) { - if (f2fs_is_pinned_file(inode)) { - f2fs_err(sbi, "Swapfile does not align to section"); - ret = -EINVAL; - goto out; - } + if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask || + nr_pblocks & sec_blks_mask) { not_aligned++; - } + nr_pblocks = roundup(nr_pblocks, blks_per_sec); + if (cur_lblock + nr_pblocks > sis->max) + nr_pblocks -= blks_per_sec; + + if (!nr_pblocks) { + /* this extent is last one */ + nr_pblocks = map.m_len; + f2fs_warn(sbi, "Swapfile: last extent is not aligned to section"); + goto next; + } + + ret = f2fs_migrate_blocks(inode, cur_lblock, + nr_pblocks); + if (ret) + goto out; + goto retry; + } +next: if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; @@ -3939,120 +4000,11 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sis->max = cur_lblock; sis->pages = cur_lblock - 1; sis->highest_bit = cur_lblock - 1; - - if (not_aligned) - f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" - "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", - not_aligned); -out: - return ret; -} - -/* Copied from generic_swapfile_activate() to check any holes */ -static int check_swap_activate(struct swap_info_struct *sis, - struct file *swap_file, sector_t *span) -{ - struct address_space *mapping = swap_file->f_mapping; - struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - unsigned blocks_per_page; - unsigned long page_no; - sector_t probe_block; - sector_t last_block; - sector_t lowest_block = -1; - sector_t highest_block = 0; - int nr_extents = 0; - int ret = 0; - - if (PAGE_SIZE == F2FS_BLKSIZE) - return check_swap_activate_fast(sis, swap_file, span); - - ret = f2fs_is_file_aligned(inode); - if (ret) - goto out; - - blocks_per_page = bytes_to_blks(inode, PAGE_SIZE); - - /* - * Map all the blocks into the extent list. This code doesn't try - * to be very smart. - */ - probe_block = 0; - page_no = 0; - last_block = bytes_to_blks(inode, i_size_read(inode)); - while ((probe_block + blocks_per_page) <= last_block && - page_no < sis->max) { - unsigned block_in_page; - sector_t first_block; - sector_t block = 0; - - cond_resched(); - - block = probe_block; - ret = bmap(inode, &block); - if (ret) - goto out; - if (!block) - goto bad_bmap; - first_block = block; - - /* - * It must be PAGE_SIZE aligned on-disk - */ - if (first_block & (blocks_per_page - 1)) { - probe_block++; - goto reprobe; - } - - for (block_in_page = 1; block_in_page < blocks_per_page; - block_in_page++) { - - block = probe_block + block_in_page; - ret = bmap(inode, &block); - if (ret) - goto out; - if (!block) - goto bad_bmap; - - if (block != first_block + block_in_page) { - /* Discontiguity */ - probe_block++; - goto reprobe; - } - } - - first_block >>= (PAGE_SHIFT - inode->i_blkbits); - if (page_no) { /* exclude the header page */ - if (first_block < lowest_block) - lowest_block = first_block; - if (first_block > highest_block) - highest_block = first_block; - } - - /* - * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks - */ - ret = add_swap_extent(sis, page_no, 1, first_block); - if (ret < 0) - goto out; - nr_extents += ret; - page_no++; - probe_block += blocks_per_page; -reprobe: - continue; - } - ret = nr_extents; - *span = 1 + highest_block - lowest_block; - if (page_no == 0) - page_no = 1; /* force Empty message */ - sis->max = page_no; - sis->pages = page_no - 1; - sis->highest_bit = page_no - 1; out: + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%u * N)", + not_aligned, blks_per_sec * F2FS_BLKSIZE); return ret; -bad_bmap: - f2fs_err(sbi, "Swapfile has holes\n"); - return -EINVAL; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -4067,6 +4019,12 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (f2fs_readonly(F2FS_I_SB(inode)->sb)) return -EROFS; + if (f2fs_lfs_mode(F2FS_I_SB(inode))) { + f2fs_err(F2FS_I_SB(inode), + "Swapfile not supported in LFS mode"); + return -EINVAL; + } + ret = f2fs_convert_inline_inode(inode); if (ret) return ret; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index c03949a7ccff..833325038ef3 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -152,6 +152,12 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->node_pages = NODE_MAPPING(sbi)->nrpages; if (sbi->meta_inode) si->meta_pages = META_MAPPING(sbi)->nrpages; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (sbi->compress_inode) { + si->compress_pages = COMPRESS_MAPPING(sbi)->nrpages; + si->compress_page_hit = atomic_read(&sbi->compress_page_hit); + } +#endif si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT]; si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT]; si->sits = MAIN_SEGS(sbi); @@ -309,6 +315,12 @@ get_cache: si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (sbi->compress_inode) { + unsigned npages = COMPRESS_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } +#endif } static int stat_show(struct seq_file *s, void *v) @@ -476,6 +488,7 @@ static int stat_show(struct seq_file *s, void *v) "volatile IO: %4d (Max. %4d)\n", si->inmem_pages, si->aw_cnt, si->max_aw_cnt, si->vw_cnt, si->max_vw_cnt); + seq_printf(s, " - compress: %4d, hit:%8d\n", si->compress_pages, si->compress_page_hit); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index dc7ce79672b8..456651682daf 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -16,6 +16,10 @@ #include "xattr.h" #include <trace/events/f2fs.h> +#ifdef CONFIG_UNICODE +extern struct kmem_cache *f2fs_cf_name_slab; +#endif + static unsigned long dir_blocks(struct inode *inode) { return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) @@ -77,11 +81,10 @@ int f2fs_init_casefolded_name(const struct inode *dir, { #ifdef CONFIG_UNICODE struct super_block *sb = dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); if (IS_CASEFOLDED(dir)) { - fname->cf_name.name = f2fs_kmalloc(sbi, F2FS_NAME_LEN, - GFP_NOFS); + fname->cf_name.name = kmem_cache_alloc(f2fs_cf_name_slab, + GFP_NOFS); if (!fname->cf_name.name) return -ENOMEM; fname->cf_name.len = utf8_casefold(sb->s_encoding, @@ -89,7 +92,7 @@ int f2fs_init_casefolded_name(const struct inode *dir, fname->cf_name.name, F2FS_NAME_LEN); if ((int)fname->cf_name.len <= 0) { - kfree(fname->cf_name.name); + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; if (sb_has_strict_encoding(sb)) return -EINVAL; @@ -172,8 +175,10 @@ void f2fs_free_filename(struct f2fs_filename *fname) fname->crypto_buf.name = NULL; #endif #ifdef CONFIG_UNICODE - kfree(fname->cf_name.name); - fname->cf_name.name = NULL; + if (fname->cf_name.name) { + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); + fname->cf_name.name = NULL; + } #endif } @@ -929,11 +934,15 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); - f2fs_clear_page_private(page); ClearPageUptodate(page); - clear_cold_data(page); + + clear_page_private_gcing(page); + inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); + + detach_page_private(page); + set_page_private(page, 0); } f2fs_put_page(page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c83d90125ebd..ee8eb33e2c25 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -98,6 +98,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_ATGC 0x08000000 #define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 #define F2FS_MOUNT_GC_MERGE 0x20000000 +#define F2FS_MOUNT_COMPRESS_CACHE 0x40000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -150,8 +151,10 @@ struct f2fs_mount_info { unsigned char compress_level; /* compress level */ bool compress_chksum; /* compressed data chksum */ unsigned char compress_ext_cnt; /* extension count */ + unsigned char nocompress_ext_cnt; /* nocompress extension count */ int compress_mode; /* compression mode */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ + unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 @@ -168,6 +171,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_SB_CHKSUM 0x0800 #define F2FS_FEATURE_CASEFOLD 0x1000 #define F2FS_FEATURE_COMPRESSION 0x2000 +#define F2FS_FEATURE_RO 0x4000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -706,6 +710,8 @@ enum { FI_COMPRESS_CORRUPT, /* indicate compressed cluster is corrupted */ FI_MMAP_FILE, /* indicate file was mmapped */ FI_ENABLE_COMPRESS, /* enable compression in "user" compression mode */ + FI_COMPRESS_RELEASED, /* compressed blocks were released */ + FI_ALIGNED_WRITE, /* enable aligned write */ FI_MAX, /* max flag, never be used */ }; @@ -939,6 +945,7 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, #define NR_CURSEG_DATA_TYPE (3) #define NR_CURSEG_NODE_TYPE (3) #define NR_CURSEG_INMEM_TYPE (2) +#define NR_CURSEG_RO_TYPE (2) #define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) #define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) @@ -1291,17 +1298,119 @@ enum { */ }; +static inline int f2fs_test_bit(unsigned int nr, char *addr); +static inline void f2fs_set_bit(unsigned int nr, char *addr); +static inline void f2fs_clear_bit(unsigned int nr, char *addr); + /* - * this value is set in page as a private data which indicate that - * the page is atomically written, and it is in inmem_pages list. + * Layout of f2fs page.private: + * + * Layout A: lowest bit should be 1 + * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... | + * bit 0 PAGE_PRIVATE_NOT_POINTER + * bit 1 PAGE_PRIVATE_ATOMIC_WRITE + * bit 2 PAGE_PRIVATE_DUMMY_WRITE + * bit 3 PAGE_PRIVATE_ONGOING_MIGRATION + * bit 4 PAGE_PRIVATE_INLINE_INODE + * bit 5 PAGE_PRIVATE_REF_RESOURCE + * bit 6- f2fs private data + * + * Layout B: lowest bit should be 0 + * page.private is a wrapped pointer. */ -#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) -#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) +enum { + PAGE_PRIVATE_NOT_POINTER, /* private contains non-pointer data */ + PAGE_PRIVATE_ATOMIC_WRITE, /* data page from atomic write path */ + PAGE_PRIVATE_DUMMY_WRITE, /* data page for padding aligned IO */ + PAGE_PRIVATE_ONGOING_MIGRATION, /* data page which is on-going migrating */ + PAGE_PRIVATE_INLINE_INODE, /* inode page contains inline data */ + PAGE_PRIVATE_REF_RESOURCE, /* dirty page has referenced resources */ + PAGE_PRIVATE_MAX +}; -#define IS_ATOMIC_WRITTEN_PAGE(page) \ - (page_private(page) == ATOMIC_WRITTEN_PAGE) -#define IS_DUMMY_WRITTEN_PAGE(page) \ - (page_private(page) == DUMMY_WRITTEN_PAGE) +#define PAGE_PRIVATE_GET_FUNC(name, flagname) \ +static inline bool page_private_##name(struct page *page) \ +{ \ + return PagePrivate(page) && \ + test_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)) && \ + test_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_SET_FUNC(name, flagname) \ +static inline void set_page_private_##name(struct page *page) \ +{ \ + if (!PagePrivate(page)) { \ + get_page(page); \ + SetPagePrivate(page); \ + set_page_private(page, 0); \ + } \ + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); \ + set_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +} + +#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \ +static inline void clear_page_private_##name(struct page *page) \ +{ \ + clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ + if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { \ + set_page_private(page, 0); \ + if (PagePrivate(page)) { \ + ClearPagePrivate(page); \ + put_page(page); \ + }\ + } \ +} + +PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER); +PAGE_PRIVATE_GET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_GET_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_SET_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE); + +PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE); +PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE); +PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION); +PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE); +PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE); + +static inline unsigned long get_page_private_data(struct page *page) +{ + unsigned long data = page_private(page); + + if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data)) + return 0; + return data >> PAGE_PRIVATE_MAX; +} + +static inline void set_page_private_data(struct page *page, unsigned long data) +{ + if (!PagePrivate(page)) { + get_page(page); + SetPagePrivate(page); + set_page_private(page, 0); + } + set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page)); + page_private(page) |= data << PAGE_PRIVATE_MAX; +} + +static inline void clear_page_private_data(struct page *page) +{ + page_private(page) &= (1 << PAGE_PRIVATE_MAX) - 1; + if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { + set_page_private(page, 0); + if (PagePrivate(page)) { + ClearPagePrivate(page); + put_page(page); + } + } +} /* For compression */ enum compress_algorithm_type { @@ -1317,6 +1426,9 @@ enum compress_flag { COMPRESS_MAX_FLAG, }; +#define COMPRESS_WATERMARK 20 +#define COMPRESS_PERCENT 20 + #define COMPRESS_DATA_RESERVED_SIZE 4 struct compress_data { __le32 clen; /* compressed data size */ @@ -1594,6 +1706,9 @@ struct f2fs_sb_info { struct kobject s_stat_kobj; /* /sys/fs/f2fs/<devname>/stat */ struct completion s_stat_kobj_unregister; + struct kobject s_feature_list_kobj; /* /sys/fs/f2fs/<devname>/feature_list */ + struct completion s_feature_list_kobj_unregister; + /* For shrinker support */ struct list_head s_list; int s_ndevs; /* number of devices */ @@ -1626,6 +1741,12 @@ struct f2fs_sb_info { u64 compr_written_block; u64 compr_saved_block; u32 compr_new_inode; + + /* For compressed block cache */ + struct inode *compress_inode; /* cache compressed blocks */ + unsigned int compress_percent; /* cache page percentage */ + unsigned int compress_watermark; /* cache page watermark */ + atomic_t compress_page_hit; /* cache hit count */ #endif }; @@ -2678,6 +2799,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: + case FI_COMPRESS_RELEASED: f2fs_mark_inode_dirty_sync(inode, true); } } @@ -2799,6 +2921,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_EXTRA_ATTR, fi->flags); if (ri->i_inline & F2FS_PIN_FILE) set_bit(FI_PIN_FILE, fi->flags); + if (ri->i_inline & F2FS_COMPRESS_RELEASED) + set_bit(FI_COMPRESS_RELEASED, fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -2819,6 +2943,8 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_EXTRA_ATTR; if (is_inode_flag_set(inode, FI_PIN_FILE)) ri->i_inline |= F2FS_PIN_FILE; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + ri->i_inline |= F2FS_COMPRESS_RELEASED; } static inline int f2fs_has_extra_attr(struct inode *inode) @@ -3027,25 +3153,6 @@ static inline bool is_dot_dotdot(const u8 *name, size_t len) return false; } -static inline bool f2fs_may_extent_tree(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - if (!test_opt(sbi, EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT) || - is_inode_flag_set(inode, FI_COMPRESSED_FILE)) - return false; - - /* - * for recovered files during mount do not create extents - * if shrinker is not registered. - */ - if (list_empty(&sbi->s_list)) - return false; - - return S_ISREG(inode->i_mode); -} - static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { @@ -3169,20 +3276,6 @@ static inline bool __is_valid_data_blkaddr(block_t blkaddr) return true; } -static inline void f2fs_set_page_private(struct page *page, - unsigned long data) -{ - if (PagePrivate(page)) - return; - - attach_page_private(page, (void *)data); -} - -static inline void f2fs_clear_page_private(struct page *page) -{ - detach_page_private(page); -} - /* * file.c */ @@ -3566,6 +3659,8 @@ void f2fs_destroy_garbage_collection_cache(void); */ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi); +int __init f2fs_create_recovery_cache(void); +void f2fs_destroy_recovery_cache(void); /* * debug.c @@ -3604,7 +3699,8 @@ struct f2fs_stat_info { unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; - int dirty_count, node_pages, meta_pages; + int dirty_count, node_pages, meta_pages, compress_pages; + int compress_page_hit; int prefree_count, call_count, cp_count, bg_cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int bg_node_segs, bg_data_segs; @@ -3940,7 +4036,9 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_end_read_compressed_page(struct page *page, bool failed); +void f2fs_decompress_cluster(struct decompress_io_ctx *dic); +void f2fs_end_read_compressed_page(struct page *page, bool failed, + block_t blkaddr); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); @@ -3958,10 +4056,19 @@ void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); +int f2fs_init_compress_inode(struct f2fs_sb_info *sbi); +void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi); int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); int __init f2fs_init_compress_cache(void); void f2fs_destroy_compress_cache(void); +struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi); +void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + nid_t ino, block_t blkaddr); +bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, + block_t blkaddr); +void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); #define inc_compr_inode_stat(inode) \ do { \ struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ @@ -3990,7 +4097,9 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } -static inline void f2fs_end_read_compressed_page(struct page *page, bool failed) +static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { } +static inline void f2fs_end_read_compressed_page(struct page *page, + bool failed, block_t blkaddr) { WARN_ON_ONCE(1); } @@ -3998,10 +4107,20 @@ static inline void f2fs_put_page_dic(struct page *page) { WARN_ON_ONCE(1); } +static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } static inline void f2fs_destroy_compress_cache(void) { } +static inline void f2fs_invalidate_compress_page(struct f2fs_sb_info *sbi, + block_t blkaddr) { } +static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, + struct page *page, nid_t ino, block_t blkaddr) { } +static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, + struct page *page, block_t blkaddr) { return false; } +static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, + nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) #endif @@ -4066,6 +4185,27 @@ F2FS_FEATURE_FUNCS(verity, VERITY); F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); +F2FS_FEATURE_FUNCS(readonly, RO); + +static inline bool f2fs_may_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, EXTENT_CACHE) || + is_inode_flag_set(inode, FI_NO_EXTENT) || + (is_inode_flag_set(inode, FI_COMPRESSED_FILE) && + !f2fs_sb_has_readonly(sbi))) + return false; + + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&sbi->s_list)) + return false; + + return S_ISREG(inode->i_mode); +} #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ceb575f99048..6afd4562335f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -63,6 +63,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (unlikely(IS_IMMUTABLE(inode))) return VM_FAULT_SIGBUS; + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + return VM_FAULT_SIGBUS; + if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto err; @@ -85,10 +88,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) err = ret; goto err; } else if (ret) { - if (ret < F2FS_I(inode)->i_cluster_size) { - err = -EAGAIN; - goto err; - } need_alloc = false; } } @@ -117,7 +116,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_block(&dn, page->index); - f2fs_put_dnode(&dn); f2fs_do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); } @@ -3203,7 +3201,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_lblk = m_next_extent; } - return err; + return 0; } static int f2fs_ioc_precache_extents(struct file *filp, unsigned long arg) @@ -3237,7 +3235,7 @@ static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) { f2fs_warn(F2FS_I_SB(inode), - "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem.\n", + "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem", inode->i_ino); return -EOPNOTSUPP; } @@ -3425,7 +3423,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) goto out; } - if (IS_IMMUTABLE(inode)) { + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { ret = -EINVAL; goto out; } @@ -3434,8 +3432,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (ret) goto out; - F2FS_I(inode)->i_flags |= F2FS_IMMUTABLE_FL; - f2fs_set_inode_flags(inode); + set_inode_flag(inode, FI_COMPRESS_RELEASED); inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); @@ -3590,7 +3587,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) inode_lock(inode); - if (!IS_IMMUTABLE(inode)) { + if (!is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { ret = -EINVAL; goto unlock_inode; } @@ -3635,8 +3632,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) up_write(&F2FS_I(inode)->i_mmap_sem); if (ret >= 0) { - F2FS_I(inode)->i_flags &= ~F2FS_IMMUTABLE_FL; - f2fs_set_inode_flags(inode); + clear_inode_flag(inode, FI_COMPRESS_RELEASED); inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); } @@ -4023,9 +4019,8 @@ static int f2fs_ioc_decompress_file(struct file *filp, unsigned long arg) LLONG_MAX); if (ret) - f2fs_warn(sbi, "%s: The file might be partially decompressed " - "(errno=%d). Please delete the file.\n", - __func__, ret); + f2fs_warn(sbi, "%s: The file might be partially decompressed (errno=%d). Please delete the file.", + __func__, ret); out: inode_unlock(inode); file_end_write(filp); @@ -4097,9 +4092,8 @@ static int f2fs_ioc_compress_file(struct file *filp, unsigned long arg) clear_inode_flag(inode, FI_ENABLE_COMPRESS); if (ret) - f2fs_warn(sbi, "%s: The file might be partially compressed " - "(errno=%d). Please delete the file.\n", - __func__, ret); + f2fs_warn(sbi, "%s: The file might be partially compressed (errno=%d). Please delete the file.", + __func__, ret); out: inode_unlock(inode); file_end_write(filp); @@ -4254,6 +4248,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto unlock; } + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + ret = -EPERM; + goto unlock; + } + ret = generic_write_checks(iocb, from); if (ret > 0) { bool preallocated = false; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8d1f17ab94d8..0e42ee5f7770 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1031,8 +1031,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (unlikely(check_valid_map(sbi, segno, offset))) { if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { - f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u\n", - blkaddr, source_blkaddr, segno); + f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u", + blkaddr, source_blkaddr, segno); f2fs_bug_on(sbi, 1); } } @@ -1261,6 +1261,7 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_put_page(mpage, 1); invalidate_mapping_pages(META_MAPPING(fio.sbi), fio.old_blkaddr, fio.old_blkaddr); + f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr); set_page_dirty(fio.encrypted_page); if (clear_page_dirty_for_io(fio.encrypted_page)) @@ -1336,7 +1337,7 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; } set_page_dirty(page); - set_cold_data(page); + set_page_private_gcing(page); } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -1362,11 +1363,11 @@ retry: f2fs_remove_dirty_inode(inode); } - set_cold_data(page); + set_page_private_gcing(page); err = f2fs_do_write_data_page(&fio); if (err) { - clear_cold_data(page); + clear_page_private_gcing(page); if (err == -ENOMEM) { congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); @@ -1450,10 +1451,8 @@ next_step: if (phase == 3) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); + if (IS_ERR(inode) || is_bad_inode(inode)) continue; - } if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { @@ -1822,6 +1821,7 @@ static void init_atgc_management(struct f2fs_sb_info *sbi) am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO; am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT; am->age_weight = DEF_GC_THREAD_AGE_WEIGHT; + am->age_threshold = DEF_GC_THREAD_AGE_THRESHOLD; } void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 92652ca7a7c8..56a20d5c15da 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -173,7 +173,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* clear inline data and flag after data writeback */ f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0); - clear_inline_node(dn->inode_page); + clear_page_private_inline(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); clear_inode_flag(dn->inode, FI_INLINE_DATA); @@ -255,7 +255,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); - clear_inline_node(dn.inode_page); + clear_page_private_inline(dn.inode_page); f2fs_put_dnode(&dn); return 0; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index b401f08569f7..9141147b5bb0 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -18,6 +18,10 @@ #include <trace/events/f2fs.h> +#ifdef CONFIG_F2FS_FS_COMPRESSION +extern const struct address_space_operations f2fs_compress_aops; +#endif + void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { if (is_inode_flag_set(inode, FI_NEW_INODE)) @@ -494,6 +498,11 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) goto make_now; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (ino == F2FS_COMPRESS_INO(sbi)) + goto make_now; +#endif + ret = do_read_inode(inode); if (ret) goto bad_inode; @@ -504,6 +513,12 @@ make_now: } else if (ino == F2FS_META_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_meta_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + } else if (ino == F2FS_COMPRESS_INO(sbi)) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + inode->i_mapping->a_ops = &f2fs_compress_aops; +#endif + mapping_set_gfp_mask(inode->i_mapping, + GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); } else if (S_ISREG(inode->i_mode)) { inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; @@ -646,7 +661,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) /* deleted inode */ if (inode->i_nlink == 0) - clear_inline_node(node_page); + clear_page_private_inline(node_page); F2FS_I(inode)->i_disk_time[0] = inode->i_atime; F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; @@ -723,8 +738,12 @@ void f2fs_evict_inode(struct inode *inode) trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); + if (test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(sbi, inode->i_ino); + if (inode->i_ino == F2FS_NODE_INO(sbi) || - inode->i_ino == F2FS_META_INO(sbi)) + inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_COMPRESS_INO(sbi)) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a9cd9cf97229..e149c8c66a71 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -153,7 +153,8 @@ fail_drop: return ERR_PTR(err); } -static inline int is_extension_exist(const unsigned char *s, const char *sub) +static inline int is_extension_exist(const unsigned char *s, const char *sub, + bool tmp_ext) { size_t slen = strlen(s); size_t sublen = strlen(sub); @@ -169,6 +170,13 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub) if (slen < sublen + 2) return 0; + if (!tmp_ext) { + /* file has no temp extension */ + if (s[slen - sublen - 1] != '.') + return 0; + return !strncasecmp(s + slen - sublen, sub, sublen); + } + for (i = 1; i < slen - sublen; i++) { if (s[i] != '.') continue; @@ -194,7 +202,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * hot_count = sbi->raw_super->hot_ext_count; for (i = 0; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i])) + if (is_extension_exist(name, extlist[i], true)) break; } @@ -279,14 +287,16 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; - unsigned char (*ext)[F2FS_EXTENSION_LEN]; - unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions; + unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions; + unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; int i, cold_count, hot_count; if (!f2fs_sb_has_compression(sbi) || - is_inode_flag_set(inode, FI_COMPRESSED_FILE) || F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL || - !f2fs_may_compress(inode)) + !f2fs_may_compress(inode) || + (!ext_cnt && !noext_cnt)) return; down_read(&sbi->sb_lock); @@ -295,7 +305,7 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, hot_count = sbi->raw_super->hot_ext_count; for (i = cold_count; i < cold_count + hot_count; i++) { - if (is_extension_exist(name, extlist[i])) { + if (is_extension_exist(name, extlist[i], false)) { up_read(&sbi->sb_lock); return; } @@ -303,10 +313,18 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, up_read(&sbi->sb_lock); - ext = F2FS_OPTION(sbi).extensions; + for (i = 0; i < noext_cnt; i++) { + if (is_extension_exist(name, noext[i], false)) { + f2fs_disable_compressed_file(inode); + return; + } + } + + if (is_inode_flag_set(inode, FI_COMPRESSED_FILE)) + return; for (i = 0; i < ext_cnt; i++) { - if (!is_extension_exist(name, ext[i])) + if (!is_extension_exist(name, ext[i], false)) continue; set_compress_context(inode); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e67ce5f13b98..0be9e2d7120e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -97,6 +97,20 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (atomic_read(&dcc->discard_cmd_cnt) * sizeof(struct discard_cmd)) >> PAGE_SHIFT; res = mem_size < (avail_ram * nm_i->ram_thresh / 100); + } else if (type == COMPRESS_PAGE) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned long free_ram = val.freeram; + + /* + * free memory is lower than watermark or cached page count + * exceed threshold, deny caching compress page. + */ + res = (free_ram > avail_ram * sbi->compress_watermark / 100) && + (COMPRESS_MAPPING(sbi)->nrpages < + free_ram * sbi->compress_percent / 100); +#else + res = false; +#endif } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) return true; @@ -1535,13 +1549,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, trace_f2fs_writepage(page, NODE); if (unlikely(f2fs_cp_error(sbi))) { - if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) { - ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); - unlock_page(page); - return 0; - } - goto redirty_out; + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); + return 0; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) @@ -1860,8 +1871,8 @@ continue_unlock: } /* flush inline_data, if it's async context. */ - if (is_inline_node(page)) { - clear_inline_node(page); + if (page_private_inline(page)) { + clear_page_private_inline(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); continue; @@ -1941,8 +1952,8 @@ continue_unlock: goto write_node; /* flush inline_data */ - if (is_inline_node(page)) { - clear_inline_node(page); + if (page_private_inline(page)) { + clear_page_private_inline(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); goto lock_node; @@ -2096,7 +2107,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); - f2fs_set_page_private(page, 0); + set_page_private_reference(page); return 1; } return 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 7a45c0f10629..ff14a6e5ac1c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -38,6 +38,9 @@ /* return value for read_node_page */ #define LOCKED_PAGE 1 +/* check pinned file's alignment status of physical blocks */ +#define FILE_NOT_ALIGNED 1 + /* For flag in struct node_info */ enum { IS_CHECKPOINTED, /* is it checkpointed before? */ @@ -148,6 +151,7 @@ enum mem_type { EXTENT_CACHE, /* indicates extent cache */ INMEM_PAGES, /* indicates inmemory pages */ DISCARD_CACHE, /* indicates memory of cached discard cmds */ + COMPRESS_PAGE, /* indicates memory of cached compressed pages */ BASE_CHECK, /* check kernel status */ }; @@ -389,20 +393,6 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold node blocks in their node footer * - Mark cold data pages in page cache */ -static inline int is_cold_data(struct page *page) -{ - return PageChecked(page); -} - -static inline void set_cold_data(struct page *page) -{ - SetPageChecked(page); -} - -static inline void clear_cold_data(struct page *page) -{ - ClearPageChecked(page); -} static inline int is_node(struct page *page, int type) { @@ -414,21 +404,6 @@ static inline int is_node(struct page *page, int type) #define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) #define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) -static inline int is_inline_node(struct page *page) -{ - return PageChecked(page); -} - -static inline void set_inline_node(struct page *page) -{ - SetPageChecked(page); -} - -static inline void clear_inline_node(struct page *page) -{ - ClearPageChecked(page); -} - static inline void set_cold_node(struct page *page, bool is_dir) { struct f2fs_node *rn = F2FS_NODE(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 422146c6d866..695eacfe776c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -45,6 +45,10 @@ static struct kmem_cache *fsync_entry_slab; +#ifdef CONFIG_UNICODE +extern struct kmem_cache *f2fs_cf_name_slab; +#endif + bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi) { s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); @@ -145,7 +149,7 @@ static int init_recovered_filename(const struct inode *dir, f2fs_hash_filename(dir, fname); #ifdef CONFIG_UNICODE /* Case-sensitive match is fine for recovery */ - kfree(fname->cf_name.name); + kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name); fname->cf_name.name = NULL; #endif } else { @@ -788,13 +792,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif - fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", - sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) { - err = -ENOMEM; - goto out; - } - INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&tmp_inode_list); INIT_LIST_HEAD(&dir_list); @@ -867,8 +864,6 @@ skip: } } - kmem_cache_destroy(fsync_entry_slab); -out: #ifdef CONFIG_QUOTA /* Turn quotas off */ if (quota_enabled) @@ -878,3 +873,17 @@ out: return ret ? ret : err; } + +int __init f2fs_create_recovery_cache(void) +{ + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", + sizeof(struct fsync_inode_entry)); + if (!fsync_entry_slab) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_recovery_cache(void) +{ + kmem_cache_destroy(fsync_entry_slab); +} diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 51dc79fad4fe..15cc89eef28d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -186,10 +186,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - if (PagePrivate(page)) - set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); - else - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); + set_page_private_atomic(page); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -272,9 +269,10 @@ next: /* we don't need to invalidate this in the sccessful status */ if (drop || recover) { ClearPageUptodate(page); - clear_cold_data(page); + clear_page_private_gcing(page); } - f2fs_clear_page_private(page); + detach_page_private(page); + set_page_private(page, 0); f2fs_put_page(page, 1); list_del(&cur->list); @@ -357,7 +355,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) struct list_head *head = &fi->inmem_pages; struct inmem_pages *cur = NULL; - f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + f2fs_bug_on(sbi, !page_private_atomic(page)); mutex_lock(&fi->inmem_lock); list_for_each_entry(cur, head, list) { @@ -373,9 +371,12 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) kmem_cache_free(inmem_entry_slab, cur); ClearPageUptodate(page); - f2fs_clear_page_private(page); + clear_page_private_atomic(page); f2fs_put_page(page, 0); + detach_page_private(page); + set_page_private(page, 0); + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); } @@ -2321,6 +2322,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) return; invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); + f2fs_invalidate_compress_page(sbi, addr); /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); @@ -3289,7 +3291,10 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; - if (is_cold_data(fio->page)) { + if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) + return CURSEG_COLD_DATA_PINNED; + + if (page_private_gcing(fio->page)) { if (fio->sbi->am.atgc_enabled && (fio->io_type == FS_DATA_IO) && (fio->sbi->gc_mode != GC_URGENT_HIGH)) @@ -3468,9 +3473,11 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio); - if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(fio->sbi), fio->old_blkaddr, fio->old_blkaddr); + f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr); + } /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); @@ -3660,6 +3667,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { invalidate_mapping_pages(META_MAPPING(sbi), old_blkaddr, old_blkaddr); + f2fs_invalidate_compress_page(sbi, old_blkaddr); if (!from_gc) update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); @@ -3919,7 +3927,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) /* sanity check for summary blocks */ if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) { - f2fs_err(sbi, "invalid journal entries nats %u sits %u\n", + f2fs_err(sbi, "invalid journal entries nats %u sits %u", nats_in_cursum(nat_j), sits_in_cursum(sit_j)); return -EINVAL; } @@ -4682,6 +4690,10 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi) struct seg_entry *se = get_seg_entry(sbi, curseg->segno); unsigned int blkofs = curseg->next_blkoff; + if (f2fs_sb_has_readonly(sbi) && + i != CURSEG_HOT_DATA && i != CURSEG_HOT_NODE) + continue; + sanity_check_seg_type(sbi, curseg->seg_type); if (f2fs_test_bit(blkofs, se->cur_valid_map)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7d325bfaf65a..8fecd3050ccd 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -148,8 +148,10 @@ enum { Opt_compress_algorithm, Opt_compress_log_size, Opt_compress_extension, + Opt_nocompress_extension, Opt_compress_chksum, Opt_compress_mode, + Opt_compress_cache, Opt_atgc, Opt_gc_merge, Opt_nogc_merge, @@ -222,8 +224,10 @@ static match_table_t f2fs_tokens = { {Opt_compress_algorithm, "compress_algorithm=%s"}, {Opt_compress_log_size, "compress_log_size=%u"}, {Opt_compress_extension, "compress_extension=%s"}, + {Opt_nocompress_extension, "nocompress_extension=%s"}, {Opt_compress_chksum, "compress_chksum"}, {Opt_compress_mode, "compress_mode=%s"}, + {Opt_compress_cache, "compress_cache"}, {Opt_atgc, "atgc"}, {Opt_gc_merge, "gc_merge"}, {Opt_nogc_merge, "nogc_merge"}, @@ -275,6 +279,24 @@ static int f2fs_sb_read_encoding(const struct f2fs_super_block *sb, return 0; } + +struct kmem_cache *f2fs_cf_name_slab; +static int __init f2fs_create_casefold_cache(void) +{ + f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name", + F2FS_NAME_LEN); + if (!f2fs_cf_name_slab) + return -ENOMEM; + return 0; +} + +static void f2fs_destroy_casefold_cache(void) +{ + kmem_cache_destroy(f2fs_cf_name_slab); +} +#else +static int __init f2fs_create_casefold_cache(void) { return 0; } +static void f2fs_destroy_casefold_cache(void) { } #endif static inline void limit_reserve_root(struct f2fs_sb_info *sbi) @@ -473,6 +495,43 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb, } #ifdef CONFIG_F2FS_FS_COMPRESSION +/* + * 1. The same extension name cannot not appear in both compress and non-compress extension + * at the same time. + * 2. If the compress extension specifies all files, the types specified by the non-compress + * extension will be treated as special cases and will not be compressed. + * 3. Don't allow the non-compress extension specifies all files. + */ +static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi) +{ + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + unsigned char (*noext)[F2FS_EXTENSION_LEN]; + int ext_cnt, noext_cnt, index = 0, no_index = 0; + + ext = F2FS_OPTION(sbi).extensions; + ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + noext = F2FS_OPTION(sbi).noextensions; + noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + + if (!noext_cnt) + return 0; + + for (no_index = 0; no_index < noext_cnt; no_index++) { + if (!strcasecmp("*", noext[no_index])) { + f2fs_info(sbi, "Don't allow the nocompress extension specifies all files"); + return -EINVAL; + } + for (index = 0; index < ext_cnt; index++) { + if (!strcasecmp(ext[index], noext[no_index])) { + f2fs_info(sbi, "Don't allow the same extension %s appear in both compress and nocompress extension", + ext[index]); + return -EINVAL; + } + } + } + return 0; +} + #ifdef CONFIG_F2FS_FS_LZ4 static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) { @@ -546,7 +605,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) substring_t args[MAX_OPT_ARGS]; #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; - int ext_cnt; + unsigned char (*noext)[F2FS_EXTENSION_LEN]; + int ext_cnt, noext_cnt; #endif char *p, *name; int arg = 0; @@ -555,7 +615,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) int ret; if (!options) - return 0; + goto default_check; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -1049,6 +1109,30 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); break; + case Opt_nocompress_extension: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_info(sbi, "Image doesn't support compression"); + break; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + noext = F2FS_OPTION(sbi).noextensions; + noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + noext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(sbi, + "invalid extension length/number"); + kfree(name); + return -EINVAL; + } + + strcpy(noext[noext_cnt], name); + F2FS_OPTION(sbi).nocompress_ext_cnt++; + kfree(name); + break; case Opt_compress_chksum: F2FS_OPTION(sbi).compress_chksum = true; break; @@ -1066,12 +1150,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_compress_cache: + set_opt(sbi, COMPRESS_CACHE); + break; #else case Opt_compress_algorithm: case Opt_compress_log_size: case Opt_compress_extension: + case Opt_nocompress_extension: case Opt_compress_chksum: case Opt_compress_mode: + case Opt_compress_cache: f2fs_info(sbi, "compression options not supported"); break; #endif @@ -1090,6 +1179,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; } } +default_check: #ifdef CONFIG_QUOTA if (f2fs_check_quota_options(sbi)) return -EINVAL; @@ -1122,6 +1212,13 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } #endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_test_compress_extension(sbi)) { + f2fs_err(sbi, "invalid compress or nocompress extension"); + return -EINVAL; + } +#endif + if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", F2FS_IO_SIZE_KB(sbi)); @@ -1153,7 +1250,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) { - f2fs_err(sbi, "LFS not compatible with checkpoint=disable\n"); + f2fs_err(sbi, "LFS not compatible with checkpoint=disable"); return -EINVAL; } @@ -1162,6 +1259,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) */ if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; + + if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { + f2fs_err(sbi, "Allow to mount readonly mode only"); + return -EROFS; + } return 0; } @@ -1403,6 +1505,8 @@ static void f2fs_put_super(struct super_block *sb) f2fs_bug_on(sbi, sbi->fsync_node_num); + f2fs_destroy_compress_inode(sbi); + iput(sbi->node_inode); sbi->node_inode = NULL; @@ -1665,6 +1769,11 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, F2FS_OPTION(sbi).extensions[i]); } + for (i = 0; i < F2FS_OPTION(sbi).nocompress_ext_cnt; i++) { + seq_printf(seq, ",nocompress_extension=%s", + F2FS_OPTION(sbi).noextensions[i]); + } + if (F2FS_OPTION(sbi).compress_chksum) seq_puts(seq, ",compress_chksum"); @@ -1672,6 +1781,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, seq_printf(seq, ",compress_mode=%s", "fs"); else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) seq_printf(seq, ",compress_mode=%s", "user"); + + if (test_opt(sbi, COMPRESS_CACHE)) + seq_puts(seq, ",compress_cache"); } #endif @@ -1819,7 +1931,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ - F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; + if (f2fs_sb_has_readonly(sbi)) + F2FS_OPTION(sbi).active_logs = NR_CURSEG_RO_TYPE; + else + F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE; + F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; @@ -1949,6 +2065,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); + bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool checkpoint_changed; #ifdef CONFIG_QUOTA int i, j; @@ -2004,6 +2121,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) goto skip; + if (f2fs_sb_has_readonly(sbi) && !(*flags & SB_RDONLY)) { + err = -EROFS; + goto restore_opts; + } + #ifdef CONFIG_QUOTA if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) { err = dquot_suspend(sb, -1); @@ -2041,6 +2163,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) { + err = -EINVAL; + f2fs_warn(sbi, "switch compress_cache option is not allowed"); + goto restore_opts; + } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); @@ -3137,14 +3265,15 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); - if (unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || + if (!f2fs_sb_has_readonly(sbi) && + unlikely(fsmeta < F2FS_MIN_META_SEGMENTS || ovp_segments == 0 || reserved_segments == 0)) { f2fs_err(sbi, "Wrong layout: check mkfs.f2fs version"); return 1; } - user_block_count = le64_to_cpu(ckpt->user_block_count); - segment_count_main = le32_to_cpu(raw_super->segment_count_main); + segment_count_main = le32_to_cpu(raw_super->segment_count_main) + + (f2fs_sb_has_readonly(sbi) ? 1 : 0); log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); if (!user_block_count || user_block_count >= segment_count_main << log_blocks_per_seg) { @@ -3175,6 +3304,10 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) return 1; + + if (f2fs_sb_has_readonly(sbi)) + goto check_data; + for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) { if (le32_to_cpu(ckpt->cur_node_segno[i]) == le32_to_cpu(ckpt->cur_node_segno[j])) { @@ -3185,10 +3318,15 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) } } } +check_data: for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) return 1; + + if (f2fs_sb_has_readonly(sbi)) + goto skip_cross; + for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) == le32_to_cpu(ckpt->cur_data_segno[j])) { @@ -3210,7 +3348,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) } } } - +skip_cross: sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); @@ -3555,7 +3693,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && !f2fs_sb_has_blkzoned(sbi)) { - f2fs_err(sbi, "Zoned block device feature not enabled\n"); + f2fs_err(sbi, "Zoned block device feature not enabled"); return -EINVAL; } if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { @@ -3940,10 +4078,14 @@ try_onemore: goto free_node_inode; } - err = f2fs_register_sysfs(sbi); + err = f2fs_init_compress_inode(sbi); if (err) goto free_root_inode; + err = f2fs_register_sysfs(sbi); + if (err) + goto free_compress_inode; + #ifdef CONFIG_QUOTA /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { @@ -4084,6 +4226,8 @@ free_meta: /* evict some inodes being cached by GC */ evict_inodes(sb); f2fs_unregister_sysfs(sbi); +free_compress_inode: + f2fs_destroy_compress_inode(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -4162,6 +4306,15 @@ static void kill_f2fs_super(struct super_block *sb) f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * latter evict_inode() can bypass checking and invalidating + * compress inode cache. + */ + if (test_opt(sbi, COMPRESS_CACHE)) + truncate_inode_pages_final(COMPRESS_MAPPING(sbi)); +#endif + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { struct cp_control cpc = { @@ -4227,9 +4380,12 @@ static int __init init_f2fs_fs(void) err = f2fs_create_checkpoint_caches(); if (err) goto free_segment_manager_caches; - err = f2fs_create_extent_cache(); + err = f2fs_create_recovery_cache(); if (err) goto free_checkpoint_caches; + err = f2fs_create_extent_cache(); + if (err) + goto free_recovery_cache; err = f2fs_create_garbage_collection_cache(); if (err) goto free_extent_cache; @@ -4258,7 +4414,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_compress_cache(); if (err) goto free_compress_mempool; + err = f2fs_create_casefold_cache(); + if (err) + goto free_compress_cache; return 0; +free_compress_cache: + f2fs_destroy_compress_cache(); free_compress_mempool: f2fs_destroy_compress_mempool(); free_bioset: @@ -4278,6 +4439,8 @@ free_garbage_collection_cache: f2fs_destroy_garbage_collection_cache(); free_extent_cache: f2fs_destroy_extent_cache(); +free_recovery_cache: + f2fs_destroy_recovery_cache(); free_checkpoint_caches: f2fs_destroy_checkpoint_caches(); free_segment_manager_caches: @@ -4292,6 +4455,7 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_casefold_cache(); f2fs_destroy_compress_cache(); f2fs_destroy_compress_mempool(); f2fs_destroy_bioset(); @@ -4303,6 +4467,7 @@ static void __exit exit_f2fs_fs(void) f2fs_exit_sysfs(); f2fs_destroy_garbage_collection_cache(); f2fs_destroy_extent_cache(); + f2fs_destroy_recovery_cache(); f2fs_destroy_checkpoint_caches(); f2fs_destroy_segment_manager_caches(); f2fs_destroy_node_manager_caches(); @@ -4315,4 +4480,5 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32"); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 39b522ec73e7..6642246206bd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -37,6 +37,7 @@ enum { #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ CPRC_INFO, /* struct ckpt_req_control */ + ATGC_INFO, /* struct atgc_management */ }; struct f2fs_attr { @@ -75,6 +76,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) #endif else if (struct_type == CPRC_INFO) return (unsigned char *)&sbi->cprc_info; + else if (struct_type == ATGC_INFO) + return (unsigned char *)&sbi->am; return NULL; } @@ -155,6 +158,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_casefold(sbi)) len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "casefold"); + if (f2fs_sb_has_readonly(sbi)) + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "readonly"); if (f2fs_sb_has_compression(sbi)) len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "compression"); @@ -495,6 +501,20 @@ out: } #endif + if (!strcmp(a->attr.name, "atgc_candidate_ratio")) { + if (t > 100) + return -EINVAL; + sbi->am.candidate_ratio = t; + return count; + } + + if (!strcmp(a->attr.name, "atgc_age_weight")) { + if (t > 100) + return -EINVAL; + sbi->am.age_weight = t; + return count; + } + *ui = (unsigned int)t; return count; @@ -546,46 +566,49 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } -enum feat_id { - FEAT_CRYPTO = 0, - FEAT_BLKZONED, - FEAT_ATOMIC_WRITE, - FEAT_EXTRA_ATTR, - FEAT_PROJECT_QUOTA, - FEAT_INODE_CHECKSUM, - FEAT_FLEXIBLE_INLINE_XATTR, - FEAT_QUOTA_INO, - FEAT_INODE_CRTIME, - FEAT_LOST_FOUND, - FEAT_VERITY, - FEAT_SB_CHECKSUM, - FEAT_CASEFOLD, - FEAT_COMPRESSION, - FEAT_TEST_DUMMY_ENCRYPTION_V2, -}; - +/* + * Note that there are three feature list entries: + * 1) /sys/fs/f2fs/features + * : shows runtime features supported by in-kernel f2fs along with Kconfig. + * - ref. F2FS_FEATURE_RO_ATTR() + * + * 2) /sys/fs/f2fs/$s_id/features <deprecated> + * : shows on-disk features enabled by mkfs.f2fs, used for old kernels. This + * won't add new feature anymore, and thus, users should check entries in 3) + * instead of this 2). + * + * 3) /sys/fs/f2fs/$s_id/feature_list + * : shows on-disk features enabled by mkfs.f2fs per instance, which follows + * sysfs entry rule where each entry should expose single value. + * This list covers old feature list provided by 2) and beyond. Therefore, + * please add new on-disk feature in this list only. + * - ref. F2FS_SB_FEATURE_RO_ATTR() + */ static ssize_t f2fs_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - switch (a->id) { - case FEAT_CRYPTO: - case FEAT_BLKZONED: - case FEAT_ATOMIC_WRITE: - case FEAT_EXTRA_ATTR: - case FEAT_PROJECT_QUOTA: - case FEAT_INODE_CHECKSUM: - case FEAT_FLEXIBLE_INLINE_XATTR: - case FEAT_QUOTA_INO: - case FEAT_INODE_CRTIME: - case FEAT_LOST_FOUND: - case FEAT_VERITY: - case FEAT_SB_CHECKSUM: - case FEAT_CASEFOLD: - case FEAT_COMPRESSION: - case FEAT_TEST_DUMMY_ENCRYPTION_V2: + return sprintf(buf, "supported\n"); +} + +#define F2FS_FEATURE_RO_ATTR(_name) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ +} + +static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (F2FS_HAS_FEATURE(sbi, a->id)) return sprintf(buf, "supported\n"); - } - return 0; + return sprintf(buf, "unsupported\n"); +} + +#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \ +static struct f2fs_attr f2fs_attr_sb_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_sb_feature_show, \ + .id = F2FS_FEATURE_##_feat, \ } #define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ @@ -605,13 +628,6 @@ static struct f2fs_attr f2fs_attr_##_name = { \ #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) -#define F2FS_FEATURE_RO_ATTR(_name, _id) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = 0444 }, \ - .show = f2fs_feature_show, \ - .id = _id, \ -} - #define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0444 }, \ @@ -685,31 +701,44 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #endif #ifdef CONFIG_FS_ENCRYPTION -F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); -F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2); +F2FS_FEATURE_RO_ATTR(encryption); +F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2); +#ifdef CONFIG_UNICODE +F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif +#endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED -F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +F2FS_FEATURE_RO_ATTR(block_zoned); #endif -F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); -F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); -F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); -F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); -F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); -F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); -F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); -F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); +F2FS_FEATURE_RO_ATTR(atomic_write); +F2FS_FEATURE_RO_ATTR(extra_attr); +F2FS_FEATURE_RO_ATTR(project_quota); +F2FS_FEATURE_RO_ATTR(inode_checksum); +F2FS_FEATURE_RO_ATTR(flexible_inline_xattr); +F2FS_FEATURE_RO_ATTR(quota_ino); +F2FS_FEATURE_RO_ATTR(inode_crtime); +F2FS_FEATURE_RO_ATTR(lost_found); #ifdef CONFIG_FS_VERITY -F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); +F2FS_FEATURE_RO_ATTR(verity); #endif -F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); -F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); +F2FS_FEATURE_RO_ATTR(sb_checksum); +#ifdef CONFIG_UNICODE +F2FS_FEATURE_RO_ATTR(casefold); +#endif +F2FS_FEATURE_RO_ATTR(readonly); #ifdef CONFIG_F2FS_FS_COMPRESSION -F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); +F2FS_FEATURE_RO_ATTR(compression); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); #endif +F2FS_FEATURE_RO_ATTR(pin_file); + +/* For ATGC */ +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_ratio, candidate_ratio); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_count); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight); +F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -778,6 +807,11 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(compr_saved_block), ATTR_LIST(compr_new_inode), #endif + /* For ATGC */ + ATTR_LIST(atgc_candidate_ratio), + ATTR_LIST(atgc_candidate_count), + ATTR_LIST(atgc_age_weight), + ATTR_LIST(atgc_age_threshold), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -786,7 +820,10 @@ static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), ATTR_LIST(test_dummy_encryption_v2), +#ifdef CONFIG_UNICODE + ATTR_LIST(encrypted_casefold), #endif +#endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED ATTR_LIST(block_zoned), #endif @@ -802,10 +839,14 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(verity), #endif ATTR_LIST(sb_checksum), +#ifdef CONFIG_UNICODE ATTR_LIST(casefold), +#endif + ATTR_LIST(readonly), #ifdef CONFIG_F2FS_FS_COMPRESSION ATTR_LIST(compression), #endif + ATTR_LIST(pin_file), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -817,6 +858,40 @@ static struct attribute *f2fs_stat_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs_stat); +F2FS_SB_FEATURE_RO_ATTR(encryption, ENCRYPT); +F2FS_SB_FEATURE_RO_ATTR(block_zoned, BLKZONED); +F2FS_SB_FEATURE_RO_ATTR(extra_attr, EXTRA_ATTR); +F2FS_SB_FEATURE_RO_ATTR(project_quota, PRJQUOTA); +F2FS_SB_FEATURE_RO_ATTR(inode_checksum, INODE_CHKSUM); +F2FS_SB_FEATURE_RO_ATTR(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); +F2FS_SB_FEATURE_RO_ATTR(quota_ino, QUOTA_INO); +F2FS_SB_FEATURE_RO_ATTR(inode_crtime, INODE_CRTIME); +F2FS_SB_FEATURE_RO_ATTR(lost_found, LOST_FOUND); +F2FS_SB_FEATURE_RO_ATTR(verity, VERITY); +F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM); +F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD); +F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION); +F2FS_SB_FEATURE_RO_ATTR(readonly, RO); + +static struct attribute *f2fs_sb_feat_attrs[] = { + ATTR_LIST(sb_encryption), + ATTR_LIST(sb_block_zoned), + ATTR_LIST(sb_extra_attr), + ATTR_LIST(sb_project_quota), + ATTR_LIST(sb_inode_checksum), + ATTR_LIST(sb_flexible_inline_xattr), + ATTR_LIST(sb_quota_ino), + ATTR_LIST(sb_inode_crtime), + ATTR_LIST(sb_lost_found), + ATTR_LIST(sb_verity), + ATTR_LIST(sb_sb_checksum), + ATTR_LIST(sb_casefold), + ATTR_LIST(sb_compression), + ATTR_LIST(sb_readonly), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_sb_feat); + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -883,6 +958,33 @@ static struct kobj_type f2fs_stat_ktype = { .release = f2fs_stat_kobj_release, }; +static ssize_t f2fs_sb_feat_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_feature_list_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static void f2fs_feature_list_kobj_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_feature_list_kobj); + complete(&sbi->s_feature_list_kobj_unregister); +} + +static const struct sysfs_ops f2fs_feature_list_attr_ops = { + .show = f2fs_sb_feat_attr_show, +}; + +static struct kobj_type f2fs_feature_list_ktype = { + .default_groups = f2fs_sb_feat_groups, + .sysfs_ops = &f2fs_feature_list_attr_ops, + .release = f2fs_feature_list_kobj_release, +}; + static int __maybe_unused segment_info_seq_show(struct seq_file *seq, void *offset) { @@ -1099,6 +1201,14 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) if (err) goto put_stat_kobj; + sbi->s_feature_list_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_feature_list_kobj_unregister); + err = kobject_init_and_add(&sbi->s_feature_list_kobj, + &f2fs_feature_list_ktype, + &sbi->s_kobj, "feature_list"); + if (err) + goto put_feature_list_kobj; + if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -1113,6 +1223,9 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) victim_bits_seq_show, sb); } return 0; +put_feature_list_kobj: + kobject_put(&sbi->s_feature_list_kobj); + wait_for_completion(&sbi->s_feature_list_kobj_unregister); put_stat_kobj: kobject_put(&sbi->s_stat_kobj); wait_for_completion(&sbi->s_stat_kobj_unregister); @@ -1135,6 +1248,9 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) kobject_del(&sbi->s_stat_kobj); kobject_put(&sbi->s_stat_kobj); wait_for_completion(&sbi->s_stat_kobj_unregister); + kobject_del(&sbi->s_feature_list_kobj); + kobject_put(&sbi->s_feature_list_kobj); + wait_for_completion(&sbi->s_feature_list_kobj_unregister); kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index bab9b202b496..de0c9b013a85 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -342,6 +342,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from) } static const struct address_space_operations fat_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = fat_readpage, .readahead = fat_readahead, .writepage = fat_writepage, diff --git a/fs/fhandle.c b/fs/fhandle.c index ec6feeccc276..6630c69c23a2 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -229,7 +229,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, path_put(&path); return fd; } - file = file_open_root(path.dentry, path.mnt, "", open_flag, 0); + file = file_open_root(&path, "", open_flag, 0); if (IS_ERR(file)) { put_unused_fd(fd); retval = PTR_ERR(file); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e91980f49388..06d04a74ab6c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -131,25 +131,6 @@ static bool inode_io_list_move_locked(struct inode *inode, return false; } -/** - * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list - * @inode: inode to be removed - * @wb: bdi_writeback @inode is being removed from - * - * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and - * clear %WB_has_dirty_io if all are empty afterwards. - */ -static void inode_io_list_del_locked(struct inode *inode, - struct bdi_writeback *wb) -{ - assert_spin_locked(&wb->list_lock); - assert_spin_locked(&inode->i_lock); - - inode->i_state &= ~I_SYNC_QUEUED; - list_del_init(&inode->i_io_list); - wb_io_lists_depopulated(wb); -} - static void wb_wakeup(struct bdi_writeback *wb) { spin_lock_bh(&wb->work_lock); @@ -244,6 +225,13 @@ void wb_wait_for_completion(struct wb_completion *done) /* one round can affect upto 5 slots */ #define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */ +/* + * Maximum inodes per isw. A specific value has been chosen to make + * struct inode_switch_wbs_context fit into 1024 bytes kmalloc. + */ +#define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \ + / sizeof(struct inode *)) + static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; @@ -279,6 +267,28 @@ void __inode_attach_wb(struct inode *inode, struct page *page) EXPORT_SYMBOL_GPL(__inode_attach_wb); /** + * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list + * @inode: inode of interest with i_lock held + * @wb: target bdi_writeback + * + * Remove the inode from wb's io lists and if necessarily put onto b_attached + * list. Only inodes attached to cgwb's are kept on this list. + */ +static void inode_cgwb_move_to_attached(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + + inode->i_state &= ~I_SYNC_QUEUED; + if (wb != &wb->bdi->wb) + list_move(&inode->i_io_list, &wb->b_attached); + else + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); +} + +/** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it * @inode: inode of interest with i_lock held * @@ -332,11 +342,18 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) } struct inode_switch_wbs_context { - struct inode *inode; - struct bdi_writeback *new_wb; + struct rcu_work work; - struct rcu_head rcu_head; - struct work_struct work; + /* + * Multiple inodes can be switched at once. The switching procedure + * consists of two parts, separated by a RCU grace period. To make + * sure that the second part is executed for each inode gone through + * the first part, all inode pointers are placed into a NULL-terminated + * array embedded into struct inode_switch_wbs_context. Otherwise + * an inode could be left in a non-consistent state. + */ + struct bdi_writeback *new_wb; + struct inode *inodes[]; }; static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) @@ -349,50 +366,23 @@ static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) up_write(&bdi->wb_switch_rwsem); } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static bool inode_do_switch_wbs(struct inode *inode, + struct bdi_writeback *old_wb, + struct bdi_writeback *new_wb) { - struct inode_switch_wbs_context *isw = - container_of(work, struct inode_switch_wbs_context, work); - struct inode *inode = isw->inode; - struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; - struct bdi_writeback *old_wb = inode->i_wb; - struct bdi_writeback *new_wb = isw->new_wb; XA_STATE(xas, &mapping->i_pages, 0); struct page *page; bool switched = false; - /* - * If @inode switches cgwb membership while sync_inodes_sb() is - * being issued, sync_inodes_sb() might miss it. Synchronize. - */ - down_read(&bdi->wb_switch_rwsem); - - /* - * By the time control reaches here, RCU grace period has passed - * since I_WB_SWITCH assertion and all wb stat update transactions - * between unlocked_inode_to_wb_begin/end() are guaranteed to be - * synchronizing against the i_pages lock. - * - * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock - * gives us exclusion against all wb related operations on @inode - * including IO list manipulations and stat updates. - */ - if (old_wb < new_wb) { - spin_lock(&old_wb->list_lock); - spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); - } else { - spin_lock(&new_wb->list_lock); - spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); - } spin_lock(&inode->i_lock); xa_lock_irq(&mapping->i_pages); /* - * Once I_FREEING is visible under i_lock, the eviction path owns - * the inode and we shouldn't modify ->i_io_list. + * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction + * path owns the inode and we shouldn't modify ->i_io_list. */ - if (unlikely(inode->i_state & I_FREEING)) + if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE))) goto skip_switch; trace_inode_switch_wbs(inode, old_wb, new_wb); @@ -419,21 +409,28 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) wb_get(new_wb); /* - * Transfer to @new_wb's IO list if necessary. The specific list - * @inode was on is ignored and the inode is put on ->b_dirty which - * is always correct including from ->b_dirty_time. The transfer - * preserves @inode->dirtied_when ordering. + * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, + * the specific list @inode was on is ignored and the @inode is put on + * ->b_dirty which is always correct including from ->b_dirty_time. + * The transfer preserves @inode->dirtied_when ordering. If the @inode + * was clean, it means it was on the b_attached list, so move it onto + * the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { - struct inode *pos; - - inode_io_list_del_locked(inode, old_wb); inode->i_wb = new_wb; - list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) - if (time_after_eq(inode->dirtied_when, - pos->dirtied_when)) - break; - inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); + + if (inode->i_state & I_DIRTY_ALL) { + struct inode *pos; + + list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) + if (time_after_eq(inode->dirtied_when, + pos->dirtied_when)) + break; + inode_io_list_move_locked(inode, new_wb, + pos->i_io_list.prev); + } else { + inode_cgwb_move_to_attached(inode, new_wb); + } } else { inode->i_wb = new_wb; } @@ -452,31 +449,91 @@ skip_switch: xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); + + return switched; +} + +static void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct inode_switch_wbs_context *isw = + container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); + struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); + struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; + struct bdi_writeback *new_wb = isw->new_wb; + unsigned long nr_switched = 0; + struct inode **inodep; + + /* + * If @inode switches cgwb membership while sync_inodes_sb() is + * being issued, sync_inodes_sb() might miss it. Synchronize. + */ + down_read(&bdi->wb_switch_rwsem); + + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions + * between unlocked_inode_to_wb_begin/end() are guaranteed to be + * synchronizing against the i_pages lock. + * + * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&new_wb->list_lock); + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + + for (inodep = isw->inodes; *inodep; inodep++) { + WARN_ON_ONCE((*inodep)->i_wb != old_wb); + if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) + nr_switched++; + } + spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); up_read(&bdi->wb_switch_rwsem); - if (switched) { + if (nr_switched) { wb_wakeup(new_wb); - wb_put(old_wb); + wb_put_many(old_wb, nr_switched); } - wb_put(new_wb); - iput(inode); + for (inodep = isw->inodes; *inodep; inodep++) + iput(*inodep); + wb_put(new_wb); kfree(isw); - atomic_dec(&isw_nr_in_flight); } -static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) +static bool inode_prepare_wbs_switch(struct inode *inode, + struct bdi_writeback *new_wb) { - struct inode_switch_wbs_context *isw = container_of(rcu_head, - struct inode_switch_wbs_context, rcu_head); + /* + * Paired with smp_mb() in cgroup_writeback_umount(). + * isw_nr_in_flight must be increased before checking SB_ACTIVE and + * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 + * in cgroup_writeback_umount() and the isw_wq will be not flushed. + */ + smp_mb(); - /* needs to grab bh-unsafe locks, bounce to work item */ - INIT_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_work(isw_wq, &isw->work); + /* while holding I_WB_SWITCH, no one else can update the association */ + spin_lock(&inode->i_lock); + if (!(inode->i_sb->s_flags & SB_ACTIVE) || + inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || + inode_to_wb(inode) == new_wb) { + spin_unlock(&inode->i_lock); + return false; + } + inode->i_state |= I_WB_SWITCH; + __iget(inode); + spin_unlock(&inode->i_lock); + + return true; } /** @@ -501,32 +558,30 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) return; - isw = kzalloc(sizeof(*isw), GFP_ATOMIC); + isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC); if (!isw) return; + atomic_inc(&isw_nr_in_flight); + /* find and pin the new wb */ rcu_read_lock(); memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); - if (memcg_css) - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + if (memcg_css && !css_tryget(memcg_css)) + memcg_css = NULL; rcu_read_unlock(); + if (!memcg_css) + goto out_free; + + isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); if (!isw->new_wb) goto out_free; - /* while holding I_WB_SWITCH, no one else can update the association */ - spin_lock(&inode->i_lock); - if (!(inode->i_sb->s_flags & SB_ACTIVE) || - inode->i_state & (I_WB_SWITCH | I_FREEING) || - inode_to_wb(inode) == isw->new_wb) { - spin_unlock(&inode->i_lock); + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) goto out_free; - } - inode->i_state |= I_WB_SWITCH; - __iget(inode); - spin_unlock(&inode->i_lock); - isw->inode = inode; + isw->inodes[0] = inode; /* * In addition to synchronizing among switchers, I_WB_SWITCH tells @@ -534,18 +589,85 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ - call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); - - atomic_inc(&isw_nr_in_flight); + INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); + queue_rcu_work(isw_wq, &isw->work); return; out_free: + atomic_dec(&isw_nr_in_flight); if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); } /** + * cleanup_offline_cgwb - detach associated inodes + * @wb: target wb + * + * Switch all inodes attached to @wb to a nearest living ancestor's wb in order + * to eventually release the dying @wb. Returns %true if not all inodes were + * switched and the function has to be restarted. + */ +bool cleanup_offline_cgwb(struct bdi_writeback *wb) +{ + struct cgroup_subsys_state *memcg_css; + struct inode_switch_wbs_context *isw; + struct inode *inode; + int nr; + bool restart = false; + + isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW * + sizeof(struct inode *), GFP_KERNEL); + if (!isw) + return restart; + + atomic_inc(&isw_nr_in_flight); + + for (memcg_css = wb->memcg_css->parent; memcg_css; + memcg_css = memcg_css->parent) { + isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); + if (isw->new_wb) + break; + } + if (unlikely(!isw->new_wb)) + isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + + nr = 0; + spin_lock(&wb->list_lock); + list_for_each_entry(inode, &wb->b_attached, i_io_list) { + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + continue; + + isw->inodes[nr++] = inode; + + if (nr >= WB_MAX_INODES_PER_ISW - 1) { + restart = true; + break; + } + } + spin_unlock(&wb->list_lock); + + /* no attached inodes? bail out */ + if (nr == 0) { + atomic_dec(&isw_nr_in_flight); + wb_put(isw->new_wb); + kfree(isw); + return restart; + } + + /* + * In addition to synchronizing among switchers, I_WB_SWITCH tells + * the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be visible. + */ + INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); + queue_rcu_work(isw_wq, &isw->work); + + return restart; +} + +/** * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it * @wbc: writeback_control of interest * @inode: target inode @@ -1000,6 +1122,12 @@ out_bdi_put: */ void cgroup_writeback_umount(void) { + /* + * SB_ACTIVE should be reliably cleared before checking + * isw_nr_in_flight, see generic_shutdown_super(). + */ + smp_mb(); + if (atomic_read(&isw_nr_in_flight)) { /* * Use rcu_barrier() to wait for all pending callbacks to @@ -1024,6 +1152,17 @@ fs_initcall(cgroup_writeback_init); static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } +static void inode_cgwb_move_to_attached(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + + inode->i_state &= ~I_SYNC_QUEUED; + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); +} + static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) @@ -1124,7 +1263,11 @@ void inode_io_list_del(struct inode *inode) wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); - inode_io_list_del_locked(inode, wb); + + inode->i_state &= ~I_SYNC_QUEUED; + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); + spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); } @@ -1437,7 +1580,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, inode->i_state &= ~I_SYNC_QUEUED; } else { /* The inode is clean. Remove from writeback lists. */ - inode_io_list_del_locked(inode, wb); + inode_cgwb_move_to_attached(inode, wb); } } @@ -1589,7 +1732,7 @@ static int writeback_single_inode(struct inode *inode, * responsible for the writeback lists. */ if (!(inode->i_state & I_DIRTY_ALL)) - inode_io_list_del_locked(inode, wb); + inode_cgwb_move_to_attached(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -2205,28 +2348,6 @@ int dirtytime_interval_handler(struct ctl_table *table, int write, return ret; } -static noinline void block_dump___mark_inode_dirty(struct inode *inode) -{ - if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { - struct dentry *dentry; - const char *name = "?"; - - dentry = d_find_alias(inode); - if (dentry) { - spin_lock(&dentry->d_lock); - name = (const char *) dentry->d_name.name; - } - printk(KERN_DEBUG - "%s(%d): dirtied inode %lu (%s) on %s\n", - current->comm, task_pid_nr(current), inode->i_ino, - name, inode->i_sb->s_id); - if (dentry) { - spin_unlock(&dentry->d_lock); - dput(dentry); - } - } -} - /** * __mark_inode_dirty - internal function to mark an inode dirty * @@ -2296,9 +2417,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) (dirtytime && (inode->i_state & I_DIRTY_INODE))) return; - if (unlikely(block_dump)) - block_dump___mark_inode_dirty(inode); - spin_lock(&inode->i_lock); if (dirtytime && (inode->i_state & I_DIRTY_INODE)) goto out_unlock_inode; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index ff99ab2a3c43..e55723744f58 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -9,6 +9,7 @@ #include <linux/delay.h> #include <linux/dax.h> #include <linux/uio.h> +#include <linux/pagemap.h> #include <linux/pfn_t.h> #include <linux/iomap.h> #include <linux/interval_tree.h> @@ -212,7 +213,7 @@ static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, dmap->writable = writable; if (!upgrade) { /* - * We don't take a refernce on inode. inode is valid right now + * We don't take a reference on inode. inode is valid right now * and when inode is going away, cleanup logic should first * cleanup dmap entries. */ @@ -621,7 +622,7 @@ static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, } /* - * If read beyond end of file happnes, fs code seems to return + * If read beyond end of file happens, fs code seems to return * it as hole */ iomap_hole: @@ -1206,7 +1207,7 @@ static void fuse_dax_free_mem_worker(struct work_struct *work) ret); } - /* If number of free ranges are still below threhold, requeue */ + /* If number of free ranges are still below threshold, requeue */ kick_dmap_free_worker(fcd, 1); } @@ -1329,7 +1330,7 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) static const struct address_space_operations fuse_dax_file_aops = { .writepages = fuse_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = noop_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .invalidatepage = noop_invalidatepage, }; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a5ceccc5ef00..1c8f79b3dd06 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -91,7 +91,7 @@ static void fuse_drop_waiting(struct fuse_conn *fc) { /* * lockess check of fc->connected is okay, because atomic_dec_and_test() - * provides a memory barrier mached with the one in fuse_wait_aborted() + * provides a memory barrier matched with the one in fuse_wait_aborted() * to ensure no wake-up is missed. */ if (atomic_dec_and_test(&fc->num_waiting) && @@ -783,6 +783,7 @@ static int fuse_check_page(struct page *page) 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | + 1 << PG_workingset | 1 << PG_reclaim | 1 << PG_waiters))) { dump_page(page, "fuse: trying to steal weird page"); @@ -1271,6 +1272,15 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, goto restart; } spin_lock(&fpq->lock); + /* + * Must not put request on fpq->io queue after having been shut down by + * fuse_abort_conn() + */ + if (!fpq->connected) { + req->out.h.error = err = -ECONNABORTED; + goto out_end; + + } list_add(&req->list, &fpq->io); spin_unlock(&fpq->lock); cs->req = req; @@ -1857,7 +1867,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, } err = -EINVAL; - if (oh.error <= -1000 || oh.error > 0) + if (oh.error <= -512 || oh.error > 0) goto copy_finish; spin_lock(&fpq->lock); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1b6c001a7dd1..eade6f965b2e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -252,7 +252,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (ret == -ENOMEM) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || - inode_wrong_type(inode, outarg.attr.mode)) + fuse_stale_inode(inode, outarg.generation, &outarg.attr)) goto invalid; forget_all_cached_acls(inode); @@ -309,68 +309,23 @@ static int fuse_dentry_delete(const struct dentry *dentry) static struct vfsmount *fuse_dentry_automount(struct path *path) { struct fs_context *fsc; - struct fuse_mount *parent_fm = get_fuse_mount_super(path->mnt->mnt_sb); - struct fuse_conn *fc = parent_fm->fc; - struct fuse_mount *fm; struct vfsmount *mnt; struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry)); - struct super_block *sb; - int err; fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); - if (IS_ERR(fsc)) { - err = PTR_ERR(fsc); - goto out; - } - - err = -ENOMEM; - fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); - if (!fm) - goto out_put_fsc; - - fsc->s_fs_info = fm; - sb = sget_fc(fsc, NULL, set_anon_super_fc); - if (IS_ERR(sb)) { - err = PTR_ERR(sb); - kfree(fm); - goto out_put_fsc; - } - fm->fc = fuse_conn_get(fc); - - /* Initialize superblock, making @mp_fi its root */ - err = fuse_fill_super_submount(sb, mp_fi); - if (err) - goto out_put_sb; + if (IS_ERR(fsc)) + return ERR_CAST(fsc); - sb->s_flags |= SB_ACTIVE; - fsc->root = dget(sb->s_root); - /* We are done configuring the superblock, so unlock it */ - up_write(&sb->s_umount); - - down_write(&fc->killsb); - list_add_tail(&fm->fc_entry, &fc->mounts); - up_write(&fc->killsb); + /* Pass the FUSE inode of the mount for fuse_get_tree_submount() */ + fsc->fs_private = mp_fi; /* Create the submount */ - mnt = vfs_create_mount(fsc); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); - goto out_put_fsc; - } - mntget(mnt); - put_fs_context(fsc); - return mnt; + mnt = fc_mount(fsc); + if (!IS_ERR(mnt)) + mntget(mnt); -out_put_sb: - /* - * Only jump here when fsc->root is NULL and sb is still locked - * (otherwise put_fs_context() will put the superblock) - */ - deactivate_locked_super(sb); -out_put_fsc: put_fs_context(fsc); -out: - return ERR_PTR(err); + return mnt; } const struct dentry_operations fuse_dentry_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 09ef2a4d25ed..97f860cfc195 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -645,7 +645,7 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. * * An example: - * User requested DIO read of 64K. It was splitted into two 32K fuse requests, + * User requested DIO read of 64K. It was split into two 32K fuse requests, * both submitted asynchronously. The first of them was ACKed by userspace as * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The * second request was ACKed as short, e.g. only 1K was read, resulting in @@ -1171,14 +1171,12 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); + tmp = copy_page_from_iter_atomic(page, offset, bytes, ii); flush_dcache_page(page); - iov_iter_advance(ii, tmp); if (!tmp) { unlock_page(page); put_page(page); - bytes = min(bytes, iov_iter_single_seg_count(ii)); goto again; } @@ -1405,7 +1403,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, nbytes += ret; ret += start; - npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; + npages = DIV_ROUND_UP(ret, PAGE_SIZE); ap->descs[ap->num_pages].offset = start; fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); @@ -2907,11 +2905,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, }; int err; bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || - (mode & FALLOC_FL_PUNCH_HOLE); + (mode & (FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)); bool block_faults = FUSE_IS_DAX(inode) && lock_inode; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (fm->fc->no_fallocate) @@ -2926,7 +2926,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, goto out; } - if (mode & FALLOC_FL_PUNCH_HOLE) { + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { loff_t endbyte = offset + length - 1; err = fuse_writeback_range(inode, offset, endbyte); @@ -2966,7 +2966,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, file_update_time(file); } - if (mode & FALLOC_FL_PUNCH_HOLE) + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) truncate_pagecache_range(inode, offset, offset + length - 1); fuse_invalidate_attr(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7e463e220053..07829ce78695 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -761,6 +761,9 @@ struct fuse_conn { /* Auto-mount submounts announced by the server */ unsigned int auto_submounts:1; + /* Propagate syncfs() to server */ + unsigned int sync_fs:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -867,6 +870,13 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) return atomic64_read(&fc->attr_version); } +static inline bool fuse_stale_inode(const struct inode *inode, int generation, + struct fuse_attr *attr) +{ + return inode->i_generation != generation || + inode_wrong_type(inode, attr->mode); +} + static inline void fuse_make_bad(struct inode *inode) { remove_inode_hash(inode); @@ -1082,15 +1092,6 @@ void fuse_send_init(struct fuse_mount *fm); int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); /* - * Fill in superblock for submounts - * @sb: partially-initialized superblock to fill in - * @parent_fi: The fuse_inode of the parent filesystem where this submount is - * mounted - */ -int fuse_fill_super_submount(struct super_block *sb, - struct fuse_inode *parent_fi); - -/* * Remove the mount from the connection * * Returns whether this was the last mount @@ -1098,6 +1099,11 @@ int fuse_fill_super_submount(struct super_block *sb, bool fuse_mount_remove(struct fuse_mount *fm); /* + * Setup context ops for submounts + */ +int fuse_init_fs_context_submount(struct fs_context *fsc); + +/* * Shut down the connection (possibly sending DESTROY request). */ void fuse_conn_destroy(struct fuse_mount *fm); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 393e36b74dc4..b9beb39a4a18 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -350,8 +350,8 @@ retry: inode->i_generation = generation; fuse_init_inode(inode, attr); unlock_new_inode(inode); - } else if (inode_wrong_type(inode, attr->mode)) { - /* Inode has changed type, any I/O on the old should fail */ + } else if (fuse_stale_inode(inode, generation, attr)) { + /* nodeid was reused, any I/O on the old inode should fail */ fuse_make_bad(inode); iput(inode); goto retry; @@ -506,6 +506,45 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) return err; } +static int fuse_sync_fs(struct super_block *sb, int wait) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; + struct fuse_syncfs_in inarg; + FUSE_ARGS(args); + int err; + + /* + * Userspace cannot handle the wait == 0 case. Avoid a + * gratuitous roundtrip. + */ + if (!wait) + return 0; + + /* The filesystem is being unmounted. Nothing to do. */ + if (!sb->s_root) + return 0; + + if (!fc->sync_fs) + return 0; + + memset(&inarg, 0, sizeof(inarg)); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.opcode = FUSE_SYNCFS; + args.nodeid = get_node_id(sb->s_root->d_inode); + args.out_numargs = 0; + + err = fuse_simple_request(fm, &args); + if (err == -ENOSYS) { + fc->sync_fs = 0; + err = 0; + } + + return err; +} + enum { OPT_SOURCE, OPT_SUBTYPE, @@ -909,6 +948,7 @@ static const struct super_operations fuse_super_operations = { .put_super = fuse_put_super, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, + .sync_fs = fuse_sync_fs, .show_options = fuse_show_options, }; @@ -1275,8 +1315,8 @@ static void fuse_sb_defaults(struct super_block *sb) sb->s_xattr = fuse_no_acl_xattr_handlers; } -int fuse_fill_super_submount(struct super_block *sb, - struct fuse_inode *parent_fi) +static int fuse_fill_super_submount(struct super_block *sb, + struct fuse_inode *parent_fi) { struct fuse_mount *fm = get_fuse_mount_super(sb); struct super_block *parent_sb = parent_fi->inode.i_sb; @@ -1313,6 +1353,58 @@ int fuse_fill_super_submount(struct super_block *sb, return 0; } +/* Filesystem context private data holds the FUSE inode of the mount point */ +static int fuse_get_tree_submount(struct fs_context *fsc) +{ + struct fuse_mount *fm; + struct fuse_inode *mp_fi = fsc->fs_private; + struct fuse_conn *fc = get_fuse_conn(&mp_fi->inode); + struct super_block *sb; + int err; + + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) + return -ENOMEM; + + fsc->s_fs_info = fm; + sb = sget_fc(fsc, NULL, set_anon_super_fc); + if (IS_ERR(sb)) { + kfree(fm); + return PTR_ERR(sb); + } + fm->fc = fuse_conn_get(fc); + + /* Initialize superblock, making @mp_fi its root */ + err = fuse_fill_super_submount(sb, mp_fi); + if (err) { + fuse_conn_put(fc); + kfree(fm); + sb->s_fs_info = NULL; + deactivate_locked_super(sb); + return err; + } + + down_write(&fc->killsb); + list_add_tail(&fm->fc_entry, &fc->mounts); + up_write(&fc->killsb); + + sb->s_flags |= SB_ACTIVE; + fsc->root = dget(sb->s_root); + + return 0; +} + +static const struct fs_context_operations fuse_context_submount_ops = { + .get_tree = fuse_get_tree_submount, +}; + +int fuse_init_fs_context_submount(struct fs_context *fsc) +{ + fsc->ops = &fuse_context_submount_ops; + return 0; +} +EXPORT_SYMBOL_GPL(fuse_init_fs_context_submount); + int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { struct fuse_dev *fud = NULL; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 277f7041d55a..bc267832310c 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -200,9 +200,12 @@ retry: if (!d_in_lookup(dentry)) { struct fuse_inode *fi; inode = d_inode(dentry); + if (inode && get_node_id(inode) != o->nodeid) + inode = NULL; if (!inode || - get_node_id(inode) != o->nodeid || - inode_wrong_type(inode, o->attr.mode)) { + fuse_stale_inode(inode, o->generation, &o->attr)) { + if (inode) + fuse_make_bad(inode); d_invalidate(dentry); dput(dentry); goto retry; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index bcb8a02e2d8b..8f52cdaa8445 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1447,6 +1447,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->release = fuse_free_conn; fc->delete_stale = true; fc->auto_submounts = true; + fc->sync_fs = true; /* Tell FUSE to split requests that exceed the virtqueue's size */ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, @@ -1496,6 +1497,9 @@ static int virtio_fs_init_fs_context(struct fs_context *fsc) { struct fuse_fs_context *ctx; + if (fsc->purpose == FS_CONTEXT_FOR_SUBMOUNT) + return fuse_init_fs_context_submount(fsc); + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 23b5be3db044..81d8f064126e 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -784,7 +784,7 @@ static const struct address_space_operations gfs2_aops = { .writepages = gfs2_writepages, .readpage = gfs2_readpage, .readahead = gfs2_readahead, - .set_page_dirty = iomap_set_page_dirty, + .set_page_dirty = __set_page_dirty_nobuffers, .releasepage = iomap_releasepage, .invalidatepage = iomap_invalidatepage, .bmap = gfs2_bmap, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 0bcf11a9987b..ed8b67b21718 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -56,14 +56,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, u64 block, struct page *page) { struct inode *inode = &ip->i_inode; - int release = 0; - - if (!page || page->index) { - page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); - if (!page) - return -ENOMEM; - release = 1; - } if (!PageUptodate(page)) { void *kaddr = kmap(page); @@ -97,26 +89,10 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, gfs2_ordered_add_inode(ip); } - if (release) { - unlock_page(page); - put_page(page); - } - return 0; } -/** - * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big - * @ip: The GFS2 inode to unstuff - * @page: The (optional) page. This is looked up if the @page is NULL - * - * This routine unstuffs a dinode and returns it to a "normal" state such - * that the height can be grown in the traditional way. - * - * Returns: errno - */ - -int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) +static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page) { struct buffer_head *bh, *dibh; struct gfs2_dinode *di; @@ -124,11 +100,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) int isdir = gfs2_is_dir(ip); int error; - down_write(&ip->i_rw_mutex); - error = gfs2_meta_inode_buffer(ip, &dibh); if (error) - goto out; + return error; if (i_size_read(&ip->i_inode)) { /* Get a free block, fill it with the stuffed data, @@ -170,12 +144,38 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) out_brelse: brelse(dibh); + return error; +} + +/** + * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big + * @ip: The GFS2 inode to unstuff + * + * This routine unstuffs a dinode and returns it to a "normal" state such + * that the height can be grown in the traditional way. + * + * Returns: errno + */ + +int gfs2_unstuff_dinode(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + struct page *page; + int error; + + down_write(&ip->i_rw_mutex); + page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); + error = -ENOMEM; + if (!page) + goto out; + error = __gfs2_unstuff_inode(ip, page); + unlock_page(page); + put_page(page); out: up_write(&ip->i_rw_mutex); return error; } - /** * find_metapath - Find path through the metadata tree * @sdp: The superblock @@ -1079,7 +1079,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, goto out_trans_fail; if (unstuff) { - ret = gfs2_unstuff_dinode(ip, NULL); + ret = gfs2_unstuff_dinode(ip); if (ret) goto out_trans_end; release_metapath(mp); @@ -2143,7 +2143,7 @@ static int do_grow(struct inode *inode, u64 size) goto do_grow_release; if (unstuff) { - error = gfs2_unstuff_dinode(ip, NULL); + error = gfs2_unstuff_dinode(ip); if (error) goto do_end_trans; } diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 6676d863faef..53cce6c08e81 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -46,7 +46,7 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, extern const struct iomap_ops gfs2_iomap_ops; extern const struct iomap_writeback_ops gfs2_writeback_ops; -extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); +extern int gfs2_unstuff_dinode(struct gfs2_inode *ip); extern int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); extern int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 18f67b37d6f8..42b7dfffb5e7 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -172,7 +172,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, return -EINVAL; if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, NULL); + error = gfs2_unstuff_dinode(ip); if (error) return error; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 493a83e3f590..84ec053d43b4 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -210,7 +210,7 @@ void gfs2_set_inode_flags(struct inode *inode) /** * do_gfs2_set_flags - set flags on an inode - * @filp: file pointer + * @inode: The inode * @reqflags: The flags to set * @mask: Indicates which flags are valid * @fsflags: The FS_* inode flags passed in @@ -427,22 +427,25 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) struct gfs2_alloc_parms ap = { .aflags = 0, }; u64 offset = page_offset(page); unsigned int data_blocks, ind_blocks, rblocks; + vm_fault_t ret = VM_FAULT_LOCKED; struct gfs2_holder gh; unsigned int length; loff_t size; - int ret; + int err; sb_start_pagefault(inode->i_sb); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (ret) + err = gfs2_glock_nq(&gh); + if (err) { + ret = block_page_mkwrite_return(err); goto out_uninit; + } /* Check page index against inode size */ size = i_size_read(inode); if (offset >= size) { - ret = -EINVAL; + ret = VM_FAULT_SIGBUS; goto out_unlock; } @@ -450,8 +453,8 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) file_update_time(vmf->vma->vm_file); /* page is wholly or partially inside EOF */ - if (offset > size - PAGE_SIZE) - length = offset_in_page(size); + if (size - offset < PAGE_SIZE) + length = size - offset; else length = PAGE_SIZE; @@ -469,24 +472,30 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) !gfs2_write_alloc_required(ip, offset, length)) { lock_page(page); if (!PageUptodate(page) || page->mapping != inode->i_mapping) { - ret = -EAGAIN; + ret = VM_FAULT_NOPAGE; unlock_page(page); } goto out_unlock; } - ret = gfs2_rindex_update(sdp); - if (ret) + err = gfs2_rindex_update(sdp); + if (err) { + ret = block_page_mkwrite_return(err); goto out_unlock; + } gfs2_write_calc_reserv(ip, length, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; - ret = gfs2_quota_lock_check(ip, &ap); - if (ret) + err = gfs2_quota_lock_check(ip, &ap); + if (err) { + ret = block_page_mkwrite_return(err); goto out_unlock; - ret = gfs2_inplace_reserve(ip, &ap); - if (ret) + } + err = gfs2_inplace_reserve(ip, &ap); + if (err) { + ret = block_page_mkwrite_return(err); goto out_quota_unlock; + } rblocks = RES_DINODE + ind_blocks; if (gfs2_is_jdata(ip)) @@ -495,28 +504,38 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) rblocks += RES_STATFS + RES_QUOTA; rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); } - ret = gfs2_trans_begin(sdp, rblocks, 0); - if (ret) + err = gfs2_trans_begin(sdp, rblocks, 0); + if (err) { + ret = block_page_mkwrite_return(err); goto out_trans_fail; + } + + /* Unstuff, if required, and allocate backing blocks for page */ + if (gfs2_is_stuffed(ip)) { + err = gfs2_unstuff_dinode(ip); + if (err) { + ret = block_page_mkwrite_return(err); + goto out_trans_end; + } + } lock_page(page); - ret = -EAGAIN; /* If truncated, we must retry the operation, we may have raced * with the glock demotion code. */ - if (!PageUptodate(page) || page->mapping != inode->i_mapping) - goto out_trans_end; + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { + ret = VM_FAULT_NOPAGE; + goto out_page_locked; + } - /* Unstuff, if required, and allocate backing blocks for page */ - ret = 0; - if (gfs2_is_stuffed(ip)) - ret = gfs2_unstuff_dinode(ip, page); - if (ret == 0) - ret = gfs2_allocate_page_backing(page, length); + err = gfs2_allocate_page_backing(page, length); + if (err) + ret = block_page_mkwrite_return(err); -out_trans_end: - if (ret) +out_page_locked: + if (ret != VM_FAULT_LOCKED) unlock_page(page); +out_trans_end: gfs2_trans_end(sdp); out_trans_fail: gfs2_inplace_release(ip); @@ -526,12 +545,12 @@ out_unlock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - if (ret == 0) { + if (ret == VM_FAULT_LOCKED) { set_page_dirty(page); wait_for_stable_page(page); } sb_end_pagefault(inode->i_sb); - return block_page_mkwrite_return(ret); + return ret; } static vm_fault_t gfs2_fault(struct vm_fault *vmf) @@ -962,7 +981,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, NULL); + error = gfs2_unstuff_dinode(ip); if (unlikely(error)) goto out; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d9cb261f55b0..1f3902ecdded 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -212,8 +212,7 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl) spin_lock(&lru_lock); - list_del(&gl->gl_lru); - list_add_tail(&gl->gl_lru, &lru_list); + list_move_tail(&gl->gl_lru, &lru_list); if (!test_bit(GLF_LRU, &gl->gl_flags)) { set_bit(GLF_LRU, &gl->gl_flags); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index d68184ebbfdd..7c9619997355 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -89,11 +89,13 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb } const struct address_space_operations gfs2_meta_aops = { + .set_page_dirty = __set_page_dirty_buffers, .writepage = gfs2_aspace_writepage, .releasepage = gfs2_releasepage, }; const struct address_space_operations gfs2_rgrp_aops = { + .set_page_dirty = __set_page_dirty_buffers, .writepage = gfs2_aspace_writepage, .releasepage = gfs2_releasepage, }; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 826f77d9cff5..5f4504dd0875 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -687,6 +687,7 @@ static int init_statfs(struct gfs2_sbd *sdp) } iput(pn); + pn = NULL; ip = GFS2_I(sdp->sd_sc_inode); error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_sc_gh); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 94637c307cc8..be0997e24d60 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -825,7 +825,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, u64 size; if (gfs2_is_stuffed(ip)) { - err = gfs2_unstuff_dinode(ip, NULL); + err = gfs2_unstuff_dinode(ip); if (err) return err; } diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 3fc5cb346586..4a95a92546a0 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -159,6 +159,7 @@ static int hfs_writepages(struct address_space *mapping, } const struct address_space_operations hfs_btree_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = hfs_readpage, .writepage = hfs_writepage, .write_begin = hfs_write_begin, @@ -168,6 +169,7 @@ const struct address_space_operations hfs_btree_aops = { }; const struct address_space_operations hfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = hfs_readpage, .writepage = hfs_writepage, .write_begin = hfs_write_begin, diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 8ea447e5c470..6fef67c2a9f0 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -156,6 +156,7 @@ static int hfsplus_writepages(struct address_space *mapping, } const struct address_space_operations hfsplus_btree_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = hfsplus_readpage, .writepage = hfsplus_writepage, .write_begin = hfsplus_write_begin, @@ -165,6 +166,7 @@ const struct address_space_operations hfsplus_btree_aops = { }; const struct address_space_operations hfsplus_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = hfsplus_readpage, .writepage = hfsplus_writepage, .write_begin = hfsplus_write_begin, @@ -279,6 +281,11 @@ int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, struct inode *inode = d_inode(path->dentry); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); + if (request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime = hfsp_mt2ut(hip->create_date); + } + if (inode->i_flags & S_APPEND) stat->attributes |= STATX_ATTR_APPEND; if (inode->i_flags & S_IMMUTABLE) diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 4d169c5a2673..e2855ceefd39 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -204,7 +204,6 @@ check_attr_tree_state_again: buf = kzalloc(node_size, GFP_NOFS); if (!buf) { - pr_err("failed to allocate memory for header node\n"); err = -ENOMEM; goto end_attr_file_creation; } diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 077c25128eb7..c3a49aacf20a 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -196,6 +196,7 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } const struct address_space_operations hpfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = hpfs_readpage, .writepage = hpfs_writepage, .readahead = hpfs_readahead, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 55efd3dd04f6..926eeb9bf4eb 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -735,6 +735,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, __SetPageUptodate(page); error = huge_add_to_page_cache(page, mapping, index); if (unlikely(error)) { + restore_reserve_on_error(h, &pseudo_vma, addr, page); put_page(page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out; @@ -1445,7 +1446,7 @@ static int get_hstate_idx(int page_size_log) * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. */ struct file *hugetlb_file_setup(const char *name, size_t size, - vm_flags_t acctflag, struct user_struct **user, + vm_flags_t acctflag, struct ucounts **ucounts, int creat_flags, int page_size_log) { struct inode *inode; @@ -1457,20 +1458,20 @@ struct file *hugetlb_file_setup(const char *name, size_t size, if (hstate_idx < 0) return ERR_PTR(-ENODEV); - *user = NULL; + *ucounts = NULL; mnt = hugetlbfs_vfsmount[hstate_idx]; if (!mnt) return ERR_PTR(-ENOENT); if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { - *user = current_user(); - if (user_shm_lock(size, *user)) { + *ucounts = current_ucounts(); + if (user_shm_lock(size, *ucounts)) { task_lock(current); pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", current->comm, current->pid); task_unlock(current); } else { - *user = NULL; + *ucounts = NULL; return ERR_PTR(-EPERM); } } @@ -1497,9 +1498,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size, iput(inode); out: - if (*user) { - user_shm_unlock(size, *user); - *user = NULL; + if (*ucounts) { + user_shm_unlock(size, *ucounts); + *ucounts = NULL; } return file; } diff --git a/fs/internal.h b/fs/internal.h index 6aeae7ef3380..3ce8edbaa3ca 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -129,7 +129,7 @@ struct open_flags { }; extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); -extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, +extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); diff --git a/fs/io-wq.c b/fs/io-wq.c index b3e8624a37d0..843d4a7bcd6e 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -9,8 +9,6 @@ #include <linux/init.h> #include <linux/errno.h> #include <linux/sched/signal.h> -#include <linux/mm.h> -#include <linux/sched/mm.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/rculist_nulls.h> @@ -96,13 +94,14 @@ struct io_wqe { struct io_wq *wq; struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; + + cpumask_var_t cpu_mask; }; /* * Per io_wq state */ struct io_wq { - struct io_wqe **wqes; unsigned long state; free_work_fn *free_work; @@ -110,14 +109,14 @@ struct io_wq { struct io_wq_hash *hash; - refcount_t refs; - atomic_t worker_refs; struct completion worker_done; struct hlist_node cpuhp_node; struct task_struct *task; + + struct io_wqe *wqes[]; }; static enum cpuhp_state io_wq_online; @@ -241,7 +240,8 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) * Most likely an attempt to queue unbounded work on an io_wq that * wasn't setup with any unbounded workers. */ - WARN_ON_ONCE(!acct->max_workers); + if (unlikely(!acct->max_workers)) + pr_warn_once("io-wq is not configured for unbound workers"); rcu_read_lock(); ret = io_wqe_activate_free_worker(wqe); @@ -560,17 +560,13 @@ loop: if (ret) continue; /* timed out, exit unless we're the fixed worker */ - if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || - !(worker->flags & IO_WORKER_F_FIXED)) + if (!(worker->flags & IO_WORKER_F_FIXED)) break; } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { raw_spin_lock_irq(&wqe->lock); - if (!wq_list_empty(&wqe->work_list)) - io_worker_handle_work(worker); - else - raw_spin_unlock_irq(&wqe->lock); + io_worker_handle_work(worker); } io_worker_exit(worker); @@ -645,7 +641,7 @@ fail: tsk->pf_io_worker = worker; worker->task = tsk; - set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node)); + set_cpus_allowed_ptr(tsk, wqe->cpu_mask); tsk->flags |= PF_NO_SETAFFINITY; raw_spin_lock_irq(&wqe->lock); @@ -901,23 +897,20 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { - int ret = -ENOMEM, node; + int ret, node; struct io_wq *wq; if (WARN_ON_ONCE(!data->free_work || !data->do_work)) return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(!bounded)) + return ERR_PTR(-EINVAL); - wq = kzalloc(sizeof(*wq), GFP_KERNEL); + wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL); if (!wq) return ERR_PTR(-ENOMEM); - - wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL); - if (!wq->wqes) - goto err_wq; - ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); if (ret) - goto err_wqes; + goto err_wq; refcount_inc(&data->hash->refs); wq->hash = data->hash; @@ -934,6 +927,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node); if (!wqe) goto err; + if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL)) + goto err; + cpumask_copy(wqe->cpu_mask, cpumask_of_node(node)); wq->wqes[node] = wqe; wqe->node = alloc_node; wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND; @@ -953,17 +949,18 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) } wq->task = get_task_struct(data->task); - refcount_set(&wq->refs, 1); atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); return wq; err: io_wq_put_hash(data->hash); cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - for_each_node(node) + for_each_node(node) { + if (!wq->wqes[node]) + continue; + free_cpumask_var(wq->wqes[node]->cpu_mask); kfree(wq->wqes[node]); -err_wqes: - kfree(wq->wqes); + } err_wq: kfree(wq); return ERR_PTR(ret); @@ -1033,10 +1030,10 @@ static void io_wq_destroy(struct io_wq *wq) .cancel_all = true, }; io_wqe_cancel_pending_work(wqe, &match); + free_cpumask_var(wqe->cpu_mask); kfree(wqe); } io_wq_put_hash(wq->hash); - kfree(wq->wqes); kfree(wq); } @@ -1045,25 +1042,67 @@ void io_wq_put_and_exit(struct io_wq *wq) WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state)); io_wq_exit_workers(wq); - if (refcount_dec_and_test(&wq->refs)) - io_wq_destroy(wq); + io_wq_destroy(wq); } +struct online_data { + unsigned int cpu; + bool online; +}; + static bool io_wq_worker_affinity(struct io_worker *worker, void *data) { - set_cpus_allowed_ptr(worker->task, cpumask_of_node(worker->wqe->node)); + struct online_data *od = data; + if (od->online) + cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask); + else + cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask); return false; } +static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online) +{ + struct online_data od = { + .cpu = cpu, + .online = online + }; + int i; + + rcu_read_lock(); + for_each_node(i) + io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od); + rcu_read_unlock(); + return 0; +} + static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node) { struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node); + + return __io_wq_cpu_online(wq, cpu, true); +} + +static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) +{ + struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node); + + return __io_wq_cpu_online(wq, cpu, false); +} + +int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) +{ int i; rcu_read_lock(); - for_each_node(i) - io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL); + for_each_node(i) { + struct io_wqe *wqe = wq->wqes[i]; + + if (mask) + cpumask_copy(wqe->cpu_mask, mask); + else + cpumask_copy(wqe->cpu_mask, cpumask_of_node(i)); + } rcu_read_unlock(); return 0; } @@ -1073,7 +1112,7 @@ static __init int io_wq_init(void) int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online", - io_wq_cpu_online, NULL); + io_wq_cpu_online, io_wq_cpu_offline); if (ret < 0) return ret; io_wq_online = ret; diff --git a/fs/io-wq.h b/fs/io-wq.h index af2df0680ee2..3999ee58ff26 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -87,7 +87,6 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; - const struct cred *creds; unsigned flags; }; @@ -128,6 +127,8 @@ void io_wq_put_and_exit(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); +int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); + static inline bool io_wq_is_hashed(struct io_wq_work *work) { return work->flags & IO_WQ_WORK_HASHED; diff --git a/fs/io_uring.c b/fs/io_uring.c index 42380ed563c4..e55b21fc0ab2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -11,7 +11,7 @@ * before writing the tail (using smp_load_acquire to read the tail will * do). It also needs a smp_mb() before updating CQ head (ordering the * entry load(s) with the head store), pairing with an implicit barrier - * through a control-dependency in io_get_cqring (smp_store_release to + * through a control-dependency in io_get_cqe (smp_store_release to * store head will do). Failure to do so could lead to reading invalid * CQ entries. * @@ -89,6 +89,7 @@ #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) +#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 /* * Shift of 9 is 512 entries, or exactly one page on 64-bit archs @@ -100,11 +101,19 @@ #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) +#define IO_RSRC_TAG_TABLE_SHIFT 9 +#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) +#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) + #define IORING_MAX_REG_BUFFERS (1U << 14) #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ IOSQE_BUFFER_SELECT) +#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ + REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS) + +#define IO_TCTX_REFS_CACHE_NR (1U << 10) struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -164,7 +173,7 @@ struct io_rings { * Written by the application, shouldn't be modified by the * kernel. */ - u32 cq_flags; + u32 cq_flags; /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure @@ -243,7 +252,8 @@ typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); struct io_rsrc_data { struct io_ring_ctx *ctx; - u64 *tags; + u64 **tags; + unsigned int nr; rsrc_put_fn *do_put; atomic_t refs; struct completion done; @@ -288,7 +298,6 @@ struct io_sq_data { unsigned long state; struct completion exited; - struct callback_head *park_task_work; }; #define IO_IOPOLL_BATCH 8 @@ -299,11 +308,8 @@ struct io_sq_data { struct io_comp_state { struct io_kiocb *reqs[IO_COMPL_BATCH]; unsigned int nr; - unsigned int locked_free_nr; /* inline/task_work completion list, under ->uring_lock */ struct list_head free_list; - /* IRQ completion list, under ->completion_lock */ - struct list_head locked_free_list; }; struct io_submit_link { @@ -338,16 +344,23 @@ struct io_submit_state { }; struct io_ring_ctx { + /* const or read-mostly hot data */ struct { struct percpu_ref refs; - } ____cacheline_aligned_in_smp; - struct { + struct io_rings *rings; unsigned int flags; unsigned int compat: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; unsigned int restricted: 1; + unsigned int off_timeout_used: 1; + unsigned int drain_active: 1; + } ____cacheline_aligned_in_smp; + + /* submission data */ + struct { + struct mutex uring_lock; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -361,35 +374,33 @@ struct io_ring_ctx { * array. */ u32 *sq_array; + struct io_uring_sqe *sq_sqes; unsigned cached_sq_head; unsigned sq_entries; - unsigned sq_mask; - unsigned sq_thread_idle; - unsigned cached_sq_dropped; - unsigned cached_cq_overflow; - unsigned long sq_check_overflow; + struct list_head defer_list; - /* hashed buffered write serialization */ - struct io_wq_hash *hash_map; + /* + * Fixed resources fast path, should be accessed only under + * uring_lock, and updated through io_uring_register(2) + */ + struct io_rsrc_node *rsrc_node; + struct io_file_table file_table; + unsigned nr_user_files; + unsigned nr_user_bufs; + struct io_mapped_ubuf **user_bufs; - struct list_head defer_list; + struct io_submit_state submit_state; struct list_head timeout_list; struct list_head cq_overflow_list; - - struct io_uring_sqe *sq_sqes; - } ____cacheline_aligned_in_smp; - - struct { - struct mutex uring_lock; - wait_queue_head_t wait; + struct xarray io_buffers; + struct xarray personalities; + u32 pers_next; + unsigned sq_thread_idle; } ____cacheline_aligned_in_smp; - struct io_submit_state submit_state; - - struct io_rings *rings; - - /* Only used for accounting purposes */ - struct mm_struct *mm_account; + /* IRQ completion list, under ->completion_lock */ + struct list_head locked_free_list; + unsigned int locked_free_nr; const struct cred *sq_creds; /* cred used for __io_sq_thread() */ struct io_sq_data *sq_data; /* if using sq thread polling */ @@ -397,44 +408,18 @@ struct io_ring_ctx { struct wait_queue_head sqo_sq_wait; struct list_head sqd_list; - /* - * If used, fixed file set. Writers must ensure that ->refs is dead, - * readers must ensure that ->refs is alive as long as the file* is - * used. Only updated through io_uring_register(2). - */ - struct io_rsrc_data *file_data; - struct io_file_table file_table; - unsigned nr_user_files; - - /* if used, fixed mapped user buffers */ - struct io_rsrc_data *buf_data; - unsigned nr_user_bufs; - struct io_mapped_ubuf **user_bufs; - - struct user_struct *user; - - struct completion ref_comp; - -#if defined(CONFIG_UNIX) - struct socket *ring_sock; -#endif - - struct xarray io_buffers; - - struct xarray personalities; - u32 pers_next; + unsigned long check_cq_overflow; struct { unsigned cached_cq_tail; unsigned cq_entries; - unsigned cq_mask; - atomic_t cq_timeouts; - unsigned cq_last_tm_flush; - unsigned cq_extra; - unsigned long cq_check_overflow; + struct eventfd_ctx *cq_ev_fd; + struct wait_queue_head poll_wait; struct wait_queue_head cq_wait; + unsigned cq_extra; + atomic_t cq_timeouts; struct fasync_struct *cq_fasync; - struct eventfd_ctx *cq_ev_fd; + unsigned cq_last_tm_flush; } ____cacheline_aligned_in_smp; struct { @@ -449,29 +434,47 @@ struct io_ring_ctx { struct list_head iopoll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; - bool poll_multi_file; + bool poll_multi_queue; } ____cacheline_aligned_in_smp; - struct delayed_work rsrc_put_work; - struct llist_head rsrc_put_llist; - struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; - struct io_rsrc_node *rsrc_node; - struct io_rsrc_node *rsrc_backup_node; - struct io_mapped_ubuf *dummy_ubuf; - struct io_restriction restrictions; - /* exit task_work */ - struct callback_head *exit_task_work; + /* slow path rsrc auxilary data, used by update/register */ + struct { + struct io_rsrc_node *rsrc_backup_node; + struct io_mapped_ubuf *dummy_ubuf; + struct io_rsrc_data *file_data; + struct io_rsrc_data *buf_data; + + struct delayed_work rsrc_put_work; + struct llist_head rsrc_put_llist; + struct list_head rsrc_ref_list; + spinlock_t rsrc_ref_lock; + }; /* Keep this last, we don't need it for the fast path */ - struct work_struct exit_work; - struct list_head tctx_list; + struct { + #if defined(CONFIG_UNIX) + struct socket *ring_sock; + #endif + /* hashed buffered write serialization */ + struct io_wq_hash *hash_map; + + /* Only used for accounting purposes */ + struct user_struct *user; + struct mm_struct *mm_account; + + /* ctx exit and cancelation */ + struct callback_head *exit_task_work; + struct work_struct exit_work; + struct list_head tctx_list; + struct completion ref_comp; + }; }; struct io_uring_task { /* submission side */ + int cached_refs; struct xarray xa; struct wait_queue_head wait; const struct io_ring_ctx *last; @@ -706,7 +709,7 @@ enum { REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, /* first byte is taken by user flags, shift it to not overlap */ - REQ_F_FAIL_LINK_BIT = 8, + REQ_F_FAIL_BIT = 8, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, REQ_F_NOWAIT_BIT, @@ -718,6 +721,7 @@ enum { REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, REQ_F_DONT_REISSUE_BIT, + REQ_F_CREDS_BIT, /* keep async read/write and isreg together and in order */ REQ_F_ASYNC_READ_BIT, REQ_F_ASYNC_WRITE_BIT, @@ -742,7 +746,7 @@ enum { REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), /* fail rest of links */ - REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), + REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), /* on inflight list, should be cancelled and waited on exit reliably */ REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), /* read/write uses file position */ @@ -771,6 +775,8 @@ enum { REQ_F_ASYNC_WRITE = BIT(REQ_F_ASYNC_WRITE_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + /* has creds assigned */ + REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), }; struct async_poll { @@ -783,6 +789,11 @@ struct io_task_work { task_work_func_t func; }; +enum { + IORING_RSRC_FILE = 0, + IORING_RSRC_BUFFER = 1, +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -846,6 +857,8 @@ struct io_kiocb { struct hlist_node hash_node; struct async_poll *apoll; struct io_wq_work work; + const struct cred *creds; + /* store used ubuf, so we can prevent reloading */ struct io_mapped_ubuf *imu; }; @@ -1029,11 +1042,11 @@ static const struct io_op_def io_op_defs[] = { }; static bool io_disarm_next(struct io_kiocb *req); -static void io_uring_del_task_file(unsigned long index); +static void io_uring_del_tctx_node(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, - struct files_struct *files); -static void io_uring_cancel_sqpoll(struct io_sq_data *sqd); + bool cancel_all); +static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, @@ -1054,8 +1067,7 @@ static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); -static void io_submit_flush_completions(struct io_comp_state *cs, - struct io_ring_ctx *ctx); +static void io_submit_flush_completions(struct io_ring_ctx *ctx); static bool io_poll_remove_waitqs(struct io_kiocb *req); static int io_req_prep_async(struct io_kiocb *req); @@ -1101,15 +1113,14 @@ static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) percpu_ref_put(ref); } -static bool io_match_task(struct io_kiocb *head, - struct task_struct *task, - struct files_struct *files) +static bool io_match_task(struct io_kiocb *head, struct task_struct *task, + bool cancel_all) { struct io_kiocb *req; if (task && head->task != task) return false; - if (!files) + if (cancel_all) return true; io_for_each_link(req, head) { @@ -1119,10 +1130,9 @@ static bool io_match_task(struct io_kiocb *head, return false; } -static inline void req_set_fail_links(struct io_kiocb *req) +static inline void req_set_fail(struct io_kiocb *req) { - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req->flags |= REQ_F_FAIL; } static void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -1174,13 +1184,13 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ctx->flags = p->flags; init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); - init_waitqueue_head(&ctx->cq_wait); + init_waitqueue_head(&ctx->poll_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); - init_waitqueue_head(&ctx->wait); + init_waitqueue_head(&ctx->cq_wait); spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); @@ -1191,7 +1201,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_llist_head(&ctx->rsrc_put_llist); INIT_LIST_HEAD(&ctx->tctx_list); INIT_LIST_HEAD(&ctx->submit_state.comp.free_list); - INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list); + INIT_LIST_HEAD(&ctx->locked_free_list); return ctx; err: kfree(ctx->dummy_ubuf); @@ -1200,13 +1210,20 @@ err: return NULL; } +static void io_account_cq_overflow(struct io_ring_ctx *ctx) +{ + struct io_rings *r = ctx->rings; + + WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); + ctx->cq_extra--; +} + static bool req_need_defer(struct io_kiocb *req, u32 seq) { if (unlikely(req->flags & REQ_F_IO_DRAIN)) { struct io_ring_ctx *ctx = req->ctx; - return seq + ctx->cq_extra != ctx->cached_cq_tail - + READ_ONCE(ctx->cached_cq_overflow); + return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; } return false; @@ -1225,8 +1242,10 @@ static void io_prep_async_work(struct io_kiocb *req) const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; - if (!req->work.creds) - req->work.creds = get_current_cred(); + if (!(req->flags & REQ_F_CREDS)) { + req->flags |= REQ_F_CREDS; + req->creds = get_current_cred(); + } req->work.list.next = NULL; req->work.flags = 0; @@ -1290,9 +1309,9 @@ static void io_kill_timeout(struct io_kiocb *req, int status) } } -static void __io_queue_deferred(struct io_ring_ctx *ctx) +static void io_queue_deferred(struct io_ring_ctx *ctx) { - do { + while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); @@ -1301,19 +1320,14 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) list_del_init(&de->list); io_req_task_queue(de->req); kfree(de); - } while (!list_empty(&ctx->defer_list)); + } } static void io_flush_timeouts(struct io_ring_ctx *ctx) { - u32 seq; + u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - if (list_empty(&ctx->timeout_list)) - return; - - seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - - do { + while (!list_empty(&ctx->timeout_list)) { u32 events_needed, events_got; struct io_kiocb *req = list_first_entry(&ctx->timeout_list, struct io_kiocb, timeout.list); @@ -1335,27 +1349,31 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) list_del_init(&req->timeout.list); io_kill_timeout(req, 0); - } while (!list_empty(&ctx->timeout_list)); - + } ctx->cq_last_tm_flush = seq; } -static void io_commit_cqring(struct io_ring_ctx *ctx) +static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { - io_flush_timeouts(ctx); + if (ctx->off_timeout_used) + io_flush_timeouts(ctx); + if (ctx->drain_active) + io_queue_deferred(ctx); +} +static inline void io_commit_cqring(struct io_ring_ctx *ctx) +{ + if (unlikely(ctx->off_timeout_used || ctx->drain_active)) + __io_commit_cqring_flush(ctx); /* order cqe stores with ring update */ smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); - - if (unlikely(!list_empty(&ctx->defer_list))) - __io_queue_deferred(ctx); } static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; - return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries; + return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; } static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) @@ -1363,21 +1381,21 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } -static inline struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) +static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - unsigned tail; + unsigned tail, mask = ctx->cq_entries - 1; /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ - if (__io_cqring_events(ctx) == rings->cq_ring_entries) + if (__io_cqring_events(ctx) == ctx->cq_entries) return NULL; tail = ctx->cached_cq_tail++; - return &rings->cqes[tail & ctx->cq_mask]; + return &rings->cqes[tail & mask]; } static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) @@ -1394,14 +1412,14 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) /* see waitqueue_active() comment */ smp_mb(); - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); + if (waitqueue_active(&ctx->cq_wait)) + wake_up(&ctx->cq_wait); if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->cq_wait)) { - wake_up_interruptible(&ctx->cq_wait); + if (waitqueue_active(&ctx->poll_wait)) { + wake_up_interruptible(&ctx->poll_wait); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); } } @@ -1412,13 +1430,13 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) smp_mb(); if (ctx->flags & IORING_SETUP_SQPOLL) { - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); + if (waitqueue_active(&ctx->cq_wait)) + wake_up(&ctx->cq_wait); } if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->cq_wait)) { - wake_up_interruptible(&ctx->cq_wait); + if (waitqueue_active(&ctx->poll_wait)) { + wake_up_interruptible(&ctx->poll_wait); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); } } @@ -1426,17 +1444,16 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { - struct io_rings *rings = ctx->rings; unsigned long flags; bool all_flushed, posted; - if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries) + if (!force && __io_cqring_events(ctx) == ctx->cq_entries) return false; posted = false; spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&ctx->cq_overflow_list)) { - struct io_uring_cqe *cqe = io_get_cqring(ctx); + struct io_uring_cqe *cqe = io_get_cqe(ctx); struct io_overflow_cqe *ocqe; if (!cqe && !force) @@ -1446,8 +1463,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); else - WRITE_ONCE(ctx->rings->cq_overflow, - ++ctx->cached_cq_overflow); + io_account_cq_overflow(ctx); + posted = true; list_del(&ocqe->list); kfree(ocqe); @@ -1455,8 +1472,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) all_flushed = list_empty(&ctx->cq_overflow_list); if (all_flushed) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); + clear_bit(0, &ctx->check_cq_overflow); ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } @@ -1472,7 +1488,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { bool ret = true; - if (test_bit(0, &ctx->cq_check_overflow)) { + if (test_bit(0, &ctx->check_cq_overflow)) { /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); @@ -1531,12 +1547,11 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, * or cannot allocate an overflow entry, then we need to drop it * on the floor. */ - WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow); + io_account_cq_overflow(ctx); return false; } if (list_empty(&ctx->cq_overflow_list)) { - set_bit(0, &ctx->sq_check_overflow); - set_bit(0, &ctx->cq_check_overflow); + set_bit(0, &ctx->check_cq_overflow); ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } ocqe->cqe.user_data = user_data; @@ -1558,7 +1573,7 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data * submission (by quite a lot). Increment the overflow count in * the ring. */ - cqe = io_get_cqring(ctx); + cqe = io_get_cqe(ctx); if (likely(cqe)) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); @@ -1588,10 +1603,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res, * free_list cache. */ if (req_ref_put_and_test(req)) { - struct io_comp_state *cs = &ctx->submit_state.comp; - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) + if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) io_disarm_next(req); if (req->link) { io_req_task_queue(req->link); @@ -1600,8 +1613,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res, } io_dismantle_req(req); io_put_task(req->task, 1); - list_add(&req->compl.list, &cs->locked_free_list); - cs->locked_free_nr++; + list_add(&req->compl.list, &ctx->locked_free_list); + ctx->locked_free_nr++; } else { if (!percpu_ref_tryget(&ctx->refs)) req = NULL; @@ -1617,8 +1630,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res, static inline bool io_req_needs_clean(struct io_kiocb *req) { - return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | - REQ_F_POLLED | REQ_F_INFLIGHT); + return req->flags & IO_REQ_CLEAN_FLAGS; } static void io_req_complete_state(struct io_kiocb *req, long res, @@ -1647,7 +1659,7 @@ static inline void io_req_complete(struct io_kiocb *req, long res) static void io_req_complete_failed(struct io_kiocb *req, long res) { - req_set_fail_links(req); + req_set_fail(req); io_put_req(req); io_req_complete_post(req, res, 0); } @@ -1656,8 +1668,8 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, struct io_comp_state *cs) { spin_lock_irq(&ctx->completion_lock); - list_splice_init(&cs->locked_free_list, &cs->free_list); - cs->locked_free_nr = 0; + list_splice_init(&ctx->locked_free_list, &cs->free_list); + ctx->locked_free_nr = 0; spin_unlock_irq(&ctx->completion_lock); } @@ -1673,7 +1685,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) * locked cache, grab the lock and move them over to our submission * side cache. */ - if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) + if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) io_flush_cached_locked_reqs(ctx, cs); nr = state->free_reqs; @@ -1695,11 +1707,11 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; - BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs)); + BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH); if (!state->free_reqs) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; - int ret; + int ret, i; if (io_flush_cached_reqs(ctx)) goto got_req; @@ -1717,6 +1729,20 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) return NULL; ret = 1; } + + /* + * Don't initialise the fields below on every allocation, but + * do that in advance and keep valid on free. + */ + for (i = 0; i < ret; i++) { + struct io_kiocb *req = state->reqs[i]; + + req->ctx = ctx; + req->link = NULL; + req->async_data = NULL; + /* not necessary, but safer to zero */ + req->result = 0; + } state->free_reqs = ret; } got_req: @@ -1740,11 +1766,9 @@ static void io_dismantle_req(struct io_kiocb *req) io_put_file(req->file); if (req->fixed_rsrc_refs) percpu_ref_put(req->fixed_rsrc_refs); - if (req->async_data) + if (req->async_data) { kfree(req->async_data); - if (req->work.creds) { - put_cred(req->work.creds); - req->work.creds = NULL; + req->async_data = NULL; } } @@ -1826,7 +1850,7 @@ static bool io_disarm_next(struct io_kiocb *req) if (likely(req->flags & REQ_F_LINK_TIMEOUT)) posted = io_kill_linked_timeout(req); - if (unlikely((req->flags & REQ_F_FAIL_LINK) && + if (unlikely((req->flags & REQ_F_FAIL) && !(req->flags & REQ_F_HARDLINK))) { posted |= (req->link != NULL); io_fail_links(req); @@ -1844,7 +1868,7 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) { + if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; bool posted; @@ -1875,54 +1899,51 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx) return; if (ctx->submit_state.comp.nr) { mutex_lock(&ctx->uring_lock); - io_submit_flush_completions(&ctx->submit_state.comp, ctx); + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); } percpu_ref_put(&ctx->refs); } -static bool __tctx_task_work(struct io_uring_task *tctx) +static void tctx_task_work(struct callback_head *cb) { struct io_ring_ctx *ctx = NULL; - struct io_wq_work_list list; - struct io_wq_work_node *node; - - if (wq_list_empty(&tctx->task_list)) - return false; - - spin_lock_irq(&tctx->task_lock); - list = tctx->task_list; - INIT_WQ_LIST(&tctx->task_list); - spin_unlock_irq(&tctx->task_lock); + struct io_uring_task *tctx = container_of(cb, struct io_uring_task, + task_work); - node = list.first; - while (node) { - struct io_wq_work_node *next = node->next; - struct io_kiocb *req; - - req = container_of(node, struct io_kiocb, io_task_work.node); - if (req->ctx != ctx) { - ctx_flush_and_put(ctx); - ctx = req->ctx; - percpu_ref_get(&ctx->refs); + while (1) { + struct io_wq_work_node *node; + + spin_lock_irq(&tctx->task_lock); + node = tctx->task_list.first; + INIT_WQ_LIST(&tctx->task_list); + spin_unlock_irq(&tctx->task_lock); + + while (node) { + struct io_wq_work_node *next = node->next; + struct io_kiocb *req = container_of(node, struct io_kiocb, + io_task_work.node); + + if (req->ctx != ctx) { + ctx_flush_and_put(ctx); + ctx = req->ctx; + percpu_ref_get(&ctx->refs); + } + req->task_work.func(&req->task_work); + node = next; } - - req->task_work.func(&req->task_work); - node = next; + if (wq_list_empty(&tctx->task_list)) { + clear_bit(0, &tctx->task_state); + if (wq_list_empty(&tctx->task_list)) + break; + /* another tctx_task_work() is enqueued, yield */ + if (test_and_set_bit(0, &tctx->task_state)) + break; + } + cond_resched(); } ctx_flush_and_put(ctx); - return list.first != NULL; -} - -static void tctx_task_work(struct callback_head *cb) -{ - struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); - - clear_bit(0, &tctx->task_state); - - while (__tctx_task_work(tctx)) - cond_resched(); } static int io_req_task_work_add(struct io_kiocb *req) @@ -2123,26 +2144,26 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, list_add(&req->compl.list, &state->comp.free_list); } -static void io_submit_flush_completions(struct io_comp_state *cs, - struct io_ring_ctx *ctx) +static void io_submit_flush_completions(struct io_ring_ctx *ctx) { + struct io_comp_state *cs = &ctx->submit_state.comp; int i, nr = cs->nr; - struct io_kiocb *req; struct req_batch rb; - io_init_req_batch(&rb); spin_lock_irq(&ctx->completion_lock); for (i = 0; i < nr; i++) { - req = cs->reqs[i]; + struct io_kiocb *req = cs->reqs[i]; + __io_cqring_fill_event(ctx, req->user_data, req->result, req->compl.cflags); } io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + + io_init_req_batch(&rb); for (i = 0; i < nr; i++) { - req = cs->reqs[i]; + struct io_kiocb *req = cs->reqs[i]; /* submission and completion refs */ if (req_ref_sub_and_test(req, 2)) @@ -2230,12 +2251,6 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) static inline bool io_run_task_work(void) { - /* - * Not safe to run on exiting task, and the task_work handling will - * not add work to such a task. - */ - if (unlikely(current->flags & PF_EXITING)) - return false; if (current->task_works) { __set_current_state(TASK_RUNNING); task_work_run(); @@ -2299,7 +2314,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * Only spin for completions if we don't have multiple devices hanging * off our complete list, and we're under the requested amount. */ - spin = !ctx->poll_multi_file && *nr_events < min; + spin = !ctx->poll_multi_queue && *nr_events < min; ret = 0; list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { @@ -2384,7 +2399,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - if (test_bit(0, &ctx->cq_check_overflow)) + if (test_bit(0, &ctx->check_cq_overflow)) __io_cqring_overflow_flush(ctx, false); if (io_cqring_events(ctx)) goto out; @@ -2483,7 +2498,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, req->flags |= REQ_F_REISSUE; return; } - req_set_fail_links(req); + req_set_fail(req); } if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_rw_kbuf(req); @@ -2506,7 +2521,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (unlikely(res != req->result)) { if (!(res == -EAGAIN && io_rw_should_reissue(req) && io_resubmit_prep(req))) { - req_set_fail_links(req); + req_set_fail(req); req->flags |= REQ_F_DONT_REISSUE; } } @@ -2523,9 +2538,14 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) * find it from a io_do_iopoll() thread before the issuer is done * accessing the kiocb cookie. */ -static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) +static void io_iopoll_req_issued(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + const bool in_async = io_wq_current_is_worker(); + + /* workqueue context doesn't hold uring_lock, grab it now */ + if (unlikely(in_async)) + mutex_lock(&ctx->uring_lock); /* * Track whether we have multiple files in our lists. This will impact @@ -2533,14 +2553,22 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) * different devices. */ if (list_empty(&ctx->iopoll_list)) { - ctx->poll_multi_file = false; - } else if (!ctx->poll_multi_file) { + ctx->poll_multi_queue = false; + } else if (!ctx->poll_multi_queue) { struct io_kiocb *list_req; + unsigned int queue_num0, queue_num1; list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, inflight_entry); - if (list_req->file != req->file) - ctx->poll_multi_file = true; + + if (list_req->file != req->file) { + ctx->poll_multi_queue = true; + } else { + queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie); + queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie); + if (queue_num0 != queue_num1) + ctx->poll_multi_queue = true; + } } /* @@ -2552,14 +2580,19 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) else list_add_tail(&req->inflight_entry, &ctx->iopoll_list); - /* - * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread - * task context or in io worker task context. If current task context is - * sq thread, we don't need to check whether should wake up sq thread. - */ - if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) && - wq_has_sleeper(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); + if (unlikely(in_async)) { + /* + * If IORING_SETUP_SQPOLL is enabled, sqes are either handle + * in sq thread task context or in io worker task context. If + * current task context is sq thread, we don't need to check + * whether should wake up sq thread. + */ + if ((ctx->flags & IORING_SETUP_SQPOLL) && + wq_has_sleeper(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); + + mutex_unlock(&ctx->uring_lock); + } } static inline void io_state_file_put(struct io_submit_state *state) @@ -2616,7 +2649,7 @@ static bool __io_file_supports_async(struct file *file, int rw) return true; return false; } - if (S_ISCHR(mode) || S_ISSOCK(mode)) + if (S_ISSOCK(mode)) return true; if (S_ISREG(mode)) { if (IS_ENABLED(CONFIG_BLOCK) && @@ -2749,12 +2782,12 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; - if (ret >= 0 && kiocb->ki_complete == io_complete_rw) + if (ret >= 0 && check_reissue) __io_complete_rw(req, ret, 0, issue_flags); else io_rw_done(kiocb, ret); - if (check_reissue && req->flags & REQ_F_REISSUE) { + if (check_reissue && (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; if (io_resubmit_prep(req)) { req_ref_get(req); @@ -2762,7 +2795,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, } else { int cflags = 0; - req_set_fail_links(req); + req_set_fail(req); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_rw_kbuf(req); __io_req_complete(req, issue_flags, ret, cflags); @@ -3233,7 +3266,7 @@ static bool io_rw_should_retry(struct io_kiocb *req) return true; } -static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) +static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) { if (req->file->f_op->read_iter) return call_read_iter(req->file, &req->rw.kiocb, iter); @@ -3448,6 +3481,10 @@ static int io_renameat_prep(struct io_kiocb *req, struct io_rename *ren = &req->rename; const char __user *oldf, *newf; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3484,7 +3521,7 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -3495,6 +3532,10 @@ static int io_unlinkat_prep(struct io_kiocb *req, struct io_unlink *un = &req->unlink; const char __user *fname; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3528,7 +3569,7 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -3565,7 +3606,7 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_shutdown_sock(sock, req->shutdown.how); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; #else @@ -3576,7 +3617,7 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) static int __io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_splice* sp = &req->splice; + struct io_splice *sp = &req->splice; unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -3623,14 +3664,14 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret != sp->len) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_splice* sp = &req->splice; + struct io_splice *sp = &req->splice; sp->off_in = READ_ONCE(sqe->splice_off_in); sp->off_out = READ_ONCE(sqe->off); @@ -3660,7 +3701,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret != sp->len) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -3713,7 +3754,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -3742,7 +3783,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -3836,32 +3877,31 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) goto err; file = do_filp_open(req->open.dfd, req->open.filename, &op); - /* only retry if RESOLVE_CACHED wasn't already set by application */ - if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) && - file == ERR_PTR(-EAGAIN)) { + if (IS_ERR(file)) { /* - * We could hang on to this 'fd', but seems like marginal - * gain for something that is now known to be a slower path. - * So just put it, and we'll get a new one when we retry. + * We could hang on to this 'fd' on retrying, but seems like + * marginal gain for something that is now known to be a slower + * path. So just put it, and we'll get a new one when we retry. */ put_unused_fd(ret); - return -EAGAIN; - } - if (IS_ERR(file)) { - put_unused_fd(ret); ret = PTR_ERR(file); - } else { - if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) - file->f_flags &= ~O_NONBLOCK; - fsnotify_open(file); - fd_install(ret, file); + /* only retry if RESOLVE_CACHED wasn't already set by application */ + if (ret == -EAGAIN && + (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) + return -EAGAIN; + goto err; } + + if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) + file->f_flags &= ~O_NONBLOCK; + fsnotify_open(file); + fd_install(ret, file); err: putname(req->open.filename); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -3933,7 +3973,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) if (head) ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); @@ -4024,7 +4064,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) __io_remove_buffers(ctx, head, p->bgid, -1U); } if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); io_ring_submit_unlock(ctx, !force_nonblock); @@ -4070,7 +4110,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; #else @@ -4106,7 +4146,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; #else @@ -4145,7 +4185,7 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4180,7 +4220,7 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) ctx->buffer); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4238,7 +4278,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) ret = filp_close(file, current->files); err: if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); if (file) fput(file); __io_req_complete(req, issue_flags, ret, 0); @@ -4271,7 +4311,7 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4375,7 +4415,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < min_ret) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4417,7 +4457,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) ret = -EINTR; if (ret < min_ret) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4612,7 +4652,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, cflags); return 0; } @@ -4667,7 +4707,7 @@ out_free: if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_recv_kbuf(req); if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, cflags); return 0; } @@ -4706,7 +4746,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) { if (ret == -ERESTARTSYS) ret = -EINTR; - req_set_fail_links(req); + req_set_fail(req); } __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -4770,7 +4810,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) ret = -EINTR; out: if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -5059,7 +5099,7 @@ static void io_async_task_func(struct callback_head *cb) struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; - trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); + trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data); if (io_poll_rewait(req, &apoll->poll)) { spin_unlock_irq(&ctx->completion_lock); @@ -5138,50 +5178,51 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, return mask; } -static bool io_arm_poll_handler(struct io_kiocb *req) +enum { + IO_APOLL_OK, + IO_APOLL_ABORTED, + IO_APOLL_READY +}; + +static int io_arm_poll_handler(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t mask, ret; + __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI; int rw; if (!req->file || !file_can_poll(req->file)) - return false; + return IO_APOLL_ABORTED; if (req->flags & REQ_F_POLLED) - return false; - if (def->pollin) + return IO_APOLL_ABORTED; + if (!def->pollin && !def->pollout) + return IO_APOLL_ABORTED; + + if (def->pollin) { rw = READ; - else if (def->pollout) + mask |= POLLIN | POLLRDNORM; + + /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ + if ((req->opcode == IORING_OP_RECVMSG) && + (req->sr_msg.msg_flags & MSG_ERRQUEUE)) + mask &= ~POLLIN; + } else { rw = WRITE; - else - return false; + mask |= POLLOUT | POLLWRNORM; + } + /* if we can't nonblock try, then no point in arming a poll handler */ if (!io_file_supports_async(req, rw)) - return false; + return IO_APOLL_ABORTED; apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) - return false; + return IO_APOLL_ABORTED; apoll->double_poll = NULL; - - req->flags |= REQ_F_POLLED; req->apoll = apoll; - - mask = EPOLLONESHOT; - if (def->pollin) - mask |= POLLIN | POLLRDNORM; - if (def->pollout) - mask |= POLLOUT | POLLWRNORM; - - /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ - if ((req->opcode == IORING_OP_RECVMSG) && - (req->sr_msg.msg_flags & MSG_ERRQUEUE)) - mask &= ~POLLIN; - - mask |= POLLERR | POLLPRI; - + req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, @@ -5189,12 +5230,14 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (ret || ipt.error) { io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); - return false; + if (ret) + return IO_APOLL_READY; + return IO_APOLL_ABORTED; } spin_unlock_irq(&ctx->completion_lock); - trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask, - apoll->poll.events); - return true; + trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, + mask, apoll->poll.events); + return IO_APOLL_OK; } static bool __io_poll_remove_one(struct io_kiocb *req, @@ -5241,7 +5284,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) if (do_complete) { io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); io_commit_cqring(req->ctx); - req_set_fail_links(req); + req_set_fail(req); io_put_req_deferred(req, 1); } @@ -5252,7 +5295,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) * Returns true if we found and killed one or more poll requests */ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, - struct files_struct *files) + bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; @@ -5264,7 +5307,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_match_task(req, tsk, files)) + if (io_match_task(req, tsk, cancel_all)) posted += io_poll_remove_one(req); } } @@ -5451,7 +5494,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) err: if (ret < 0) { spin_unlock_irq(&ctx->completion_lock); - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -5471,7 +5514,7 @@ err: if (!completing) { ret = io_poll_add(preq, issue_flags); if (ret < 0) { - req_set_fail_links(preq); + req_set_fail(preq); io_req_complete(preq, ret); } } @@ -5496,7 +5539,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - req_set_fail_links(req); + req_set_fail(req); io_put_req(req); return HRTIMER_NORESTART; } @@ -5532,7 +5575,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (IS_ERR(req)) return PTR_ERR(req); - req_set_fail_links(req); + req_set_fail(req); io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); io_put_req_deferred(req, 1); return 0; @@ -5611,7 +5654,7 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_put_req(req); return 0; } @@ -5634,6 +5677,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EINVAL; req->timeout.off = off; + if (unlikely(off && !req->ctx->off_timeout_used)) + req->ctx->off_timeout_used = true; if (!req->async_data && io_alloc_async_data(req)) return -ENOMEM; @@ -5764,7 +5809,7 @@ done: io_cqring_ev_posted(ctx); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); } static int io_async_cancel_prep(struct io_kiocb *req, @@ -5821,7 +5866,7 @@ done: io_cqring_ev_posted(ctx); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_put_req(req); return 0; } @@ -5863,7 +5908,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) mutex_unlock(&ctx->uring_lock); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -5974,48 +6019,69 @@ static int io_req_prep_async(struct io_kiocb *req) static u32 io_get_sequence(struct io_kiocb *req) { - struct io_kiocb *pos; - struct io_ring_ctx *ctx = req->ctx; - u32 total_submitted, nr_reqs = 0; + u32 seq = req->ctx->cached_sq_head; - io_for_each_link(pos, req) - nr_reqs++; - - total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; - return total_submitted - nr_reqs; + /* need original cached_sq_head, but it was increased for each req */ + io_for_each_link(req, req) + seq--; + return seq; } -static int io_req_defer(struct io_kiocb *req) +static bool io_drain_req(struct io_kiocb *req) { + struct io_kiocb *pos; struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; int ret; u32 seq; + /* + * If we need to drain a request in the middle of a link, drain the + * head request and the next request/link after the current link. + * Considering sequential execution of links, IOSQE_IO_DRAIN will be + * maintained for every request of our link. + */ + if (ctx->drain_next) { + req->flags |= REQ_F_IO_DRAIN; + ctx->drain_next = false; + } + /* not interested in head, start from the first linked */ + io_for_each_link(pos, req->link) { + if (pos->flags & REQ_F_IO_DRAIN) { + ctx->drain_next = true; + req->flags |= REQ_F_IO_DRAIN; + break; + } + } + /* Still need defer if there is pending req in defer list. */ if (likely(list_empty_careful(&ctx->defer_list) && - !(req->flags & REQ_F_IO_DRAIN))) - return 0; + !(req->flags & REQ_F_IO_DRAIN))) { + ctx->drain_active = false; + return false; + } seq = io_get_sequence(req); /* Still a chance to pass the sequence check */ if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) - return 0; + return false; ret = io_req_prep_async(req); if (ret) return ret; io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); - if (!de) - return -ENOMEM; + if (!de) { + io_req_complete_failed(req, ret); + return true; + } spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); kfree(de); io_queue_async_work(req); - return -EIOCBQUEUED; + return true; } trace_io_uring_defer(ctx, req, req->user_data); @@ -6023,7 +6089,7 @@ static int io_req_defer(struct io_kiocb *req) de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); - return -EIOCBQUEUED; + return true; } static void io_clean_op(struct io_kiocb *req) @@ -6040,7 +6106,6 @@ static void io_clean_op(struct io_kiocb *req) kfree(req->sr_msg.kbuf); break; } - req->flags &= ~REQ_F_BUFFER_SELECTED; } if (req->flags & REQ_F_NEED_CLEANUP) { @@ -6052,8 +6117,8 @@ static void io_clean_op(struct io_kiocb *req) case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: { struct io_async_rw *io = req->async_data; - if (io->free_iovec) - kfree(io->free_iovec); + + kfree(io->free_iovec); break; } case IORING_OP_RECVMSG: @@ -6081,7 +6146,6 @@ static void io_clean_op(struct io_kiocb *req) putname(req->unlink.filename); break; } - req->flags &= ~REQ_F_NEED_CLEANUP; } if ((req->flags & REQ_F_POLLED) && req->apoll) { kfree(req->apoll->double_poll); @@ -6092,8 +6156,11 @@ static void io_clean_op(struct io_kiocb *req) struct io_uring_task *tctx = req->task->io_uring; atomic_dec(&tctx->inflight_tracked); - req->flags &= ~REQ_F_INFLIGHT; } + if (req->flags & REQ_F_CREDS) + put_cred(req->creds); + + req->flags &= ~IO_REQ_CLEAN_FLAGS; } static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) @@ -6102,8 +6169,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) const struct cred *creds = NULL; int ret; - if (req->work.creds && req->work.creds != current_cred()) - creds = override_creds(req->work.creds); + if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) + creds = override_creds(req->creds); switch (req->opcode) { case IORING_OP_NOP: @@ -6213,23 +6280,11 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (creds) revert_creds(creds); - if (ret) return ret; - /* If the op doesn't have a file, we're not polling for it */ - if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) { - const bool in_async = io_wq_current_is_worker(); - - /* workqueue context doesn't hold uring_lock, grab it now */ - if (in_async) - mutex_lock(&ctx->uring_lock); - - io_iopoll_req_issued(req, in_async); - - if (in_async) - mutex_unlock(&ctx->uring_lock); - } + if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) + io_iopoll_req_issued(req); return 0; } @@ -6411,6 +6466,7 @@ static void __io_queue_sqe(struct io_kiocb *req) struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); int ret; +issue_sqe: ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); /* @@ -6425,17 +6481,21 @@ static void __io_queue_sqe(struct io_kiocb *req) cs->reqs[cs->nr++] = req; if (cs->nr == ARRAY_SIZE(cs->reqs)) - io_submit_flush_completions(cs, ctx); + io_submit_flush_completions(ctx); } else { io_put_req(req); } } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - if (!io_arm_poll_handler(req)) { + switch (io_arm_poll_handler(req)) { + case IO_APOLL_READY: + goto issue_sqe; + case IO_APOLL_ABORTED: /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ io_queue_async_work(req); + break; } } else { io_req_complete_failed(req, ret); @@ -6444,23 +6504,20 @@ static void __io_queue_sqe(struct io_kiocb *req) io_queue_linked_timeout(linked_timeout); } -static void io_queue_sqe(struct io_kiocb *req) +static inline void io_queue_sqe(struct io_kiocb *req) { - int ret; + if (unlikely(req->ctx->drain_active) && io_drain_req(req)) + return; - ret = io_req_defer(req); - if (ret) { - if (ret != -EIOCBQUEUED) { -fail_req: - io_req_complete_failed(req, ret); - } - } else if (req->flags & REQ_F_FORCE_ASYNC) { - ret = io_req_prep_async(req); - if (unlikely(ret)) - goto fail_req; - io_queue_async_work(req); - } else { + if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) { __io_queue_sqe(req); + } else { + int ret = io_req_prep_async(req); + + if (unlikely(ret)) + io_req_complete_failed(req, ret); + else + io_queue_async_work(req); } } @@ -6473,7 +6530,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { - if (!ctx->restricted) + if (likely(!ctx->restricted)) return true; if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) @@ -6501,35 +6558,33 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); req->user_data = READ_ONCE(sqe->user_data); - req->async_data = NULL; req->file = NULL; - req->ctx = ctx; - req->link = NULL; req->fixed_rsrc_refs = NULL; /* one is dropped after submission, the other at completion */ atomic_set(&req->refs, 2); req->task = current; - req->result = 0; - req->work.creds = NULL; /* enforce forwards compatibility on users */ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) return -EINVAL; if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; - if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) + if (!io_check_restriction(ctx, req, sqe_flags)) return -EACCES; if ((sqe_flags & IOSQE_BUFFER_SELECT) && !io_op_defs[req->opcode].buffer_select) return -EOPNOTSUPP; + if (unlikely(sqe_flags & IOSQE_IO_DRAIN)) + ctx->drain_active = true; personality = READ_ONCE(sqe->personality); if (personality) { - req->work.creds = xa_load(&ctx->personalities, personality); - if (!req->work.creds) + req->creds = xa_load(&ctx->personalities, personality); + if (!req->creds) return -EINVAL; - get_cred(req->work.creds); + get_cred(req->creds); + req->flags |= REQ_F_CREDS; } state = &ctx->submit_state; @@ -6566,20 +6621,22 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, fail_req: if (link->head) { /* fail even hard links since we don't submit */ - link->head->flags |= REQ_F_FAIL_LINK; + req_set_fail(link->head); io_req_complete_failed(link->head, -ECANCELED); link->head = NULL; } io_req_complete_failed(req, ret); return ret; } + ret = io_req_prep(req, sqe); if (unlikely(ret)) goto fail_req; /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, - true, ctx->flags & IORING_SETUP_SQPOLL); + trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, + req->flags, true, + ctx->flags & IORING_SETUP_SQPOLL); /* * If we already have a head request, queue this one for async @@ -6591,17 +6648,6 @@ fail_req: if (link->head) { struct io_kiocb *head = link->head; - /* - * Taking sequential execution of a link, draining both sides - * of the link also fullfils IOSQE_IO_DRAIN semantics for all - * requests in the link. So, it drains the head and the - * next after the link request. The last one is done via - * drain_next flag to persist the effect across calls. - */ - if (req->flags & REQ_F_IO_DRAIN) { - head->flags |= REQ_F_IO_DRAIN; - ctx->drain_next = 1; - } ret = io_req_prep_async(req); if (unlikely(ret)) goto fail_req; @@ -6611,14 +6657,10 @@ fail_req: /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - io_queue_sqe(head); link->head = NULL; + io_queue_sqe(head); } } else { - if (unlikely(ctx->drain_next)) { - req->flags |= REQ_F_IO_DRAIN; - ctx->drain_next = 0; - } if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { link->head = req; link->last = req; @@ -6639,7 +6681,7 @@ static void io_submit_state_end(struct io_submit_state *state, if (state->link.head) io_queue_sqe(state->link.head); if (state->comp.nr) - io_submit_flush_completions(&state->comp, ctx); + io_submit_flush_completions(ctx); if (state->plug_started) blk_finish_plug(&state->plug); io_state_file_put(state); @@ -6670,7 +6712,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) } /* - * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory + * Fetch an sqe, if one is available. Note this returns a pointer to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always * being a good citizen. If members of the sqe are validated and then later @@ -6679,8 +6721,8 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) */ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) { - u32 *sq_array = ctx->sq_array; - unsigned head; + unsigned head, mask = ctx->sq_entries - 1; + unsigned sq_idx = ctx->cached_sq_head++ & mask; /* * The cached sq head (or cq tail) serves two purposes: @@ -6690,28 +6732,36 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ - head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]); + head = READ_ONCE(ctx->sq_array[sq_idx]); if (likely(head < ctx->sq_entries)) return &ctx->sq_sqes[head]; /* drop invalid entries */ - ctx->cached_sq_dropped++; - WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped); + ctx->cq_extra--; + WRITE_ONCE(ctx->rings->sq_dropped, + READ_ONCE(ctx->rings->sq_dropped) + 1); return NULL; } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { + struct io_uring_task *tctx; int submitted = 0; /* make sure SQ entry isn't read before tail */ nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); - if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - percpu_counter_add(¤t->io_uring->inflight, nr); - refcount_add(nr, ¤t->usage); + tctx = current->io_uring; + tctx->cached_refs -= nr; + if (unlikely(tctx->cached_refs < 0)) { + unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; + + percpu_counter_add(&tctx->inflight, refill); + refcount_add(refill, ¤t->usage); + tctx->cached_refs += refill; + } io_submit_state_start(&ctx->submit_state, nr); while (submitted < nr) { @@ -6737,12 +6787,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) if (unlikely(submitted != nr)) { int ref_used = (submitted == -EAGAIN) ? 0 : submitted; - struct io_uring_task *tctx = current->io_uring; int unused = nr - ref_used; + current->io_uring->cached_refs += unused; percpu_ref_put_many(&ctx->refs, unused); - percpu_counter_sub(&tctx->inflight, unused); - put_task_struct_many(current, unused); } io_submit_state_end(&ctx->submit_state, ctx); @@ -6752,6 +6800,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) return submitted; } +static inline bool io_sqd_events_pending(struct io_sq_data *sqd) +{ + return READ_ONCE(sqd->state); +} + static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) { /* Tell userspace we may need a wakeup call */ @@ -6774,11 +6827,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) to_submit = io_sqring_entries(ctx); /* if we're handling multiple rings, cap submit size for fairness */ - if (cap_entries && to_submit > 8) - to_submit = 8; + if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) + to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; if (!list_empty(&ctx->iopoll_list) || to_submit) { unsigned nr_events = 0; + const struct cred *creds = NULL; + + if (ctx->sq_creds != current_cred()) + creds = override_creds(ctx->sq_creds); mutex_lock(&ctx->uring_lock); if (!list_empty(&ctx->iopoll_list)) @@ -6792,10 +6849,12 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) !(ctx->flags & IORING_SETUP_R_DISABLED)) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); - } - if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) - wake_up(&ctx->sqo_sq_wait); + if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) + wake_up(&ctx->sqo_sq_wait); + if (creds) + revert_creds(creds); + } return ret; } @@ -6810,6 +6869,22 @@ static void io_sqd_update_thread_idle(struct io_sq_data *sqd) sqd->sq_thread_idle = sq_thread_idle; } +static bool io_sqd_handle_event(struct io_sq_data *sqd) +{ + bool did_sig = false; + struct ksignal ksig; + + if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || + signal_pending(current)) { + mutex_unlock(&sqd->lock); + if (signal_pending(current)) + did_sig = get_signal(&ksig); + cond_resched(); + mutex_lock(&sqd->lock); + } + return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); +} + static int io_sq_thread(void *data) { struct io_sq_data *sqd = data; @@ -6828,48 +6903,26 @@ static int io_sq_thread(void *data) current->flags |= PF_NO_SETAFFINITY; mutex_lock(&sqd->lock); - /* a user may had exited before the thread started */ - io_run_task_work_head(&sqd->park_task_work); - - while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { - int ret; - bool cap_entries, sqt_spin, needs_sched; - - if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || - signal_pending(current)) { - bool did_sig = false; - - mutex_unlock(&sqd->lock); - if (signal_pending(current)) { - struct ksignal ksig; + while (1) { + bool cap_entries, sqt_spin = false; - did_sig = get_signal(&ksig); - } - cond_resched(); - mutex_lock(&sqd->lock); - io_run_task_work(); - io_run_task_work_head(&sqd->park_task_work); - if (did_sig) + if (io_sqd_events_pending(sqd) || signal_pending(current)) { + if (io_sqd_handle_event(sqd)) break; timeout = jiffies + sqd->sq_thread_idle; - continue; } - sqt_spin = false; + cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - const struct cred *creds = NULL; + int ret = __io_sq_thread(ctx, cap_entries); - if (ctx->sq_creds != current_cred()) - creds = override_creds(ctx->sq_creds); - ret = __io_sq_thread(ctx, cap_entries); - if (creds) - revert_creds(creds); if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) sqt_spin = true; } + if (io_run_task_work()) + sqt_spin = true; if (sqt_spin || !time_after(jiffies, timeout)) { - io_run_task_work(); cond_resched(); if (sqt_spin) timeout = jiffies + sqd->sq_thread_idle; @@ -6877,12 +6930,12 @@ static int io_sq_thread(void *data) } prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); - if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_ring_set_wakeup_flag(ctx); + if (!io_sqd_events_pending(sqd) && !current->task_works) { + bool needs_sched = true; - needs_sched = true; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + io_ring_set_wakeup_flag(ctx); + if ((ctx->flags & IORING_SETUP_IOPOLL) && !list_empty_careful(&ctx->iopoll_list)) { needs_sched = false; @@ -6904,16 +6957,14 @@ static int io_sq_thread(void *data) } finish_wait(&sqd->wait, &wait); - io_run_task_work_head(&sqd->park_task_work); timeout = jiffies + sqd->sq_thread_idle; } - io_uring_cancel_sqpoll(sqd); + io_uring_cancel_generic(true, sqd); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); io_run_task_work(); - io_run_task_work_head(&sqd->park_task_work); mutex_unlock(&sqd->lock); complete(&sqd->exited); @@ -6950,7 +7001,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ - if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow)) + if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } @@ -6978,7 +7029,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (ret || io_should_wake(iowq)) return ret; /* let the caller flush overflows, retry */ - if (test_bit(0, &ctx->cq_check_overflow)) + if (test_bit(0, &ctx->check_cq_overflow)) return 1; *timeout = schedule_timeout(*timeout); @@ -7043,10 +7094,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, ret = -EBUSY; break; } - prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, + prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); - finish_wait(&ctx->wait, &iowq.wq); + finish_wait(&ctx->cq_wait, &iowq.wq); cond_resched(); } while (ret > 0); @@ -7055,14 +7106,36 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void io_free_file_tables(struct io_file_table *table, unsigned nr_files) +static void io_free_page_table(void **table, size_t size) { - unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE); + unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); for (i = 0; i < nr_tables; i++) - kfree(table->files[i]); - kfree(table->files); - table->files = NULL; + kfree(table[i]); + kfree(table); +} + +static void **io_alloc_page_table(size_t size) +{ + unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); + size_t init_size = size; + void **table; + + table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL); + if (!table) + return NULL; + + for (i = 0; i < nr_tables; i++) { + unsigned int this_size = min_t(size_t, size, PAGE_SIZE); + + table[i] = kzalloc(this_size, GFP_KERNEL); + if (!table[i]) { + io_free_page_table(table, init_size); + return NULL; + } + size -= this_size; + } + return table; } static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx) @@ -7151,33 +7224,77 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct return ret; } +static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) +{ + unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; + unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; + + return &data->tags[table_idx][off]; +} + static void io_rsrc_data_free(struct io_rsrc_data *data) { - kvfree(data->tags); + size_t size = data->nr * sizeof(data->tags[0][0]); + + if (data->tags) + io_free_page_table((void **)data->tags, size); kfree(data); } -static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx, - rsrc_put_fn *do_put, - unsigned nr) +static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, + u64 __user *utags, unsigned nr, + struct io_rsrc_data **pdata) { struct io_rsrc_data *data; + int ret = -ENOMEM; + unsigned i; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) - return NULL; - - data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL); + return -ENOMEM; + data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); if (!data->tags) { kfree(data); - return NULL; + return -ENOMEM; } - atomic_set(&data->refs, 1); + data->nr = nr; data->ctx = ctx; data->do_put = do_put; + if (utags) { + ret = -EFAULT; + for (i = 0; i < nr; i++) { + u64 *tag_slot = io_get_tag_slot(data, i); + + if (copy_from_user(tag_slot, &utags[i], + sizeof(*tag_slot))) + goto fail; + } + } + + atomic_set(&data->refs, 1); init_completion(&data->done); - return data; + *pdata = data; + return 0; +fail: + io_rsrc_data_free(data); + return ret; +} + +static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) +{ + size_t size = nr_files * sizeof(struct io_fixed_file); + + table->files = (struct io_fixed_file **)io_alloc_page_table(size); + return !!table->files; +} + +static void io_free_file_tables(struct io_file_table *table, unsigned nr_files) +{ + size_t size = nr_files * sizeof(struct io_fixed_file); + + io_free_page_table((void **)table->files, size); + table->files = NULL; } static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) @@ -7441,31 +7558,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) } #endif -static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) -{ - unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE); - - table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL); - if (!table->files) - return false; - - for (i = 0; i < nr_tables; i++) { - unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE); - - table->files[i] = kcalloc(this_files, sizeof(*table->files[i]), - GFP_KERNEL); - if (!table->files[i]) - break; - nr_files -= this_files; - } - - if (i == nr_tables) - return true; - - io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE); - return false; -} - static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) { struct file *file = prsrc->file; @@ -7540,14 +7632,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) if (prsrc->tag) { bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; - unsigned long flags; io_ring_submit_lock(ctx, lock_ring); - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock_irq(&ctx->completion_lock); io_cqring_fill_event(ctx, prsrc->tag, 0, 0); ctx->cq_extra++; io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); io_ring_submit_unlock(ctx, lock_ring); } @@ -7629,7 +7720,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, struct file *file; int fd, ret; unsigned i; - struct io_rsrc_data *file_data; if (ctx->file_data) return -EBUSY; @@ -7640,27 +7730,24 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, ret = io_rsrc_node_switch_start(ctx); if (ret) return ret; + ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, + &ctx->file_data); + if (ret) + return ret; - file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args); - if (!file_data) - return -ENOMEM; - ctx->file_data = file_data; ret = -ENOMEM; if (!io_alloc_file_tables(&ctx->file_table, nr_args)) goto out_free; for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - u64 tag = 0; - - if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) || - copy_from_user(&fd, &fds[i], sizeof(fd))) { + if (copy_from_user(&fd, &fds[i], sizeof(fd))) { ret = -EFAULT; goto out_fput; } /* allow sparse sets */ if (fd == -1) { ret = -EINVAL; - if (unlikely(tag)) + if (unlikely(*io_get_tag_slot(ctx->file_data, i))) goto out_fput; continue; } @@ -7681,7 +7768,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, fput(file); goto out_fput; } - ctx->file_data->tags[i] = tag; io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); } @@ -7759,7 +7845,7 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, if (!prsrc) return -ENOMEM; - prsrc->tag = data->tags[idx]; + prsrc->tag = *io_get_tag_slot(data, idx); prsrc->rsrc = rsrc; list_add(&prsrc->list, &node->rsrc_list); return 0; @@ -7829,7 +7915,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - data->tags[up->offset + done] = tag; + *io_get_tag_slot(data, up->offset + done) = tag; io_fixed_file_set(file_slot, file); err = io_sqe_file_register(ctx, file, i); if (err) { @@ -7887,7 +7973,7 @@ static int io_uring_alloc_task_context(struct task_struct *task, struct io_uring_task *tctx; int ret; - tctx = kmalloc(sizeof(*tctx), GFP_KERNEL); + tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); if (unlikely(!tctx)) return -ENOMEM; @@ -7907,13 +7993,11 @@ static int io_uring_alloc_task_context(struct task_struct *task, xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); - tctx->last = NULL; atomic_set(&tctx->in_idle, 0); atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); - tctx->task_state = 0; init_task_work(&tctx->task_work, tctx_task_work); return 0; } @@ -7924,6 +8008,7 @@ void __io_uring_free(struct task_struct *tsk) WARN_ON_ONCE(!xa_empty(&tctx->xa)); WARN_ON_ONCE(tctx->io_wq); + WARN_ON_ONCE(tctx->cached_refs); percpu_counter_destroy(&tctx->inflight); kfree(tctx); @@ -8300,6 +8385,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, for (i = 0; i < nr_pages; i++) { struct vm_area_struct *vma = vmas[i]; + if (vma_is_shmem(vma)) + continue; if (vma->vm_file && !is_file_hugepages(vma->vm_file)) { ret = -EOPNOTSUPP; @@ -8397,9 +8484,9 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, ret = io_rsrc_node_switch_start(ctx); if (ret) return ret; - data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args); - if (!data) - return -ENOMEM; + ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); + if (ret) + return ret; ret = io_buffers_map_alloc(ctx, nr_args); if (ret) { io_rsrc_data_free(data); @@ -8407,19 +8494,13 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, } for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { - u64 tag = 0; - - if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) { - ret = -EFAULT; - break; - } ret = io_copy_iov(ctx, &iov, arg, i); if (ret) break; ret = io_buffer_validate(&iov); if (ret) break; - if (!iov.iov_base && tag) { + if (!iov.iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; } @@ -8428,7 +8509,6 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, &last_hpage); if (ret) break; - data->tags[i] = tag; } WARN_ON_ONCE(ctx->buf_data); @@ -8493,7 +8573,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, } ctx->user_bufs[i] = imu; - ctx->buf_data->tags[offset] = tag; + *io_get_tag_slot(ctx->buf_data, offset) = tag; } if (needs_switch) @@ -8515,6 +8595,7 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) ctx->cq_ev_fd = eventfd_ctx_fdget(fd); if (IS_ERR(ctx->cq_ev_fd)) { int ret = PTR_ERR(ctx->cq_ev_fd); + ctx->cq_ev_fd = NULL; return ret; } @@ -8638,7 +8719,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - poll_wait(file, &ctx->cq_wait, wait); + poll_wait(file, &ctx->poll_wait, wait); /* * synchronizes with barrier from wq_has_sleeper call in * io_commit_cqring @@ -8660,7 +8741,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * Users may get EPOLLIN meanwhile seeing nothing in cqring, this * pushs them to do the flush. */ - if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow)) + if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -8708,7 +8789,7 @@ static void io_tctx_exit_cb(struct callback_head *cb) * node. It'll be removed by the end of cancellation, just ignore it. */ if (!atomic_read(&tctx->in_idle)) - io_uring_del_task_file((unsigned long)work->ctx); + io_uring_del_tctx_node((unsigned long)work->ctx); complete(&work->completion); } @@ -8734,7 +8815,7 @@ static void io_ring_exit_work(struct work_struct *work) * as nobody else will be looking for them. */ do { - io_uring_try_cancel_requests(ctx, NULL, NULL); + io_uring_try_cancel_requests(ctx, NULL, true); if (ctx->sq_data) { struct io_sq_data *sqd = ctx->sq_data; struct task_struct *tsk; @@ -8785,14 +8866,14 @@ static void io_ring_exit_work(struct work_struct *work) /* Returns true if we found and killed one or more timeouts */ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, - struct files_struct *files) + bool cancel_all) { struct io_kiocb *req, *tmp; int canceled = 0; spin_lock_irq(&ctx->completion_lock); list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - if (io_match_task(req, tsk, files)) { + if (io_match_task(req, tsk, cancel_all)) { io_kill_timeout(req, -ECANCELED); canceled++; } @@ -8818,8 +8899,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); - io_kill_timeouts(ctx, NULL, NULL); - io_poll_remove_all(ctx, NULL, NULL); + io_kill_timeouts(ctx, NULL, true); + io_poll_remove_all(ctx, NULL, true); /* if we failed setting up the ctx, we might not have any rings */ io_iopoll_try_reap_events(ctx); @@ -8845,7 +8926,7 @@ static int io_uring_release(struct inode *inode, struct file *file) struct io_task_cancel { struct task_struct *task; - struct files_struct *files; + bool all; }; static bool io_cancel_task_cb(struct io_wq_work *work, void *data) @@ -8854,30 +8935,29 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) struct io_task_cancel *cancel = data; bool ret; - if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) { + if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) { unsigned long flags; struct io_ring_ctx *ctx = req->ctx; /* protect against races with linked timeouts */ spin_lock_irqsave(&ctx->completion_lock, flags); - ret = io_match_task(req, cancel->task, cancel->files); + ret = io_match_task(req, cancel->task, cancel->all); spin_unlock_irqrestore(&ctx->completion_lock, flags); } else { - ret = io_match_task(req, cancel->task, cancel->files); + ret = io_match_task(req, cancel->task, cancel->all); } return ret; } static bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct task_struct *task, - struct files_struct *files) + struct task_struct *task, bool cancel_all) { struct io_defer_entry *de; LIST_HEAD(list); spin_lock_irq(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task(de->req, task, files)) { + if (io_match_task(de->req, task, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } @@ -8921,9 +9001,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, - struct files_struct *files) + bool cancel_all) { - struct io_task_cancel cancel = { .task = task, .files = files, }; + struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; struct io_uring_task *tctx = task ? task->io_uring : NULL; while (1) { @@ -8943,7 +9023,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } /* SQPOLL thread does its own polling */ - if ((!(ctx->flags & IORING_SETUP_SQPOLL) && !files) || + if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || (ctx->sq_data && ctx->sq_data->thread == current)) { while (!list_empty_careful(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); @@ -8951,10 +9031,11 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } } - ret |= io_cancel_defer_files(ctx, task, files); - ret |= io_poll_remove_all(ctx, task, files); - ret |= io_kill_timeouts(ctx, task, files); - ret |= io_run_task_work(); + ret |= io_cancel_defer_files(ctx, task, cancel_all); + ret |= io_poll_remove_all(ctx, task, cancel_all); + ret |= io_kill_timeouts(ctx, task, cancel_all); + if (task) + ret |= io_run_task_work(); ret |= io_run_ctx_fallback(ctx); if (!ret) break; @@ -8962,7 +9043,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } } -static int __io_uring_add_task_file(struct io_ring_ctx *ctx) +static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; @@ -8999,19 +9080,19 @@ static int __io_uring_add_task_file(struct io_ring_ctx *ctx) /* * Note that this task has used io_uring. We use it for cancelation purposes. */ -static inline int io_uring_add_task_file(struct io_ring_ctx *ctx) +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; if (likely(tctx && tctx->last == ctx)) return 0; - return __io_uring_add_task_file(ctx); + return __io_uring_add_tctx_node(ctx); } /* * Remove this io_uring_file -> task mapping. */ -static void io_uring_del_task_file(unsigned long index) +static void io_uring_del_tctx_node(unsigned long index) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; @@ -9041,7 +9122,7 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) unsigned long index; xa_for_each(&tctx->xa, index, node) - io_uring_del_task_file(index); + io_uring_del_tctx_node(index); if (wq) { /* * Must be after io_uring_del_task_file() (removes nodes under @@ -9059,99 +9140,83 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) return percpu_counter_sum(&tctx->inflight); } -static void io_uring_try_cancel(struct files_struct *files) +static void io_uring_drop_tctx_refs(struct task_struct *task) { - struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; - unsigned long index; - - xa_for_each(&tctx->xa, index, node) { - struct io_ring_ctx *ctx = node->ctx; + struct io_uring_task *tctx = task->io_uring; + unsigned int refs = tctx->cached_refs; - /* sqpoll task will cancel all its requests */ - if (!ctx->sq_data) - io_uring_try_cancel_requests(ctx, current, files); - } + tctx->cached_refs = 0; + percpu_counter_sub(&tctx->inflight, refs); + put_task_struct_many(task, refs); } -/* should only be called by SQPOLL task */ -static void io_uring_cancel_sqpoll(struct io_sq_data *sqd) +/* + * Find any io_uring ctx that this task has registered or done IO on, and cancel + * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation. + */ +static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) { struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx; s64 inflight; DEFINE_WAIT(wait); + WARN_ON_ONCE(sqd && sqd->thread != current); + if (!current->io_uring) return; if (tctx->io_wq) io_wq_exit_start(tctx->io_wq); - WARN_ON_ONCE(!sqd || sqd->thread != current); - + io_uring_drop_tctx_refs(current); atomic_inc(&tctx->in_idle); do { /* read completions before cancelations */ - inflight = tctx_inflight(tctx, false); + inflight = tctx_inflight(tctx, !cancel_all); if (!inflight) break; - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_uring_try_cancel_requests(ctx, current, NULL); - prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); - /* - * If we've seen completions, retry without waiting. This - * avoids a race where a completion comes in before we did - * prepare_to_wait(). - */ - if (inflight == tctx_inflight(tctx, false)) - schedule(); - finish_wait(&tctx->wait, &wait); - } while (1); - atomic_dec(&tctx->in_idle); -} + if (!sqd) { + struct io_tctx_node *node; + unsigned long index; -/* - * Find any io_uring fd that this task has registered or done IO on, and cancel - * requests. - */ -void __io_uring_cancel(struct files_struct *files) -{ - struct io_uring_task *tctx = current->io_uring; - DEFINE_WAIT(wait); - s64 inflight; - - if (tctx->io_wq) - io_wq_exit_start(tctx->io_wq); + xa_for_each(&tctx->xa, index, node) { + /* sqpoll task will cancel all its requests */ + if (node->ctx->sq_data) + continue; + io_uring_try_cancel_requests(node->ctx, current, + cancel_all); + } + } else { + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_uring_try_cancel_requests(ctx, current, + cancel_all); + } - /* make sure overflow events are dropped */ - atomic_inc(&tctx->in_idle); - do { - /* read completions before cancelations */ - inflight = tctx_inflight(tctx, !!files); - if (!inflight) - break; - io_uring_try_cancel(files); prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); - /* * If we've seen completions, retry without waiting. This * avoids a race where a completion comes in before we did * prepare_to_wait(). */ - if (inflight == tctx_inflight(tctx, !!files)) + if (inflight == tctx_inflight(tctx, !cancel_all)) schedule(); finish_wait(&tctx->wait, &wait); } while (1); atomic_dec(&tctx->in_idle); io_uring_clean_tctx(tctx); - if (!files) { + if (cancel_all) { /* for exec all current's requests should be gone, kill tctx */ __io_uring_free(current); } } +void __io_uring_cancel(struct files_struct *files) +{ + io_uring_cancel_generic(!files, NULL); +} + static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { @@ -9312,9 +9377,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_cqring_overflow_flush(ctx, false); ret = -EOWNERDEAD; - if (unlikely(ctx->sq_data->thread == NULL)) { + if (unlikely(ctx->sq_data->thread == NULL)) goto out; - } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) { @@ -9324,7 +9388,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } submitted = to_submit; } else if (to_submit) { - ret = io_uring_add_task_file(ctx); + ret = io_uring_add_tctx_node(ctx); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); @@ -9508,8 +9572,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->cq_ring_mask = p->cq_entries - 1; rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries; - ctx->sq_mask = rings->sq_ring_mask; - ctx->cq_mask = rings->cq_ring_mask; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { @@ -9536,7 +9598,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) if (fd < 0) return fd; - ret = io_uring_add_task_file(ctx); + ret = io_uring_add_tctx_node(ctx); if (ret) { put_unused_fd(fd); return ret; @@ -9671,7 +9733,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS; + IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | + IORING_FEAT_RSRC_TAGS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -9911,7 +9974,7 @@ static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, } static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned size) + unsigned size, unsigned type) { struct io_uring_rsrc_update2 up; @@ -9919,13 +9982,13 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; - if (!up.nr) + if (!up.nr || up.resv) return -EINVAL; - return __io_register_rsrc_update(ctx, up.type, &up, up.nr); + return __io_register_rsrc_update(ctx, type, &up, up.nr); } static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, - unsigned int size) + unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr; @@ -9936,10 +9999,10 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, memset(&rr, 0, sizeof(rr)); if (copy_from_user(&rr, arg, size)) return -EFAULT; - if (!rr.nr) + if (!rr.nr || rr.resv || rr.resv2) return -EINVAL; - switch (rr.type) { + switch (type) { case IORING_RSRC_FILE: return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), rr.nr, u64_to_user_ptr(rr.tags)); @@ -9950,6 +10013,43 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; } +static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, + unsigned len) +{ + struct io_uring_task *tctx = current->io_uring; + cpumask_var_t new_mask; + int ret; + + if (!tctx || !tctx->io_wq) + return -EINVAL; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_clear(new_mask); + if (len > cpumask_size()) + len = cpumask_size(); + + if (copy_from_user(new_mask, arg, len)) { + free_cpumask_var(new_mask); + return -EFAULT; + } + + ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); + free_cpumask_var(new_mask); + return ret; +} + +static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) +{ + struct io_uring_task *tctx = current->io_uring; + + if (!tctx || !tctx->io_wq) + return -EINVAL; + + return io_wq_cpu_affinity(tctx->io_wq, NULL); +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -9961,8 +10061,12 @@ static bool io_register_op_must_quiesce(int op) case IORING_REGISTER_PROBE: case IORING_REGISTER_PERSONALITY: case IORING_UNREGISTER_PERSONALITY: - case IORING_REGISTER_RSRC: - case IORING_REGISTER_RSRC_UPDATE: + case IORING_REGISTER_FILES2: + case IORING_REGISTER_FILES_UPDATE2: + case IORING_REGISTER_BUFFERS2: + case IORING_REGISTER_BUFFERS_UPDATE: + case IORING_REGISTER_IOWQ_AFF: + case IORING_UNREGISTER_IOWQ_AFF: return false; default: return true; @@ -10088,11 +10192,31 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_RESTRICTIONS: ret = io_register_restrictions(ctx, arg, nr_args); break; - case IORING_REGISTER_RSRC: - ret = io_register_rsrc(ctx, arg, nr_args); + case IORING_REGISTER_FILES2: + ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); + break; + case IORING_REGISTER_FILES_UPDATE2: + ret = io_register_rsrc_update(ctx, arg, nr_args, + IORING_RSRC_FILE); break; - case IORING_REGISTER_RSRC_UPDATE: - ret = io_register_rsrc_update(ctx, arg, nr_args); + case IORING_REGISTER_BUFFERS2: + ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); + break; + case IORING_REGISTER_BUFFERS_UPDATE: + ret = io_register_rsrc_update(ctx, arg, nr_args, + IORING_RSRC_BUFFER); + break; + case IORING_REGISTER_IOWQ_AFF: + ret = -EINVAL; + if (!arg || !nr_args) + break; + ret = io_register_iowq_aff(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_IOWQ_AFF: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_unregister_iowq_aff(ctx); break; default: ret = -EINVAL; @@ -10172,6 +10296,7 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); BUILD_BUG_SQE_ELEM(32, __u64, user_data); BUILD_BUG_SQE_ELEM(40, __u16, buf_index); + BUILD_BUG_SQE_ELEM(40, __u16, buf_group); BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); @@ -10184,6 +10309,7 @@ static int __init io_uring_init(void) BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); return 0; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 9023717c5188..41da4f14c00b 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -640,31 +640,6 @@ out_no_page: return status; } -int -iomap_set_page_dirty(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - int newly_dirty; - - if (unlikely(!mapping)) - return !TestSetPageDirty(page); - - /* - * Lock out page's memcg migration to keep PageDirty - * synchronized with per-memcg dirty page counters. - */ - lock_page_memcg(page); - newly_dirty = !TestSetPageDirty(page); - if (newly_dirty) - __set_page_dirty(page, mapping, 0); - unlock_page_memcg(page); - - if (newly_dirty) - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return newly_dirty; -} -EXPORT_SYMBOL_GPL(iomap_set_page_dirty); - static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, size_t copied, struct page *page) { @@ -684,7 +659,7 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, if (unlikely(copied < len && !PageUptodate(page))) return 0; iomap_set_range_uptodate(page, offset_in_page(pos), len); - iomap_set_page_dirty(page); + __set_page_dirty_nobuffers(page); return copied; } @@ -771,10 +746,6 @@ again: * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. */ if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; @@ -789,30 +760,29 @@ again: if (mapping_writably_mapped(inode->i_mapping)) flush_dcache_page(page); - copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + copied = copy_page_from_iter_atomic(page, offset, bytes, i); - copied = iomap_write_end(inode, pos, bytes, copied, page, iomap, + status = iomap_write_end(inode, pos, bytes, copied, page, iomap, srcmap); - cond_resched(); + if (unlikely(copied != status)) + iov_iter_revert(i, copied - status); - iov_iter_advance(i, copied); - if (unlikely(copied == 0)) { + cond_resched(); + if (unlikely(status == 0)) { /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. + * A short copy made iomap_write_end() reject the + * thing entirely. Might be memory poisoning + * halfway through, might be a race with munmap, + * might be severe memory pressure. */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(i)); + if (copied) + bytes = copied; goto again; } - pos += copied; - written += copied; - length -= copied; + pos += status; + written += status; + length -= status; balance_dirty_pages_ratelimited(inode->i_mapping); } while (iov_iter_count(i) && length); diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index b9e6a7ec78be..eb2f8273e6f1 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -235,8 +235,6 @@ static int do_isofs_readdir(struct inode *inode, struct file *file, break; } ctx->pos += de_len; - - continue; } if (bh) brelse(bh); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 6f65bfa9f18d..57ab424c05ff 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -151,7 +151,8 @@ void jfs_evict_inode(struct inode *inode) if (test_cflag(COMMIT_Freewmap, inode)) jfs_free_zero_link(inode); - diFree(inode); + if (JFS_SBI(inode->i_sb)->ipimap) + diFree(inode); /* * Free the inode from the quota allocation. @@ -356,6 +357,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } const struct address_space_operations jfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = jfs_readpage, .readahead = jfs_readahead, .writepage = jfs_writepage, diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h index d6af79e94263..6b231d0d0071 100644 --- a/fs/jfs/jfs_dinode.h +++ b/fs/jfs/jfs_dinode.h @@ -101,7 +101,6 @@ struct dinode { u8 unused[16]; /* 16: */ dxd_t _dxd; /* 16: */ union { - __le32 _rdev; /* 4: */ /* * The fast symlink area * is expected to overflow @@ -109,9 +108,15 @@ struct dinode { * needed (which will clear * INLINEEA). */ - u8 _fastsymlink[128]; - } _u; - u8 _inlineea[128]; + struct { + union { + __le32 _rdev; /* 4: */ + u8 _fastsymlink[128]; + } _u; + u8 _inlineea[128]; + }; + u8 _inline_all[256]; + }; } _special; } _u2; } _file; @@ -122,6 +127,7 @@ struct dinode { #define di_rdev u._file._u2._special._u._rdev #define di_fastsymlink u._file._u2._special._u._fastsymlink #define di_inlineea u._file._u2._special._inlineea +#define di_inline_all u._file._u2._special._inline_all } u; }; diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 7aee15608619..91f4ec93dab1 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -3660,7 +3660,7 @@ void dbFinalizeBmap(struct inode *ipbmap) * (the leftmost ag with average free space in it); */ //agpref: - /* get the number of active ags and inacitve ags */ + /* get the number of active ags and inactive ags */ actags = bmp->db_maxag + 1; inactags = bmp->db_numag - actags; ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1); /* ??? */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 937ca07b58b1..799d3837e7c2 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -103,10 +103,8 @@ int diMount(struct inode *ipimap) */ /* allocate the in-memory inode map control structure. */ imap = kmalloc(sizeof(struct inomap), GFP_KERNEL); - if (imap == NULL) { - jfs_err("diMount: kmalloc returned NULL!"); + if (imap == NULL) return -ENOMEM; - } /* read the on-disk inode map control structure. */ @@ -763,7 +761,7 @@ int diWrite(tid_t tid, struct inode *ip) lv = & dilinelock->lv[dilinelock->index]; lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE; lv->length = 2; - memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE); + memcpy(&dp->di_inline_all, jfs_ip->i_inline_all, IDATASIZE); dilinelock->index++; } /* @@ -3084,7 +3082,7 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip) } if (S_ISDIR(ip->i_mode)) { - memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384); + memcpy(&jfs_ip->u.dir, &dip->u._dir, 384); } else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) { memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288); } else diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index a466ec41cfbb..721def69e732 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -77,11 +77,18 @@ struct jfs_inode_info { unchar _unused[16]; /* 16: */ dxd_t _dxd; /* 16: */ /* _inline may overflow into _inline_ea when needed */ - unchar _inline[128]; /* 128: inline symlink */ /* _inline_ea may overlay the last part of * file._xtroot if maxentry = XTROOTINITSLOT */ - unchar _inline_ea[128]; /* 128: inline extended attr */ + union { + struct { + /* 128: inline symlink */ + unchar _inline[128]; + /* 128: inline extended attr */ + unchar _inline_ea[128]; + }; + unchar _inline_all[256]; + }; } link; } u; #ifdef CONFIG_QUOTA @@ -96,6 +103,7 @@ struct jfs_inode_info { #define i_dtroot u.dir._dtroot #define i_inline u.link._inline #define i_inline_ea u.link._inline_ea +#define i_inline_all u.link._inline_all #define IREAD_LOCK(ip, subclass) \ down_read_nested(&JFS_IP(ip)->rdwrlock, subclass) diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 9330eff210e0..78fd136ac13b 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1324,6 +1324,7 @@ int lmLogInit(struct jfs_log * log) } else { if (!uuid_equal(&logsuper->uuid, &log->uuid)) { jfs_warn("wrong uuid on JFS log device"); + rc = -EINVAL; goto errout20; } log->size = le32_to_cpu(logsuper->size); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 053295cd7bc6..042bbe6d8ac2 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -105,7 +105,7 @@ static DEFINE_SPINLOCK(jfsTxnLock); #define TXN_LOCK() spin_lock(&jfsTxnLock) #define TXN_UNLOCK() spin_unlock(&jfsTxnLock) -#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock); +#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock) #define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags) #define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags) diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 1f0ffabbde56..9030aeaf0f88 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -939,7 +939,8 @@ static int __init init_jfs_fs(void) jfs_inode_cachep = kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, - offsetof(struct jfs_inode_info, i_inline), IDATASIZE, + offsetof(struct jfs_inode_info, i_inline_all), + sizeof_field(struct jfs_inode_info, i_inline_all), init_once); if (jfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 90d255fbdd9b..87aac4c72c37 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -160,7 +160,7 @@ int kernel_read_file_from_path_initns(const char *path, loff_t offset, get_fs_root(init_task.fs, &root); task_unlock(&init_task); - file = file_open_root(root.dentry, root.mnt, path, O_RDONLY, 0); + file = file_open_root(&root, path, O_RDONLY, 0); path_put(&root); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 7e0e62deab53..33166ec90a11 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -548,49 +548,6 @@ void kernfs_put(struct kernfs_node *kn) } EXPORT_SYMBOL_GPL(kernfs_put); -static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct kernfs_node *kn; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - /* Always perform fresh lookup for negatives */ - if (d_really_is_negative(dentry)) - goto out_bad_unlocked; - - kn = kernfs_dentry_node(dentry); - mutex_lock(&kernfs_mutex); - - /* The kernfs node has been deactivated */ - if (!kernfs_active(kn)) - goto out_bad; - - /* The kernfs node has been moved? */ - if (kernfs_dentry_node(dentry->d_parent) != kn->parent) - goto out_bad; - - /* The kernfs node has been renamed */ - if (strcmp(dentry->d_name.name, kn->name) != 0) - goto out_bad; - - /* The kernfs node has been moved to a different namespace */ - if (kn->parent && kernfs_ns_enabled(kn->parent) && - kernfs_info(dentry->d_sb)->ns != kn->ns) - goto out_bad; - - mutex_unlock(&kernfs_mutex); - return 1; -out_bad: - mutex_unlock(&kernfs_mutex); -out_bad_unlocked: - return 0; -} - -const struct dentry_operations kernfs_dops = { - .d_revalidate = kernfs_dop_revalidate, -}; - /** * kernfs_node_from_dentry - determine kernfs_node associated with a dentry * @dentry: the dentry in question @@ -1073,6 +1030,49 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, return ERR_PTR(rc); } +static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct kernfs_node *kn; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* Always perform fresh lookup for negatives */ + if (d_really_is_negative(dentry)) + goto out_bad_unlocked; + + kn = kernfs_dentry_node(dentry); + mutex_lock(&kernfs_mutex); + + /* The kernfs node has been deactivated */ + if (!kernfs_active(kn)) + goto out_bad; + + /* The kernfs node has been moved? */ + if (kernfs_dentry_node(dentry->d_parent) != kn->parent) + goto out_bad; + + /* The kernfs node has been renamed */ + if (strcmp(dentry->d_name.name, kn->name) != 0) + goto out_bad; + + /* The kernfs node has been moved to a different namespace */ + if (kn->parent && kernfs_ns_enabled(kn->parent) && + kernfs_info(dentry->d_sb)->ns != kn->ns) + goto out_bad; + + mutex_unlock(&kernfs_mutex); + return 1; +out_bad: + mutex_unlock(&kernfs_mutex); +out_bad_unlocked: + return 0; +} + +const struct dentry_operations kernfs_dops = { + .d_revalidate = kernfs_dop_revalidate, +}; + static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index d73950fc3d57..26f2aa3586f9 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -17,12 +17,6 @@ #include "kernfs-internal.h" -static const struct address_space_operations kernfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, -}; - static const struct inode_operations kernfs_iops = { .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, @@ -203,7 +197,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) { kernfs_get(kn); inode->i_private = kn; - inode->i_mapping->a_ops = &kernfs_aops; + inode->i_mapping->a_ops = &ram_aops; inode->i_op = &kernfs_iops; inode->i_generation = kernfs_gen(kn); diff --git a/fs/libfs.c b/fs/libfs.c index e9b29c6ffccb..51b4de3b3447 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -512,7 +512,7 @@ int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } EXPORT_SYMBOL(simple_setattr); -int simple_readpage(struct file *file, struct page *page) +static int simple_readpage(struct file *file, struct page *page) { clear_highpage(page); flush_dcache_page(page); @@ -520,7 +520,6 @@ int simple_readpage(struct file *file, struct page *page) unlock_page(page); return 0; } -EXPORT_SYMBOL(simple_readpage); int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -568,7 +567,7 @@ EXPORT_SYMBOL(simple_write_begin); * * Use *ONLY* with simple_readpage() */ -int simple_write_end(struct file *file, struct address_space *mapping, +static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { @@ -597,7 +596,17 @@ int simple_write_end(struct file *file, struct address_space *mapping, return copied; } -EXPORT_SYMBOL(simple_write_end); + +/* + * Provides ramfs-style behavior: data in the pagecache, but no writeback. + */ +const struct address_space_operations ram_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, + .write_end = simple_write_end, + .set_page_dirty = __set_page_dirty_no_writeback, +}; +EXPORT_SYMBOL(ram_aops); /* * the inodes created here are not hashed. If you use iunique to generate @@ -1162,22 +1171,6 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); -int noop_set_page_dirty(struct page *page) -{ - /* - * Unlike __set_page_dirty_no_writeback that handles dirty page - * tracking in the page object, dax does all dirty tracking in - * the inode address_space in response to mkwrite faults. In the - * dax case we only need to worry about potentially dirty CPU - * caches, not dirty page cache pages to write back. - * - * This callback is defined to prevent fallback to - * __set_page_dirty_buffers() in set_page_dirty(). - */ - return 0; -} -EXPORT_SYMBOL_GPL(noop_set_page_dirty); - void noop_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { @@ -1208,19 +1201,10 @@ void kfree_link(void *p) } EXPORT_SYMBOL(kfree_link); -/* - * nop .set_page_dirty method so that people can use .page_mkwrite on - * anon inodes. - */ -static int anon_set_page_dirty(struct page *page) -{ - return 0; -}; - struct inode *alloc_anon_inode(struct super_block *s) { static const struct address_space_operations anon_aops = { - .set_page_dirty = anon_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, }; struct inode *inode = new_inode_pseudo(s); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 1a639e34847d..2de048f80eb8 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -766,6 +766,46 @@ static void __exit exit_nlm(void) module_init(init_nlm); module_exit(exit_nlm); +/** + * nlmsvc_dispatch - Process an NLM Request + * @rqstp: incoming request + * @statp: pointer to location of accept_stat field in RPC Reply buffer + * + * Return values: + * %0: Processing complete; do not send a Reply + * %1: Processing complete; send Reply in rqstp->rq_res + */ +static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp) +{ + const struct svc_procedure *procp = rqstp->rq_procinfo; + struct kvec *argv = rqstp->rq_arg.head; + struct kvec *resv = rqstp->rq_res.head; + + svcxdr_init_decode(rqstp); + if (!procp->pc_decode(rqstp, argv->iov_base)) + goto out_decode_err; + + *statp = procp->pc_func(rqstp); + if (*statp == rpc_drop_reply) + return 0; + if (*statp != rpc_success) + return 1; + + svcxdr_init_encode(rqstp); + if (!procp->pc_encode(rqstp, resv->iov_base + resv->iov_len)) + goto out_encode_err; + + return 1; + +out_decode_err: + *statp = rpc_garbage_args; + return 1; + +out_encode_err: + *statp = rpc_system_err; + return 1; +} + /* * Define NLM program and procedures */ @@ -775,6 +815,7 @@ static const struct svc_version nlmsvc_version1 = { .vs_nproc = 17, .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version1_count, + .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; static unsigned int nlmsvc_version3_count[24]; @@ -783,6 +824,7 @@ static const struct svc_version nlmsvc_version3 = { .vs_nproc = 24, .vs_proc = nlmsvc_procedures, .vs_count = nlmsvc_version3_count, + .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #ifdef CONFIG_LOCKD_V4 @@ -792,6 +834,7 @@ static const struct svc_version nlmsvc_version4 = { .vs_nproc = 24, .vs_proc = nlmsvc_procedures4, .vs_count = nlmsvc_version4_count, + .vs_dispatch = nlmsvc_dispatch, .vs_xdrsize = NLMSVC_XDRSIZE, }; #endif diff --git a/fs/lockd/svcxdr.h b/fs/lockd/svcxdr.h new file mode 100644 index 000000000000..c69a0bb76c94 --- /dev/null +++ b/fs/lockd/svcxdr.h @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Encode/decode NLM basic data types + * + * Basic NLMv3 XDR data types are not defined in an IETF standards + * document. X/Open has a description of these data types that + * is useful. See Chapter 10 of "Protocols for Interworking: + * XNFS, Version 3W". + * + * Basic NLMv4 XDR data types are defined in Appendix II.1.4 of + * RFC 1813: "NFS Version 3 Protocol Specification". + * + * Author: Chuck Lever <chuck.lever@oracle.com> + * + * Copyright (c) 2020, Oracle and/or its affiliates. + */ + +#ifndef _LOCKD_SVCXDR_H_ +#define _LOCKD_SVCXDR_H_ + +static inline bool +svcxdr_decode_stats(struct xdr_stream *xdr, __be32 *status) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, XDR_UNIT); + if (!p) + return false; + *status = *p; + + return true; +} + +static inline bool +svcxdr_encode_stats(struct xdr_stream *xdr, __be32 status) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT); + if (!p) + return false; + *p = status; + + return true; +} + +static inline bool +svcxdr_decode_string(struct xdr_stream *xdr, char **data, unsigned int *data_len) +{ + __be32 *p; + u32 len; + + if (xdr_stream_decode_u32(xdr, &len) < 0) + return false; + if (len > NLM_MAXSTRLEN) + return false; + p = xdr_inline_decode(xdr, len); + if (!p) + return false; + *data_len = len; + *data = (char *)p; + + return true; +} + +/* + * NLM cookies are defined by specification to be a variable-length + * XDR opaque no longer than 1024 bytes. However, this implementation + * limits their length to 32 bytes, and treats zero-length cookies + * specially. + */ +static inline bool +svcxdr_decode_cookie(struct xdr_stream *xdr, struct nlm_cookie *cookie) +{ + __be32 *p; + u32 len; + + if (xdr_stream_decode_u32(xdr, &len) < 0) + return false; + if (len > NLM_MAXCOOKIELEN) + return false; + if (!len) + goto out_hpux; + + p = xdr_inline_decode(xdr, len); + if (!p) + return false; + cookie->len = len; + memcpy(cookie->data, p, len); + + return true; + + /* apparently HPUX can return empty cookies */ +out_hpux: + cookie->len = 4; + memset(cookie->data, 0, 4); + return true; +} + +static inline bool +svcxdr_encode_cookie(struct xdr_stream *xdr, const struct nlm_cookie *cookie) +{ + __be32 *p; + + if (xdr_stream_encode_u32(xdr, cookie->len) < 0) + return false; + p = xdr_reserve_space(xdr, cookie->len); + if (!p) + return false; + memcpy(p, cookie->data, cookie->len); + + return true; +} + +static inline bool +svcxdr_decode_owner(struct xdr_stream *xdr, struct xdr_netobj *obj) +{ + __be32 *p; + u32 len; + + if (xdr_stream_decode_u32(xdr, &len) < 0) + return false; + if (len > XDR_MAX_NETOBJ) + return false; + p = xdr_inline_decode(xdr, len); + if (!p) + return false; + obj->len = len; + obj->data = (u8 *)p; + + return true; +} + +static inline bool +svcxdr_encode_owner(struct xdr_stream *xdr, const struct xdr_netobj *obj) +{ + unsigned int quadlen = XDR_QUADLEN(obj->len); + __be32 *p; + + if (xdr_stream_encode_u32(xdr, obj->len) < 0) + return false; + p = xdr_reserve_space(xdr, obj->len); + if (!p) + return false; + p[quadlen - 1] = 0; /* XDR pad */ + memcpy(p, obj->data, obj->len); + + return true; +} + +#endif /* _LOCKD_SVCXDR_H_ */ diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 982629f7b120..9235e60b1769 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -19,7 +19,7 @@ #include <uapi/linux/nfs2.h> -#define NLMDBG_FACILITY NLMDBG_XDR +#include "svcxdr.h" static inline loff_t @@ -42,311 +42,323 @@ loff_t_to_s32(loff_t offset) } /* - * XDR functions for basic NLM types + * NLM file handles are defined by specification to be a variable-length + * XDR opaque no longer than 1024 bytes. However, this implementation + * constrains their length to exactly the length of an NFSv2 file + * handle. */ -static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c) +static bool +svcxdr_decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) { - unsigned int len; - - len = ntohl(*p++); - - if(len==0) - { - c->len=4; - memset(c->data, 0, 4); /* hockeypux brain damage */ - } - else if(len<=NLM_MAXCOOKIELEN) - { - c->len=len; - memcpy(c->data, p, len); - p+=XDR_QUADLEN(len); - } - else - { - dprintk("lockd: bad cookie size %d (only cookies under " - "%d bytes are supported.)\n", - len, NLM_MAXCOOKIELEN); - return NULL; - } - return p; -} - -static inline __be32 * -nlm_encode_cookie(__be32 *p, struct nlm_cookie *c) -{ - *p++ = htonl(c->len); - memcpy(p, c->data, c->len); - p+=XDR_QUADLEN(c->len); - return p; -} - -static __be32 * -nlm_decode_fh(__be32 *p, struct nfs_fh *f) -{ - unsigned int len; - - if ((len = ntohl(*p++)) != NFS2_FHSIZE) { - dprintk("lockd: bad fhandle size %d (should be %d)\n", - len, NFS2_FHSIZE); - return NULL; - } - f->size = NFS2_FHSIZE; - memset(f->data, 0, sizeof(f->data)); - memcpy(f->data, p, NFS2_FHSIZE); - return p + XDR_QUADLEN(NFS2_FHSIZE); -} - -/* - * Encode and decode owner handle - */ -static inline __be32 * -nlm_decode_oh(__be32 *p, struct xdr_netobj *oh) -{ - return xdr_decode_netobj(p, oh); -} - -static inline __be32 * -nlm_encode_oh(__be32 *p, struct xdr_netobj *oh) -{ - return xdr_encode_netobj(p, oh); + __be32 *p; + u32 len; + + if (xdr_stream_decode_u32(xdr, &len) < 0) + return false; + if (len != NFS2_FHSIZE) + return false; + + p = xdr_inline_decode(xdr, len); + if (!p) + return false; + fh->size = NFS2_FHSIZE; + memcpy(fh->data, p, len); + memset(fh->data + NFS2_FHSIZE, 0, sizeof(fh->data) - NFS2_FHSIZE); + + return true; } -static __be32 * -nlm_decode_lock(__be32 *p, struct nlm_lock *lock) +static bool +svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock) { - struct file_lock *fl = &lock->fl; - s32 start, len, end; - - if (!(p = xdr_decode_string_inplace(p, &lock->caller, - &lock->len, - NLM_MAXSTRLEN)) - || !(p = nlm_decode_fh(p, &lock->fh)) - || !(p = nlm_decode_oh(p, &lock->oh))) - return NULL; - lock->svid = ntohl(*p++); + struct file_lock *fl = &lock->fl; + s32 start, len, end; + + if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) + return false; + if (!svcxdr_decode_fhandle(xdr, &lock->fh)) + return false; + if (!svcxdr_decode_owner(xdr, &lock->oh)) + return false; + if (xdr_stream_decode_u32(xdr, &lock->svid) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &start) < 0) + return false; + if (xdr_stream_decode_u32(xdr, &len) < 0) + return false; locks_init_lock(fl); fl->fl_flags = FL_POSIX; - fl->fl_type = F_RDLCK; /* as good as anything else */ - start = ntohl(*p++); - len = ntohl(*p++); + fl->fl_type = F_RDLCK; end = start + len - 1; - fl->fl_start = s32_to_loff_t(start); - if (len == 0 || end < 0) fl->fl_end = OFFSET_MAX; else fl->fl_end = s32_to_loff_t(end); - return p; + + return true; } -/* - * Encode result of a TEST/TEST_MSG call - */ -static __be32 * -nlm_encode_testres(__be32 *p, struct nlm_res *resp) +static bool +svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock) { - s32 start, len; - - if (!(p = nlm_encode_cookie(p, &resp->cookie))) - return NULL; - *p++ = resp->status; - - if (resp->status == nlm_lck_denied) { - struct file_lock *fl = &resp->lock.fl; - - *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one; - *p++ = htonl(resp->lock.svid); - - /* Encode owner handle. */ - if (!(p = xdr_encode_netobj(p, &resp->lock.oh))) - return NULL; + const struct file_lock *fl = &lock->fl; + s32 start, len; + + /* exclusive */ + if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0) + return false; + if (xdr_stream_encode_u32(xdr, lock->svid) < 0) + return false; + if (!svcxdr_encode_owner(xdr, &lock->oh)) + return false; + start = loff_t_to_s32(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + len = 0; + else + len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1); + if (xdr_stream_encode_u32(xdr, start) < 0) + return false; + if (xdr_stream_encode_u32(xdr, len) < 0) + return false; - start = loff_t_to_s32(fl->fl_start); - if (fl->fl_end == OFFSET_MAX) - len = 0; - else - len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1); + return true; +} - *p++ = htonl(start); - *p++ = htonl(len); +static bool +svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp) +{ + if (!svcxdr_encode_stats(xdr, resp->status)) + return false; + switch (resp->status) { + case nlm_lck_denied: + if (!svcxdr_encode_holder(xdr, &resp->lock)) + return false; } - return p; + return true; } /* - * First, the server side XDR functions + * Decode Call arguments */ + +int +nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p) +{ + return 1; +} + int nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; + u32 exclusive; - if (!(p = nlm_decode_cookie(p, &argp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) return 0; - - exclusive = ntohl(*p++); - if (!(p = nlm_decode_lock(p, &argp->lock))) + if (xdr_stream_decode_bool(xdr, &exclusive) < 0) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; if (exclusive) argp->lock.fl.fl_type = F_WRLCK; - return xdr_argsize_check(rqstp, p); -} - -int -nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p) -{ - struct nlm_res *resp = rqstp->rq_resp; - - if (!(p = nlm_encode_testres(p, resp))) - return 0; - return xdr_ressize_check(rqstp, p); + return 1; } int nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; + u32 exclusive; - if (!(p = nlm_decode_cookie(p, &argp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (xdr_stream_decode_bool(xdr, &argp->block) < 0) return 0; - argp->block = ntohl(*p++); - exclusive = ntohl(*p++); - if (!(p = nlm_decode_lock(p, &argp->lock))) + if (xdr_stream_decode_bool(xdr, &exclusive) < 0) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; if (exclusive) argp->lock.fl.fl_type = F_WRLCK; - argp->reclaim = ntohl(*p++); - argp->state = ntohl(*p++); + if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->state) < 0) + return 0; argp->monitor = 1; /* monitor client by default */ - return xdr_argsize_check(rqstp, p); + return 1; } int nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; + u32 exclusive; - if (!(p = nlm_decode_cookie(p, &argp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (xdr_stream_decode_bool(xdr, &argp->block) < 0) return 0; - argp->block = ntohl(*p++); - exclusive = ntohl(*p++); - if (!(p = nlm_decode_lock(p, &argp->lock))) + if (xdr_stream_decode_bool(xdr, &exclusive) < 0) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; if (exclusive) argp->lock.fl.fl_type = F_WRLCK; - return xdr_argsize_check(rqstp, p); + + return 1; } int nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - if (!(p = nlm_decode_cookie(p, &argp->cookie)) - || !(p = nlm_decode_lock(p, &argp->lock))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; argp->lock.fl.fl_type = F_UNLCK; - return xdr_argsize_check(rqstp, p); + + return 1; } int -nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_lock *lock = &argp->lock; - - memset(lock, 0, sizeof(*lock)); - locks_init_lock(&lock->fl); - lock->svid = ~(u32) 0; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nlm_res *resp = rqstp->rq_argp; - if (!(p = nlm_decode_cookie(p, &argp->cookie)) - || !(p = xdr_decode_string_inplace(p, &lock->caller, - &lock->len, NLM_MAXSTRLEN)) - || !(p = nlm_decode_fh(p, &lock->fh)) - || !(p = nlm_decode_oh(p, &lock->oh))) + if (!svcxdr_decode_cookie(xdr, &resp->cookie)) + return 0; + if (!svcxdr_decode_stats(xdr, &resp->status)) return 0; - argp->fsm_mode = ntohl(*p++); - argp->fsm_access = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + + return 1; } int -nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_res *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nlm_reboot *argp = rqstp->rq_argp; + u32 len; - if (!(p = nlm_encode_cookie(p, &resp->cookie))) + if (xdr_stream_decode_u32(xdr, &len) < 0) + return 0; + if (len > SM_MAXSTRLEN) + return 0; + p = xdr_inline_decode(xdr, len); + if (!p) + return 0; + argp->len = len; + argp->mon = (char *)p; + if (xdr_stream_decode_u32(xdr, &argp->state) < 0) + return 0; + p = xdr_inline_decode(xdr, SM_PRIV_SIZE); + if (!p) return 0; - *p++ = resp->status; - *p++ = xdr_zero; /* sequence argument */ - return xdr_ressize_check(rqstp, p); + memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); + + return 1; } int -nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_res *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nlm_args *argp = rqstp->rq_argp; + struct nlm_lock *lock = &argp->lock; - if (!(p = nlm_encode_cookie(p, &resp->cookie))) + memset(lock, 0, sizeof(*lock)); + locks_init_lock(&lock->fl); + lock->svid = ~(u32)0; + + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) + return 0; + if (!svcxdr_decode_fhandle(xdr, &lock->fh)) + return 0; + if (!svcxdr_decode_owner(xdr, &lock->oh)) + return 0; + /* XXX: Range checks are missing in the original code */ + if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0) return 0; - *p++ = resp->status; - return xdr_ressize_check(rqstp, p); + + return 1; } int nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; struct nlm_lock *lock = &argp->lock; - if (!(p = xdr_decode_string_inplace(p, &lock->caller, - &lock->len, NLM_MAXSTRLEN))) + if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->state) < 0) return 0; - argp->state = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + + return 1; } + +/* + * Encode Reply results + */ + int -nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_reboot *argp = rqstp->rq_argp; - - if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) - return 0; - argp->state = ntohl(*p++); - memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); - p += XDR_QUADLEN(SM_PRIV_SIZE); - return xdr_argsize_check(rqstp, p); + return 1; } int -nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_res *resp = rqstp->rq_argp; + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct nlm_res *resp = rqstp->rq_resp; - if (!(p = nlm_decode_cookie(p, &resp->cookie))) - return 0; - resp->status = *p++; - return xdr_argsize_check(rqstp, p); + return svcxdr_encode_cookie(xdr, &resp->cookie) && + svcxdr_encode_testrply(xdr, resp); } int -nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p) { - return xdr_argsize_check(rqstp, p); + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct nlm_res *resp = rqstp->rq_resp; + + return svcxdr_encode_cookie(xdr, &resp->cookie) && + svcxdr_encode_stats(xdr, resp->status); } int -nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p) +nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p) { - return xdr_ressize_check(rqstp, p); + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct nlm_res *resp = rqstp->rq_resp; + + if (!svcxdr_encode_cookie(xdr, &resp->cookie)) + return 0; + if (!svcxdr_encode_stats(xdr, resp->status)) + return 0; + /* sequence */ + if (xdr_stream_encode_u32(xdr, 0) < 0) + return 0; + + return 1; } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 5fa9f48a9dba..98e957e4566c 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -18,7 +18,7 @@ #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> -#define NLMDBG_FACILITY NLMDBG_XDR +#include "svcxdr.h" static inline loff_t s64_to_loff_t(__s64 offset) @@ -41,309 +41,322 @@ loff_t_to_s64(loff_t offset) } /* - * XDR functions for basic NLM types + * NLM file handles are defined by specification to be a variable-length + * XDR opaque no longer than 1024 bytes. However, this implementation + * limits their length to the size of an NFSv3 file handle. */ -static __be32 * -nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c) +static bool +svcxdr_decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) { - unsigned int len; - - len = ntohl(*p++); - - if(len==0) - { - c->len=4; - memset(c->data, 0, 4); /* hockeypux brain damage */ - } - else if(len<=NLM_MAXCOOKIELEN) - { - c->len=len; - memcpy(c->data, p, len); - p+=XDR_QUADLEN(len); - } - else - { - dprintk("lockd: bad cookie size %d (only cookies under " - "%d bytes are supported.)\n", - len, NLM_MAXCOOKIELEN); - return NULL; - } - return p; -} - -static __be32 * -nlm4_encode_cookie(__be32 *p, struct nlm_cookie *c) -{ - *p++ = htonl(c->len); - memcpy(p, c->data, c->len); - p+=XDR_QUADLEN(c->len); - return p; -} - -static __be32 * -nlm4_decode_fh(__be32 *p, struct nfs_fh *f) -{ - memset(f->data, 0, sizeof(f->data)); - f->size = ntohl(*p++); - if (f->size > NFS_MAXFHSIZE) { - dprintk("lockd: bad fhandle size %d (should be <=%d)\n", - f->size, NFS_MAXFHSIZE); - return NULL; - } - memcpy(f->data, p, f->size); - return p + XDR_QUADLEN(f->size); -} - -/* - * Encode and decode owner handle - */ -static __be32 * -nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh) -{ - return xdr_decode_netobj(p, oh); + __be32 *p; + u32 len; + + if (xdr_stream_decode_u32(xdr, &len) < 0) + return false; + if (len > NFS_MAXFHSIZE) + return false; + + p = xdr_inline_decode(xdr, len); + if (!p) + return false; + fh->size = len; + memcpy(fh->data, p, len); + memset(fh->data + len, 0, sizeof(fh->data) - len); + + return true; } -static __be32 * -nlm4_decode_lock(__be32 *p, struct nlm_lock *lock) +static bool +svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock) { - struct file_lock *fl = &lock->fl; - __u64 len, start; - __s64 end; - - if (!(p = xdr_decode_string_inplace(p, &lock->caller, - &lock->len, NLM_MAXSTRLEN)) - || !(p = nlm4_decode_fh(p, &lock->fh)) - || !(p = nlm4_decode_oh(p, &lock->oh))) - return NULL; - lock->svid = ntohl(*p++); + struct file_lock *fl = &lock->fl; + u64 len, start; + s64 end; + + if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) + return false; + if (!svcxdr_decode_fhandle(xdr, &lock->fh)) + return false; + if (!svcxdr_decode_owner(xdr, &lock->oh)) + return false; + if (xdr_stream_decode_u32(xdr, &lock->svid) < 0) + return false; + if (xdr_stream_decode_u64(xdr, &start) < 0) + return false; + if (xdr_stream_decode_u64(xdr, &len) < 0) + return false; locks_init_lock(fl); fl->fl_flags = FL_POSIX; - fl->fl_type = F_RDLCK; /* as good as anything else */ - p = xdr_decode_hyper(p, &start); - p = xdr_decode_hyper(p, &len); + fl->fl_type = F_RDLCK; end = start + len - 1; - fl->fl_start = s64_to_loff_t(start); - if (len == 0 || end < 0) fl->fl_end = OFFSET_MAX; else fl->fl_end = s64_to_loff_t(end); - return p; + + return true; } -/* - * Encode result of a TEST/TEST_MSG call - */ -static __be32 * -nlm4_encode_testres(__be32 *p, struct nlm_res *resp) +static bool +svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock) +{ + const struct file_lock *fl = &lock->fl; + s64 start, len; + + /* exclusive */ + if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0) + return false; + if (xdr_stream_encode_u32(xdr, lock->svid) < 0) + return false; + if (!svcxdr_encode_owner(xdr, &lock->oh)) + return false; + start = loff_t_to_s64(fl->fl_start); + if (fl->fl_end == OFFSET_MAX) + len = 0; + else + len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1); + if (xdr_stream_encode_u64(xdr, start) < 0) + return false; + if (xdr_stream_encode_u64(xdr, len) < 0) + return false; + + return true; +} + +static bool +svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp) { - s64 start, len; - - dprintk("xdr: before encode_testres (p %p resp %p)\n", p, resp); - if (!(p = nlm4_encode_cookie(p, &resp->cookie))) - return NULL; - *p++ = resp->status; - - if (resp->status == nlm_lck_denied) { - struct file_lock *fl = &resp->lock.fl; - - *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one; - *p++ = htonl(resp->lock.svid); - - /* Encode owner handle. */ - if (!(p = xdr_encode_netobj(p, &resp->lock.oh))) - return NULL; - - start = loff_t_to_s64(fl->fl_start); - if (fl->fl_end == OFFSET_MAX) - len = 0; - else - len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1); - - p = xdr_encode_hyper(p, start); - p = xdr_encode_hyper(p, len); - dprintk("xdr: encode_testres (status %u pid %d type %d start %Ld end %Ld)\n", - resp->status, (int)resp->lock.svid, fl->fl_type, - (long long)fl->fl_start, (long long)fl->fl_end); + if (!svcxdr_encode_stats(xdr, resp->status)) + return false; + switch (resp->status) { + case nlm_lck_denied: + if (!svcxdr_encode_holder(xdr, &resp->lock)) + return false; } - dprintk("xdr: after encode_testres (p %p resp %p)\n", p, resp); - return p; + return true; } /* - * First, the server side XDR functions + * Decode Call arguments */ + +int +nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p) +{ + return 1; +} + int nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; + u32 exclusive; - if (!(p = nlm4_decode_cookie(p, &argp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) return 0; - - exclusive = ntohl(*p++); - if (!(p = nlm4_decode_lock(p, &argp->lock))) + if (xdr_stream_decode_bool(xdr, &exclusive) < 0) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; if (exclusive) argp->lock.fl.fl_type = F_WRLCK; - return xdr_argsize_check(rqstp, p); -} - -int -nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p) -{ - struct nlm_res *resp = rqstp->rq_resp; - - if (!(p = nlm4_encode_testres(p, resp))) - return 0; - return xdr_ressize_check(rqstp, p); + return 1; } int nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; + u32 exclusive; - if (!(p = nlm4_decode_cookie(p, &argp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (xdr_stream_decode_bool(xdr, &argp->block) < 0) return 0; - argp->block = ntohl(*p++); - exclusive = ntohl(*p++); - if (!(p = nlm4_decode_lock(p, &argp->lock))) + if (xdr_stream_decode_bool(xdr, &exclusive) < 0) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; if (exclusive) argp->lock.fl.fl_type = F_WRLCK; - argp->reclaim = ntohl(*p++); - argp->state = ntohl(*p++); + if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->state) < 0) + return 0; argp->monitor = 1; /* monitor client by default */ - return xdr_argsize_check(rqstp, p); + return 1; } int nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - u32 exclusive; + u32 exclusive; - if (!(p = nlm4_decode_cookie(p, &argp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (xdr_stream_decode_bool(xdr, &argp->block) < 0) return 0; - argp->block = ntohl(*p++); - exclusive = ntohl(*p++); - if (!(p = nlm4_decode_lock(p, &argp->lock))) + if (xdr_stream_decode_bool(xdr, &exclusive) < 0) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; if (exclusive) argp->lock.fl.fl_type = F_WRLCK; - return xdr_argsize_check(rqstp, p); + return 1; } int nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; - if (!(p = nlm4_decode_cookie(p, &argp->cookie)) - || !(p = nlm4_decode_lock(p, &argp->lock))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) + return 0; + if (!svcxdr_decode_lock(xdr, &argp->lock)) return 0; argp->lock.fl.fl_type = F_UNLCK; - return xdr_argsize_check(rqstp, p); + + return 1; } int -nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_args *argp = rqstp->rq_argp; - struct nlm_lock *lock = &argp->lock; - - memset(lock, 0, sizeof(*lock)); - locks_init_lock(&lock->fl); - lock->svid = ~(u32) 0; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nlm_res *resp = rqstp->rq_argp; - if (!(p = nlm4_decode_cookie(p, &argp->cookie)) - || !(p = xdr_decode_string_inplace(p, &lock->caller, - &lock->len, NLM_MAXSTRLEN)) - || !(p = nlm4_decode_fh(p, &lock->fh)) - || !(p = nlm4_decode_oh(p, &lock->oh))) + if (!svcxdr_decode_cookie(xdr, &resp->cookie)) + return 0; + if (!svcxdr_decode_stats(xdr, &resp->status)) return 0; - argp->fsm_mode = ntohl(*p++); - argp->fsm_access = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + + return 1; } int -nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_res *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nlm_reboot *argp = rqstp->rq_argp; + u32 len; - if (!(p = nlm4_encode_cookie(p, &resp->cookie))) + if (xdr_stream_decode_u32(xdr, &len) < 0) return 0; - *p++ = resp->status; - *p++ = xdr_zero; /* sequence argument */ - return xdr_ressize_check(rqstp, p); + if (len > SM_MAXSTRLEN) + return 0; + p = xdr_inline_decode(xdr, len); + if (!p) + return 0; + argp->len = len; + argp->mon = (char *)p; + if (xdr_stream_decode_u32(xdr, &argp->state) < 0) + return 0; + p = xdr_inline_decode(xdr, SM_PRIV_SIZE); + if (!p) + return 0; + memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); + + return 1; } int -nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_res *resp = rqstp->rq_resp; + struct xdr_stream *xdr = &rqstp->rq_arg_stream; + struct nlm_args *argp = rqstp->rq_argp; + struct nlm_lock *lock = &argp->lock; + + memset(lock, 0, sizeof(*lock)); + locks_init_lock(&lock->fl); + lock->svid = ~(u32)0; - if (!(p = nlm4_encode_cookie(p, &resp->cookie))) + if (!svcxdr_decode_cookie(xdr, &argp->cookie)) return 0; - *p++ = resp->status; - return xdr_ressize_check(rqstp, p); + if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) + return 0; + if (!svcxdr_decode_fhandle(xdr, &lock->fh)) + return 0; + if (!svcxdr_decode_owner(xdr, &lock->oh)) + return 0; + /* XXX: Range checks are missing in the original code */ + if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0) + return 0; + + return 1; } int nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p) { + struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nlm_args *argp = rqstp->rq_argp; struct nlm_lock *lock = &argp->lock; - if (!(p = xdr_decode_string_inplace(p, &lock->caller, - &lock->len, NLM_MAXSTRLEN))) + if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len)) + return 0; + if (xdr_stream_decode_u32(xdr, &argp->state) < 0) return 0; - argp->state = ntohl(*p++); - return xdr_argsize_check(rqstp, p); + + return 1; } + +/* + * Encode Reply results + */ + int -nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_reboot *argp = rqstp->rq_argp; - - if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) - return 0; - argp->state = ntohl(*p++); - memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); - p += XDR_QUADLEN(SM_PRIV_SIZE); - return xdr_argsize_check(rqstp, p); + return 1; } int -nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p) { - struct nlm_res *resp = rqstp->rq_argp; + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct nlm_res *resp = rqstp->rq_resp; - if (!(p = nlm4_decode_cookie(p, &resp->cookie))) - return 0; - resp->status = *p++; - return xdr_argsize_check(rqstp, p); + return svcxdr_encode_cookie(xdr, &resp->cookie) && + svcxdr_encode_testrply(xdr, resp); } int -nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p) { - return xdr_argsize_check(rqstp, p); + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct nlm_res *resp = rqstp->rq_resp; + + return svcxdr_encode_cookie(xdr, &resp->cookie) && + svcxdr_encode_stats(xdr, resp->status); } int -nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p) +nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p) { - return xdr_ressize_check(rqstp, p); + struct xdr_stream *xdr = &rqstp->rq_res_stream; + struct nlm_res *resp = rqstp->rq_resp; + + if (!svcxdr_encode_cookie(xdr, &resp->cookie)) + return 0; + if (!svcxdr_encode_stats(xdr, resp->status)) + return 0; + /* sequence */ + if (xdr_stream_encode_u32(xdr, 0) < 0) + return 0; + + return 1; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index a532a99bbe81..a71f1cf894b9 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -442,6 +442,7 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations minix_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = minix_readpage, .writepage = minix_writepage, .write_begin = minix_write_begin, diff --git a/fs/namei.c b/fs/namei.c index 79b0ff9b151e..bf6d8a738c59 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -554,7 +554,7 @@ struct nameidata { struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ - unsigned int flags; + unsigned int flags, state; unsigned seq, m_seq, r_seq; int last_type; unsigned depth; @@ -573,10 +573,15 @@ struct nameidata { umode_t dir_mode; } __randomize_layout; -static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) +#define ND_ROOT_PRESET 1 +#define ND_ROOT_GRABBED 2 +#define ND_JUMPED 4 + +static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) { struct nameidata *old = current->nameidata; p->stack = p->internal; + p->depth = 0; p->dfd = dfd; p->name = name; p->path.mnt = NULL; @@ -586,6 +591,17 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) current->nameidata = p; } +static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name, + const struct path *root) +{ + __set_nameidata(p, dfd, name); + p->state = 0; + if (unlikely(root)) { + p->state = ND_ROOT_PRESET; + p->root = *root; + } +} + static void restore_nameidata(void) { struct nameidata *now = current->nameidata, *old = now->saved; @@ -645,9 +661,9 @@ static void terminate_walk(struct nameidata *nd) path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); - if (nd->flags & LOOKUP_ROOT_GRABBED) { + if (nd->state & ND_ROOT_GRABBED) { path_put(&nd->root); - nd->flags &= ~LOOKUP_ROOT_GRABBED; + nd->state &= ~ND_ROOT_GRABBED; } } else { nd->flags &= ~LOOKUP_RCU; @@ -710,9 +726,9 @@ static bool legitimize_root(struct nameidata *nd) if (!nd->root.mnt && (nd->flags & LOOKUP_IS_SCOPED)) return false; /* Nothing to do if nd->root is zero or is managed by the VFS user. */ - if (!nd->root.mnt || (nd->flags & LOOKUP_ROOT)) + if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; - nd->flags |= LOOKUP_ROOT_GRABBED; + nd->state |= ND_ROOT_GRABBED; return legitimize_path(nd, &nd->root, nd->root_seq); } @@ -849,8 +865,9 @@ static int complete_walk(struct nameidata *nd) * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ - if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED))) - nd->root.mnt = NULL; + if (!(nd->state & ND_ROOT_PRESET)) + if (!(nd->flags & LOOKUP_IS_SCOPED)) + nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) return -ECHILD; @@ -877,7 +894,7 @@ static int complete_walk(struct nameidata *nd) return -EXDEV; } - if (likely(!(nd->flags & LOOKUP_JUMPED))) + if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) @@ -915,7 +932,7 @@ static int set_root(struct nameidata *nd) } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); - nd->flags |= LOOKUP_ROOT_GRABBED; + nd->state |= ND_ROOT_GRABBED; } return 0; } @@ -948,7 +965,7 @@ static int nd_jump_root(struct nameidata *nd) path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; return 0; } @@ -976,7 +993,7 @@ int nd_jump_link(struct path *path) path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; return 0; err: @@ -1423,7 +1440,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, if (mounted) { path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; *seqp = read_seqcount_begin(&dentry->d_seq); *inode = dentry->d_inode; /* @@ -1468,7 +1485,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, if (unlikely(nd->flags & LOOKUP_NO_XDEV)) ret = -EXDEV; else - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; } if (unlikely(ret)) { dput(path->dentry); @@ -2219,7 +2236,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) case 2: if (name[1] == '.') { type = LAST_DOTDOT; - nd->flags |= LOOKUP_JUMPED; + nd->state |= ND_JUMPED; } break; case 1: @@ -2227,7 +2244,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) } if (likely(type == LAST_NORM)) { struct dentry *parent = nd->path.dentry; - nd->flags &= ~LOOKUP_JUMPED; + nd->state &= ~ND_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { struct qstr this = { { .hash_len = hash_len }, .name = name }; err = parent->d_op->d_hash(parent, &this); @@ -2301,14 +2318,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (flags & LOOKUP_RCU) rcu_read_lock(); - nd->flags = flags | LOOKUP_JUMPED; - nd->depth = 0; + nd->flags = flags; + nd->state |= ND_JUMPED; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); - if (flags & LOOKUP_ROOT) { + if (nd->state & ND_ROOT_PRESET) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) @@ -2383,7 +2400,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) nd->root_seq = nd->seq; } else { path_get(&nd->root); - nd->flags |= LOOKUP_ROOT_GRABBED; + nd->state |= ND_ROOT_GRABBED; } } return s; @@ -2422,7 +2439,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path ; if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { err = handle_lookup_down(nd); - nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... + nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... } if (!err) err = complete_walk(nd); @@ -2446,11 +2463,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); - if (unlikely(root)) { - nd.root = *root; - flags |= LOOKUP_ROOT; - } - set_nameidata(&nd, dfd, name); + set_nameidata(&nd, dfd, name, root); retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) retval = path_lookupat(&nd, flags, path); @@ -2491,7 +2504,7 @@ static struct filename *filename_parentat(int dfd, struct filename *name, if (IS_ERR(name)) return name; - set_nameidata(&nd, dfd, name); + set_nameidata(&nd, dfd, name, NULL); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) retval = path_parentat(&nd, flags, parent); @@ -3517,7 +3530,7 @@ struct file *do_filp_open(int dfd, struct filename *pathname, int flags = op->lookup_flags; struct file *filp; - set_nameidata(&nd, dfd, pathname); + set_nameidata(&nd, dfd, pathname, NULL); filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(&nd, op, flags); @@ -3527,25 +3540,22 @@ struct file *do_filp_open(int dfd, struct filename *pathname, return filp; } -struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, +struct file *do_file_open_root(const struct path *root, const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; struct filename *filename; - int flags = op->lookup_flags | LOOKUP_ROOT; - - nd.root.mnt = mnt; - nd.root.dentry = dentry; + int flags = op->lookup_flags; - if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) + if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); filename = getname_kernel(name); if (IS_ERR(filename)) return ERR_CAST(filename); - set_nameidata(&nd, -1, filename); + set_nameidata(&nd, -1, filename, root); file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) file = path_openat(&nd, op, flags); diff --git a/fs/namespace.c b/fs/namespace.c index c3f1a78ba369..ab4174a3c802 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3464,9 +3464,10 @@ out_type: return ret; } -#define FSMOUNT_VALID_FLAGS \ - (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ - MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME) +#define FSMOUNT_VALID_FLAGS \ + (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ + MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \ + MOUNT_ATTR_NOSYMFOLLOW) #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP) @@ -3487,6 +3488,8 @@ static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) mnt_flags |= MNT_NOEXEC; if (attr_flags & MOUNT_ATTR_NODIRATIME) mnt_flags |= MNT_NODIRATIME; + if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW) + mnt_flags |= MNT_NOSYMFOLLOW; return mnt_flags; } diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 725614625ed4..0b6cd3b8734c 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -1011,12 +1011,42 @@ out: } EXPORT_SYMBOL(netfs_readpage); -static void netfs_clear_thp(struct page *page) +/** + * netfs_skip_page_read - prep a page for writing without reading first + * @page: page being prepared + * @pos: starting position for the write + * @len: length of write + * + * In some cases, write_begin doesn't need to read at all: + * - full page write + * - write that lies in a page that is completely beyond EOF + * - write that covers the the page from start to EOF or beyond it + * + * If any of these criteria are met, then zero out the unwritten parts + * of the page and return true. Otherwise, return false. + */ +static bool netfs_skip_page_read(struct page *page, loff_t pos, size_t len) { - unsigned int i; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + size_t offset = offset_in_thp(page, pos); + + /* Full page write */ + if (offset == 0 && len >= thp_size(page)) + return true; + + /* pos beyond last page in the file */ + if (pos - offset >= i_size) + goto zero_out; + + /* Write that covers from the start of the page to EOF or beyond */ + if (offset == 0 && (pos + len) >= i_size) + goto zero_out; - for (i = 0; i < thp_nr_pages(page); i++) - clear_highpage(page + i); + return false; +zero_out: + zero_user_segments(page, 0, offset, offset + len, thp_size(page)); + return true; } /** @@ -1024,7 +1054,7 @@ static void netfs_clear_thp(struct page *page) * @file: The file to read from * @mapping: The mapping to read from * @pos: File position at which the write will begin - * @len: The length of the write in this page + * @len: The length of the write (may extend beyond the end of the page chosen) * @flags: AOP_* flags * @_page: Where to put the resultant page * @_fsdata: Place for the netfs to store a cookie @@ -1061,8 +1091,6 @@ int netfs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = file_inode(file); unsigned int debug_index = 0; pgoff_t index = pos >> PAGE_SHIFT; - int pos_in_page = pos & ~PAGE_MASK; - loff_t size; int ret; DEFINE_READAHEAD(ractl, file, NULL, mapping, index); @@ -1090,13 +1118,8 @@ retry: * within the cache granule containing the EOF, in which case we need * to preload the granule. */ - size = i_size_read(inode); if (!ops->is_cache_enabled(inode) && - ((pos_in_page == 0 && len == thp_size(page)) || - (pos >= size) || - (pos_in_page == 0 && (pos + len) >= size))) { - netfs_clear_thp(page); - SetPageUptodate(page); + netfs_skip_page_read(page, pos, len)) { netfs_stat(&netfs_n_rh_write_zskip); goto have_page_no_wait; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index cfeaadf56bf0..330f65727c45 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -406,7 +406,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) if (cl_init->hostname == NULL) { WARN_ON(1); - return NULL; + return ERR_PTR(-EINVAL); } /* see if the client already exists */ diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index e6ec6f09ac6e..11118398f495 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -75,6 +75,13 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); } +static void nfs_mark_return_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + static bool nfs4_is_valid_delegation(const struct nfs_delegation *delegation, fmode_t flags) @@ -293,6 +300,7 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) goto out; spin_lock(&delegation->lock); if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); /* Refcount matched in nfs_end_delegation_return() */ ret = nfs_get_delegation(delegation); } @@ -314,16 +322,17 @@ nfs_start_delegation_return(struct nfs_inode *nfsi) return delegation; } -static void -nfs_abort_delegation_return(struct nfs_delegation *delegation, - struct nfs_client *clp) +static void nfs_abort_delegation_return(struct nfs_delegation *delegation, + struct nfs_client *clp, int err) { spin_lock(&delegation->lock); clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + if (err == -EAGAIN) { + set_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state); + } spin_unlock(&delegation->lock); - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); } static struct nfs_delegation * @@ -521,11 +530,18 @@ out: static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + unsigned int mode = O_WRONLY | O_RDWR; int err = 0; if (delegation == NULL) return 0; - do { + + if (!issync) + mode |= O_NONBLOCK; + /* Recall of any remaining application leases */ + err = break_lease(inode, mode); + + while (err == 0) { if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) break; err = nfs_delegation_claim_opens(inode, &delegation->stateid, @@ -536,10 +552,10 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation * Guard against state recovery */ err = nfs4_wait_clnt_recover(clp); - } while (err == 0); + } if (err) { - nfs_abort_delegation_return(delegation, clp); + nfs_abort_delegation_return(delegation, clp, err); goto out; } @@ -568,6 +584,7 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) if (ret) clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || + test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) || test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) ret = false; @@ -647,6 +664,38 @@ out: return err; } +static bool nfs_server_clear_delayed_delegations(struct nfs_server *server) +{ + struct nfs_delegation *d; + bool ret = false; + + list_for_each_entry_rcu (d, &server->delegations, super_list) { + if (!test_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags)) + continue; + nfs_mark_return_delegation(server, d); + clear_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags); + ret = true; + } + return ret; +} + +static bool nfs_client_clear_delayed_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + bool ret = false; + + if (!test_and_clear_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state)) + goto out; + rcu_read_lock(); + list_for_each_entry_rcu (server, &clp->cl_superblocks, client_link) { + if (nfs_server_clear_delayed_delegations(server)) + ret = true; + } + rcu_read_unlock(); +out: + return ret; +} + /** * nfs_client_return_marked_delegations - return previously marked delegations * @clp: nfs_client to process @@ -659,8 +708,14 @@ out: */ int nfs_client_return_marked_delegations(struct nfs_client *clp) { - return nfs_client_for_each_server(clp, - nfs_server_return_marked_delegations, NULL); + int err = nfs_client_for_each_server( + clp, nfs_server_return_marked_delegations, NULL); + if (err) + return err; + /* If a return was delayed, sleep to prevent hard looping */ + if (nfs_client_clear_delayed_delegations(clp)) + ssleep(1); + return 0; } /** @@ -698,13 +753,14 @@ int nfs4_inode_return_delegation(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - int err = 0; - nfs_wb_all(inode); delegation = nfs_start_delegation_return(nfsi); + /* Synchronous recall of any application leases */ + break_lease(inode, O_WRONLY | O_RDWR); + nfs_wb_all(inode); if (delegation != NULL) - err = nfs_end_delegation_return(inode, delegation, 1); - return err; + return nfs_end_delegation_return(inode, delegation, 1); + return 0; } /** @@ -775,13 +831,6 @@ static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); } -static void nfs_mark_return_delegation(struct nfs_server *server, - struct nfs_delegation *delegation) -{ - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); -} - static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) { struct nfs_delegation *delegation; @@ -1010,6 +1059,9 @@ int nfs_async_inode_return_delegation(struct inode *inode, nfs_mark_return_delegation(server, delegation); rcu_read_unlock(); + /* If there are any application leases or delegations, recall them */ + break_lease(inode, O_WRONLY | O_RDWR | O_NONBLOCK); + nfs_delegation_run_state_manager(clp); return 0; out_enoent: diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index c19b4fd20781..1c378992b7c0 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -36,6 +36,7 @@ enum { NFS_DELEGATION_REVOKED, NFS_DELEGATION_TEST_EXPIRED, NFS_DELEGATION_INODE_FREEING, + NFS_DELEGATION_RETURN_DELAYED, }; int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 2d30a4da49fa..2e894fec036b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -700,8 +700,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; struct nfs_commit_info cinfo; - bool request_commit = false; struct nfs_page *req = nfs_list_entry(hdr->pages.next); + int flags = NFS_ODIRECT_DONE; nfs_init_cinfo_from_dreq(&cinfo, dreq); @@ -713,15 +713,9 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) nfs_direct_count_bytes(dreq, hdr); if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) { - switch (dreq->flags) { - case 0: + if (!dreq->flags) dreq->flags = NFS_ODIRECT_DO_COMMIT; - request_commit = true; - break; - case NFS_ODIRECT_RESCHED_WRITES: - case NFS_ODIRECT_DO_COMMIT: - request_commit = true; - } + flags = dreq->flags; } spin_unlock(&dreq->lock); @@ -729,12 +723,15 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); - if (request_commit) { + if (flags == NFS_ODIRECT_DO_COMMIT) { kref_get(&req->wb_kref); memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo, hdr->ds_commit_idx); + } else if (flags == NFS_ODIRECT_RESCHED_WRITES) { + kref_get(&req->wb_kref); + nfs_mark_request_commit(req, NULL, &cinfo, 0); } nfs_unlock_and_release_request(req); } diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index c4c021c6ebbd..d743629e05e1 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -385,12 +385,15 @@ static void nfs_readpage_from_fscache_complete(struct page *page, "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n", page, context, error); - /* if the read completes with an error, we just unlock the page and let - * the VM reissue the readpage */ - if (!error) { + /* + * If the read completes with an error, mark the page with PG_checked, + * unlock the page, and let the VM reissue the readpage. + */ + if (!error) SetPageUptodate(page); - unlock_page(page); - } + else + SetPageChecked(page); + unlock_page(page); } /* @@ -405,6 +408,11 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", nfs_i_fscache(inode), page, page->index, page->flags, inode); + if (PageChecked(page)) { + ClearPageChecked(page); + return 1; + } + ret = fscache_read_or_alloc_page(nfs_i_fscache(inode), page, nfs_readpage_from_fscache_complete, diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index aaeeb4659bff..59355c106ece 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -67,7 +67,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i int nfs_get_root(struct super_block *s, struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); - struct nfs_server *server = NFS_SB(s); + struct nfs_server *server = NFS_SB(s), *clone_server; struct nfs_fsinfo fsinfo; struct dentry *root; struct inode *inode; @@ -127,7 +127,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) } spin_unlock(&root->d_lock); fc->root = root; - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) + if (server->caps & NFS_CAP_SECURITY_LABEL) kflags |= SECURITY_LSM_NATIVE_LABELS; if (ctx->clone_data.sb) { if (d_inode(fc->root)->i_fop != &nfs_dir_operations) { @@ -137,15 +137,19 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) /* clone lsm security options from the parent to the new sb */ error = security_sb_clone_mnt_opts(ctx->clone_data.sb, s, kflags, &kflags_out); + if (error) + goto error_splat_root; + clone_server = NFS_SB(ctx->clone_data.sb); + server->has_sec_mnt_opts = clone_server->has_sec_mnt_opts; } else { error = security_sb_set_mnt_opts(s, fc->security, kflags, &kflags_out); } if (error) goto error_splat_root; - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && + if (server->caps & NFS_CAP_SECURITY_LABEL && !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) - NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; + server->caps &= ~NFS_CAP_SECURITY_LABEL; nfs_setsecurity(inode, fsinfo.fattr, fsinfo.fattr->label); error = 0; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 529c4099f482..853213b3a209 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1101,6 +1101,7 @@ EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) { filp->private_data = get_nfs_open_context(ctx); + set_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); if (list_empty(&ctx->list)) nfs_inode_attach_open_context(ctx); } @@ -1120,6 +1121,8 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct continue; if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) continue; + if (!test_bit(NFS_CONTEXT_FILE_OPEN, &pos->flags)) + continue; ctx = get_nfs_open_context(pos); if (ctx) break; @@ -1135,6 +1138,7 @@ void nfs_file_clear_open_context(struct file *filp) if (ctx) { struct inode *inode = d_inode(ctx->dentry); + clear_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); /* * We fatal error on write before. Try to writeback * every page again. @@ -2055,35 +2059,33 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); + attr_changed = true; dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); } else if (!have_delegation) nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; inode_set_iversion_raw(inode, fattr->change_attr); - attr_changed = true; } } else { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CHANGE; - cache_revalidated = false; + if (!have_delegation || + (nfsi->cache_validity & NFS_INO_INVALID_CHANGE) != 0) + cache_revalidated = false; } - if (fattr->valid & NFS_ATTR_FATTR_MTIME) { + if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; - } else if (fattr_supported & NFS_ATTR_FATTR_MTIME) { + else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MTIME; - cache_revalidated = false; - } - if (fattr->valid & NFS_ATTR_FATTR_CTIME) { + if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; - } else if (fattr_supported & NFS_ATTR_FATTR_CTIME) { + else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CTIME; - cache_revalidated = false; - } /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -2096,7 +2098,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) i_size_write(inode, new_isize); if (!have_writers) invalid |= NFS_INO_INVALID_DATA; - attr_changed = true; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -2111,19 +2112,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) fattr->du.nfs3.used = 0; fattr->valid |= NFS_ATTR_FATTR_SPACE_USED; } - } else { + } else nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_SIZE; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; - else if (fattr_supported & NFS_ATTR_FATTR_ATIME) { + else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATIME; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { @@ -2132,71 +2129,55 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_mode = newmode; invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; - attr_changed = true; } - } else if (fattr_supported & NFS_ATTR_FATTR_MODE) { + } else if (fattr_supported & NFS_ATTR_FATTR_MODE) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MODE; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; - attr_changed = true; } - } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) { + } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_OTHER; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; - attr_changed = true; } - } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) { + } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_OTHER; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; set_nlink(inode, fattr->nlink); - attr_changed = true; } - } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) { + } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_NLINK; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) { + } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_BLOCKS; - cache_revalidated = false; - } - if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) { + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; - } else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) { + else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_BLOCKS; - cache_revalidated = false; - } /* Update attrtimeo value if we're out of the unstable period */ if (attr_changed) { diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 5c4e23abc345..2299446b3b89 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -385,7 +385,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, break; case NFS3_CREATE_UNCHECKED: - goto out; + goto out_release_acls; } nfs_fattr_init(data->res.dir_attr); nfs_fattr_init(data->res.fattr); @@ -751,7 +751,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, break; default: status = -EINVAL; - goto out; + goto out_release_acls; } d_alias = nfs3_do_create(dir, dentry, data); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 065cb04222a1..ba78df4b13d9 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -45,6 +45,7 @@ enum nfs4_client_state { NFS4CLNT_RECALL_RUNNING, NFS4CLNT_RECALL_ANY_LAYOUT_READ, NFS4CLNT_RECALL_ANY_LAYOUT_RW, + NFS4CLNT_DELEGRETURN_DELAYED, }; #define NFS4_RENEW_TIMEOUT 0x01 @@ -205,6 +206,7 @@ struct nfs4_exception { struct inode *inode; nfs4_stateid *stateid; long timeout; + unsigned char task_is_privileged : 1; unsigned char delay : 1, recovering : 1, retry : 1; @@ -321,7 +323,8 @@ extern int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode); - +extern int nfs4_proc_setlease(struct file *file, long arg, + struct file_lock **lease, void **priv); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); extern void nfs4_update_changeattr(struct inode *dir, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 889a9f4c0310..28431acd1230 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -197,8 +197,11 @@ void nfs40_shutdown_client(struct nfs_client *clp) struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) { - int err; + char buf[INET6_ADDRSTRLEN + 1]; + const char *ip_addr = cl_init->ip_addr; struct nfs_client *clp = nfs_alloc_client(cl_init); + int err; + if (IS_ERR(clp)) return clp; @@ -222,6 +225,44 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) init_waitqueue_head(&clp->cl_lock_waitq); #endif INIT_LIST_HEAD(&clp->pending_cb_stateids); + + if (cl_init->minorversion != 0) + __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); + __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); + __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); + + /* + * Set up the connection to the server before we add add to the + * global list. + */ + err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); + if (err == -EINVAL) + err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); + if (err < 0) + goto error; + + /* If no clientaddr= option was specified, find a usable cb address */ + if (ip_addr == NULL) { + struct sockaddr_storage cb_addr; + struct sockaddr *sap = (struct sockaddr *)&cb_addr; + + err = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); + if (err < 0) + goto error; + err = rpc_ntop(sap, buf, sizeof(buf)); + if (err < 0) + goto error; + ip_addr = (const char *)buf; + } + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + + err = nfs_idmap_new(clp); + if (err < 0) { + dprintk("%s: failed to create idmapper. Error = %d\n", + __func__, err); + goto error; + } + __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); return clp; error: @@ -372,8 +413,6 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init) { - char buf[INET6_ADDRSTRLEN + 1]; - const char *ip_addr = cl_init->ip_addr; struct nfs_client *old; int error; @@ -381,43 +420,6 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, /* the client is initialised already */ return clp; - /* Check NFS protocol revision and initialize RPC op vector */ - clp->rpc_ops = &nfs_v4_clientops; - - if (clp->cl_minorversion != 0) - __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); - __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); - __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - - error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); - if (error == -EINVAL) - error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); - if (error < 0) - goto error; - - /* If no clientaddr= option was specified, find a usable cb address */ - if (ip_addr == NULL) { - struct sockaddr_storage cb_addr; - struct sockaddr *sap = (struct sockaddr *)&cb_addr; - - error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); - if (error < 0) - goto error; - error = rpc_ntop(sap, buf, sizeof(buf)); - if (error < 0) - goto error; - ip_addr = (const char *)buf; - } - strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); - - error = nfs_idmap_new(clp); - if (error < 0) { - dprintk("%s: failed to create idmapper. Error = %d\n", - __func__, error); - goto error; - } - __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); - error = nfs4_init_client_minor_version(clp); if (error < 0) goto error; @@ -435,8 +437,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, */ nfs_mark_client_ready(clp, -EPERM); } - nfs_put_client(clp); clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags); + nfs_put_client(clp); return old; error: diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index a1e5c6b85ded..c820de58a661 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -435,6 +435,12 @@ void nfs42_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ +static int nfs4_setlease(struct file *file, long arg, struct file_lock **lease, + void **priv) +{ + return nfs4_proc_setlease(file, arg, lease, priv); +} + const struct file_operations nfs4_file_operations = { .read_iter = nfs_file_read, .write_iter = nfs_file_write, @@ -448,7 +454,7 @@ const struct file_operations nfs4_file_operations = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, - .setlease = simple_nosetlease, + .setlease = nfs4_setlease, #ifdef CONFIG_NFS_V4_2 .copy_file_range = nfs4_copy_file_range, .llseek = nfs4_file_llseek, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0cd965882232..e1214bb6b7ee 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -589,6 +589,8 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ goto out_retry; } if (exception->recovering) { + if (exception->task_is_privileged) + return -EDEADLOCK; ret = nfs4_wait_clnt_recover(clp); if (test_bit(NFS_MIG_FAILED, &server->mig_status)) return -EIO; @@ -614,6 +616,8 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server, goto out_retry; } if (exception->recovering) { + if (exception->task_is_privileged) + return -EDEADLOCK; rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); @@ -1151,7 +1155,11 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res) { - return nfs4_do_call_sync(clnt, server, msg, args, res, 0); + unsigned short task_flags = 0; + + if (server->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; + return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags); } @@ -1201,12 +1209,12 @@ nfs4_update_changeattr_locked(struct inode *inode, u64 change_attr = inode_peek_iversion_raw(inode); cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + if (S_ISDIR(inode->i_mode)) + cache_validity |= NFS_INO_INVALID_DATA; switch (NFS_SERVER(inode)->change_attr_type) { case NFS4_CHANGE_TYPE_IS_UNDEFINED: - break; - case NFS4_CHANGE_TYPE_IS_TIME_METADATA: - if ((s64)(change_attr - cinfo->after) > 0) + if (cinfo->after == change_attr) goto out; break; default: @@ -1214,24 +1222,21 @@ nfs4_update_changeattr_locked(struct inode *inode, goto out; } - if (cinfo->atomic && cinfo->before == change_attr) { - nfsi->attrtimeo_timestamp = jiffies; - } else { - if (S_ISDIR(inode->i_mode)) { - cache_validity |= NFS_INO_INVALID_DATA; + inode_set_iversion_raw(inode, cinfo->after); + if (!cinfo->atomic || cinfo->before != change_attr) { + if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); - } else { - if (!NFS_PROTO(inode)->have_delegation(inode, - FMODE_READ)) - cache_validity |= NFS_INO_REVAL_PAGECACHE; - } - if (cinfo->before != change_attr) - cache_validity |= NFS_INO_INVALID_ACCESS | - NFS_INO_INVALID_ACL | - NFS_INO_INVALID_XATTR; + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + cache_validity |= + NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | + NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | + NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR | + NFS_INO_REVAL_PAGECACHE; + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); } - inode_set_iversion_raw(inode, cinfo->after); + nfsi->attrtimeo_timestamp = jiffies; nfsi->read_cache_jiffies = timestamp; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE; @@ -2565,6 +2570,9 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, }; int status; + if (server->nfs_client->cl_minorversion) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + kref_get(&data->kref); data->rpc_done = false; data->rpc_status = 0; @@ -3745,6 +3753,9 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) }; int status = -ENOMEM; + if (server->nfs_client->cl_minorversion) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg); @@ -3878,6 +3889,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) + server->caps |= NFS_CAP_SECURITY_LABEL; +#endif if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID)) server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID; if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE)) @@ -3898,10 +3913,6 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME; if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)) server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME; -#ifdef CONFIG_NFS_V4_SECURITY_LABEL - if (!(res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)) - server->fattr_valid &= ~NFS_ATTR_FATTR_V4_SECURITY_LABEL; -#endif memcpy(server->attr_bitmask_nl, res.attr_bitmask, sizeof(server->attr_bitmask)); server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL; @@ -4184,6 +4195,9 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, }; unsigned short task_flags = 0; + if (nfs4_has_session(server->nfs_client)) + task_flags = RPC_TASK_MOVEABLE; + /* Is this is an attribute revalidation, subject to softreval? */ if (inode && (server->flags & NFS_MOUNT_SOFTREVAL)) task_flags |= RPC_TASK_TIMEOUT; @@ -4303,6 +4317,9 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, }; unsigned short task_flags = 0; + if (server->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; + /* Is this is an attribute revalidation, subject to softreval? */ if (nfs_lookup_is_soft_revalidate(dentry)) task_flags |= RPC_TASK_TIMEOUT; @@ -5968,6 +5985,14 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen do { err = __nfs4_proc_set_acl(inode, buf, buflen); trace_nfs4_set_acl(inode, err); + if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) { + /* + * no need to retry since the kernel + * isn't involved in encoding the ACEs. + */ + err = -EINVAL; + break; + } err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); @@ -6409,6 +6434,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) struct nfs4_exception exception = { .inode = data->inode, .stateid = &data->stateid, + .task_is_privileged = data->args.seq_args.sa_privileged, }; if (!nfs4_sequence_done(task, &data->res.seq_res)) @@ -6525,14 +6551,13 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_delegreturn_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE, }; int status = 0; data = kzalloc(sizeof(*data), GFP_NOFS); if (data == NULL) return -ENOMEM; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0); nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, @@ -6563,6 +6588,12 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, } } + if (!data->inode) + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, + 1); + else + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, + 0); task_setup_data.callback_data = data; msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; @@ -6838,6 +6869,11 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; + struct nfs_client *client = + NFS_SERVER(lsp->ls_state->inode)->nfs_client; + + if (client->cl_minorversion) + task_setup_data.flags |= RPC_TASK_MOVEABLE; nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg); @@ -7112,6 +7148,10 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int ret; + struct nfs_client *client = NFS_SERVER(state->inode)->nfs_client; + + if (client->cl_minorversion) + task_setup_data.flags |= RPC_TASK_MOVEABLE; dprintk("%s: begin!\n", __func__); data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), @@ -7420,6 +7460,43 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) return nfs4_retry_setlk(state, cmd, request); } +static int nfs4_delete_lease(struct file *file, void **priv) +{ + return generic_setlease(file, F_UNLCK, NULL, priv); +} + +static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, + void **priv) +{ + struct inode *inode = file_inode(file); + fmode_t type = arg == F_RDLCK ? FMODE_READ : FMODE_WRITE; + int ret; + + /* No delegation, no lease */ + if (!nfs4_have_delegation(inode, type)) + return -EAGAIN; + ret = generic_setlease(file, arg, lease, priv); + if (ret || nfs4_have_delegation(inode, type)) + return ret; + /* We raced with a delegation return */ + nfs4_delete_lease(file, priv); + return -EAGAIN; +} + +int nfs4_proc_setlease(struct file *file, long arg, struct file_lock **lease, + void **priv) +{ + switch (arg) { + case F_RDLCK: + case F_WRLCK: + return nfs4_add_lease(file, arg, lease, priv); + case F_UNLCK: + return nfs4_delete_lease(file, priv); + default: + return -EINVAL; + } +} + int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) { struct nfs_server *server = NFS_SERVER(state->inode); @@ -9168,7 +9245,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs41_sequence_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE, }; struct rpc_task *ret; @@ -9367,7 +9444,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, { struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); - struct pnfs_layout_hdr *lo; + struct pnfs_layout_hdr *lo = lgp->lo; int nfs4err = task->tk_status; int err, status = 0; LIST_HEAD(head); @@ -9419,7 +9496,6 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, case -NFS4ERR_BAD_STATEID: exception->timeout = 0; spin_lock(&inode->i_lock); - lo = NFS_I(inode)->layout; /* If the open stateid was bad, then recover it. */ if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { @@ -9491,7 +9567,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) .rpc_message = &msg, .callback_ops = &nfs4_layoutget_call_ops, .callback_data = lgp, - .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | + RPC_TASK_MOVEABLE, }; struct pnfs_layout_segment *lseg = NULL; struct nfs4_exception exception = { @@ -9502,9 +9579,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) dprintk("--> %s\n", __func__); - /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ - pnfs_get_layout_hdr(NFS_I(inode)->layout); - nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0); task = rpc_run_task(&task_setup_data); @@ -9632,6 +9706,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) .rpc_message = &msg, .callback_ops = &nfs4_layoutreturn_call_ops, .callback_data = lrp, + .flags = RPC_TASK_MOVEABLE, }; int status = 0; @@ -9640,15 +9715,20 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) &task_setup_data.rpc_client, &msg); dprintk("--> %s\n", __func__); + lrp->inode = nfs_igrab_and_active(lrp->args.inode); if (!sync) { - lrp->inode = nfs_igrab_and_active(lrp->args.inode); if (!lrp->inode) { nfs4_layoutreturn_release(lrp); return -EAGAIN; } task_setup_data.flags |= RPC_TASK_ASYNC; } - nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1, 0); + if (!lrp->inode) + nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1, + 1); + else + nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1, + 0); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -9781,6 +9861,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) .rpc_message = &msg, .callback_ops = &nfs4_layoutcommit_ops, .callback_data = data, + .flags = RPC_TASK_MOVEABLE, }; struct rpc_task *task; int status = 0; @@ -10108,7 +10189,7 @@ static int nfs41_free_stateid(struct nfs_server *server, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs41_free_stateid_ops, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE, }; struct nfs_free_stateid_data *data; struct rpc_task *task; diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 2ef75caad6da..7a2567aa2b86 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -625,7 +625,7 @@ TRACE_EVENT(nfs4_state_mgr, TP_fast_assign( __entry->state = clp->cl_state; - __assign_str(hostname, clp->cl_hostname) + __assign_str(hostname, clp->cl_hostname); ), TP_printk( @@ -1637,7 +1637,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown") + __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown"); ), TP_printk( @@ -1694,7 +1694,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown") + __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown"); __entry->stateid_seq = be32_to_cpu(stateid->seqid); __entry->stateid_hash = diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index eb1ef3462e84..8a224871be74 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -280,8 +280,6 @@ TRACE_DEFINE_ENUM(LOOKUP_OPEN); TRACE_DEFINE_ENUM(LOOKUP_CREATE); TRACE_DEFINE_ENUM(LOOKUP_EXCL); TRACE_DEFINE_ENUM(LOOKUP_RENAME_TARGET); -TRACE_DEFINE_ENUM(LOOKUP_JUMPED); -TRACE_DEFINE_ENUM(LOOKUP_ROOT); TRACE_DEFINE_ENUM(LOOKUP_EMPTY); TRACE_DEFINE_ENUM(LOOKUP_DOWN); @@ -297,8 +295,6 @@ TRACE_DEFINE_ENUM(LOOKUP_DOWN); { LOOKUP_CREATE, "CREATE" }, \ { LOOKUP_EXCL, "EXCL" }, \ { LOOKUP_RENAME_TARGET, "RENAME_TARGET" }, \ - { LOOKUP_JUMPED, "JUMPED" }, \ - { LOOKUP_ROOT, "ROOT" }, \ { LOOKUP_EMPTY, "EMPTY" }, \ { LOOKUP_DOWN, "DOWN" }) @@ -430,10 +426,6 @@ TRACE_DEFINE_ENUM(O_CLOEXEC); { O_NOATIME, "O_NOATIME" }, \ { O_CLOEXEC, "O_CLOEXEC" }) -TRACE_DEFINE_ENUM(FMODE_READ); -TRACE_DEFINE_ENUM(FMODE_WRITE); -TRACE_DEFINE_ENUM(FMODE_EXEC); - #define show_fmode_flags(mode) \ __print_flags(mode, "|", \ { ((__force unsigned long)FMODE_READ), "READ" }, \ @@ -1431,8 +1423,8 @@ DECLARE_EVENT_CLASS(nfs_xdr_event, __entry->version = task->tk_client->cl_vers; __entry->error = error; __assign_str(program, - task->tk_client->cl_program->name) - __assign_str(procedure, task->tk_msg.rpc_proc->p_name) + task->tk_client->cl_program->name); + __assign_str(procedure, task->tk_msg.rpc_proc->p_name); ), TP_printk( diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index cf9cc62ec48e..cc232d1f16f2 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -954,6 +954,7 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) { struct nfs_pgio_header *hdr; int ret; + unsigned short task_flags = 0; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { @@ -962,14 +963,17 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) } nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); ret = nfs_generic_pgio(desc, hdr); - if (ret == 0) + if (ret == 0) { + if (NFS_SERVER(hdr->inode)->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), hdr, hdr->cred, NFS_PROTO(hdr->inode), desc->pg_rpc_callops, desc->pg_ioflags, - RPC_TASK_CRED_NOREF); + RPC_TASK_CRED_NOREF | task_flags); + } return ret; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2c01ee805306..ef14ea0b6ab8 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -966,10 +966,8 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, const struct cred *cred, bool update_barrier) { - u32 oldseq, newseq, new_barrier = 0; - - oldseq = be32_to_cpu(lo->plh_stateid.seqid); - newseq = be32_to_cpu(new->seqid); + u32 oldseq = be32_to_cpu(lo->plh_stateid.seqid); + u32 newseq = be32_to_cpu(new->seqid); if (!pnfs_layout_is_valid(lo)) { pnfs_set_layout_cred(lo, cred); @@ -979,19 +977,21 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); return; } - if (pnfs_seqid_is_newer(newseq, oldseq)) { + + if (pnfs_seqid_is_newer(newseq, oldseq)) nfs4_stateid_copy(&lo->plh_stateid, new); - /* - * Because of wraparound, we want to keep the barrier - * "close" to the current seqids. - */ - new_barrier = newseq - atomic_read(&lo->plh_outstanding); - } - if (update_barrier) - new_barrier = be32_to_cpu(new->seqid); - else if (new_barrier == 0) + + if (update_barrier) { + pnfs_barrier_update(lo, newseq); return; - pnfs_barrier_update(lo, new_barrier); + } + /* + * Because of wraparound, we want to keep the barrier + * "close" to the current seqids. We really only want to + * get here from a layoutget call. + */ + if (atomic_read(&lo->plh_outstanding) == 1) + pnfs_barrier_update(lo, be32_to_cpu(lo->plh_stateid.seqid)); } static bool @@ -1128,8 +1128,7 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp) size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE; nfs4_free_pages(lgp->args.layout.pages, max_pages); - if (lgp->args.inode) - pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout); + pnfs_put_layout_hdr(lgp->lo); put_nfs_open_context(lgp->args.ctx); kfree(lgp); } @@ -2014,7 +2013,7 @@ lookup_again: * If the layout segment list is empty, but there are outstanding * layoutget calls, then they might be subject to a layoutrecall. */ - if (list_empty(&lo->plh_segs) && + if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) && atomic_read(&lo->plh_outstanding) != 0) { spin_unlock(&ino->i_lock); lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, @@ -2124,6 +2123,9 @@ lookup_again: goto out_put_layout_hdr; } + lgp->lo = lo; + pnfs_get_layout_hdr(lo); + lseg = nfs4_proc_layoutget(lgp, &timeout); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); @@ -2255,6 +2257,7 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data, pnfs_put_layout_hdr(lo); return; } + lgp->lo = lo; data->lgp = lgp; data->o_arg.lg_args = &lgp->args; data->o_res.lg_res = &lgp->res; @@ -2263,6 +2266,7 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data, static void _lgopen_prepare_floating(struct nfs4_opendata *data, struct nfs_open_context *ctx) { + struct inode *ino = data->dentry->d_inode; struct pnfs_layout_range rng = { .iomode = (data->o_arg.fmode & FMODE_WRITE) ? IOMODE_RW: IOMODE_READ, @@ -2271,7 +2275,7 @@ static void _lgopen_prepare_floating(struct nfs4_opendata *data, }; struct nfs4_layoutget *lgp; - lgp = pnfs_alloc_init_layoutget_args(NULL, ctx, ¤t_stateid, + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, &rng, GFP_KERNEL); if (!lgp) return; @@ -2291,6 +2295,8 @@ void pnfs_lgopen_prepare(struct nfs4_opendata *data, /* Could check on max_ops, but currently hardcoded high enough */ if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN)) return; + if (data->lgp) + return; if (data->state) _lgopen_prepare_attached(data, ctx); else @@ -2330,13 +2336,13 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, } return; } - if (!lgp->args.inode) { + if (!lgp->lo) { lo = _pnfs_grab_empty_layout(ino, ctx); if (!lo) return; - lgp->args.inode = ino; + lgp->lo = lo; } else - lo = NFS_I(lgp->args.inode)->layout; + lo = lgp->lo; lseg = pnfs_layout_process(lgp); if (!IS_ERR(lseg)) { @@ -2349,11 +2355,9 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, void nfs4_lgopen_release(struct nfs4_layoutget *lgp) { if (lgp != NULL) { - struct inode *inode = lgp->args.inode; - if (inode) { - struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; - pnfs_clear_first_layoutget(lo); - nfs_layoutget_end(lo); + if (lgp->lo) { + pnfs_clear_first_layoutget(lgp->lo); + nfs_layoutget_end(lgp->lo); } pnfs_layoutget_free(lgp); } @@ -2362,7 +2366,7 @@ void nfs4_lgopen_release(struct nfs4_layoutget *lgp) struct pnfs_layout_segment * pnfs_layout_process(struct nfs4_layoutget *lgp) { - struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct pnfs_layout_hdr *lo = lgp->lo; struct nfs4_layoutget_res *res = &lgp->res; struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; @@ -2390,11 +2394,13 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } + if (!pnfs_layout_is_valid(lo) && !pnfs_is_first_layoutget(lo)) + goto out_forget; + if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { - if (!pnfs_layout_is_valid(lo) && - pnfs_is_first_layoutget(lo)) + if (!pnfs_layout_is_valid(lo)) lo->plh_barrier = 0; dprintk("%s forget reply due to sequence\n", __func__); goto out_forget; @@ -2413,8 +2419,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } else { /* We have a completely new layout */ - if (!pnfs_is_first_layoutget(lo)) - goto out_forget; pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true); } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 49d3389bd813..cf19914fec81 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -805,19 +805,16 @@ out: } EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add); -static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) +static int nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) { might_sleep(); - wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, - TASK_KILLABLE); + return wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, TASK_KILLABLE); } static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) { smp_mb__before_atomic(); - clear_bit(NFS4DS_CONNECTING, &ds->ds_state); - smp_mb__after_atomic(); - wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); + clear_and_wake_up_bit(NFS4DS_CONNECTING, &ds->ds_state); } static struct nfs_client *(*get_v3_ds_connect)( @@ -858,7 +855,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); if (!load_v3_ds_connect()) - goto out; + return -EPROTONOSUPPORT; list_for_each_entry(da, &ds->ds_addrs, da_node) { dprintk("%s: DS %s: trying address %s\n", @@ -993,30 +990,33 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, { int err; -again: - err = 0; - if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { - if (version == 3) { - err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, - retrans); - } else if (version == 4) { - err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, - retrans, minor_version); - } else { - dprintk("%s: unsupported DS version %d\n", __func__, - version); - err = -EPROTONOSUPPORT; - } + do { + err = nfs4_wait_ds_connect(ds); + if (err || ds->ds_clp) + goto out; + if (nfs4_test_deviceid_unavailable(devid)) + return -ENODEV; + } while (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) != 0); - nfs4_clear_ds_conn_bit(ds); - } else { - nfs4_wait_ds_connect(ds); + if (ds->ds_clp) + goto connect_done; - /* what was waited on didn't connect AND didn't mark unavail */ - if (!ds->ds_clp && !nfs4_test_deviceid_unavailable(devid)) - goto again; + switch (version) { + case 3: + err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, retrans); + break; + case 4: + err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, retrans, + minor_version); + break; + default: + dprintk("%s: unsupported DS version %d\n", __func__, version); + err = -EPROTONOSUPPORT; } +connect_done: + nfs4_clear_ds_conn_bit(ds); +out: /* * At this point the ds->ds_clp should be ready, but it might have * hit an error. diff --git a/fs/nfs/read.c b/fs/nfs/read.c index d2b6dce1f99f..9f39e0a1a38b 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -74,8 +74,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); -static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode) +static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *pgm; unsigned long npages; @@ -86,9 +85,9 @@ static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio, WARN_ON_ONCE(pgio->pg_mirror_count != 1); pgm = &pgio->pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; + NFS_I(pgio->pg_inode)->read_io += pgm->pg_bytes_written; npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT; - nfs_add_stats(inode, NFSIOS_READPAGES, npages); + nfs_add_stats(pgio->pg_inode, NFSIOS_READPAGES, npages); } @@ -363,22 +362,23 @@ int nfs_readpage(struct file *file, struct page *page) } else desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); + xchg(&desc.ctx->error, 0); if (!IS_SYNC(inode)) { ret = nfs_readpage_from_fscache(desc.ctx, inode, page); if (ret == 0) - goto out; + goto out_wait; } - xchg(&desc.ctx->error, 0); nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); ret = readpage_async_filler(&desc, page); + if (ret) + goto out; - if (!ret) - nfs_pageio_complete_read(&desc.pgio, inode); - + nfs_pageio_complete_read(&desc.pgio); ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; +out_wait: if (!ret) { ret = wait_on_page_locked_killable(page); if (!PageUptodate(page) && !ret) @@ -430,7 +430,7 @@ int nfs_readpages(struct file *file, struct address_space *mapping, ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); - nfs_pageio_complete_read(&desc.pgio, inode); + nfs_pageio_complete_read(&desc.pgio); read_complete: put_nfs_open_context(desc.ctx); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3bf82178166a..eae9bf114041 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1810,6 +1810,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo) { struct nfs_commit_data *data; + unsigned short task_flags = 0; /* another commit raced with us */ if (list_empty(head)) @@ -1820,8 +1821,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); atomic_inc(&cinfo->mds->rpcs_out); + if (NFS_SERVER(inode)->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), - data->mds_ops, how, RPC_TASK_CRED_NOREF); + data->mds_ops, how, + RPC_TASK_CRED_NOREF | task_flags); } /* diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 26f2a50eceac..edec45831585 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -82,6 +82,7 @@ __state_in_grace(struct net *net, bool open) /** * locks_in_grace + * @net: network namespace * * Lock managers call this function to determine when it is OK for them * to answer ordinary lock requests, and when they should accept only diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index a75abeb1e698..935c1028c217 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -176,6 +176,12 @@ struct nfsd_net { unsigned int longest_chain_cachesize; struct shrinker nfsd_reply_cache_shrinker; + + /* tracking server-to-server copy mounts */ + spinlock_t nfsd_ssc_lock; + struct list_head nfsd_ssc_mount_list; + wait_queue_head_t nfsd_ssc_waitq; + /* utsname taken from the process that starts the server */ char nfsd_name[UNX_MAXNODENAME+1]; }; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index a1591feeea22..5dfe7644a517 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -172,7 +172,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) struct nfsd3_getaclres *resp = rqstp->rq_resp; struct dentry *dentry = resp->fh.fh_dentry; struct kvec *head = rqstp->rq_res.head; - struct inode *inode = d_inode(dentry); + struct inode *inode; unsigned int base; int n; int w; @@ -181,6 +181,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p) return 0; switch (resp->status) { case nfs_ok: + inode = d_inode(dentry); if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh)) return 0; if (xdr_stream_encode_u32(xdr, resp->mask) < 0) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7325592b456e..0f8b10f363e7 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -915,10 +915,8 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.authflavor = clp->cl_cred.cr_flavor; clp->cl_cb_ident = conn->cb_ident; } else { - if (!conn->cb_xprt) { - trace_nfsd_cb_setup_err(clp, -EINVAL); + if (!conn->cb_xprt) return -EINVAL; - } clp->cl_cb_conn.cb_xprt = conn->cb_xprt; clp->cl_cb_session = ses; args.bc_xprt = conn->cb_xprt; @@ -941,37 +939,43 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c } clp->cl_cb_client = client; clp->cl_cb_cred = cred; - trace_nfsd_cb_setup(clp); + rcu_read_lock(); + trace_nfsd_cb_setup(clp, rpc_peeraddr2str(client, RPC_DISPLAY_NETID), + args.authflavor); + rcu_read_unlock(); return 0; } +static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate) +{ + if (clp->cl_cb_state != newstate) { + clp->cl_cb_state = newstate; + trace_nfsd_cb_state(clp); + } +} + static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; - clp->cl_cb_state = NFSD4_CB_DOWN; - trace_nfsd_cb_state(clp); + nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN); } static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; - clp->cl_cb_state = NFSD4_CB_FAULT; - trace_nfsd_cb_state(clp); + nfsd4_mark_cb_state(clp, NFSD4_CB_FAULT); } static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) { struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); - trace_nfsd_cb_done(clp, task->tk_status); if (task->tk_status) nfsd4_mark_cb_down(clp, task->tk_status); - else { - clp->cl_cb_state = NFSD4_CB_UP; - trace_nfsd_cb_state(clp); - } + else + nfsd4_mark_cb_state(clp, NFSD4_CB_UP); } static void nfsd4_cb_probe_release(void *calldata) @@ -995,8 +999,8 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = { */ void nfsd4_probe_callback(struct nfs4_client *clp) { - clp->cl_cb_state = NFSD4_CB_UNKNOWN; - trace_nfsd_cb_state(clp); + trace_nfsd_cb_probe(clp); + nfsd4_mark_cb_state(clp, NFSD4_CB_UNKNOWN); set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); nfsd4_run_cb(&clp->cl_cb_null); } @@ -1009,11 +1013,10 @@ void nfsd4_probe_callback_sync(struct nfs4_client *clp) void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) { - clp->cl_cb_state = NFSD4_CB_UNKNOWN; + nfsd4_mark_cb_state(clp, NFSD4_CB_UNKNOWN); spin_lock(&clp->cl_lock); memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn)); spin_unlock(&clp->cl_lock); - trace_nfsd_cb_state(clp); } /* @@ -1170,8 +1173,6 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) struct nfsd4_callback *cb = calldata; struct nfs4_client *clp = cb->cb_clp; - trace_nfsd_cb_done(clp, task->tk_status); - if (!nfsd4_cb_sequence_done(task, cb)) return; @@ -1231,6 +1232,9 @@ void nfsd4_destroy_callback_queue(void) /* must be called under the state lock */ void nfsd4_shutdown_callback(struct nfs4_client *clp) { + if (clp->cl_cb_state != NFSD4_CB_UNKNOWN) + trace_nfsd_cb_shutdown(clp); + set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags); /* * Note this won't actually result in a null callback; @@ -1276,7 +1280,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) * kill the old client: */ if (clp->cl_cb_client) { - trace_nfsd_cb_shutdown(clp); rpc_shutdown_client(clp->cl_cb_client); clp->cl_cb_client = NULL; put_cred(clp->cl_cb_cred); @@ -1322,8 +1325,6 @@ nfsd4_run_cb_work(struct work_struct *work) struct rpc_clnt *clnt; int flags; - trace_nfsd_cb_work(clp, cb->cb_msg.rpc_proc->p_name); - if (cb->cb_need_restart) { cb->cb_need_restart = false; } else { @@ -1345,7 +1346,7 @@ nfsd4_run_cb_work(struct work_struct *work) * Don't send probe messages for 4.1 or later. */ if (!cb->cb_ops && clp->cl_minorversion) { - clp->cl_cb_state = NFSD4_CB_UP; + nfsd4_mark_cb_state(clp, NFSD4_CB_UP); nfsd41_destroy_cb(cb); return; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f4ce93d7f26e..486c5dba4b65 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -55,6 +55,13 @@ module_param(inter_copy_offload_enable, bool, 0644); MODULE_PARM_DESC(inter_copy_offload_enable, "Enable inter server to server copy offload. Default: false"); +#ifdef CONFIG_NFSD_V4_2_INTER_SSC +static int nfsd4_ssc_umount_timeout = 900000; /* default to 15 mins */ +module_param(nfsd4_ssc_umount_timeout, int, 0644); +MODULE_PARM_DESC(nfsd4_ssc_umount_timeout, + "idle msecs before unmount export from source server"); +#endif + #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> @@ -1166,6 +1173,81 @@ extern void nfs_sb_deactive(struct super_block *sb); #define NFSD42_INTERSSC_MOUNTOPS "vers=4.2,addr=%s,sec=sys" /* + * setup a work entry in the ssc delayed unmount list. + */ +static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr, + struct nfsd4_ssc_umount_item **retwork, struct vfsmount **ss_mnt) +{ + struct nfsd4_ssc_umount_item *ni = 0; + struct nfsd4_ssc_umount_item *work = NULL; + struct nfsd4_ssc_umount_item *tmp; + DEFINE_WAIT(wait); + + *ss_mnt = NULL; + *retwork = NULL; + work = kzalloc(sizeof(*work), GFP_KERNEL); +try_again: + spin_lock(&nn->nfsd_ssc_lock); + list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { + if (strncmp(ni->nsui_ipaddr, ipaddr, sizeof(ni->nsui_ipaddr))) + continue; + /* found a match */ + if (ni->nsui_busy) { + /* wait - and try again */ + prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, + TASK_INTERRUPTIBLE); + spin_unlock(&nn->nfsd_ssc_lock); + + /* allow 20secs for mount/unmount for now - revisit */ + if (signal_pending(current) || + (schedule_timeout(20*HZ) == 0)) { + kfree(work); + return nfserr_eagain; + } + finish_wait(&nn->nfsd_ssc_waitq, &wait); + goto try_again; + } + *ss_mnt = ni->nsui_vfsmount; + refcount_inc(&ni->nsui_refcnt); + spin_unlock(&nn->nfsd_ssc_lock); + kfree(work); + + /* return vfsmount in ss_mnt */ + return 0; + } + if (work) { + strncpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr)); + refcount_set(&work->nsui_refcnt, 2); + work->nsui_busy = true; + list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list); + *retwork = work; + } + spin_unlock(&nn->nfsd_ssc_lock); + return 0; +} + +static void nfsd4_ssc_update_dul_work(struct nfsd_net *nn, + struct nfsd4_ssc_umount_item *work, struct vfsmount *ss_mnt) +{ + /* set nsui_vfsmount, clear busy flag and wakeup waiters */ + spin_lock(&nn->nfsd_ssc_lock); + work->nsui_vfsmount = ss_mnt; + work->nsui_busy = false; + wake_up_all(&nn->nfsd_ssc_waitq); + spin_unlock(&nn->nfsd_ssc_lock); +} + +static void nfsd4_ssc_cancel_dul_work(struct nfsd_net *nn, + struct nfsd4_ssc_umount_item *work) +{ + spin_lock(&nn->nfsd_ssc_lock); + list_del(&work->nsui_list); + wake_up_all(&nn->nfsd_ssc_waitq); + spin_unlock(&nn->nfsd_ssc_lock); + kfree(work); +} + +/* * Support one copy source server for now. */ static __be32 @@ -1181,6 +1263,8 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, char *ipaddr, *dev_name, *raw_data; int len, raw_len; __be32 status = nfserr_inval; + struct nfsd4_ssc_umount_item *work = NULL; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); naddr = &nss->u.nl4_addr; tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr, @@ -1229,12 +1313,24 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, goto out_free_rawdata; snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep); + status = nfsd4_ssc_setup_dul(nn, ipaddr, &work, &ss_mnt); + if (status) + goto out_free_devname; + if (ss_mnt) + goto out_done; + /* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */ ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data); module_put(type->owner); - if (IS_ERR(ss_mnt)) + if (IS_ERR(ss_mnt)) { + status = nfserr_nodev; + if (work) + nfsd4_ssc_cancel_dul_work(nn, work); goto out_free_devname; - + } + if (work) + nfsd4_ssc_update_dul_work(nn, work, ss_mnt); +out_done: status = 0; *mount = ss_mnt; @@ -1301,10 +1397,42 @@ static void nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src, struct nfsd_file *dst) { + bool found = false; + long timeout; + struct nfsd4_ssc_umount_item *tmp; + struct nfsd4_ssc_umount_item *ni = NULL; + struct nfsd_net *nn = net_generic(dst->nf_net, nfsd_net_id); + nfs42_ssc_close(src->nf_file); - fput(src->nf_file); nfsd_file_put(dst); - mntput(ss_mnt); + fput(src->nf_file); + + if (!nn) { + mntput(ss_mnt); + return; + } + spin_lock(&nn->nfsd_ssc_lock); + timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout); + list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { + if (ni->nsui_vfsmount->mnt_sb == ss_mnt->mnt_sb) { + list_del(&ni->nsui_list); + /* + * vfsmount can be shared by multiple exports, + * decrement refcnt. If the count drops to 1 it + * will be unmounted when nsui_expire expires. + */ + refcount_dec(&ni->nsui_refcnt); + ni->nsui_expire = jiffies + timeout; + list_add_tail(&ni->nsui_list, &nn->nfsd_ssc_mount_list); + found = true; + break; + } + } + spin_unlock(&nn->nfsd_ssc_lock); + if (!found) { + mntput(ss_mnt); + return; + } } #else /* CONFIG_NFSD_V4_2_INTER_SSC */ @@ -1375,7 +1503,8 @@ static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = { static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) { - copy->cp_res.wr_stable_how = NFS_UNSTABLE; + copy->cp_res.wr_stable_how = + copy->committed ? NFS_FILE_SYNC : NFS_UNSTABLE; copy->cp_synchronous = sync; gen_boot_verifier(©->cp_res.wr_verifier, copy->cp_clp->net); } @@ -1386,6 +1515,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) u64 bytes_total = copy->cp_count; u64 src_pos = copy->cp_src_pos; u64 dst_pos = copy->cp_dst_pos; + __be32 status; /* See RFC 7862 p.67: */ if (bytes_total == 0) @@ -1403,6 +1533,16 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) src_pos += bytes_copied; dst_pos += bytes_copied; } while (bytes_total > 0 && !copy->cp_synchronous); + /* for a non-zero asynchronous copy do a commit of data */ + if (!copy->cp_synchronous && copy->cp_res.wr_bytes_written > 0) { + down_write(©->nf_dst->nf_rwsem); + status = vfs_fsync_range(copy->nf_dst->nf_file, + copy->cp_dst_pos, + copy->cp_res.wr_bytes_written, 0); + up_write(©->nf_dst->nf_rwsem); + if (!status) + copy->committed = true; + } return bytes_copied; } @@ -1497,6 +1637,8 @@ do_callback: memcpy(&cb_copy->fh, ©->fh, sizeof(copy->fh)); nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp, &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); + trace_nfsd_cb_offload(copy->cp_clp, ©->cp_res.cb_stateid, + ©->fh, copy->cp_count, copy->nfserr); nfsd4_run_cb(&cb_copy->cp_cb); out: if (!copy->cp_intra) @@ -3232,7 +3374,7 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) { struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfsd4_compoundargs *argp = rqstp->rq_argp; - struct nfsd4_op *this = &argp->ops[resp->opcnt - 1]; + struct nfsd4_op *this; struct nfsd4_compound_state *cstate = &resp->cstate; struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow; u32 opiter; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b517a8794400..fa67ecd5fe63 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -44,6 +44,7 @@ #include <linux/jhash.h> #include <linux/string_helpers.h> #include <linux/fsnotify.h> +#include <linux/nfs_ssc.h> #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" @@ -1745,6 +1746,8 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u) struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user); struct nfs4_client *clp = c->cn_session->se_client; + trace_nfsd_cb_lost(clp); + spin_lock(&clp->cl_lock); if (!list_empty(&c->cn_persession)) { list_del(&c->cn_persession); @@ -2351,10 +2354,25 @@ static struct nfs4_client *get_nfsdfs_clp(struct inode *inode) static void seq_quote_mem(struct seq_file *m, char *data, int len) { seq_printf(m, "\""); - seq_escape_mem_ascii(m, data, len); + seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\"); seq_printf(m, "\""); } +static const char *cb_state2str(int state) +{ + switch (state) { + case NFSD4_CB_UP: + return "UP"; + case NFSD4_CB_UNKNOWN: + return "UNKNOWN"; + case NFSD4_CB_DOWN: + return "DOWN"; + case NFSD4_CB_FAULT: + return "FAULT"; + } + return "UNDEFINED"; +} + static int client_info_show(struct seq_file *m, void *v) { struct inode *inode = m->private; @@ -2383,6 +2401,8 @@ static int client_info_show(struct seq_file *m, void *v) seq_printf(m, "\nImplementation time: [%lld, %ld]\n", clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec); } + seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state)); + seq_printf(m, "callback address: %pISpc\n", &clp->cl_cb_conn.cb_addr); drop_client(clp); return 0; @@ -2665,6 +2685,8 @@ static void force_expire_client(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); bool already_expired; + trace_nfsd_clid_admin_expired(&clp->cl_clientid); + spin_lock(&clp->cl_lock); clp->cl_time = 0; spin_unlock(&clp->cl_lock); @@ -2816,14 +2838,11 @@ move_to_confirmed(struct nfs4_client *clp) lockdep_assert_held(&nn->client_lock); - dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); add_clp_to_name_tree(clp, &nn->conf_name_tree); - if (!test_and_set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags) && - clp->cl_nfsd_dentry && - clp->cl_nfsd_info_dentry) - fsnotify_dentry(clp->cl_nfsd_info_dentry, FS_MODIFY); + set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); + trace_nfsd_clid_confirmed(&clp->cl_clientid); renew_client_locked(clp); } @@ -3176,20 +3195,24 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } /* case 6 */ exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; + trace_nfsd_clid_confirmed_r(conf); goto out_copy; } if (!creds_match) { /* case 3 */ if (client_has_state(conf)) { status = nfserr_clid_inuse; + trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; } goto out_new; } if (verfs_match) { /* case 2 */ conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; + trace_nfsd_clid_confirmed_r(conf); goto out_copy; } /* case 5, client reboot */ + trace_nfsd_clid_verf_mismatch(conf, rqstp, &verf); conf = NULL; goto out_new; } @@ -3199,16 +3222,19 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - unconf = find_unconfirmed_client_by_name(&exid->clname, nn); + unconf = find_unconfirmed_client_by_name(&exid->clname, nn); if (unconf) /* case 4, possible retry or client restart */ unhash_client_locked(unconf); - /* case 1 (normal case) */ + /* case 1, new owner ID */ + trace_nfsd_clid_fresh(new); + out_new: if (conf) { status = mark_client_expired_locked(conf); if (status) goto out; + trace_nfsd_clid_replaced(&conf->cl_clientid); } new->cl_minorversion = cstate->minorversion; new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0]; @@ -3232,8 +3258,10 @@ out: out_nolock: if (new) expire_client(new); - if (unconf) + if (unconf) { + trace_nfsd_clid_expire_unconf(&unconf->cl_clientid); expire_client(unconf); + } return status; } @@ -3425,9 +3453,10 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; } } else if (unconf) { + status = nfserr_clid_inuse; if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { - status = nfserr_clid_inuse; + trace_nfsd_clid_cred_mismatch(unconf, rqstp); goto out_free_conn; } status = nfserr_wrong_cred; @@ -3447,6 +3476,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, old = NULL; goto out_free_conn; } + trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); conf = unconf; @@ -3471,6 +3501,8 @@ nfsd4_create_session(struct svc_rqst *rqstp, /* cache solo and embedded create sessions under the client_lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); spin_unlock(&nn->client_lock); + if (conf == unconf) + fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY); /* init connection and backchannel */ nfsd4_init_conn(rqstp, conn, new); nfsd4_put_session(new); @@ -3904,6 +3936,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, status = nfserr_wrong_cred; goto out; } + trace_nfsd_clid_destroyed(&clp->cl_clientid); unhash_client_locked(clp); out: spin_unlock(&nn->client_lock); @@ -3946,6 +3979,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, goto out; status = nfs_ok; + trace_nfsd_clid_reclaim_complete(&clp->cl_clientid); nfsd4_client_record_create(clp); inc_reclaim_complete(clp); out: @@ -3967,27 +4001,29 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, new = create_client(clname, rqstp, &clverifier); if (new == NULL) return nfserr_jukebox; - /* Cases below refer to rfc 3530 section 14.2.33: */ spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&clname, nn); if (conf && client_has_state(conf)) { - /* case 0: */ status = nfserr_clid_inuse; if (clp_used_exchangeid(conf)) goto out; if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { - trace_nfsd_clid_inuse_err(conf); + trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; } } unconf = find_unconfirmed_client_by_name(&clname, nn); if (unconf) unhash_client_locked(unconf); - /* We need to handle only case 1: probable callback update */ - if (conf && same_verf(&conf->cl_verifier, &clverifier)) { - copy_clid(new, conf); - gen_confirm(new, nn); - } + if (conf) { + if (same_verf(&conf->cl_verifier, &clverifier)) { + copy_clid(new, conf); + gen_confirm(new, nn); + } else + trace_nfsd_clid_verf_mismatch(conf, rqstp, + &clverifier); + } else + trace_nfsd_clid_fresh(new); new->cl_minorversion = 0; gen_callback(new, setclid, rqstp); add_to_unconfirmed(new); @@ -4000,12 +4036,13 @@ out: spin_unlock(&nn->client_lock); if (new) free_client(new); - if (unconf) + if (unconf) { + trace_nfsd_clid_expire_unconf(&unconf->cl_clientid); expire_client(unconf); + } return status; } - __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, @@ -4034,25 +4071,27 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, * Nevertheless, RFC 7530 recommends INUSE for this case: */ status = nfserr_clid_inuse; - if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) + if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { + trace_nfsd_clid_cred_mismatch(unconf, rqstp); goto out; - if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) + } + if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) { + trace_nfsd_clid_cred_mismatch(conf, rqstp); goto out; - /* cases below refer to rfc 3530 section 14.2.34: */ + } if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) { if (conf && same_verf(&confirm, &conf->cl_confirm)) { - /* case 2: probable retransmit */ status = nfs_ok; - } else /* case 4: client hasn't noticed we rebooted yet? */ + } else status = nfserr_stale_clientid; goto out; } status = nfs_ok; - if (conf) { /* case 1: callback update */ + if (conf) { old = unconf; unhash_client_locked(old); nfsd4_change_callback(conf, &unconf->cl_cb_conn); - } else { /* case 3: normal case; new or rebooted client */ + } else { old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = nfserr_clid_inuse; @@ -4065,12 +4104,15 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, old = NULL; goto out; } + trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); conf = unconf; } get_client_locked(conf); spin_unlock(&nn->client_lock); + if (conf == unconf) + fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY); nfsd4_probe_callback(conf); spin_lock(&nn->client_lock); put_client_renew_locked(conf); @@ -4618,7 +4660,7 @@ nfsd_break_deleg_cb(struct file_lock *fl) struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; struct nfs4_file *fp = dp->dl_stid.sc_file; - trace_nfsd_deleg_break(&dp->dl_stid.sc_stateid); + trace_nfsd_cb_recall(&dp->dl_stid); /* * We don't want the locks code to timeout the lease for us; @@ -5457,6 +5499,69 @@ static bool state_expired(struct laundry_time *lt, time64_t last_refresh) return false; } +#ifdef CONFIG_NFSD_V4_2_INTER_SSC +void nfsd4_ssc_init_umount_work(struct nfsd_net *nn) +{ + spin_lock_init(&nn->nfsd_ssc_lock); + INIT_LIST_HEAD(&nn->nfsd_ssc_mount_list); + init_waitqueue_head(&nn->nfsd_ssc_waitq); +} +EXPORT_SYMBOL_GPL(nfsd4_ssc_init_umount_work); + +/* + * This is called when nfsd is being shutdown, after all inter_ssc + * cleanup were done, to destroy the ssc delayed unmount list. + */ +static void nfsd4_ssc_shutdown_umount(struct nfsd_net *nn) +{ + struct nfsd4_ssc_umount_item *ni = NULL; + struct nfsd4_ssc_umount_item *tmp; + + spin_lock(&nn->nfsd_ssc_lock); + list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { + list_del(&ni->nsui_list); + spin_unlock(&nn->nfsd_ssc_lock); + mntput(ni->nsui_vfsmount); + kfree(ni); + spin_lock(&nn->nfsd_ssc_lock); + } + spin_unlock(&nn->nfsd_ssc_lock); +} + +static void nfsd4_ssc_expire_umount(struct nfsd_net *nn) +{ + bool do_wakeup = false; + struct nfsd4_ssc_umount_item *ni = 0; + struct nfsd4_ssc_umount_item *tmp; + + spin_lock(&nn->nfsd_ssc_lock); + list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) { + if (time_after(jiffies, ni->nsui_expire)) { + if (refcount_read(&ni->nsui_refcnt) > 1) + continue; + + /* mark being unmount */ + ni->nsui_busy = true; + spin_unlock(&nn->nfsd_ssc_lock); + mntput(ni->nsui_vfsmount); + spin_lock(&nn->nfsd_ssc_lock); + + /* waiters need to start from begin of list */ + list_del(&ni->nsui_list); + kfree(ni); + + /* wakeup ssc_connect waiters */ + do_wakeup = true; + continue; + } + break; + } + if (do_wakeup) + wake_up_all(&nn->nfsd_ssc_waitq); + spin_unlock(&nn->nfsd_ssc_lock); +} +#endif + static time64_t nfs4_laundromat(struct nfsd_net *nn) { @@ -5495,10 +5600,8 @@ nfs4_laundromat(struct nfsd_net *nn) clp = list_entry(pos, struct nfs4_client, cl_lru); if (!state_expired(<, clp->cl_time)) break; - if (mark_client_expired_locked(clp)) { - trace_nfsd_clid_expired(&clp->cl_clientid); + if (mark_client_expired_locked(clp)) continue; - } list_add(&clp->cl_lru, &reaplist); } spin_unlock(&nn->client_lock); @@ -5568,6 +5671,10 @@ nfs4_laundromat(struct nfsd_net *nn) list_del_init(&nbl->nbl_lru); free_blocked_lock(nbl); } +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + /* service the server-to-server copy delayed unmount list */ + nfsd4_ssc_expire_umount(nn); +#endif out: return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); } @@ -6430,8 +6537,10 @@ nfsd4_lm_notify(struct file_lock *fl) } spin_unlock(&nn->blocked_locks_lock); - if (queue) + if (queue) { + trace_nfsd_cb_notify_lock(lo, nbl); nfsd4_run_cb(&nbl->nbl_cb); + } } static const struct lock_manager_operations nfsd_posix_mng_ops = { @@ -7229,7 +7338,6 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, unsigned int strhashval; struct nfs4_client_reclaim *crp; - trace_nfsd_clid_reclaim(nn, name.len, name.data); crp = alloc_reclaim(); if (crp) { strhashval = clientstr_hashval(name); @@ -7279,8 +7387,6 @@ nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn) unsigned int strhashval; struct nfs4_client_reclaim *crp = NULL; - trace_nfsd_clid_find(nn, name.len, name.data); - strhashval = clientstr_hashval(name); list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) { if (compare_blob(&crp->cr_name, &name) == 0) { @@ -7486,6 +7592,9 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + nfsd4_ssc_shutdown_umount(nn); +#endif } void diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 14dbfa75059d..9664303afdaf 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -484,6 +484,10 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval) extern int nfsd4_is_junction(struct dentry *dentry); extern int register_cld_notifier(void); extern void unregister_cld_notifier(void); +#ifdef CONFIG_NFSD_V4_2_INTER_SSC +extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn); +#endif + #else /* CONFIG_NFSD_V4 */ static inline int nfsd4_is_junction(struct dentry *dentry) { diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index aff2cda5c6c3..6106697adc04 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -225,15 +225,12 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) * returns a crc32 hash for the filehandle that is compatible with * the one displayed by "wireshark". */ - -static inline u32 -knfsd_fh_hash(struct knfsd_fh *fh) +static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) { return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size); } #else -static inline u32 -knfsd_fh_hash(struct knfsd_fh *fh) +static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) { return 0; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index dd5d69921676..ccb59e91011b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -403,6 +403,9 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred) if (ret) goto out_filecache; +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + nfsd4_ssc_init_umount_work(nn); +#endif nn->nfsd_net_up = true; return 0; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 27a93ebd1d80..adaec43548d1 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -408,7 +408,6 @@ TRACE_EVENT(nfsd_dirent, __entry->ino = ino; __entry->len = namlen; memcpy(__get_str(name), name, namlen); - __assign_str(name, name); ), TP_printk("fh_hash=0x%08x ino=%llu name=%.*s", __entry->fh_hash, __entry->ino, @@ -459,7 +458,6 @@ DEFINE_STATEID_EVENT(layout_recall_release); DEFINE_STATEID_EVENT(open); DEFINE_STATEID_EVENT(deleg_read); -DEFINE_STATEID_EVENT(deleg_break); DEFINE_STATEID_EVENT(deleg_recall); DECLARE_EVENT_CLASS(nfsd_stateseqid_class, @@ -511,7 +509,12 @@ DEFINE_EVENT(nfsd_clientid_class, nfsd_clid_##name, \ TP_PROTO(const clientid_t *clid), \ TP_ARGS(clid)) -DEFINE_CLIENTID_EVENT(expired); +DEFINE_CLIENTID_EVENT(expire_unconf); +DEFINE_CLIENTID_EVENT(reclaim_complete); +DEFINE_CLIENTID_EVENT(confirmed); +DEFINE_CLIENTID_EVENT(destroyed); +DEFINE_CLIENTID_EVENT(admin_expired); +DEFINE_CLIENTID_EVENT(replaced); DEFINE_CLIENTID_EVENT(purged); DEFINE_CLIENTID_EVENT(renew); DEFINE_CLIENTID_EVENT(stale); @@ -536,58 +539,102 @@ DEFINE_EVENT(nfsd_net_class, nfsd_##name, \ DEFINE_NET_EVENT(grace_start); DEFINE_NET_EVENT(grace_complete); -DECLARE_EVENT_CLASS(nfsd_clid_class, - TP_PROTO(const struct nfsd_net *nn, - unsigned int namelen, - const unsigned char *namedata), - TP_ARGS(nn, namelen, namedata), +TRACE_EVENT(nfsd_clid_cred_mismatch, + TP_PROTO( + const struct nfs4_client *clp, + const struct svc_rqst *rqstp + ), + TP_ARGS(clp, rqstp), TP_STRUCT__entry( - __field(unsigned long long, boot_time) - __field(unsigned int, namelen) - __dynamic_array(unsigned char, name, namelen) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(unsigned long, cl_flavor) + __field(unsigned long, new_flavor) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( - __entry->boot_time = nn->boot_time; - __entry->namelen = namelen; - memcpy(__get_dynamic_array(name), namedata, namelen); + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __entry->cl_flavor = clp->cl_cred.cr_flavor; + __entry->new_flavor = rqstp->rq_cred.cr_flavor; + memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote, + sizeof(struct sockaddr_in6)); ), - TP_printk("boot_time=%16llx nfs4_clientid=%.*s", - __entry->boot_time, __entry->namelen, __get_str(name)) + TP_printk("client %08x:%08x flavor=%s, conflict=%s from addr=%pISpc", + __entry->cl_boot, __entry->cl_id, + show_nfsd_authflavor(__entry->cl_flavor), + show_nfsd_authflavor(__entry->new_flavor), __entry->addr + ) ) -#define DEFINE_CLID_EVENT(name) \ -DEFINE_EVENT(nfsd_clid_class, nfsd_clid_##name, \ - TP_PROTO(const struct nfsd_net *nn, \ - unsigned int namelen, \ - const unsigned char *namedata), \ - TP_ARGS(nn, namelen, namedata)) - -DEFINE_CLID_EVENT(find); -DEFINE_CLID_EVENT(reclaim); +TRACE_EVENT(nfsd_clid_verf_mismatch, + TP_PROTO( + const struct nfs4_client *clp, + const struct svc_rqst *rqstp, + const nfs4_verifier *verf + ), + TP_ARGS(clp, rqstp, verf), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __array(unsigned char, cl_verifier, NFS4_VERIFIER_SIZE) + __array(unsigned char, new_verifier, NFS4_VERIFIER_SIZE) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + memcpy(__entry->cl_verifier, (void *)&clp->cl_verifier, + NFS4_VERIFIER_SIZE); + memcpy(__entry->new_verifier, (void *)verf, + NFS4_VERIFIER_SIZE); + memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote, + sizeof(struct sockaddr_in6)); + ), + TP_printk("client %08x:%08x verf=0x%s, updated=0x%s from addr=%pISpc", + __entry->cl_boot, __entry->cl_id, + __print_hex_str(__entry->cl_verifier, NFS4_VERIFIER_SIZE), + __print_hex_str(__entry->new_verifier, NFS4_VERIFIER_SIZE), + __entry->addr + ) +); -TRACE_EVENT(nfsd_clid_inuse_err, +DECLARE_EVENT_CLASS(nfsd_clid_class, TP_PROTO(const struct nfs4_client *clp), TP_ARGS(clp), TP_STRUCT__entry( __field(u32, cl_boot) __field(u32, cl_id) __array(unsigned char, addr, sizeof(struct sockaddr_in6)) - __field(unsigned int, namelen) - __dynamic_array(unsigned char, name, clp->cl_name.len) + __field(unsigned long, flavor) + __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) + __dynamic_array(char, name, clp->cl_name.len + 1) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; __entry->cl_id = clp->cl_clientid.cl_id; memcpy(__entry->addr, &clp->cl_addr, sizeof(struct sockaddr_in6)); - __entry->namelen = clp->cl_name.len; - memcpy(__get_dynamic_array(name), clp->cl_name.data, - clp->cl_name.len); - ), - TP_printk("nfs4_clientid %.*s already in use by %pISpc, client %08x:%08x", - __entry->namelen, __get_str(name), __entry->addr, + __entry->flavor = clp->cl_cred.cr_flavor; + memcpy(__entry->verifier, (void *)&clp->cl_verifier, + NFS4_VERIFIER_SIZE); + memcpy(__get_str(name), clp->cl_name.data, clp->cl_name.len); + __get_str(name)[clp->cl_name.len] = '\0'; + ), + TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x", + __entry->addr, __get_str(name), + __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE), + show_nfsd_authflavor(__entry->flavor), __entry->cl_boot, __entry->cl_id) -) +); + +#define DEFINE_CLID_EVENT(name) \ +DEFINE_EVENT(nfsd_clid_class, nfsd_clid_##name, \ + TP_PROTO(const struct nfs4_client *clp), \ + TP_ARGS(clp)) + +DEFINE_CLID_EVENT(fresh); +DEFINE_CLID_EVENT(confirmed_r); /* * from fs/nfsd/filecache.h @@ -809,9 +856,9 @@ TRACE_EVENT(nfsd_cb_args, memcpy(__entry->addr, &conn->cb_addr, sizeof(struct sockaddr_in6)); ), - TP_printk("client %08x:%08x callback addr=%pISpc prog=%u ident=%u", - __entry->cl_boot, __entry->cl_id, - __entry->addr, __entry->prog, __entry->ident) + TP_printk("addr=%pISpc client %08x:%08x prog=%u ident=%u", + __entry->addr, __entry->cl_boot, __entry->cl_id, + __entry->prog, __entry->ident) ); TRACE_EVENT(nfsd_cb_nodelegs, @@ -828,11 +875,6 @@ TRACE_EVENT(nfsd_cb_nodelegs, TP_printk("client %08x:%08x", __entry->cl_boot, __entry->cl_id) ) -TRACE_DEFINE_ENUM(NFSD4_CB_UP); -TRACE_DEFINE_ENUM(NFSD4_CB_UNKNOWN); -TRACE_DEFINE_ENUM(NFSD4_CB_DOWN); -TRACE_DEFINE_ENUM(NFSD4_CB_FAULT); - #define show_cb_state(val) \ __print_symbolic(val, \ { NFSD4_CB_UP, "UP" }, \ @@ -866,10 +908,53 @@ DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name, \ TP_PROTO(const struct nfs4_client *clp), \ TP_ARGS(clp)) -DEFINE_NFSD_CB_EVENT(setup); DEFINE_NFSD_CB_EVENT(state); +DEFINE_NFSD_CB_EVENT(probe); +DEFINE_NFSD_CB_EVENT(lost); DEFINE_NFSD_CB_EVENT(shutdown); +TRACE_DEFINE_ENUM(RPC_AUTH_NULL); +TRACE_DEFINE_ENUM(RPC_AUTH_UNIX); +TRACE_DEFINE_ENUM(RPC_AUTH_GSS); +TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5); +TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5I); +TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5P); + +#define show_nfsd_authflavor(val) \ + __print_symbolic(val, \ + { RPC_AUTH_NULL, "none" }, \ + { RPC_AUTH_UNIX, "sys" }, \ + { RPC_AUTH_GSS, "gss" }, \ + { RPC_AUTH_GSS_KRB5, "krb5" }, \ + { RPC_AUTH_GSS_KRB5I, "krb5i" }, \ + { RPC_AUTH_GSS_KRB5P, "krb5p" }) + +TRACE_EVENT(nfsd_cb_setup, + TP_PROTO(const struct nfs4_client *clp, + const char *netid, + rpc_authflavor_t authflavor + ), + TP_ARGS(clp, netid, authflavor), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(unsigned long, authflavor) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __array(unsigned char, netid, 8) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + strlcpy(__entry->netid, netid, sizeof(__entry->netid)); + __entry->authflavor = authflavor; + memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("addr=%pISpc client %08x:%08x proto=%s flavor=%s", + __entry->addr, __entry->cl_boot, __entry->cl_id, + __entry->netid, show_nfsd_authflavor(__entry->authflavor)) +); + TRACE_EVENT(nfsd_cb_setup_err, TP_PROTO( const struct nfs4_client *clp, @@ -893,52 +978,97 @@ TRACE_EVENT(nfsd_cb_setup_err, __entry->addr, __entry->cl_boot, __entry->cl_id, __entry->error) ); -TRACE_EVENT(nfsd_cb_work, +TRACE_EVENT(nfsd_cb_recall, TP_PROTO( - const struct nfs4_client *clp, - const char *procedure + const struct nfs4_stid *stid ), - TP_ARGS(clp, procedure), + TP_ARGS(stid), TP_STRUCT__entry( __field(u32, cl_boot) __field(u32, cl_id) - __string(procedure, procedure) + __field(u32, si_id) + __field(u32, si_generation) __array(unsigned char, addr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( + const stateid_t *stp = &stid->sc_stateid; + const struct nfs4_client *clp = stid->sc_client; + + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + if (clp) + memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, + sizeof(struct sockaddr_in6)); + else + memset(__entry->addr, 0, sizeof(struct sockaddr_in6)); + ), + TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x", + __entry->addr, __entry->cl_boot, __entry->cl_id, + __entry->si_id, __entry->si_generation) +); + +TRACE_EVENT(nfsd_cb_notify_lock, + TP_PROTO( + const struct nfs4_lockowner *lo, + const struct nfsd4_blocked_lock *nbl + ), + TP_ARGS(lo, nbl), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, fh_hash) + __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + const struct nfs4_client *clp = lo->lo_owner.so_client; + __entry->cl_boot = clp->cl_clientid.cl_boot; __entry->cl_id = clp->cl_clientid.cl_id; - __assign_str(procedure, procedure) + __entry->fh_hash = knfsd_fh_hash(&nbl->nbl_fh); memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, sizeof(struct sockaddr_in6)); ), - TP_printk("addr=%pISpc client %08x:%08x procedure=%s", + TP_printk("addr=%pISpc client %08x:%08x fh_hash=0x%08x", __entry->addr, __entry->cl_boot, __entry->cl_id, - __get_str(procedure)) + __entry->fh_hash) ); -TRACE_EVENT(nfsd_cb_done, +TRACE_EVENT(nfsd_cb_offload, TP_PROTO( const struct nfs4_client *clp, - int status + const stateid_t *stp, + const struct knfsd_fh *fh, + u64 count, + __be32 status ), - TP_ARGS(clp, status), + TP_ARGS(clp, stp, fh, count, status), TP_STRUCT__entry( __field(u32, cl_boot) __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + __field(u32, fh_hash) __field(int, status) + __field(u64, count) __array(unsigned char, addr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( - __entry->cl_boot = clp->cl_clientid.cl_boot; - __entry->cl_id = clp->cl_clientid.cl_id; - __entry->status = status; + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + __entry->fh_hash = knfsd_fh_hash(fh); + __entry->status = be32_to_cpu(status); + __entry->count = count; memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr, sizeof(struct sockaddr_in6)); ), - TP_printk("addr=%pISpc client %08x:%08x status=%d", + TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x fh_hash=0x%08x count=%llu status=%d", __entry->addr, __entry->cl_boot, __entry->cl_id, - __entry->status) + __entry->si_id, __entry->si_generation, + __entry->fh_hash, __entry->count, __entry->status) ); #endif /* _NFSD_TRACE_H */ diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 15adf1f6ab21..a224a5e23cc1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1123,6 +1123,19 @@ out: } #ifdef CONFIG_NFSD_V3 +static int +nfsd_filemap_write_and_wait_range(struct nfsd_file *nf, loff_t offset, + loff_t end) +{ + struct address_space *mapping = nf->nf_file->f_mapping; + int ret = filemap_fdatawrite_range(mapping, offset, end); + + if (ret) + return ret; + filemap_fdatawait_range_keep_errors(mapping, offset, end); + return 0; +} + /* * Commit all pending writes to stable storage. * @@ -1153,10 +1166,11 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) goto out; if (EX_ISSYNC(fhp->fh_export)) { - int err2; + int err2 = nfsd_filemap_write_and_wait_range(nf, offset, end); down_write(&nf->nf_rwsem); - err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); + if (!err2) + err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); switch (err2) { case 0: nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net, @@ -1613,9 +1627,9 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path); err = nfserrno(host_err); + fh_unlock(fhp); if (!err) err = nfserrno(commit_metadata(fhp)); - fh_unlock(fhp); fh_drop_write(fhp); @@ -1680,6 +1694,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (d_really_is_negative(dold)) goto out_dput; host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL); + fh_unlock(ffhp); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); if (!err) @@ -1859,6 +1874,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, { struct dentry *dentry, *rdentry; struct inode *dirp; + struct inode *rinode; __be32 err; int host_err; @@ -1887,6 +1903,8 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, host_err = -ENOENT; goto out_drop_write; } + rinode = d_inode(rdentry); + ihold(rinode); if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; @@ -1899,9 +1917,11 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, host_err = vfs_rmdir(&init_user_ns, dirp, rdentry); } + fh_unlock(fhp); if (!host_err) host_err = commit_metadata(fhp); dput(rdentry); + iput(rinode); /* truncate the inode here */ out_drop_write: fh_drop_write(fhp); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index a7c425254fee..3e4052e3bd50 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -567,6 +567,7 @@ struct nfsd4_copy { struct vfsmount *ss_mnt; struct nfs_fh c_fh; nfs4_stateid stateid; + bool committed; }; struct nfsd4_seek { diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index f42ab57201e7..ab9ec073330f 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -738,7 +738,6 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree, if (ptr2 != ptr + cnt || ++cnt == maxblocks) goto end; index++; - continue; } if (level == maxlevel) break; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index c0361ce45f62..97769fe4d588 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -434,6 +434,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) static const struct address_space_operations def_mdt_aops = { + .set_page_dirty = __set_page_dirty_buffers, .writepage = nilfs_mdt_write_page, }; diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 303d71430bdd..68e8d61e28dd 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -19,19 +19,6 @@ /* /sys/fs/<nilfs>/ */ static struct kset *nilfs_kset; -#define NILFS_SHOW_TIME(time_t_val, buf) ({ \ - struct tm res; \ - int count = 0; \ - time64_to_tm(time_t_val, 0, &res); \ - res.tm_year += 1900; \ - res.tm_mon += 1; \ - count = scnprintf(buf, PAGE_SIZE, \ - "%ld-%.2d-%.2d %.2d:%.2d:%.2d\n", \ - res.tm_year, res.tm_mon, res.tm_mday, \ - res.tm_hour, res.tm_min, res.tm_sec);\ - count; \ -}) - #define NILFS_DEV_INT_GROUP_OPS(name, parent_name) \ static ssize_t nilfs_##name##_attr_show(struct kobject *kobj, \ struct attribute *attr, char *buf) \ @@ -576,7 +563,7 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr, ctime = nilfs->ns_ctime; up_read(&nilfs->ns_segctor_sem); - return NILFS_SHOW_TIME(ctime, buf); + return sysfs_emit(buf, "%ptTs\n", &ctime); } static ssize_t @@ -604,7 +591,7 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr, nongc_ctime = nilfs->ns_nongc_ctime; up_read(&nilfs->ns_segctor_sem); - return NILFS_SHOW_TIME(nongc_ctime, buf); + return sysfs_emit(buf, "%ptTs\n", &nongc_ctime); } static ssize_t @@ -724,7 +711,7 @@ nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr, sbwtime = nilfs->ns_sbwtime; up_read(&nilfs->ns_sem); - return NILFS_SHOW_TIME(sbwtime, buf); + return sysfs_emit(buf, "%ptTs\n", &sbwtime); } static ssize_t @@ -1053,6 +1040,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) nilfs_sysfs_delete_superblock_group(nilfs); nilfs_sysfs_delete_segctor_group(nilfs); kobject_del(&nilfs->ns_dev_kobj); + kobject_put(&nilfs->ns_dev_kobj); kfree(nilfs->ns_dev_subgroups); } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index be5b6d2c01e7..64864fb40b40 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -471,7 +471,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, info_type, fanotify_info_name(info), info->name_len, buf, count); if (ret < 0) - return ret; + goto out_close_fd; buf += ret; count -= ret; @@ -519,7 +519,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, fanotify_event_object_fh(event), info_type, dot, dot_len, buf, count); if (ret < 0) - return ret; + goto out_close_fd; buf += ret; count -= ret; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index e5aab265dff1..ab4f3362466d 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1684,20 +1684,17 @@ static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, { struct page **last_page = pages + nr_pages; size_t total = 0; - struct iov_iter data = *i; unsigned len, copied; do { len = PAGE_SIZE - ofs; if (len > bytes) len = bytes; - copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs, - len); + copied = copy_page_from_iter_atomic(*pages, ofs, len, i); total += copied; bytes -= copied; if (!bytes) break; - iov_iter_advance(&data, copied); if (copied < len) goto err; ofs = 0; @@ -1866,34 +1863,24 @@ again: if (likely(copied == bytes)) { status = ntfs_commit_pages_after_write(pages, do_pages, pos, bytes); - if (!status) - status = bytes; } do { unlock_page(pages[--do_pages]); put_page(pages[do_pages]); } while (do_pages); - if (unlikely(status < 0)) + if (unlikely(status < 0)) { + iov_iter_revert(i, copied); break; - copied = status; + } cond_resched(); - if (unlikely(!copied)) { - size_t sc; - - /* - * We failed to copy anything. Fall back to single - * segment length write. - * - * This is needed to avoid possible livelock in the - * case that all segments in the iov cannot be copied - * at once without a pagefault. - */ - sc = iov_iter_single_seg_count(i); - if (bytes > sc) - bytes = sc; + if (unlikely(copied < bytes)) { + iov_iter_revert(i, copied); + if (copied) + bytes = copied; + else if (bytes > PAGE_SIZE - ofs) + bytes = PAGE_SIZE - ofs; goto again; } - iov_iter_advance(i, copied); pos += copied; written += copied; balance_dirty_pages_ratelimited(mapping); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index f5c058b3192c..4474adb393ca 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -477,7 +477,7 @@ err_corrupt_attr: } file_name_attr = (FILE_NAME_ATTR*)((u8*)attr + le16_to_cpu(attr->data.resident.value_offset)); - p2 = (u8*)attr + le32_to_cpu(attr->data.resident.value_length); + p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length); if (p2 < (u8*)attr || p2 > p) goto err_corrupt_attr; /* This attribute is ok, but is it in the $Extend directory? */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 1294925ac94a..68d11c295dd3 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -632,8 +632,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, } if (PageUptodate(page)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_new(bh) && ocfs2_should_read_blk(inode, page, block_start) && @@ -2454,6 +2453,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } const struct address_space_operations ocfs2_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = ocfs2_readpage, .readahead = ocfs2_readahead, .writepage = ocfs2_writepage, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index e829c2595543..f89ffcbd585f 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1442,8 +1442,6 @@ void o2hb_init(void) for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) INIT_LIST_HEAD(&o2hb_live_slots[i]); - INIT_LIST_HEAD(&o2hb_node_events); - memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); @@ -1598,12 +1596,13 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item, struct o2hb_region *reg = to_o2hb_region(item); unsigned long long tmp; char *p = (char *)page; + ssize_t ret; if (reg->hr_bdev) return -EINVAL; - tmp = simple_strtoull(p, &p, 0); - if (!p || (*p && (*p != '\n'))) + ret = kstrtoull(p, 0, &tmp); + if (ret) return -EINVAL; reg->hr_start_block = tmp; diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index bb82e6b1ff4e..625c92521416 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -824,7 +824,7 @@ static void __exit exit_o2nm(void) static int __init init_o2nm(void) { - int ret = -1; + int ret; o2hb_init(); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 4960a6de768d..9b88219febb5 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2977,7 +2977,7 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { enum dlm_lockres_list idx; - struct list_head *queue = &res->granted; + struct list_head *queue; struct dlm_lock *lock; int noderef; u8 nodenum = O2NM_MAX_NODES; diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 90b8d300c1ee..de56e6231af8 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -326,11 +326,7 @@ static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj, ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n", p->fe_ino, p->fe_done, ocfs2_filecheck_error(p->fe_status)); - if (ret < 0) { - total = ret; - break; - } - if (ret == remain) { + if (ret >= remain) { /* snprintf() didn't fit */ total = -E2BIG; break; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index d50e8b8dfea4..16f1bfc407f2 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -500,11 +500,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, list_for_each_entry(p, &ocfs2_stack_list, sp_list) { ret = snprintf(buf, remain, "%s\n", p->sp_name); - if (ret < 0) { - total = ret; - break; - } - if (ret == remain) { + if (ret >= remain) { /* snprintf() didn't fit */ total = -E2BIG; break; @@ -531,7 +527,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, if (active_stack) { ret = snprintf(buf, PAGE_SIZE, "%s\n", active_stack->sp_name); - if (ret == PAGE_SIZE) + if (ret >= PAGE_SIZE) ret = -E2BIG; } spin_unlock(&ocfs2_stack_lock); diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 11e733aab25d..89725b15a64b 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -372,6 +372,7 @@ const struct inode_operations omfs_file_inops = { }; const struct address_space_operations omfs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = omfs_readpage, .readahead = omfs_readahead, .writepage = omfs_writepage, diff --git a/fs/open.c b/fs/open.c index e53af13b5835..94bef26ff1b6 100644 --- a/fs/open.c +++ b/fs/open.c @@ -852,8 +852,17 @@ static int do_dentry_open(struct file *f, * XXX: Huge page cache doesn't support writing yet. Drop all page * cache for this file before processing writes. */ - if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping)) - truncate_pagecache(inode, 0); + if (f->f_mode & FMODE_WRITE) { + /* + * Paired with smp_mb() in collapse_file() to ensure nr_thps + * is up to date and the update to i_writecount by + * get_write_access() is visible. Ensures subsequent insertion + * of THPs into the page cache will fail. + */ + smp_mb(); + if (filemap_nr_thps(inode->i_mapping)) + truncate_pagecache(inode, 0); + } return 0; @@ -1002,12 +1011,20 @@ inline struct open_how build_open_how(int flags, umode_t mode) inline int build_open_flags(const struct open_how *how, struct open_flags *op) { - int flags = how->flags; + u64 flags = how->flags; + u64 strip = FMODE_NONOTIFY | O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); - /* Must never be set by userspace */ - flags &= ~(FMODE_NONOTIFY | O_CLOEXEC); + BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), + "struct open_flags doesn't yet handle flags > 32 bits"); + + /* + * Strip flags that either shouldn't be set by userspace like + * FMODE_NONOTIFY or that aren't relevant in determining struct + * open_flags like O_CLOEXEC. + */ + flags &= ~strip; /* * Older syscalls implicitly clear all of the invalid flags or argument @@ -1156,7 +1173,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) } EXPORT_SYMBOL(filp_open); -struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, +struct file *file_open_root(const struct path *root, const char *filename, int flags, umode_t mode) { struct open_flags op; @@ -1164,7 +1181,7 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, int err = build_open_flags(&how, &op); if (err) return ERR_PTR(err); - return do_file_open_root(dentry, mnt, filename, &op); + return do_file_open_root(root, filename, &op); } EXPORT_SYMBOL(file_open_root); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 6bf35a0d61f3..16ac617df7d7 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -249,8 +249,7 @@ static void orangefs_readahead(struct readahead_control *rac) { loff_t offset; struct iov_iter iter; - struct file *file = rac->file; - struct inode *inode = file->f_mapping->host; + struct inode *inode = rac->mapping->host; struct xarray *i_pages; struct page *page; loff_t new_start = readahead_pos(rac); @@ -269,14 +268,14 @@ static void orangefs_readahead(struct readahead_control *rac) readahead_expand(rac, new_start, new_len); offset = readahead_pos(rac); - i_pages = &file->f_mapping->i_pages; + i_pages = &rac->mapping->i_pages; iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac)); /* read in the pages. */ if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &offset, &iter, readahead_length(rac), - inode->i_size, NULL, NULL, file)) < 0) + inode->i_size, NULL, NULL, rac->file)) < 0) gossip_debug(GOSSIP_FILE_DEBUG, "%s: wait_for_direct_io failed. \n", __func__); else diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index ee5efdc35cc1..2f2e430461b2 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -209,7 +209,7 @@ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = (sector_t) new_op->downcall.resp.statfs.blocks_avail; buf->f_files = (sector_t) new_op->downcall.resp.statfs.files_total; buf->f_ffree = (sector_t) new_op->downcall.resp.statfs.files_avail; - buf->f_frsize = sb->s_blocksize; + buf->f_frsize = 0; out_op_release: op_release(new_op); diff --git a/fs/proc/array.c b/fs/proc/array.c index 7ec59171f197..ee0ce8cecc4a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -284,7 +284,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) collect_sigign_sigcatch(p, &ignored, &caught); num_threads = get_nr_threads(p); rcu_read_lock(); /* FIXME: is this correct? */ - qsize = atomic_read(&__task_cred(p)->user->sigpending); + qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); rcu_read_unlock(); qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); diff --git a/fs/proc/base.c b/fs/proc/base.c index 58bbf334265b..e5b5f7709d48 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -854,7 +854,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); while (count > 0) { - int this_len = min_t(int, count, PAGE_SIZE); + size_t this_len = min_t(size_t, count, PAGE_SIZE); if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; @@ -2674,6 +2674,13 @@ out: } #ifdef CONFIG_SECURITY +static int proc_pid_attr_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); + return 0; +} + static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { @@ -2704,7 +2711,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, int rv; /* A task may only write when it was the opener. */ - if (file->f_cred != current_real_cred()) + if (file->private_data != current->mm) return -EPERM; rcu_read_lock(); @@ -2754,9 +2761,11 @@ out: } static const struct file_operations proc_pid_attr_operations = { + .open = proc_pid_attr_open, .read = proc_pid_attr_read, .write = proc_pid_attr_write, .llseek = generic_file_llseek, + .release = mem_release, }; #define LSM_DIR_OPS(LSM) \ @@ -3163,7 +3172,7 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), - DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), @@ -3508,7 +3517,7 @@ static const struct inode_operations proc_tid_comm_inode_operations = { */ static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), - DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("fdinfo", S_IRUGO|S_IXUGO, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index ad31ec4ad627..6d8d4bf20837 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -49,7 +49,7 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) else q = '"'; ret = snprintf(dst, rest(dst, end), "%c%s%c%s", - q, val, q, vnode->next ? ", " : "\n"); + q, val, q, xbc_node_is_array(vnode) ? ", " : "\n"); if (ret < 0) goto out; dst += ret; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 07fc4fad2602..172c86270b31 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -6,6 +6,7 @@ #include <linux/fdtable.h> #include <linux/namei.h> #include <linux/pid.h> +#include <linux/ptrace.h> #include <linux/security.h> #include <linux/file.h> #include <linux/seq_file.h> @@ -53,9 +54,10 @@ static int seq_show(struct seq_file *m, void *v) if (ret) return ret; - seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n", (long long)file->f_pos, f_flags, - real_mount(file->f_path.mnt)->mnt_id); + real_mount(file->f_path.mnt)->mnt_id, + file_inode(file)->i_ino); /* show_fd_locks() never deferences files so a stale value is safe */ show_fd_locks(m, file, files); @@ -72,6 +74,18 @@ out: static int seq_fdinfo_open(struct inode *inode, struct file *file) { + bool allowed = false; + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + + allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); + put_task_struct(task); + + if (!allowed) + return -EACCES; + return single_open(file, seq_show, inode); } @@ -308,7 +322,7 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry, struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUSR); + inode = proc_pid_make_inode(dentry->d_sb, task, S_IFREG | S_IRUGO); if (!inode) return ERR_PTR(-ENOENT); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 4d2e64e9016c..982e694aae77 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -313,6 +313,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { char *buf = file->private_data; size_t phdrs_offset, notes_offset, data_offset; + size_t page_offline_frozen = 1; size_t phdrs_len, notes_len; struct kcore_list *m; size_t tsz; @@ -322,6 +323,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) int ret = 0; down_read(&kclist_lock); + /* + * Don't race against drivers that set PageOffline() and expect no + * further page access. + */ + page_offline_freeze(); get_kcore_size(&nphdr, &phdrs_len, ¬es_len, &data_offset); phdrs_offset = sizeof(struct elfhdr); @@ -380,11 +386,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) phdr->p_type = PT_LOAD; phdr->p_flags = PF_R | PF_W | PF_X; phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset; - if (m->type == KCORE_REMAP) - phdr->p_vaddr = (size_t)m->vaddr; - else - phdr->p_vaddr = (size_t)m->addr; - if (m->type == KCORE_RAM || m->type == KCORE_REMAP) + phdr->p_vaddr = (size_t)m->addr; + if (m->type == KCORE_RAM) phdr->p_paddr = __pa(m->addr); else if (m->type == KCORE_TEXT) phdr->p_paddr = __pa_symbol(m->addr); @@ -468,6 +471,9 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) m = NULL; while (buflen) { + struct page *page; + unsigned long pfn; + /* * If this is the first iteration or the address is not within * the previous entry, search for a matching entry. @@ -480,31 +486,57 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } } + if (page_offline_frozen++ % MAX_ORDER_NR_PAGES == 0) { + page_offline_thaw(); + cond_resched(); + page_offline_freeze(); + } + if (&m->list == &kclist_head) { if (clear_user(buffer, tsz)) { ret = -EFAULT; goto out; } m = NULL; /* skip the list anchor */ - } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) { - if (clear_user(buffer, tsz)) { - ret = -EFAULT; - goto out; - } - } else if (m->type == KCORE_VMALLOC) { + goto skip; + } + + switch (m->type) { + case KCORE_VMALLOC: vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ if (copy_to_user(buffer, buf, tsz)) { ret = -EFAULT; goto out; } - } else if (m->type == KCORE_USER) { + break; + case KCORE_USER: /* User page is handled prior to normal kernel page: */ if (copy_to_user(buffer, (char *)start, tsz)) { ret = -EFAULT; goto out; } - } else { + break; + case KCORE_RAM: + pfn = __pa(start) >> PAGE_SHIFT; + page = pfn_to_online_page(pfn); + + /* + * Don't read offline sections, logically offline pages + * (e.g., inflated in a balloon), hwpoisoned pages, + * and explicitly excluded physical ranges. + */ + if (!page || PageOffline(page) || + is_page_hwpoison(page) || !pfn_is_ram(pfn)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } + break; + } + fallthrough; + case KCORE_VMEMMAP: + case KCORE_TEXT: if (kern_addr_valid(start)) { /* * Using bounce buffer to bypass the @@ -528,7 +560,15 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) goto out; } } + break; + default: + pr_warn_once("Unhandled KCORE type: %d\n", m->type); + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } +skip: buflen -= tsz; *fpos += tsz; buffer += tsz; @@ -537,6 +577,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } out: + page_offline_thaw(); up_read(&kclist_lock); if (ret) return ret; diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 8468baee951d..f32878d9a39f 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -16,7 +16,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v) get_avenrun(avnrun, FIXED_1/200, 0); - seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n", LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index dea0f5ee540c..5d66faecd4ef 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1807,7 +1807,7 @@ static int process_sysctl_arg(char *param, char *val, panic("%s: Failed to allocate path for %s\n", __func__, param); strreplace(path, '.', '/'); - file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0); + file = file_open_root_mnt(*proc_mnt, path, O_WRONLY, 0); if (IS_ERR(file)) { err = PTR_ERR(file); if (err == -ENOENT) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index f25e8531fd27..6561a06ef905 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -200,8 +200,8 @@ static int show_stat(struct seq_file *p, void *v) "\nctxt %llu\n" "btime %llu\n" "processes %lu\n" - "procs_running %lu\n" - "procs_blocked %lu\n", + "procs_running %u\n" + "procs_blocked %u\n", nr_context_switches(), (unsigned long long)boottime.tv_sec, total_forks, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fc9784544b24..eb97468dfe4c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -514,10 +514,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_migration_entry(swpent)) - page = migration_entry_to_page(swpent); - else if (is_device_private_entry(swpent)) - page = device_private_entry_to_page(swpent); + } else if (is_pfn_swap_entry(swpent)) + page = pfn_swap_entry_to_page(swpent); } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { page = xa_load(&vma->vm_file->f_mapping->i_pages, @@ -549,7 +547,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, swp_entry_t entry = pmd_to_swp_entry(*pmd); if (is_migration_entry(entry)) - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); } if (IS_ERR_OR_NULL(page)) return; @@ -694,10 +692,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, } else if (is_swap_pte(*pte)) { swp_entry_t swpent = pte_to_swp_entry(*pte); - if (is_migration_entry(swpent)) - page = migration_entry_to_page(swpent); - else if (is_device_private_entry(swpent)) - page = device_private_entry_to_page(swpent); + if (is_pfn_swap_entry(swpent)) + page = pfn_swap_entry_to_page(swpent); } if (page) { int mapcount = page_mapcount(page); @@ -832,7 +828,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - transparent_hugepage_enabled(vma)); + transparent_hugepage_active(vma)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); @@ -1047,7 +1043,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, return false; if (!is_cow_mapping(vma->vm_flags)) return false; - if (likely(!atomic_read(&vma->vm_mm->has_pinned))) + if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) return false; page = vm_normal_page(vma, addr, pte); if (!page) @@ -1302,6 +1298,7 @@ struct pagemapread { #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_UFFD_WP BIT_ULL(57) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) #define PM_PRESENT BIT_ULL(63) @@ -1375,20 +1372,21 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, page = vm_normal_page(vma, addr, pte); if (pte_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; + if (pte_uffd_wp(pte)) + flags |= PM_UFFD_WP; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; + if (pte_swp_uffd_wp(pte)) + flags |= PM_UFFD_WP; entry = pte_to_swp_entry(pte); if (pm->show_pfn) frame = swp_type(entry) | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; - if (is_migration_entry(entry)) - page = migration_entry_to_page(entry); - - if (is_device_private_entry(entry)) - page = device_private_entry_to_page(entry); + if (is_pfn_swap_entry(entry)) + page = pfn_swap_entry_to_page(entry); } if (page && !PageAnon(page)) @@ -1426,6 +1424,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, flags |= PM_PRESENT; if (pmd_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; + if (pmd_uffd_wp(pmd)) + flags |= PM_UFFD_WP; if (pm->show_pfn) frame = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1444,8 +1444,10 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; + if (pmd_swp_uffd_wp(pmd)) + flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); - page = migration_entry_to_page(entry); + page = pfn_swap_entry_to_page(entry); } #endif diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 8adabde685f1..328da35da390 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -173,6 +173,7 @@ config PSTORE_BLK tristate "Log panic/oops to a block device" depends on PSTORE depends on BLOCK + depends on BROKEN select PSTORE_ZONE default n help diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 4bb8a344957a..04ce58c939a0 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -8,15 +8,16 @@ #include <linux/kernel.h> #include <linux/module.h> -#include "../../block/blk.h" #include <linux/blkdev.h> #include <linux/string.h> #include <linux/of.h> #include <linux/of_address.h> #include <linux/platform_device.h> #include <linux/pstore_blk.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/init_syscalls.h> #include <linux/mount.h> -#include <linux/uio.h> static long kmsg_size = CONFIG_PSTORE_BLK_KMSG_SIZE; module_param(kmsg_size, long, 0400); @@ -57,27 +58,7 @@ MODULE_PARM_DESC(best_effort, "use best effort to write (i.e. do not require sto /* * blkdev - the block device to use for pstore storage - * - * Usually, this will be a partition of a block device. - * - * blkdev accepts the following variants: - * 1) <hex_major><hex_minor> device number in hexadecimal representation, - * with no leading 0x, for example b302. - * 2) /dev/<disk_name> represents the device number of disk - * 3) /dev/<disk_name><decimal> represents the device number - * of partition - device number of disk plus the partition number - * 4) /dev/<disk_name>p<decimal> - same as the above, that form is - * used when disk name of partitioned disk ends on a digit. - * 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the - * unique id of a partition if the partition table provides it. - * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS - * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- - * filled hex representation of the 32-bit "NT disk signature", and PP - * is a zero-filled hex representation of the 1-based partition number. - * 6) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to - * a partition with a known unique id. - * 7) <major>:<minor> major and minor number of the device separated by - * a colon. + * See Documentation/admin-guide/pstore-blk.rst for details. */ static char blkdev[80] = CONFIG_PSTORE_BLK_BLKDEV; module_param_string(blkdev, blkdev, 80, 0400); @@ -88,14 +69,8 @@ MODULE_PARM_DESC(blkdev, "block device for pstore storage"); * during the register/unregister functions. */ static DEFINE_MUTEX(pstore_blk_lock); -static struct block_device *psblk_bdev; -static struct pstore_zone_info *pstore_zone_info; - -struct bdev_info { - dev_t devt; - sector_t nr_sects; - sector_t start_sect; -}; +static struct file *psblk_file; +static struct pstore_device_info *pstore_device_info; #define check_size(name, alignsize) ({ \ long _##name_ = (name); \ @@ -108,57 +83,63 @@ struct bdev_info { _##name_; \ }) +#define verify_size(name, alignsize, enabled) { \ + long _##name_; \ + if (enabled) \ + _##name_ = check_size(name, alignsize); \ + else \ + _##name_ = 0; \ + /* Synchronize module parameters with resuls. */ \ + name = _##name_ / 1024; \ + dev->zone.name = _##name_; \ +} + static int __register_pstore_device(struct pstore_device_info *dev) { int ret; lockdep_assert_held(&pstore_blk_lock); - if (!dev || !dev->total_size || !dev->read || !dev->write) + if (!dev) { + pr_err("NULL device info\n"); + return -EINVAL; + } + if (!dev->zone.total_size) { + pr_err("zero sized device\n"); return -EINVAL; + } + if (!dev->zone.read) { + pr_err("no read handler for device\n"); + return -EINVAL; + } + if (!dev->zone.write) { + pr_err("no write handler for device\n"); + return -EINVAL; + } /* someone already registered before */ - if (pstore_zone_info) + if (pstore_device_info) return -EBUSY; - pstore_zone_info = kzalloc(sizeof(struct pstore_zone_info), GFP_KERNEL); - if (!pstore_zone_info) - return -ENOMEM; - /* zero means not limit on which backends to attempt to store. */ if (!dev->flags) dev->flags = UINT_MAX; -#define verify_size(name, alignsize, enabled) { \ - long _##name_; \ - if (enabled) \ - _##name_ = check_size(name, alignsize); \ - else \ - _##name_ = 0; \ - name = _##name_ / 1024; \ - pstore_zone_info->name = _##name_; \ - } - + /* Copy in module parameters. */ verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG); verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE); verify_size(ftrace_size, 4096, dev->flags & PSTORE_FLAGS_FTRACE); -#undef verify_size - - pstore_zone_info->total_size = dev->total_size; - pstore_zone_info->max_reason = max_reason; - pstore_zone_info->read = dev->read; - pstore_zone_info->write = dev->write; - pstore_zone_info->erase = dev->erase; - pstore_zone_info->panic_write = dev->panic_write; - pstore_zone_info->name = KBUILD_MODNAME; - pstore_zone_info->owner = THIS_MODULE; - - ret = register_pstore_zone(pstore_zone_info); - if (ret) { - kfree(pstore_zone_info); - pstore_zone_info = NULL; - } + dev->zone.max_reason = max_reason; + + /* Initialize required zone ownership details. */ + dev->zone.name = KBUILD_MODNAME; + dev->zone.owner = THIS_MODULE; + + ret = register_pstore_zone(&dev->zone); + if (ret == 0) + pstore_device_info = dev; + return ret; } /** @@ -185,10 +166,9 @@ EXPORT_SYMBOL_GPL(register_pstore_device); static void __unregister_pstore_device(struct pstore_device_info *dev) { lockdep_assert_held(&pstore_blk_lock); - if (pstore_zone_info && pstore_zone_info->read == dev->read) { - unregister_pstore_zone(pstore_zone_info); - kfree(pstore_zone_info); - pstore_zone_info = NULL; + if (pstore_device_info && pstore_device_info == dev) { + unregister_pstore_zone(&dev->zone); + pstore_device_info = NULL; } } @@ -205,204 +185,59 @@ void unregister_pstore_device(struct pstore_device_info *dev) } EXPORT_SYMBOL_GPL(unregister_pstore_device); -/** - * psblk_get_bdev() - open block device - * - * @holder: Exclusive holder identifier - * @info: Information about bdev to fill in - * - * Return: pointer to block device on success and others on error. - * - * On success, the returned block_device has reference count of one. - */ -static struct block_device *psblk_get_bdev(void *holder, - struct bdev_info *info) -{ - struct block_device *bdev = ERR_PTR(-ENODEV); - fmode_t mode = FMODE_READ | FMODE_WRITE; - sector_t nr_sects; - - lockdep_assert_held(&pstore_blk_lock); - - if (pstore_zone_info) - return ERR_PTR(-EBUSY); - - if (!blkdev[0]) - return ERR_PTR(-ENODEV); - - if (holder) - mode |= FMODE_EXCL; - bdev = blkdev_get_by_path(blkdev, mode, holder); - if (IS_ERR(bdev)) { - dev_t devt; - - devt = name_to_dev_t(blkdev); - if (devt == 0) - return ERR_PTR(-ENODEV); - bdev = blkdev_get_by_dev(devt, mode, holder); - if (IS_ERR(bdev)) - return bdev; - } - - nr_sects = bdev_nr_sectors(bdev); - if (!nr_sects) { - pr_err("not enough space for '%s'\n", blkdev); - blkdev_put(bdev, mode); - return ERR_PTR(-ENOSPC); - } - - if (info) { - info->devt = bdev->bd_dev; - info->nr_sects = nr_sects; - info->start_sect = get_start_sect(bdev); - } - - return bdev; -} - -static void psblk_put_bdev(struct block_device *bdev, void *holder) -{ - fmode_t mode = FMODE_READ | FMODE_WRITE; - - lockdep_assert_held(&pstore_blk_lock); - - if (!bdev) - return; - - if (holder) - mode |= FMODE_EXCL; - blkdev_put(bdev, mode); -} - static ssize_t psblk_generic_blk_read(char *buf, size_t bytes, loff_t pos) { - struct block_device *bdev = psblk_bdev; - struct file file; - struct kiocb kiocb; - struct iov_iter iter; - struct kvec iov = {.iov_base = buf, .iov_len = bytes}; - - if (!bdev) - return -ENODEV; - - memset(&file, 0, sizeof(struct file)); - file.f_mapping = bdev->bd_inode->i_mapping; - file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; - file.f_inode = bdev->bd_inode; - file_ra_state_init(&file.f_ra, file.f_mapping); - - init_sync_kiocb(&kiocb, &file); - kiocb.ki_pos = pos; - iov_iter_kvec(&iter, READ, &iov, 1, bytes); - - return generic_file_read_iter(&kiocb, &iter); + return kernel_read(psblk_file, buf, bytes, &pos); } static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes, loff_t pos) { - struct block_device *bdev = psblk_bdev; - struct iov_iter iter; - struct kiocb kiocb; - struct file file; - ssize_t ret; - struct kvec iov = {.iov_base = (void *)buf, .iov_len = bytes}; - - if (!bdev) - return -ENODEV; - /* Console/Ftrace backend may handle buffer until flush dirty zones */ if (in_interrupt() || irqs_disabled()) return -EBUSY; - - memset(&file, 0, sizeof(struct file)); - file.f_mapping = bdev->bd_inode->i_mapping; - file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; - file.f_inode = bdev->bd_inode; - - init_sync_kiocb(&kiocb, &file); - kiocb.ki_pos = pos; - iov_iter_kvec(&iter, WRITE, &iov, 1, bytes); - - inode_lock(bdev->bd_inode); - ret = generic_write_checks(&kiocb, &iter); - if (ret > 0) - ret = generic_perform_write(&file, &iter, pos); - inode_unlock(bdev->bd_inode); - - if (likely(ret > 0)) { - const struct file_operations f_op = {.fsync = blkdev_fsync}; - - file.f_op = &f_op; - kiocb.ki_pos += ret; - ret = generic_write_sync(&kiocb, ret); - } - return ret; + return kernel_write(psblk_file, buf, bytes, &pos); } /* * This takes its configuration only from the module parameters now. - * See psblk_get_bdev() and blkdev. */ -static int __register_pstore_blk(void) +static int __register_pstore_blk(struct pstore_device_info *dev, + const char *devpath) { - char bdev_name[BDEVNAME_SIZE]; - struct block_device *bdev; - struct pstore_device_info dev; - struct bdev_info binfo; - void *holder = blkdev; + struct inode *inode; int ret = -ENODEV; lockdep_assert_held(&pstore_blk_lock); - /* hold bdev exclusively */ - memset(&binfo, 0, sizeof(binfo)); - bdev = psblk_get_bdev(holder, &binfo); - if (IS_ERR(bdev)) { - pr_err("failed to open '%s'!\n", blkdev); - return PTR_ERR(bdev); + psblk_file = filp_open(devpath, O_RDWR | O_DSYNC | O_NOATIME | O_EXCL, 0); + if (IS_ERR(psblk_file)) { + ret = PTR_ERR(psblk_file); + pr_err("failed to open '%s': %d!\n", devpath, ret); + goto err; } - /* only allow driver matching the @blkdev */ - if (!binfo.devt) { - pr_debug("no major\n"); - ret = -ENODEV; - goto err_put_bdev; + inode = file_inode(psblk_file); + if (!S_ISBLK(inode->i_mode)) { + pr_err("'%s' is not block device!\n", devpath); + goto err_fput; } - /* psblk_bdev must be assigned before register to pstore/blk */ - psblk_bdev = bdev; - - memset(&dev, 0, sizeof(dev)); - dev.total_size = binfo.nr_sects << SECTOR_SHIFT; - dev.read = psblk_generic_blk_read; - dev.write = psblk_generic_blk_write; + inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode; + dev->zone.total_size = i_size_read(inode); - ret = __register_pstore_device(&dev); + ret = __register_pstore_device(dev); if (ret) - goto err_put_bdev; + goto err_fput; - bdevname(bdev, bdev_name); - pr_info("attached %s (no dedicated panic_write!)\n", bdev_name); return 0; -err_put_bdev: - psblk_bdev = NULL; - psblk_put_bdev(bdev, holder); - return ret; -} - -static void __unregister_pstore_blk(unsigned int major) -{ - struct pstore_device_info dev = { .read = psblk_generic_blk_read }; - void *holder = blkdev; +err_fput: + fput(psblk_file); +err: + psblk_file = NULL; - lockdep_assert_held(&pstore_blk_lock); - if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) { - __unregister_pstore_device(&dev); - psblk_put_bdev(psblk_bdev, holder); - psblk_bdev = NULL; - } + return ret; } /* get information of pstore/blk */ @@ -419,13 +254,93 @@ int pstore_blk_get_config(struct pstore_blk_config *info) } EXPORT_SYMBOL_GPL(pstore_blk_get_config); + +#ifndef MODULE +static const char devname[] = "/dev/pstore-blk"; +static __init const char *early_boot_devpath(const char *initial_devname) +{ + /* + * During early boot the real root file system hasn't been + * mounted yet, and no device nodes are present yet. Use the + * same scheme to find the device that we use for mounting + * the root file system. + */ + dev_t dev = name_to_dev_t(initial_devname); + + if (!dev) { + pr_err("failed to resolve '%s'!\n", initial_devname); + return initial_devname; + } + + init_unlink(devname); + init_mknod(devname, S_IFBLK | 0600, new_encode_dev(dev)); + + return devname; +} +#else +static inline const char *early_boot_devpath(const char *initial_devname) +{ + return initial_devname; +} +#endif + +static int __init __best_effort_init(void) +{ + struct pstore_device_info *best_effort_dev; + int ret; + + /* No best-effort mode requested. */ + if (!best_effort) + return 0; + + /* Reject an empty blkdev. */ + if (!blkdev[0]) { + pr_err("blkdev empty with best_effort=Y\n"); + return -EINVAL; + } + + best_effort_dev = kzalloc(sizeof(*best_effort_dev), GFP_KERNEL); + if (!best_effort_dev) + return -ENOMEM; + + best_effort_dev->zone.read = psblk_generic_blk_read; + best_effort_dev->zone.write = psblk_generic_blk_write; + + ret = __register_pstore_blk(best_effort_dev, + early_boot_devpath(blkdev)); + if (ret) + kfree(best_effort_dev); + else + pr_info("attached %s (%zu) (no dedicated panic_write!)\n", + blkdev, best_effort_dev->zone.total_size); + + return ret; +} + +static void __exit __best_effort_exit(void) +{ + /* + * Currently, the only user of psblk_file is best_effort, so + * we can assume that pstore_device_info is associated with it. + * Once there are "real" blk devices, there will need to be a + * dedicated pstore_blk_info, etc. + */ + if (psblk_file) { + struct pstore_device_info *dev = pstore_device_info; + + __unregister_pstore_device(dev); + kfree(dev); + fput(psblk_file); + psblk_file = NULL; + } +} + static int __init pstore_blk_init(void) { - int ret = 0; + int ret; mutex_lock(&pstore_blk_lock); - if (!pstore_zone_info && best_effort && blkdev[0]) - ret = __register_pstore_blk(); + ret = __best_effort_init(); mutex_unlock(&pstore_blk_lock); return ret; @@ -435,15 +350,9 @@ late_initcall(pstore_blk_init); static void __exit pstore_blk_exit(void) { mutex_lock(&pstore_blk_lock); - if (psblk_bdev) - __unregister_pstore_blk(MAJOR(psblk_bdev->bd_dev)); - else { - struct pstore_device_info dev = { }; - - if (pstore_zone_info) - dev.read = pstore_zone_info->read; - __unregister_pstore_device(&dev); - } + __best_effort_exit(); + /* If we've been asked to unload, unregister any remaining device. */ + __unregister_pstore_device(pstore_device_info); mutex_unlock(&pstore_blk_lock); } module_exit(pstore_blk_exit); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 05e4bd9ab6d6..2bcc9a6f1bfc 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -968,31 +968,30 @@ out: return ret; } -SYSCALL_DEFINE4(quotactl_path, unsigned int, cmd, const char __user *, - mountpoint, qid_t, id, void __user *, addr) +SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd, + qid_t, id, void __user *, addr) { struct super_block *sb; - struct path mountpath; unsigned int cmds = cmd >> SUBCMDSHIFT; unsigned int type = cmd & SUBCMDMASK; + struct fd f; int ret; - if (type >= MAXQUOTAS) - return -EINVAL; + f = fdget_raw(fd); + if (!f.file) + return -EBADF; - ret = user_path_at(AT_FDCWD, mountpoint, - LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT, &mountpath); - if (ret) - return ret; - - sb = mountpath.mnt->mnt_sb; + ret = -EINVAL; + if (type >= MAXQUOTAS) + goto out; if (quotactl_cmd_write(cmds)) { - ret = mnt_want_write(mountpath.mnt); + ret = mnt_want_write(f.file->f_path.mnt); if (ret) goto out; } + sb = f.file->f_path.mnt->mnt_sb; if (quotactl_cmd_onoff(cmds)) down_write(&sb->s_umount); else @@ -1006,9 +1005,8 @@ SYSCALL_DEFINE4(quotactl_path, unsigned int, cmd, const char __user *, up_read(&sb->s_umount); if (quotactl_cmd_write(cmds)) - mnt_drop_write(mountpath.mnt); + mnt_drop_write(f.file->f_path.mnt); out: - path_put(&mountpath); - + fdput(f); return ret; } diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index c5562c871c8b..d3e995e1046f 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -47,15 +47,6 @@ static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info) / info->dqi_entry_size; } -static char *getdqbuf(size_t size) -{ - char *buf = kmalloc(size, GFP_NOFS); - if (!buf) - printk(KERN_WARNING - "VFS: Not enough memory for quota buffers.\n"); - return buf; -} - static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) { struct super_block *sb = info->dqi_sb; @@ -83,7 +74,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) /* Remove empty block from list and return it */ static int get_free_dqblk(struct qtree_mem_dqinfo *info) { - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int ret, blk; @@ -132,7 +123,7 @@ static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk) static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, uint blk) { - char *tmpbuf = getdqbuf(info->dqi_usable_bs); + char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; uint nextblk = le32_to_cpu(dh->dqdh_next_free); uint prevblk = le32_to_cpu(dh->dqdh_prev_free); @@ -179,7 +170,7 @@ out_buf: static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, uint blk) { - char *tmpbuf = getdqbuf(info->dqi_usable_bs); + char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int err; @@ -227,7 +218,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, { uint blk, i; struct qt_disk_dqdbheader *dh; - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); char *ddquot; *err = 0; @@ -298,7 +289,7 @@ out_buf: static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *treeblk, int depth) { - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); int ret = 0, newson = 0, newact = 0; __le32 *ref; uint newblk; @@ -375,7 +366,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) int type = dquot->dq_id.type; struct super_block *sb = dquot->dq_sb; ssize_t ret; - char *ddquot = getdqbuf(info->dqi_entry_size); + char *ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS); if (!ddquot) return -ENOMEM; @@ -414,7 +405,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk) { struct qt_disk_dqdbheader *dh; - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); int ret = 0; if (!buf) @@ -474,7 +465,7 @@ out_buf: static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *blk, int depth) { - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); int ret = 0; uint newblk; __le32 *ref = (__le32 *)buf; @@ -533,7 +524,7 @@ EXPORT_SYMBOL(qtree_delete_dquot); static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk) { - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); loff_t ret = 0; int i; char *ddquot; @@ -571,7 +562,7 @@ out_buf: static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk, int depth) { - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); loff_t ret = 0; __le32 *ref = (__le32 *)buf; @@ -635,7 +626,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) } dquot->dq_off = offset; } - ddquot = getdqbuf(info->dqi_entry_size); + ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS); if (!ddquot) return -ENOMEM; ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size, @@ -679,7 +670,7 @@ EXPORT_SYMBOL(qtree_release_dquot); static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id, unsigned int blk, int depth) { - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); __le32 *ref = (__le32 *)buf; ssize_t ret; unsigned int epb = info->dqi_usable_bs >> 2; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 9ebd17d7befb..65e7e56005b8 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -53,13 +53,6 @@ struct ramfs_fs_info { static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; -static const struct address_space_operations ramfs_aops = { - .readpage = simple_readpage, - .write_begin = simple_write_begin, - .write_end = simple_write_end, - .set_page_dirty = __set_page_dirty_no_writeback, -}; - struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev) { @@ -68,7 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(&init_user_ns, inode, dir, mode); - inode->i_mapping->a_ops = &ramfs_aops; + inode->i_mapping->a_ops = &ram_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 780bb90c1804..f49b72ccac4c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2584,9 +2584,7 @@ static int reiserfs_write_full_page(struct page *page, clear_buffer_dirty(bh); set_buffer_uptodate(bh); } else if ((checked || buffer_dirty(bh)) && - (!buffer_mapped(bh) || (buffer_mapped(bh) - && bh->b_blocknr == - 0))) { + (!buffer_mapped(bh) || bh->b_blocknr == 0)) { /* * not mapped yet, or it points to a direct item, search * the btree for the mapping info, and log any direct diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 9edc8e2b154e..0834b101c316 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2758,6 +2758,20 @@ int journal_init(struct super_block *sb, const char *j_dev_name, goto free_and_return; } + /* + * Sanity check to see if journal first block is correct. + * If journal first block is invalid it can cause + * zeroing important superblock members. + */ + if (!SB_ONDISK_JOURNAL_DEVICE(sb) && + SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) { + reiserfs_warning(sb, "journal-1393", + "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d", + SB_JOURNAL_1st_RESERVED_BLOCK(sb), + SB_ONDISK_JOURNAL_1st_BLOCK(sb)); + goto free_and_return; + } + if (journal_init_dev(sb, journal, j_dev_name) != 0) { reiserfs_warning(sb, "sh-462", "unable to initialize journal device"); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 017db70d0f48..3d7a35d6a18b 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -132,6 +132,7 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, return IO_ERROR; } PATH_LAST_POSITION(path)--; + break; case ITEM_FOUND: break; diff --git a/fs/seq_file.c b/fs/seq_file.c index 5059248f2d64..b117b212ef28 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -356,6 +356,31 @@ int seq_release(struct inode *inode, struct file *file) EXPORT_SYMBOL(seq_release); /** + * seq_escape_mem - print data into buffer, escaping some characters + * @m: target buffer + * @src: source buffer + * @len: size of source buffer + * @flags: flags to pass to string_escape_mem() + * @esc: set of characters that need escaping + * + * Puts data into buffer, replacing each occurrence of character from + * given class (defined by @flags and @esc) with printable escaped sequence. + * + * Use seq_has_overflowed() to check for errors. + */ +void seq_escape_mem(struct seq_file *m, const char *src, size_t len, + unsigned int flags, const char *esc) +{ + char *buf; + size_t size = seq_get_buf(m, &buf); + int ret; + + ret = string_escape_mem(src, len, buf, size, flags, esc); + seq_commit(m, ret < size ? ret : -1); +} +EXPORT_SYMBOL(seq_escape_mem); + +/** * seq_escape - print string into buffer, escaping some characters * @m: target buffer * @s: string @@ -367,26 +392,10 @@ EXPORT_SYMBOL(seq_release); */ void seq_escape(struct seq_file *m, const char *s, const char *esc) { - char *buf; - size_t size = seq_get_buf(m, &buf); - int ret; - - ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc); - seq_commit(m, ret < size ? ret : -1); + seq_escape_str(m, s, ESCAPE_OCTAL, esc); } EXPORT_SYMBOL(seq_escape); -void seq_escape_mem_ascii(struct seq_file *m, const char *src, size_t isz) -{ - char *buf; - size_t size = seq_get_buf(m, &buf); - int ret; - - ret = string_escape_mem_ascii(src, isz, buf, size); - seq_commit(m, ret < size ? ret : -1); -} -EXPORT_SYMBOL(seq_escape_mem_ascii); - void seq_vprintf(struct seq_file *m, const char *f, va_list args) { int len; diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index b9e87ebb1060..855f0e87066d 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -226,8 +226,11 @@ out_free_bio: bio_free_pages(bio); bio_put(bio); out: - if (res < 0) + if (res < 0) { ERROR("Failed to read block 0x%llx: %d\n", index, res); + if (msblk->panic_on_errors) + panic("squashfs read failed"); + } return res; } diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 166e98806265..1e90c2575f9b 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -65,5 +65,6 @@ struct squashfs_sb_info { unsigned int fragments; int xattr_ids; unsigned int ids; + bool panic_on_errors; }; #endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 88cc94be1076..60d6951915f4 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -18,9 +18,11 @@ #include <linux/fs.h> #include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/vfs.h> #include <linux/slab.h> #include <linux/mutex.h> +#include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/module.h> @@ -37,6 +39,51 @@ static struct file_system_type squashfs_fs_type; static const struct super_operations squashfs_super_ops; +enum Opt_errors { + Opt_errors_continue, + Opt_errors_panic, +}; + +enum squashfs_param { + Opt_errors, +}; + +struct squashfs_mount_opts { + enum Opt_errors errors; +}; + +static const struct constant_table squashfs_param_errors[] = { + {"continue", Opt_errors_continue }, + {"panic", Opt_errors_panic }, + {} +}; + +static const struct fs_parameter_spec squashfs_fs_parameters[] = { + fsparam_enum("errors", Opt_errors, squashfs_param_errors), + {} +}; + +static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct squashfs_mount_opts *opts = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, squashfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_errors: + opts->errors = result.uint_32; + break; + default: + return -EINVAL; + } + + return 0; +} + static const struct squashfs_decompressor *supported_squashfs_filesystem( struct fs_context *fc, short major, short minor, short id) @@ -67,6 +114,7 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct squashfs_mount_opts *opts = fc->fs_private; struct squashfs_sb_info *msblk; struct squashfs_super_block *sblk = NULL; struct inode *root; @@ -85,6 +133,8 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) } msblk = sb->s_fs_info; + msblk->panic_on_errors = (opts->errors == Opt_errors_panic); + msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); @@ -350,18 +400,52 @@ static int squashfs_get_tree(struct fs_context *fc) static int squashfs_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + struct squashfs_mount_opts *opts = fc->fs_private; + sync_filesystem(fc->root->d_sb); fc->sb_flags |= SB_RDONLY; + + msblk->panic_on_errors = (opts->errors == Opt_errors_panic); + return 0; } +static void squashfs_free_fs_context(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + static const struct fs_context_operations squashfs_context_ops = { .get_tree = squashfs_get_tree, + .free = squashfs_free_fs_context, + .parse_param = squashfs_parse_param, .reconfigure = squashfs_reconfigure, }; +static int squashfs_show_options(struct seq_file *s, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + + if (msblk->panic_on_errors) + seq_puts(s, ",errors=panic"); + else + seq_puts(s, ",errors=continue"); + + return 0; +} + static int squashfs_init_fs_context(struct fs_context *fc) { + struct squashfs_mount_opts *opts; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + fc->fs_private = opts; fc->ops = &squashfs_context_ops; return 0; } @@ -481,6 +565,7 @@ static struct file_system_type squashfs_fs_type = { .owner = THIS_MODULE, .name = "squashfs", .init_fs_context = squashfs_init_fs_context, + .parameters = squashfs_fs_parameters, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV }; @@ -491,6 +576,7 @@ static const struct super_operations squashfs_super_ops = { .free_inode = squashfs_free_inode, .statfs = squashfs_statfs, .put_super = squashfs_put_super, + .show_options = squashfs_show_options, }; module_init(init_squashfs_fs); diff --git a/fs/super.c b/fs/super.c index 11b7e7213fd1..91b7f156735b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1277,9 +1277,9 @@ int get_tree_bdev(struct fs_context *fc, } /* - * s_umount nests inside bd_mutex during + * s_umount nests inside open_mutex during * __invalidate_device(). blkdev_put() acquires - * bd_mutex and can't be called under s_umount. Drop + * open_mutex and can't be called under s_umount. Drop * s_umount temporarily. This is safe as we're * holding an active reference. */ @@ -1352,9 +1352,9 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } /* - * s_umount nests inside bd_mutex during + * s_umount nests inside open_mutex during * __invalidate_device(). blkdev_put() acquires - * bd_mutex and can't be called under s_umount. Drop + * open_mutex and can't be called under s_umount. Drop * s_umount temporarily. This is safe as we're * holding an active reference. */ diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 8b2e99b7bc9f..749385015a8d 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -495,6 +495,7 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations sysv_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = sysv_readpage, .writepage = sysv_writepage, .write_begin = sysv_write_begin, diff --git a/fs/udf/file.c b/fs/udf/file.c index 2846dcd92197..1baff8ddb754 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -125,6 +125,7 @@ static int udf_adinicb_write_end(struct file *file, struct address_space *mappin } const struct address_space_operations udf_adinicb_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = udf_adinicb_readpage, .writepage = udf_adinicb_writepage, .write_begin = udf_adinicb_write_begin, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 0dd2f93ac048..4917670860a0 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -235,6 +235,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations udf_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = udf_readpage, .readahead = udf_readahead, .writepage = udf_writepage, diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 3ae9f1e91984..7c7c9bbbfa57 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -934,6 +934,10 @@ static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir, iinfo->i_location.partitionReferenceNum, 0); epos.bh = udf_tgetblk(sb, block); + if (unlikely(!epos.bh)) { + err = -ENOMEM; + goto out_no_entry; + } lock_buffer(epos.bh); memset(epos.bh->b_data, 0x00, bsize); set_buffer_uptodate(epos.bh); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index debc282c1bb4..ac628de69601 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -526,6 +526,7 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations ufs_aops = { + .set_page_dirty = __set_page_dirty_buffers, .readpage = ufs_readpage, .writepage = ufs_writepage, .write_begin = ufs_write_begin, diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 14f92285d04f..f6e0f0c0d0e5 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -337,7 +337,7 @@ out: return ret; } -static inline long userfaultfd_get_blocking_state(unsigned int flags) +static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) { if (flags & FAULT_FLAG_INTERRUPTIBLE) return TASK_INTERRUPTIBLE; @@ -370,7 +370,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) struct userfaultfd_wait_queue uwq; vm_fault_t ret = VM_FAULT_SIGBUS; bool must_wait; - long blocking_state; + unsigned int blocking_state; /* * We don't do userfault handling for the final child pid update. @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, } if (vm_flags & VM_UFFD_MINOR) { - /* FIXME: Add minor fault interception for shmem. */ - if (!is_vm_hugetlb_page(vma)) + if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) return false; } @@ -1304,8 +1303,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vm_flags = 0; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; - if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP + goto out; +#endif vm_flags |= VM_UFFD_WP; + } if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR goto out; @@ -1941,7 +1944,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR - uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS; + uffdio_api.features &= + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); +#endif +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP + uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index c68a36688474..778ec52cce70 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -27,6 +27,276 @@ #include "xfs_defer.h" #include "xfs_log_format.h" #include "xfs_trans.h" +#include "xfs_trace.h" +#include "xfs_inode.h" +#include "xfs_icache.h" + + +/* + * Passive reference counting access wrappers to the perag structures. If the + * per-ag structure is to be freed, the freeing code is responsible for cleaning + * up objects with passive references before freeing the structure. This is + * things like cached buffers. + */ +struct xfs_perag * +xfs_perag_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ref = 0; + + rcu_read_lock(); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + ref = atomic_inc_return(&pag->pag_ref); + } + rcu_read_unlock(); + trace_xfs_perag_get(mp, agno, ref, _RET_IP_); + return pag; +} + +/* + * search from @first to find the next perag with the given tag set. + */ +struct xfs_perag * +xfs_perag_get_tag( + struct xfs_mount *mp, + xfs_agnumber_t first, + unsigned int tag) +{ + struct xfs_perag *pag; + int found; + int ref; + + rcu_read_lock(); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, first, 1, tag); + if (found <= 0) { + rcu_read_unlock(); + return NULL; + } + ref = atomic_inc_return(&pag->pag_ref); + rcu_read_unlock(); + trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); + return pag; +} + +void +xfs_perag_put( + struct xfs_perag *pag) +{ + int ref; + + ASSERT(atomic_read(&pag->pag_ref) > 0); + ref = atomic_dec_return(&pag->pag_ref); + trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); +} + +/* + * xfs_initialize_perag_data + * + * Read in each per-ag structure so we can count up the number of + * allocated inodes, free inodes and used filesystem blocks as this + * information is no longer persistent in the superblock. Once we have + * this information, write it into the in-core superblock structure. + */ +int +xfs_initialize_perag_data( + struct xfs_mount *mp, + xfs_agnumber_t agcount) +{ + xfs_agnumber_t index; + struct xfs_perag *pag; + struct xfs_sb *sbp = &mp->m_sb; + uint64_t ifree = 0; + uint64_t ialloc = 0; + uint64_t bfree = 0; + uint64_t bfreelst = 0; + uint64_t btree = 0; + uint64_t fdblocks; + int error = 0; + + for (index = 0; index < agcount; index++) { + /* + * read the agf, then the agi. This gets us + * all the information we need and populates the + * per-ag structures for us. + */ + error = xfs_alloc_pagf_init(mp, NULL, index, 0); + if (error) + return error; + + error = xfs_ialloc_pagi_init(mp, NULL, index); + if (error) + return error; + pag = xfs_perag_get(mp, index); + ifree += pag->pagi_freecount; + ialloc += pag->pagi_count; + bfree += pag->pagf_freeblks; + bfreelst += pag->pagf_flcount; + btree += pag->pagf_btreeblks; + xfs_perag_put(pag); + } + fdblocks = bfree + bfreelst + btree; + + /* + * If the new summary counts are obviously incorrect, fail the + * mount operation because that implies the AGFs are also corrupt. + * Clear FS_COUNTERS so that we don't unmount with a dirty log, which + * will prevent xfs_repair from fixing anything. + */ + if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { + xfs_alert(mp, "AGF corruption. Please run xfs_repair."); + error = -EFSCORRUPTED; + goto out; + } + + /* Overwrite incore superblock counters with just-read data */ + spin_lock(&mp->m_sb_lock); + sbp->sb_ifree = ifree; + sbp->sb_icount = ialloc; + sbp->sb_fdblocks = fdblocks; + spin_unlock(&mp->m_sb_lock); + + xfs_reinit_percpu_counters(mp); +out: + xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); + return error; +} + +STATIC void +__xfs_free_perag( + struct rcu_head *head) +{ + struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); + + ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); + ASSERT(atomic_read(&pag->pag_ref) == 0); + kmem_free(pag); +} + +/* + * Free up the per-ag resources associated with the mount structure. + */ +void +xfs_free_perag( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, agno); + spin_unlock(&mp->m_perag_lock); + ASSERT(pag); + ASSERT(atomic_read(&pag->pag_ref) == 0); + + cancel_delayed_work_sync(&pag->pag_blockgc_work); + xfs_iunlink_destroy(pag); + xfs_buf_hash_destroy(pag); + + call_rcu(&pag->rcu_head, __xfs_free_perag); + } +} + +int +xfs_initialize_perag( + struct xfs_mount *mp, + xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi) +{ + struct xfs_perag *pag; + xfs_agnumber_t index; + xfs_agnumber_t first_initialised = NULLAGNUMBER; + int error; + + /* + * Walk the current per-ag tree so we don't try to initialise AGs + * that already exist (growfs case). Allocate and insert all the + * AGs we don't find ready for initialisation. + */ + for (index = 0; index < agcount; index++) { + pag = xfs_perag_get(mp, index); + if (pag) { + xfs_perag_put(pag); + continue; + } + + pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + if (!pag) { + error = -ENOMEM; + goto out_unwind_new_pags; + } + pag->pag_agno = index; + pag->pag_mount = mp; + + error = radix_tree_preload(GFP_NOFS); + if (error) + goto out_free_pag; + + spin_lock(&mp->m_perag_lock); + if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { + WARN_ON_ONCE(1); + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + error = -EEXIST; + goto out_free_pag; + } + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + + /* Place kernel structure only init below this point. */ + spin_lock_init(&pag->pag_ici_lock); + spin_lock_init(&pag->pagb_lock); + spin_lock_init(&pag->pag_state_lock); + INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + init_waitqueue_head(&pag->pagb_wait); + pag->pagb_count = 0; + pag->pagb_tree = RB_ROOT; + + error = xfs_buf_hash_init(pag); + if (error) + goto out_remove_pag; + + error = xfs_iunlink_init(pag); + if (error) + goto out_hash_destroy; + + /* first new pag is fully initialized */ + if (first_initialised == NULLAGNUMBER) + first_initialised = index; + } + + index = xfs_set_inode_alloc(mp, agcount); + + if (maxagi) + *maxagi = index; + + mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); + return 0; + +out_hash_destroy: + xfs_buf_hash_destroy(pag); +out_remove_pag: + radix_tree_delete(&mp->m_perag_tree, index); +out_free_pag: + kmem_free(pag); +out_unwind_new_pags: + /* unwind any prior newly initialized pags */ + for (index = first_initialised; index < agcount; index++) { + pag = radix_tree_delete(&mp->m_perag_tree, index); + if (!pag) + break; + xfs_buf_hash_destroy(pag); + xfs_iunlink_destroy(pag); + kmem_free(pag); + } + return error; +} static int xfs_get_aghdr_buf( @@ -43,7 +313,6 @@ xfs_get_aghdr_buf( if (error) return error; - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); bp->b_bn = blkno; bp->b_maps[0].bm_bn = blkno; bp->b_ops = ops; @@ -510,6 +779,7 @@ xfs_ag_shrink_space( struct xfs_buf *agibp, *agfbp; struct xfs_agi *agi; struct xfs_agf *agf; + xfs_agblock_t aglen; int error, err2; ASSERT(agno == mp->m_sb.sb_agcount - 1); @@ -524,14 +794,14 @@ xfs_ag_shrink_space( return error; agf = agfbp->b_addr; + aglen = be32_to_cpu(agi->agi_length); /* some extra paranoid checks before we shrink the ag */ if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) return -EFSCORRUPTED; - if (delta >= agi->agi_length) + if (delta >= aglen) return -EINVAL; - args.fsbno = XFS_AGB_TO_FSB(mp, agno, - be32_to_cpu(agi->agi_length) - delta); + args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta); /* * Disable perag reservations so it doesn't cause the allocation request @@ -646,7 +916,7 @@ xfs_ag_extend_space( * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that * this doesn't actually exist in the rmap btree. */ - error = xfs_rmap_free(tp, bp, id->agno, + error = xfs_rmap_free(tp, bp, bp->b_pag, be32_to_cpu(agf->agf_length) - len, len, &XFS_RMAP_OINFO_SKIP_UPDATE); if (error) diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 4535de1d88ea..4c6f9045baca 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -9,6 +9,142 @@ struct xfs_mount; struct xfs_trans; +struct xfs_perag; + +/* + * Per-ag infrastructure + */ + +/* per-AG block reservation data structures*/ +struct xfs_ag_resv { + /* number of blocks originally reserved here */ + xfs_extlen_t ar_orig_reserved; + /* number of blocks reserved here */ + xfs_extlen_t ar_reserved; + /* number of blocks originally asked for */ + xfs_extlen_t ar_asked; +}; + +/* + * Per-ag incore structure, copies of information in agf and agi, to improve the + * performance of allocation group selection. + */ +struct xfs_perag { + struct xfs_mount *pag_mount; /* owner filesystem */ + xfs_agnumber_t pag_agno; /* AG this structure belongs to */ + atomic_t pag_ref; /* perag reference count */ + char pagf_init; /* this agf's entry is initialized */ + char pagi_init; /* this agi's entry is initialized */ + char pagf_metadata; /* the agf is preferred to be metadata */ + char pagi_inodeok; /* The agi is ok for inodes */ + uint8_t pagf_levels[XFS_BTNUM_AGF]; + /* # of levels in bno & cnt btree */ + bool pagf_agflreset; /* agfl requires reset before use */ + uint32_t pagf_flcount; /* count of blocks in freelist */ + xfs_extlen_t pagf_freeblks; /* total free blocks */ + xfs_extlen_t pagf_longest; /* longest free space */ + uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ + xfs_agino_t pagi_freecount; /* number of free inodes */ + xfs_agino_t pagi_count; /* number of allocated inodes */ + + /* + * Inode allocation search lookup optimisation. + * If the pagino matches, the search for new inodes + * doesn't need to search the near ones again straight away + */ + xfs_agino_t pagl_pagino; + xfs_agino_t pagl_leftrec; + xfs_agino_t pagl_rightrec; + + int pagb_count; /* pagb slots in use */ + uint8_t pagf_refcount_level; /* recount btree height */ + + /* Blocks reserved for all kinds of metadata. */ + struct xfs_ag_resv pag_meta_resv; + /* Blocks reserved for the reverse mapping btree. */ + struct xfs_ag_resv pag_rmapbt_resv; + + /* -- kernel only structures below this line -- */ + + /* + * Bitsets of per-ag metadata that have been checked and/or are sick. + * Callers should hold pag_state_lock before accessing this field. + */ + uint16_t pag_checked; + uint16_t pag_sick; + spinlock_t pag_state_lock; + + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ + unsigned int pagb_gen; /* generation count for pagb_tree */ + wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */ + + atomic_t pagf_fstrms; /* # of filestreams active in this AG */ + + spinlock_t pag_ici_lock; /* incore inode cache lock */ + struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ + unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ + + /* buffer cache index */ + spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ + struct rhashtable pag_buf_hash; + + /* for rcu-safe freeing */ + struct rcu_head rcu_head; + + /* background prealloc block trimming */ + struct delayed_work pag_blockgc_work; + + /* + * Unlinked inode information. This incore information reflects + * data stored in the AGI, so callers must hold the AGI buffer lock + * or have some other means to control concurrency. + */ + struct rhashtable pagi_unlinked_hash; +}; + +int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi); +int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); +void xfs_free_perag(struct xfs_mount *mp); + +struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); +struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, + unsigned int tag); +void xfs_perag_put(struct xfs_perag *pag); + +/* + * Perag iteration APIs + * + * XXX: for_each_perag_range() usage really needs an iterator to clean up when + * we terminate at end_agno because we may have taken a reference to the perag + * beyond end_agno. Right now callers have to be careful to catch and clean that + * up themselves. This is not necessary for the callers of for_each_perag() and + * for_each_perag_from() because they terminate at sb_agcount where there are + * no perag structures in tree beyond end_agno. + */ +#define for_each_perag_range(mp, next_agno, end_agno, pag) \ + for ((pag) = xfs_perag_get((mp), (next_agno)); \ + (pag) != NULL && (next_agno) <= (end_agno); \ + (next_agno) = (pag)->pag_agno + 1, \ + xfs_perag_put(pag), \ + (pag) = xfs_perag_get((mp), (next_agno))) + +#define for_each_perag_from(mp, next_agno, pag) \ + for_each_perag_range((mp), (next_agno), (mp)->m_sb.sb_agcount, (pag)) + + +#define for_each_perag(mp, agno, pag) \ + (agno) = 0; \ + for_each_perag_from((mp), (agno), (pag)) + +#define for_each_perag_tag(mp, agno, pag, tag) \ + for ((agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ + (pag) != NULL; \ + (agno) = (pag)->pag_agno + 1, \ + xfs_perag_put(pag), \ + (pag) = xfs_perag_get_tag((mp), (agno), (tag))) struct aghdr_init_data { /* per ag data */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index bbfea8022a3b..2aa2b3484c28 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -19,7 +19,7 @@ #include "xfs_btree.h" #include "xfs_refcount_btree.h" #include "xfs_ialloc_btree.h" -#include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" /* @@ -250,7 +250,6 @@ xfs_ag_resv_init( struct xfs_trans *tp) { struct xfs_mount *mp = pag->pag_mount; - xfs_agnumber_t agno = pag->pag_agno; xfs_extlen_t ask; xfs_extlen_t used; int error = 0, error2; @@ -260,11 +259,11 @@ xfs_ag_resv_init( if (pag->pag_meta_resv.ar_asked == 0) { ask = used = 0; - error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used); + error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used); if (error) goto out; - error = xfs_finobt_calc_reserves(mp, tp, agno, &ask, &used); + error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used); if (error) goto out; @@ -282,7 +281,7 @@ xfs_ag_resv_init( mp->m_finobt_nores = true; - error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, + error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used); if (error) goto out; @@ -300,7 +299,7 @@ xfs_ag_resv_init( if (pag->pag_rmapbt_resv.ar_asked == 0) { ask = used = 0; - error = xfs_rmapbt_calc_reserves(mp, tp, agno, &ask, &used); + error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used); if (error) goto out; @@ -366,7 +365,7 @@ xfs_ag_resv_alloc_extent( break; default: ASSERT(0); - /* fall through */ + fallthrough; case XFS_AG_RESV_NONE: field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : XFS_TRANS_SB_FDBLOCKS; @@ -408,7 +407,7 @@ xfs_ag_resv_free_extent( break; default: ASSERT(0); - /* fall through */ + fallthrough; case XFS_AG_RESV_NONE: xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); return; diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index 8a8eb4bc48bb..b74b210008ea 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -18,6 +18,21 @@ void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, struct xfs_trans *tp, xfs_extlen_t len); +static inline struct xfs_ag_resv * +xfs_perag_resv( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + switch (type) { + case XFS_AG_RESV_METADATA: + return &pag->pag_meta_resv; + case XFS_AG_RESV_RMAPBT: + return &pag->pag_rmapbt_resv; + default: + return NULL; + } +} + /* * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from * the AGFL, they are allocated one at a time and the reservation updates don't diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 82b7cbb1f24f..6929157d8d6e 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -10,7 +10,6 @@ #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_btree.h" @@ -24,6 +23,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_bmap.h" @@ -230,7 +230,7 @@ xfs_alloc_get_rec( int *stat) /* output: success/failure */ { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.agno; + xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; union xfs_btree_rec *rec; int error; @@ -776,7 +776,7 @@ xfs_alloc_cur_setup( */ if (!acur->cnt) acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_CNT); + args->agbp, args->pag, XFS_BTNUM_CNT); error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); if (error) return error; @@ -786,10 +786,10 @@ xfs_alloc_cur_setup( */ if (!acur->bnolt) acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_BNO); + args->agbp, args->pag, XFS_BTNUM_BNO); if (!acur->bnogt) acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_BNO); + args->agbp, args->pag, XFS_BTNUM_BNO); return i == 1 ? 0 : -ENOSPC; } @@ -1063,7 +1063,7 @@ xfs_alloc_ag_vextent_small( if (fbno == NULLAGBLOCK) goto out; - xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, + xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, (args->datatype & XFS_ALLOC_NOBUSY)); if (args->datatype & XFS_ALLOC_USERDATA) { @@ -1089,7 +1089,7 @@ xfs_alloc_ag_vextent_small( * If we're feeding an AGFL block to something that doesn't live in the * free space, we need to clear out the OWN_AG rmap. */ - error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1, + error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1, &XFS_RMAP_OINFO_AG); if (error) goto error; @@ -1166,7 +1166,7 @@ xfs_alloc_ag_vextent( /* if not file data, insert new block into the reverse map btree */ if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { - error = xfs_rmap_alloc(args->tp, args->agbp, args->agno, + error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, args->agbno, args->len, &args->oinfo); if (error) return error; @@ -1178,7 +1178,7 @@ xfs_alloc_ag_vextent( if (error) return error; - ASSERT(!xfs_extent_busy_search(args->mp, args->agno, + ASSERT(!xfs_extent_busy_search(args->mp, args->pag, args->agbno, args->len)); } @@ -1217,7 +1217,7 @@ xfs_alloc_ag_vextent_exact( * Allocate/initialize a cursor for the by-number freespace btree. */ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); + args->pag, XFS_BTNUM_BNO); /* * Lookup bno and minlen in the btree (minlen is irrelevant, really). @@ -1277,7 +1277,7 @@ xfs_alloc_ag_vextent_exact( * Allocate/initialize a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT); + args->pag, XFS_BTNUM_CNT); ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); @@ -1674,9 +1674,8 @@ restart: * Allocate and initialize a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT); + args->pag, XFS_BTNUM_CNT); bno_cur = NULL; - busy = false; /* * Look for an entry >= maxlen+alignment-1 blocks. @@ -1837,7 +1836,7 @@ restart: * Allocate and initialize a cursor for the by-block tree. */ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO); + args->pag, XFS_BTNUM_BNO); if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, rbno, rlen, XFSA_FIXUP_CNT_OK))) goto error0; @@ -1896,12 +1895,13 @@ xfs_free_ag_extent( int haveright; /* have a right neighbor */ int i; int error; + struct xfs_perag *pag = agbp->b_pag; bno_cur = cnt_cur = NULL; mp = tp->t_mountp; if (!xfs_rmap_should_skip_owner_update(oinfo)) { - error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo); + error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo); if (error) goto error0; } @@ -1909,7 +1909,7 @@ xfs_free_ag_extent( /* * Allocate and initialize a cursor for the by-block btree. */ - bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); + bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO); /* * Look for a neighboring block on the left (lower block numbers) * that is contiguous with this space. @@ -1979,7 +1979,7 @@ xfs_free_ag_extent( /* * Now allocate and initialize a cursor for the by-size tree. */ - cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT); + cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT); /* * Have both left and right contiguous neighbors. * Merge all three into a single free block. @@ -2490,7 +2490,7 @@ xfs_exact_minlen_extent_available( int error = 0; cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, - args->agno, XFS_BTNUM_CNT); + args->pag, XFS_BTNUM_CNT); error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); if (error) goto out; @@ -2693,21 +2693,21 @@ out_no_agbp: * Get a block from the freelist. * Returns with the buffer for the block gotten. */ -int /* error */ +int xfs_alloc_get_freelist( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop, /* block address retrieved from freelist */ - int btreeblk) /* destination is a AGF btree */ + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agblock_t *bnop, + int btreeblk) { - struct xfs_agf *agf = agbp->b_addr; - struct xfs_buf *agflbp;/* buffer for a.g. freelist structure */ - xfs_agblock_t bno; /* block number returned */ - __be32 *agfl_bno; - int error; - int logflags; - xfs_mount_t *mp = tp->t_mountp; - xfs_perag_t *pag; /* per allocation group data */ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_buf *agflbp; + xfs_agblock_t bno; + __be32 *agfl_bno; + int error; + int logflags; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; /* * Freelist is empty, give up. @@ -2817,20 +2817,20 @@ xfs_alloc_pagf_init( /* * Put the block on the freelist for the allocation group. */ -int /* error */ +int xfs_alloc_put_freelist( - xfs_trans_t *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for a.g. freelist header */ - struct xfs_buf *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno, /* block being freed */ - int btreeblk) /* block came from a AGF btree */ + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_buf *agflbp, + xfs_agblock_t bno, + int btreeblk) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agf *agf = agbp->b_addr; - __be32 *blockp;/* pointer to array entry */ + struct xfs_perag *pag; + __be32 *blockp; int error; int logflags; - xfs_perag_t *pag; /* per allocation group data */ __be32 *agfl_bno; int startoff; @@ -3174,7 +3174,7 @@ xfs_alloc_vextent( } args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); args->type = XFS_ALLOCTYPE_NEAR_BNO; - /* FALLTHROUGH */ + fallthrough; case XFS_ALLOCTYPE_FIRST_AG: /* * Rotate through the allocation groups looking for a winner. @@ -3292,7 +3292,7 @@ error0: int xfs_free_extent_fix_freelist( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_buf **agbp) { struct xfs_alloc_arg args; @@ -3301,7 +3301,8 @@ xfs_free_extent_fix_freelist( memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; - args.agno = agno; + args.agno = pag->pag_agno; + args.pag = pag; /* * validate that the block number is legal - the enables us to detect @@ -3310,17 +3311,12 @@ xfs_free_extent_fix_freelist( if (args.agno >= args.mp->m_sb.sb_agcount) return -EFSCORRUPTED; - args.pag = xfs_perag_get(args.mp, args.agno); - ASSERT(args.pag); - error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); if (error) - goto out; + return error; *agbp = args.agbp; -out: - xfs_perag_put(args.pag); - return error; + return 0; } /* @@ -3344,6 +3340,7 @@ __xfs_free_extent( struct xfs_agf *agf; int error; unsigned int busy_flags = 0; + struct xfs_perag *pag; ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); @@ -3352,33 +3349,37 @@ __xfs_free_extent( XFS_ERRTAG_FREE_EXTENT)) return -EIO; - error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + pag = xfs_perag_get(mp, agno); + error = xfs_free_extent_fix_freelist(tp, pag, &agbp); if (error) - return error; + goto err; agf = agbp->b_addr; if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { error = -EFSCORRUPTED; - goto err; + goto err_release; } /* validate the extent size is legal now we have the agf locked */ if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; - goto err; + goto err_release; } error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); if (error) - goto err; + goto err_release; if (skip_discard) busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; - xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags); + xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); + xfs_perag_put(pag); return 0; -err: +err_release: xfs_trans_brelse(tp, agbp); +err: + xfs_perag_put(pag); return error; } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index a4427c5775c2..e30900b6f8ba 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -214,7 +214,7 @@ int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, struct xfs_buf *, struct xfs_owner_info *); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); -int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, +int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_buf **agbp); xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index a43e4c50e69b..6b363f78cfa2 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -9,7 +9,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_btree_staging.h" @@ -19,6 +18,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans.h" +#include "xfs_ag.h" STATIC struct xfs_btree_cur * @@ -26,8 +26,7 @@ xfs_allocbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.agno, - cur->bc_btnum); + cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum); } STATIC void @@ -39,13 +38,12 @@ xfs_allocbt_set_root( struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; int btnum = cur->bc_btnum; - struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); - pag->pagf_levels[btnum] += inc; + cur->bc_ag.pag->pagf_levels[btnum] += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -72,7 +70,7 @@ xfs_allocbt_alloc_block( } atomic64_inc(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agbp->b_pag, bno, 1, false); new->s = cpu_to_be32(bno); @@ -86,7 +84,6 @@ xfs_allocbt_free_block( struct xfs_buf *bp) { struct xfs_buf *agbp = cur->bc_ag.agbp; - struct xfs_agf *agf = agbp->b_addr; xfs_agblock_t bno; int error; @@ -96,7 +93,7 @@ xfs_allocbt_free_block( return error; atomic64_dec(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + xfs_extent_busy_insert(cur->bc_tp, agbp->b_pag, bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; } @@ -225,7 +222,7 @@ xfs_allocbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -473,7 +470,7 @@ STATIC struct xfs_btree_cur * xfs_allocbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; @@ -486,6 +483,7 @@ xfs_allocbt_init_common( cur->bc_mp = mp; cur->bc_btnum = btnum; cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_ag.abt.active = false; if (btnum == XFS_BTNUM_CNT) { cur->bc_ops = &xfs_cntbt_ops; @@ -496,8 +494,9 @@ xfs_allocbt_init_common( cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); } - cur->bc_ag.agno = agno; - cur->bc_ag.abt.active = false; + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); + cur->bc_ag.pag = pag; if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; @@ -513,13 +512,13 @@ xfs_allocbt_init_cursor( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer for agf structure */ - xfs_agnumber_t agno, /* allocation group number */ + struct xfs_perag *pag, xfs_btnum_t btnum) /* btree identifier */ { struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - cur = xfs_allocbt_init_common(mp, tp, agno, btnum); + cur = xfs_allocbt_init_common(mp, tp, pag, btnum); if (btnum == XFS_BTNUM_CNT) cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); else @@ -535,12 +534,12 @@ struct xfs_btree_cur * xfs_allocbt_stage_cursor( struct xfs_mount *mp, struct xbtree_afakeroot *afake, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - cur = xfs_allocbt_init_common(mp, NULL, agno, btnum); + cur = xfs_allocbt_init_common(mp, NULL, pag, btnum); xfs_btree_stage_afakeroot(cur, afake); return cur; } diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index a5b998e950fe..9eb4c667a6b8 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xfs_perag; struct xbtree_afakeroot; /* @@ -46,11 +47,11 @@ struct xbtree_afakeroot; (maxrecs) * sizeof(xfs_alloc_key_t) + \ ((index) - 1) * sizeof(xfs_alloc_ptr_t))) -extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, - struct xfs_trans *, struct xfs_buf *, - xfs_agnumber_t, xfs_btnum_t); +extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_perag *pag, xfs_btnum_t btnum); struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, xfs_agnumber_t agno, + struct xbtree_afakeroot *afake, struct xfs_perag *pag, xfs_btnum_t btnum); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 96146f425e50..d9d7d5137b73 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -44,20 +44,27 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); * Internal routines when attribute list is one block. */ STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); -STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); +STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp); /* * Internal routines when attribute list is more than one block. */ STATIC int xfs_attr_node_get(xfs_da_args_t *args); -STATIC int xfs_attr_node_addname(xfs_da_args_t *args); -STATIC int xfs_attr_node_removename(xfs_da_args_t *args); +STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args); +STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac); +STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac); +STATIC int xfs_attr_node_addname_clear_incomplete( + struct xfs_delattr_context *dac); STATIC int xfs_attr_node_hasname(xfs_da_args_t *args, struct xfs_da_state **state); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); +STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac, + struct xfs_buf **leaf_bp); +STATIC int xfs_attr_node_removename(struct xfs_da_args *args, + struct xfs_da_state *state); int xfs_inode_hasattr( @@ -237,27 +244,77 @@ xfs_attr_is_shortform( } /* - * Attempts to set an attr in shortform, or converts short form to leaf form if - * there is not enough room. If the attr is set, the transaction is committed - * and set to NULL. + * Checks to see if a delayed attribute transaction should be rolled. If so, + * transaction is finished or rolled as needed. */ STATIC int -xfs_attr_set_shortform( - struct xfs_da_args *args, - struct xfs_buf **leaf_bp) +xfs_attr_trans_roll( + struct xfs_delattr_context *dac) { - struct xfs_inode *dp = args->dp; - int error, error2 = 0; + struct xfs_da_args *args = dac->da_args; + int error; + + if (dac->flags & XFS_DAC_DEFER_FINISH) { + /* + * The caller wants us to finish all the deferred ops so that we + * avoid pinning the log tail with a large number of deferred + * ops. + */ + dac->flags &= ~XFS_DAC_DEFER_FINISH; + error = xfs_defer_finish(&args->trans); + } else + error = xfs_trans_roll_inode(&args->trans, args->dp); + + return error; +} + +/* + * Set the attribute specified in @args. + */ +int +xfs_attr_set_args( + struct xfs_da_args *args) +{ + struct xfs_buf *leaf_bp = NULL; + int error = 0; + struct xfs_delattr_context dac = { + .da_args = args, + }; + + do { + error = xfs_attr_set_iter(&dac, &leaf_bp); + if (error != -EAGAIN) + break; + + error = xfs_attr_trans_roll(&dac); + if (error) { + if (leaf_bp) + xfs_trans_brelse(args->trans, leaf_bp); + return error; + } + } while (true); + + return error; +} + +STATIC int +xfs_attr_sf_addname( + struct xfs_delattr_context *dac, + struct xfs_buf **leaf_bp) +{ + struct xfs_da_args *args = dac->da_args; + struct xfs_inode *dp = args->dp; + int error = 0; /* * Try to add the attr to the attribute list in the inode. */ error = xfs_attr_try_sf_addname(dp, args); - if (error != -ENOSPC) { - error2 = xfs_trans_commit(args->trans); - args->trans = NULL; - return error ? error : error2; - } + + /* Should only be 0, -EEXIST or -ENOSPC */ + if (error != -ENOSPC) + return error; + /* * It won't fit in the shortform, transform to a leaf block. GROT: * another possible req'mt for a double-split btree op. @@ -269,85 +326,300 @@ xfs_attr_set_shortform( /* * Prevent the leaf buffer from being unlocked so that a concurrent AIL * push cannot grab the half-baked leaf buffer and run into problems - * with the write verifier. Once we're done rolling the transaction we - * can release the hold and add the attr to the leaf. + * with the write verifier. */ xfs_trans_bhold(args->trans, *leaf_bp); - error = xfs_defer_finish(&args->trans); - xfs_trans_bhold_release(args->trans, *leaf_bp); - if (error) { - xfs_trans_brelse(args->trans, *leaf_bp); - return error; - } - return 0; + /* + * We're still in XFS_DAS_UNINIT state here. We've converted + * the attr fork to leaf format and will restart with the leaf + * add. + */ + dac->flags |= XFS_DAC_DEFER_FINISH; + return -EAGAIN; } /* * Set the attribute specified in @args. + * This routine is meant to function as a delayed operation, and may return + * -EAGAIN when the transaction needs to be rolled. Calling functions will need + * to handle this, and recall the function until a successful error code is + * returned. */ int -xfs_attr_set_args( - struct xfs_da_args *args) +xfs_attr_set_iter( + struct xfs_delattr_context *dac, + struct xfs_buf **leaf_bp) { - struct xfs_inode *dp = args->dp; - struct xfs_buf *leaf_bp = NULL; - int error = 0; + struct xfs_da_args *args = dac->da_args; + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; + int forkoff, error = 0; + + /* State machine switch */ + switch (dac->dela_state) { + case XFS_DAS_UNINIT: + /* + * If the fork is shortform, attempt to add the attr. If there + * is no space, this converts to leaf format and returns + * -EAGAIN with the leaf buffer held across the roll. The caller + * will deal with a transaction roll error, but otherwise + * release the hold once we return with a clean transaction. + */ + if (xfs_attr_is_shortform(dp)) + return xfs_attr_sf_addname(dac, leaf_bp); + if (*leaf_bp != NULL) { + xfs_trans_bhold_release(args->trans, *leaf_bp); + *leaf_bp = NULL; + } - /* - * If the attribute list is already in leaf format, jump straight to - * leaf handling. Otherwise, try to add the attribute to the shortform - * list; if there's no room then convert the list to leaf format and try - * again. - */ - if (xfs_attr_is_shortform(dp)) { + if (xfs_attr_is_leaf(dp)) { + error = xfs_attr_leaf_try_add(args, *leaf_bp); + if (error == -ENOSPC) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; + + /* + * Finish any deferred work items and roll the + * transaction once more. The goal here is to + * call node_addname with the inode and + * transaction in the same state (inode locked + * and joined, transaction clean) no matter how + * we got to this step. + * + * At this point, we are still in + * XFS_DAS_UNINIT, but when we come back, we'll + * be a node, so we'll fall down into the node + * handling code below + */ + dac->flags |= XFS_DAC_DEFER_FINISH; + return -EAGAIN; + } else if (error) { + return error; + } + + dac->dela_state = XFS_DAS_FOUND_LBLK; + } else { + error = xfs_attr_node_addname_find_attr(dac); + if (error) + return error; + + error = xfs_attr_node_addname(dac); + if (error) + return error; + + dac->dela_state = XFS_DAS_FOUND_NBLK; + } + return -EAGAIN; + case XFS_DAS_FOUND_LBLK: + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + + /* Open coded xfs_attr_rmtval_set without trans handling */ + if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) { + dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT; + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_find_space(dac); + if (error) + return error; + } + } /* - * If the attr was successfully set in shortform, the - * transaction is committed and set to NULL. Otherwise, is it - * converted from shortform to leaf, and the transaction is - * retained. + * Repeat allocating remote blocks for the attr value until + * blkcnt drops to zero. */ - error = xfs_attr_set_shortform(args, &leaf_bp); - if (error || !args->trans) + if (dac->blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(dac); + if (error) + return error; + return -EAGAIN; + } + + error = xfs_attr_rmtval_set_value(args); + if (error) return error; - } - if (xfs_attr_is_leaf(dp)) { - error = xfs_attr_leaf_addname(args); - if (error != -ENOSPC) + /* + * If this is not a rename, clear the incomplete flag and we're + * done. + */ + if (!(args->op_flags & XFS_DA_OP_RENAME)) { + if (args->rmtblkno > 0) + error = xfs_attr3_leaf_clearflag(args); return error; + } /* - * Promote the attribute list to the Btree format. + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + * + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr3_leaf_to_node(args); + error = xfs_attr3_leaf_flipflags(args); if (error) return error; + /* + * Commit the flag value change and start the next trans in + * series. + */ + dac->dela_state = XFS_DAS_FLIP_LFLAG; + return -EAGAIN; + case XFS_DAS_FLIP_LFLAG: + /* + * Dismantle the "old" attribute/value pair by removing a + * "remote" value (if it exists). + */ + xfs_attr_restore_rmt_blk(args); + error = xfs_attr_rmtval_invalidate(args); + if (error) + return error; + + /* fallthrough */ + case XFS_DAS_RM_LBLK: + /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ + dac->dela_state = XFS_DAS_RM_LBLK; + if (args->rmtblkno) { + error = __xfs_attr_rmtval_remove(dac); + if (error) + return error; + + dac->dela_state = XFS_DAS_RD_LEAF; + return -EAGAIN; + } + /* fallthrough */ + case XFS_DAS_RD_LEAF: /* - * Finish any deferred work items and roll the transaction once - * more. The goal here is to call node_addname with the inode - * and transaction in the same state (inode locked and joined, - * transaction clean) no matter how we got to this step. + * This is the last step for leaf format. Read the block with + * the old attr, remove the old attr, check for shortform + * conversion and return. */ - error = xfs_defer_finish(&args->trans); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + &bp); if (error) return error; + xfs_attr3_leaf_remove(bp, args); + + forkoff = xfs_attr_shortform_allfit(bp, dp); + if (forkoff) + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); + /* bp is gone due to xfs_da_shrink_inode */ + + return error; + + case XFS_DAS_FOUND_NBLK: + /* + * Find space for remote blocks and fall into the allocation + * state. + */ + if (args->rmtblkno > 0) { + error = xfs_attr_rmtval_find_space(dac); + if (error) + return error; + } + + /* fallthrough */ + case XFS_DAS_ALLOC_NODE: + /* + * If there was an out-of-line value, allocate the blocks we + * identified for its storage and copy the value. This is done + * after we create the attribute so that we don't overflow the + * maximum size of a transaction and/or hit a deadlock. + */ + dac->dela_state = XFS_DAS_ALLOC_NODE; + if (args->rmtblkno > 0) { + if (dac->blkcnt > 0) { + error = xfs_attr_rmtval_set_blk(dac); + if (error) + return error; + return -EAGAIN; + } + + error = xfs_attr_rmtval_set_value(args); + if (error) + return error; + } + /* - * Commit the current trans (including the inode) and - * start a new one. + * If this was not a rename, clear the incomplete flag and we're + * done. */ - error = xfs_trans_roll_inode(&args->trans, dp); + if (!(args->op_flags & XFS_DA_OP_RENAME)) { + if (args->rmtblkno > 0) + error = xfs_attr3_leaf_clearflag(args); + goto out; + } + + /* + * If this is an atomic rename operation, we must "flip" the + * incomplete flags on the "new" and "old" attribute/value pairs + * so that one disappears and one appears atomically. Then we + * must remove the "old" attribute/value pair. + * + * In a separate transaction, set the incomplete flag on the + * "old" attr and clear the incomplete flag on the "new" attr. + */ + error = xfs_attr3_leaf_flipflags(args); + if (error) + goto out; + /* + * Commit the flag value change and start the next trans in + * series + */ + dac->dela_state = XFS_DAS_FLIP_NFLAG; + return -EAGAIN; + + case XFS_DAS_FLIP_NFLAG: + /* + * Dismantle the "old" attribute/value pair by removing a + * "remote" value (if it exists). + */ + xfs_attr_restore_rmt_blk(args); + + error = xfs_attr_rmtval_invalidate(args); if (error) return error; - } - error = xfs_attr_node_addname(args); + /* fallthrough */ + case XFS_DAS_RM_NBLK: + /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ + dac->dela_state = XFS_DAS_RM_NBLK; + if (args->rmtblkno) { + error = __xfs_attr_rmtval_remove(dac); + if (error) + return error; + + dac->dela_state = XFS_DAS_CLR_FLAG; + return -EAGAIN; + } + + /* fallthrough */ + case XFS_DAS_CLR_FLAG: + /* + * The last state for node format. Look up the old attr and + * remove it. + */ + error = xfs_attr_node_addname_clear_incomplete(dac); + break; + default: + ASSERT(0); + break; + } +out: return error; } + /* * Return EEXIST if attr is found, or ENOATTR if not */ @@ -382,16 +654,25 @@ xfs_has_attr( */ int xfs_attr_remove_args( - struct xfs_da_args *args) + struct xfs_da_args *args) { - if (!xfs_inode_hasattr(args->dp)) - return -ENOATTR; + int error; + struct xfs_delattr_context dac = { + .da_args = args, + }; - if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) - return xfs_attr_shortform_remove(args); - if (xfs_attr_is_leaf(args->dp)) - return xfs_attr_leaf_removename(args); - return xfs_attr_node_removename(args); + do { + error = xfs_attr_remove_iter(&dac); + if (error != -EAGAIN) + break; + + error = xfs_attr_trans_roll(&dac); + if (error) + return error; + + } while (true); + + return error; } /* @@ -559,7 +840,7 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) if (retval == -EEXIST) { if (args->attr_flags & XATTR_CREATE) return retval; - retval = xfs_attr_shortform_remove(args); + retval = xfs_attr_sf_removename(args); if (retval) return retval; /* @@ -670,115 +951,6 @@ out_brelse: return retval; } - -/* - * Add a name to the leaf attribute list structure - * - * This leaf block cannot have a "remote" value, we only call this routine - * if bmap_one_block() says there is only one block (ie: no remote blks). - */ -STATIC int -xfs_attr_leaf_addname( - struct xfs_da_args *args) -{ - int error, forkoff; - struct xfs_buf *bp = NULL; - struct xfs_inode *dp = args->dp; - - trace_xfs_attr_leaf_addname(args); - - error = xfs_attr_leaf_try_add(args, bp); - if (error) - return error; - - /* - * Commit the transaction that added the attr name so that - * later routines can manage their own transactions. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; - - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_set(args); - if (error) - return error; - } - - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - /* - * Added a "remote" value, just clear the incomplete flag. - */ - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - - return error; - } - - /* - * If this is an atomic rename operation, we must "flip" the incomplete - * flags on the "new" and "old" attribute/value pairs so that one - * disappears and one appears atomically. Then we must remove the "old" - * attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the "old" attr - * and clear the incomplete flag on the "new" attr. - */ - - error = xfs_attr3_leaf_flipflags(args); - if (error) - return error; - /* - * Commit the flag value change and start the next trans in series. - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - if (error) - return error; - - /* - * Dismantle the "old" attribute/value pair by removing a "remote" value - * (if it exists). - */ - xfs_attr_restore_rmt_blk(args); - - if (args->rmtblkno) { - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } - - /* - * Read in the block containing the "old" attr, then remove the "old" - * attr from that block (neat, huh!) - */ - error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, - &bp); - if (error) - return error; - - xfs_attr3_leaf_remove(bp, args); - - /* - * If the result is small enough, shrink it all into the inode. - */ - forkoff = xfs_attr_shortform_allfit(bp, dp); - if (forkoff) - error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); - /* bp is gone due to xfs_da_shrink_inode */ - - return error; -} - /* * Return EEXIST if attr is found, or ENOATTR if not */ @@ -909,48 +1081,26 @@ xfs_attr_node_hasname( * External routines when attribute list size > geo->blksize *========================================================================*/ -/* - * Add a name to a Btree-format attribute list. - * - * This will involve walking down the Btree, and may involve splitting - * leaf nodes and even splitting intermediate nodes up to and including - * the root node (a special case of an intermediate node). - * - * "Remote" attribute values confuse the issue and atomic rename operations - * add a whole extra layer of confusion on top of that. - */ STATIC int -xfs_attr_node_addname( - struct xfs_da_args *args) +xfs_attr_node_addname_find_attr( + struct xfs_delattr_context *dac) { - struct xfs_da_state *state; - struct xfs_da_state_blk *blk; - struct xfs_inode *dp; - int retval, error; - - trace_xfs_attr_node_addname(args); + struct xfs_da_args *args = dac->da_args; + int retval; /* - * Fill in bucket of arguments/results/context to carry around. - */ - dp = args->dp; -restart: - /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = 0; - retval = xfs_attr_node_hasname(args, &state); + retval = xfs_attr_node_hasname(args, &dac->da_state); if (retval != -ENOATTR && retval != -EEXIST) - goto out; + return retval; - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) - goto out; + goto error; if (retval == -EEXIST) { if (args->attr_flags & XATTR_CREATE) - goto out; + goto error; trace_xfs_attr_node_replace(args); @@ -968,8 +1118,44 @@ restart: args->rmtvaluelen = 0; } - retval = xfs_attr3_leaf_add(blk->bp, state->args); - if (retval == -ENOSPC) { + return 0; +error: + if (dac->da_state) + xfs_da_state_free(dac->da_state); + return retval; +} + +/* + * Add a name to a Btree-format attribute list. + * + * This will involve walking down the Btree, and may involve splitting + * leaf nodes and even splitting intermediate nodes up to and including + * the root node (a special case of an intermediate node). + * + * "Remote" attribute values confuse the issue and atomic rename operations + * add a whole extra layer of confusion on top of that. + * + * This routine is meant to function as a delayed operation, and may return + * -EAGAIN when the transaction needs to be rolled. Calling functions will need + * to handle this, and recall the function until a successful error code is + *returned. + */ +STATIC int +xfs_attr_node_addname( + struct xfs_delattr_context *dac) +{ + struct xfs_da_args *args = dac->da_args; + struct xfs_da_state *state = dac->da_state; + struct xfs_da_state_blk *blk; + int error; + + trace_xfs_attr_node_addname(args); + + blk = &state->path.blk[state->path.active-1]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + + error = xfs_attr3_leaf_add(blk->bp, state->args); + if (error == -ENOSPC) { if (state->path.active == 1) { /* * Its really a single leaf node, but it had @@ -981,19 +1167,16 @@ restart: error = xfs_attr3_leaf_to_node(args); if (error) goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; /* - * Commit the node conversion and start the next - * trans in the chain. + * Now that we have converted the leaf to a node, we can + * roll the transaction, and try xfs_attr3_leaf_add + * again on re-entry. No need to set dela_state to do + * this. dela_state is still unset by this function at + * this point. */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; - - goto restart; + dac->flags |= XFS_DAC_DEFER_FINISH; + return -EAGAIN; } /* @@ -1005,9 +1188,7 @@ restart: error = xfs_da3_split(state); if (error) goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; + dac->flags |= XFS_DAC_DEFER_FINISH; } else { /* * Addition succeeded, update Btree hashvals. @@ -1015,77 +1196,21 @@ restart: xfs_da3_fixhashpath(state, &state->path); } - /* - * Kill the state structure, we're done with it and need to - * allow the buffers to come back later. - */ - xfs_da_state_free(state); - state = NULL; - - /* - * Commit the leaf addition or btree split and start the next - * trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; - - /* - * If there was an out-of-line value, allocate the blocks we - * identified for its storage and copy the value. This is done - * after we create the attribute so that we don't overflow the - * maximum size of a transaction and/or hit a deadlock. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_rmtval_set(args); - if (error) - return error; - } - - if (!(args->op_flags & XFS_DA_OP_RENAME)) { - /* - * Added a "remote" value, just clear the incomplete flag. - */ - if (args->rmtblkno > 0) - error = xfs_attr3_leaf_clearflag(args); - retval = error; - goto out; - } - - /* - * If this is an atomic rename operation, we must "flip" the incomplete - * flags on the "new" and "old" attribute/value pairs so that one - * disappears and one appears atomically. Then we must remove the "old" - * attribute/value pair. - * - * In a separate transaction, set the incomplete flag on the "old" attr - * and clear the incomplete flag on the "new" attr. - */ - error = xfs_attr3_leaf_flipflags(args); - if (error) - goto out; - /* - * Commit the flag value change and start the next trans in series - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - if (error) - goto out; - - /* - * Dismantle the "old" attribute/value pair by removing a "remote" value - * (if it exists). - */ - xfs_attr_restore_rmt_blk(args); +out: + if (state) + xfs_da_state_free(state); + return error; +} - if (args->rmtblkno) { - error = xfs_attr_rmtval_invalidate(args); - if (error) - return error; - error = xfs_attr_rmtval_remove(args); - if (error) - return error; - } +STATIC int +xfs_attr_node_addname_clear_incomplete( + struct xfs_delattr_context *dac) +{ + struct xfs_da_args *args = dac->da_args; + struct xfs_da_state *state = NULL; + int retval = 0; + int error = 0; /* * Re-find the "old" attribute entry after any split ops. The INCOMPLETE @@ -1098,13 +1223,7 @@ restart: if (error) goto out; - /* - * Remove the name and update the hashvals in the tree. - */ - blk = &state->path.blk[state->path.active-1]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr3_leaf_remove(blk->bp, args); - xfs_da3_fixhashpath(state, &state->path); + error = xfs_attr_node_removename(args, state); /* * Check to see if the tree needs to be collapsed. @@ -1190,14 +1309,16 @@ xfs_attr_leaf_mark_incomplete( */ STATIC int xfs_attr_node_removename_setup( - struct xfs_da_args *args, - struct xfs_da_state **state) + struct xfs_delattr_context *dac) { - int error; + struct xfs_da_args *args = dac->da_args; + struct xfs_da_state **state = &dac->da_state; + int error; error = xfs_attr_node_hasname(args, state); if (error != -EEXIST) return error; + error = 0; ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); ASSERT((*state)->path.blk[(*state)->path.active - 1].magic == @@ -1206,97 +1327,164 @@ int xfs_attr_node_removename_setup( if (args->rmtblkno > 0) { error = xfs_attr_leaf_mark_incomplete(args, *state); if (error) - return error; + goto out; - return xfs_attr_rmtval_invalidate(args); + error = xfs_attr_rmtval_invalidate(args); } +out: + if (error) + xfs_da_state_free(*state); - return 0; + return error; } STATIC int -xfs_attr_node_remove_rmt( +xfs_attr_node_removename( struct xfs_da_args *args, struct xfs_da_state *state) { - int error = 0; - - error = xfs_attr_rmtval_remove(args); - if (error) - return error; + struct xfs_da_state_blk *blk; + int retval; /* - * Refill the state structure with buffers, the prior calls released our - * buffers. + * Remove the name and update the hashvals in the tree. */ - return xfs_attr_refillstate(state); + blk = &state->path.blk[state->path.active-1]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + retval = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); + + return retval; } /* - * Remove a name from a B-tree attribute list. + * Remove the attribute specified in @args. * * This will involve walking down the Btree, and may involve joining * leaf nodes and even joining intermediate nodes up to and including * the root node (a special case of an intermediate node). + * + * This routine is meant to function as either an in-line or delayed operation, + * and may return -EAGAIN when the transaction needs to be rolled. Calling + * functions will need to handle this, and call the function until a + * successful error code is returned. */ -STATIC int -xfs_attr_node_removename( - struct xfs_da_args *args) +int +xfs_attr_remove_iter( + struct xfs_delattr_context *dac) { - struct xfs_da_state *state; - struct xfs_da_state_blk *blk; - int retval, error; - struct xfs_inode *dp = args->dp; + struct xfs_da_args *args = dac->da_args; + struct xfs_da_state *state = dac->da_state; + int retval, error = 0; + struct xfs_inode *dp = args->dp; trace_xfs_attr_node_removename(args); - error = xfs_attr_node_removename_setup(args, &state); - if (error) - goto out; + switch (dac->dela_state) { + case XFS_DAS_UNINIT: + if (!xfs_inode_hasattr(dp)) + return -ENOATTR; - /* - * If there is an out-of-line value, de-allocate the blocks. - * This is done before we remove the attribute so that we don't - * overflow the maximum size of a transaction and/or hit a deadlock. - */ - if (args->rmtblkno > 0) { - error = xfs_attr_node_remove_rmt(args, state); - if (error) - goto out; - } + /* + * Shortform or leaf formats don't require transaction rolls and + * thus state transitions. Call the right helper and return. + */ + if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) + return xfs_attr_sf_removename(args); - /* - * Remove the name and update the hashvals in the tree. - */ - blk = &state->path.blk[ state->path.active-1 ]; - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - retval = xfs_attr3_leaf_remove(blk->bp, args); - xfs_da3_fixhashpath(state, &state->path); + if (xfs_attr_is_leaf(dp)) + return xfs_attr_leaf_removename(args); - /* - * Check to see if the tree needs to be collapsed. - */ - if (retval && (state->path.active > 1)) { - error = xfs_da3_join(state); - if (error) - goto out; - error = xfs_defer_finish(&args->trans); - if (error) - goto out; /* - * Commit the Btree join operation and start a new trans. + * Node format may require transaction rolls. Set up the + * state context and fall into the state machine. */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - goto out; - } + if (!dac->da_state) { + error = xfs_attr_node_removename_setup(dac); + if (error) + return error; + state = dac->da_state; + } - /* - * If the result is small enough, push it all into the inode. - */ - if (xfs_attr_is_leaf(dp)) - error = xfs_attr_node_shrink(args, state); + /* fallthrough */ + case XFS_DAS_RMTBLK: + dac->dela_state = XFS_DAS_RMTBLK; + /* + * If there is an out-of-line value, de-allocate the blocks. + * This is done before we remove the attribute so that we don't + * overflow the maximum size of a transaction and/or hit a + * deadlock. + */ + if (args->rmtblkno > 0) { + /* + * May return -EAGAIN. Roll and repeat until all remote + * blocks are removed. + */ + error = __xfs_attr_rmtval_remove(dac); + if (error == -EAGAIN) + return error; + else if (error) + goto out; + + /* + * Refill the state structure with buffers (the prior + * calls released our buffers) and close out this + * transaction before proceeding. + */ + ASSERT(args->rmtblkno == 0); + error = xfs_attr_refillstate(state); + if (error) + goto out; + dac->dela_state = XFS_DAS_RM_NAME; + dac->flags |= XFS_DAC_DEFER_FINISH; + return -EAGAIN; + } + + /* fallthrough */ + case XFS_DAS_RM_NAME: + /* + * If we came here fresh from a transaction roll, reattach all + * the buffers to the current transaction. + */ + if (dac->dela_state == XFS_DAS_RM_NAME) { + error = xfs_attr_refillstate(state); + if (error) + goto out; + } + + retval = xfs_attr_node_removename(args, state); + + /* + * Check to see if the tree needs to be collapsed. If so, roll + * the transacton and fall into the shrink state. + */ + if (retval && (state->path.active > 1)) { + error = xfs_da3_join(state); + if (error) + goto out; + + dac->flags |= XFS_DAC_DEFER_FINISH; + dac->dela_state = XFS_DAS_RM_SHRINK; + return -EAGAIN; + } + + /* fallthrough */ + case XFS_DAS_RM_SHRINK: + /* + * If the result is small enough, push it all into the inode. + * This is our final state so it's safe to return a dirty + * transaction. + */ + if (xfs_attr_is_leaf(dp)) + error = xfs_attr_node_shrink(args, state); + ASSERT(error != -EAGAIN); + break; + default: + ASSERT(0); + error = -EINVAL; + goto out; + } out: if (state) xfs_da_state_free(state); diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 2b1f61987a9d..8de5d1d2733e 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -74,6 +74,406 @@ struct xfs_attr_list_context { }; +/* + * ======================================================================== + * Structure used to pass context around among the delayed routines. + * ======================================================================== + */ + +/* + * Below is a state machine diagram for attr remove operations. The XFS_DAS_* + * states indicate places where the function would return -EAGAIN, and then + * immediately resume from after being called by the calling function. States + * marked as a "subroutine state" indicate that they belong to a subroutine, and + * so the calling function needs to pass them back to that subroutine to allow + * it to finish where it left off. But they otherwise do not have a role in the + * calling function other than just passing through. + * + * xfs_attr_remove_iter() + * │ + * v + * have attr to remove? ──n──> done + * │ + * y + * │ + * v + * are we short form? ──y──> xfs_attr_shortform_remove ──> done + * │ + * n + * │ + * V + * are we leaf form? ──y──> xfs_attr_leaf_removename ──> done + * │ + * n + * │ + * V + * ┌── need to setup state? + * │ │ + * n y + * │ │ + * │ v + * │ find attr and get state + * │ attr has remote blks? ──n─┐ + * │ │ v + * │ │ find and invalidate + * │ y the remote blocks. + * │ │ mark attr incomplete + * │ ├────────────────┘ + * └──────────┤ + * │ + * v + * Have remote blks to remove? ───y─────┐ + * │ ^ remove the blks + * │ │ │ + * │ │ v + * │ XFS_DAS_RMTBLK <─n── done? + * │ re-enter with │ + * │ one less blk to y + * │ remove │ + * │ V + * │ refill the state + * n │ + * │ v + * │ XFS_DAS_RM_NAME + * │ │ + * ├─────────────────────────┘ + * │ + * v + * remove leaf and + * update hash with + * xfs_attr_node_remove_cleanup + * │ + * v + * need to + * shrink tree? ─n─┐ + * │ │ + * y │ + * │ │ + * v │ + * join leaf │ + * │ │ + * v │ + * XFS_DAS_RM_SHRINK │ + * │ │ + * v │ + * do the shrink │ + * │ │ + * v │ + * free state <──┘ + * │ + * v + * done + * + * + * Below is a state machine diagram for attr set operations. + * + * It seems the challenge with understanding this system comes from trying to + * absorb the state machine all at once, when really one should only be looking + * at it with in the context of a single function. Once a state sensitive + * function is called, the idea is that it "takes ownership" of the + * state machine. It isn't concerned with the states that may have belonged to + * it's calling parent. Only the states relevant to itself or any other + * subroutines there in. Once a calling function hands off the state machine to + * a subroutine, it needs to respect the simple rule that it doesn't "own" the + * state machine anymore, and it's the responsibility of that calling function + * to propagate the -EAGAIN back up the call stack. Upon reentry, it is + * committed to re-calling that subroutine until it returns something other than + * -EAGAIN. Once that subroutine signals completion (by returning anything other + * than -EAGAIN), the calling function can resume using the state machine. + * + * xfs_attr_set_iter() + * │ + * v + * ┌─y─ has an attr fork? + * │ | + * │ n + * │ | + * │ V + * │ add a fork + * │ │ + * └──────────┤ + * │ + * V + * ┌─── is shortform? + * │ │ + * │ y + * │ │ + * │ V + * │ xfs_attr_set_fmt + * │ | + * │ V + * │ xfs_attr_try_sf_addname + * │ │ + * │ V + * │ had enough ──y──> done + * │ space? + * n │ + * │ n + * │ │ + * │ V + * │ transform to leaf + * │ │ + * │ V + * │ hold the leaf buffer + * │ │ + * │ V + * │ return -EAGAIN + * │ Re-enter in + * │ leaf form + * │ + * └─> release leaf buffer + * if needed + * │ + * V + * ┌───n── fork has + * │ only 1 blk? + * │ │ + * │ y + * │ │ + * │ v + * │ xfs_attr_leaf_try_add() + * │ │ + * │ v + * │ had enough ──────────────y─────────────┐ + * │ space? │ + * │ │ │ + * │ n │ + * │ │ │ + * │ v │ + * │ return -EAGAIN │ + * │ re-enter in │ + * │ node form │ + * │ │ │ + * └──────────┤ │ + * │ │ + * V │ + * xfs_attr_node_addname_find_attr │ + * determines if this │ + * is create or rename │ + * find space to store attr │ + * │ │ + * v │ + * xfs_attr_node_addname │ + * │ │ + * v │ + * fits in a node leaf? ────n─────┐ │ + * │ ^ v │ + * │ │ single leaf node? │ + * │ │ │ │ │ + * y │ y n │ + * │ │ │ │ │ + * v │ v v │ + * update │ grow the leaf split if │ + * hashvals └── return -EAGAIN needed │ + * │ retry leaf add │ │ + * │ on reentry │ │ + * ├────────────────────────────┘ │ + * │ │ + * v │ + * need to alloc │ + * ┌─y── or flip flag? │ + * │ │ │ + * │ n │ + * │ │ │ + * │ v │ + * │ done │ + * │ │ + * │ │ + * │ XFS_DAS_FOUND_LBLK <────────────────┘ + * │ │ + * │ V + * │ xfs_attr_leaf_addname() + * │ │ + * │ v + * │ ┌──first time through? + * │ │ │ + * │ │ y + * │ │ │ + * │ n v + * │ │ if we have rmt blks + * │ │ find space for them + * │ │ │ + * │ └──────────┤ + * │ │ + * │ v + * │ still have + * │ ┌─n─ blks to alloc? <──┐ + * │ │ │ │ + * │ │ y │ + * │ │ │ │ + * │ │ v │ + * │ │ alloc one blk │ + * │ │ return -EAGAIN ──┘ + * │ │ re-enter with one + * │ │ less blk to alloc + * │ │ + * │ │ + * │ └───> set the rmt + * │ value + * │ │ + * │ v + * │ was this + * │ a rename? ──n─┐ + * │ │ │ + * │ y │ + * │ │ │ + * │ v │ + * │ flip incomplete │ + * │ flag │ + * │ │ │ + * │ v │ + * │ XFS_DAS_FLIP_LFLAG │ + * │ │ │ + * │ v │ + * │ need to remove │ + * │ old bks? ──n──┤ + * │ │ │ + * │ y │ + * │ │ │ + * │ V │ + * │ remove │ + * │ ┌───> old blks │ + * │ │ │ │ + * │ XFS_DAS_RM_LBLK │ │ + * │ ^ │ │ + * │ │ v │ + * │ └──y── more to │ + * │ remove? │ + * │ │ │ + * │ n │ + * │ │ │ + * │ v │ + * │ XFS_DAS_RD_LEAF │ + * │ │ │ + * │ v │ + * │ remove leaf │ + * │ │ │ + * │ v │ + * │ shrink to sf │ + * │ if needed │ + * │ │ │ + * │ v │ + * │ done <──────┘ + * │ + * └──────> XFS_DAS_FOUND_NBLK + * │ + * v + * ┌─────n── need to + * │ alloc blks? + * │ │ + * │ y + * │ │ + * │ v + * │ find space + * │ │ + * │ v + * │ ┌─>XFS_DAS_ALLOC_NODE + * │ │ │ + * │ │ v + * │ │ alloc blk + * │ │ │ + * │ │ v + * │ └──y── need to alloc + * │ more blocks? + * │ │ + * │ n + * │ │ + * │ v + * │ set the rmt value + * │ │ + * │ v + * │ was this + * └────────> a rename? ──n─┐ + * │ │ + * y │ + * │ │ + * v │ + * flip incomplete │ + * flag │ + * │ │ + * v │ + * XFS_DAS_FLIP_NFLAG │ + * │ │ + * v │ + * need to │ + * remove blks? ─n──┤ + * │ │ + * y │ + * │ │ + * v │ + * remove │ + * ┌────────> old blks │ + * │ │ │ + * XFS_DAS_RM_NBLK │ │ + * ^ │ │ + * │ v │ + * └──────y── more to │ + * remove │ + * │ │ + * n │ + * │ │ + * v │ + * XFS_DAS_CLR_FLAG │ + * │ │ + * v │ + * clear flags │ + * │ │ + * ├──────────┘ + * │ + * v + * done + */ + +/* + * Enum values for xfs_delattr_context.da_state + * + * These values are used by delayed attribute operations to keep track of where + * they were before they returned -EAGAIN. A return code of -EAGAIN signals the + * calling function to roll the transaction, and then call the subroutine to + * finish the operation. The enum is then used by the subroutine to jump back + * to where it was and resume executing where it left off. + */ +enum xfs_delattr_state { + XFS_DAS_UNINIT = 0, /* No state has been set yet */ + XFS_DAS_RMTBLK, /* Removing remote blks */ + XFS_DAS_RM_NAME, /* Remove attr name */ + XFS_DAS_RM_SHRINK, /* We are shrinking the tree */ + XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */ + XFS_DAS_FOUND_NBLK, /* We found node blk for attr */ + XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */ + XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */ + XFS_DAS_RD_LEAF, /* Read in the new leaf */ + XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */ + XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */ + XFS_DAS_RM_NBLK, /* A rename is removing node blocks */ + XFS_DAS_CLR_FLAG, /* Clear incomplete flag */ +}; + +/* + * Defines for xfs_delattr_context.flags + */ +#define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */ +#define XFS_DAC_LEAF_ADDNAME_INIT 0x02 /* xfs_attr_leaf_addname init*/ + +/* + * Context used for keeping track of delayed attribute operations + */ +struct xfs_delattr_context { + struct xfs_da_args *da_args; + + /* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */ + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + int blkcnt; + + /* Used in xfs_attr_node_removename to roll through removing blocks */ + struct xfs_da_state *da_state; + + /* Used to keep track of current state of delayed operation */ + unsigned int flags; + enum xfs_delattr_state dela_state; +}; + /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ @@ -92,6 +492,9 @@ int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set_args(struct xfs_da_args *args); int xfs_has_attr(struct xfs_da_args *args); int xfs_attr_remove_args(struct xfs_da_args *args); +int xfs_attr_remove_iter(struct xfs_delattr_context *dac); bool xfs_attr_namecheck(const void *name, size_t length); +void xfs_delattr_context_init(struct xfs_delattr_context *dac, + struct xfs_da_args *args); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 556184b63061..b910bd209949 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -19,14 +19,15 @@ #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_attr_sf.h" -#include "xfs_attr_remote.h" #include "xfs_attr.h" +#include "xfs_attr_remote.h" #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_dir2.h" #include "xfs_log.h" +#include "xfs_ag.h" /* @@ -773,7 +774,7 @@ xfs_attr_fork_remove( * Remove an attribute from the shortform attribute list structure. */ int -xfs_attr_shortform_remove( +xfs_attr_sf_removename( struct xfs_da_args *args) { struct xfs_attr_shortform *sf; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 9b1c59f40a26..efa757f1e912 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -51,7 +51,7 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, struct xfs_buf **leaf_bp); -int xfs_attr_shortform_remove(struct xfs_da_args *args); +int xfs_attr_sf_removename(struct xfs_da_args *args); int xfs_attr_sf_findname(struct xfs_da_args *args, struct xfs_attr_sf_entry **sfep, unsigned int *basep); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 48d8e9caf86f..0c8bee3abc3b 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -439,9 +439,9 @@ xfs_attr_rmtval_get( /* * Find a "hole" in the attribute address space large enough for us to drop the - * new attribute's value into + * new attributes value into */ -STATIC int +int xfs_attr_rmt_find_hole( struct xfs_da_args *args) { @@ -468,7 +468,7 @@ xfs_attr_rmt_find_hole( return 0; } -STATIC int +int xfs_attr_rmtval_set_value( struct xfs_da_args *args) { @@ -562,69 +562,66 @@ xfs_attr_rmtval_stale( } /* - * Write the value associated with an attribute into the out-of-line buffer - * that we have defined for it. + * Find a hole for the attr and store it in the delayed attr context. This + * initializes the context to roll through allocating an attr extent for a + * delayed attr operation */ int -xfs_attr_rmtval_set( - struct xfs_da_args *args) +xfs_attr_rmtval_find_space( + struct xfs_delattr_context *dac) { - struct xfs_inode *dp = args->dp; - struct xfs_bmbt_irec map; - xfs_dablk_t lblkno; - int blkcnt; - int nmap; - int error; + struct xfs_da_args *args = dac->da_args; + struct xfs_bmbt_irec *map = &dac->map; + int error; - trace_xfs_attr_rmtval_set(args); + dac->lblkno = 0; + dac->blkcnt = 0; + args->rmtblkcnt = 0; + args->rmtblkno = 0; + memset(map, 0, sizeof(struct xfs_bmbt_irec)); error = xfs_attr_rmt_find_hole(args); if (error) return error; - blkcnt = args->rmtblkcnt; - lblkno = (xfs_dablk_t)args->rmtblkno; - /* - * Roll through the "value", allocating blocks on disk as required. - */ - while (blkcnt > 0) { - /* - * Allocate a single extent, up to the size of the value. - * - * Note that we have to consider this a data allocation as we - * write the remote attribute without logging the contents. - * Hence we must ensure that we aren't using blocks that are on - * the busy list so that we don't overwrite blocks which have - * recently been freed but their transactions are not yet - * committed to disk. If we overwrite the contents of a busy - * extent and then crash then the block may not contain the - * correct metadata after log recovery occurs. - */ - nmap = 1; - error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, - &nmap); - if (error) - return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; + dac->blkcnt = args->rmtblkcnt; + dac->lblkno = args->rmtblkno; - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - lblkno += map.br_blockcount; - blkcnt -= map.br_blockcount; + return 0; +} - /* - * Start the next trans in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) - return error; - } +/* + * Write one block of the value associated with an attribute into the + * out-of-line buffer that we have defined for it. This is similar to a subset + * of xfs_attr_rmtval_set, but records the current block to the delayed attr + * context, and leaves transaction handling to the caller. + */ +int +xfs_attr_rmtval_set_blk( + struct xfs_delattr_context *dac) +{ + struct xfs_da_args *args = dac->da_args; + struct xfs_inode *dp = args->dp; + struct xfs_bmbt_irec *map = &dac->map; + int nmap; + int error; + + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno, + dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total, + map, &nmap); + if (error) + return error; + + ASSERT(nmap == 1); + ASSERT((map->br_startblock != DELAYSTARTBLOCK) && + (map->br_startblock != HOLESTARTBLOCK)); - return xfs_attr_rmtval_set_value(args); + /* roll attribute extent map forwards */ + dac->lblkno += map->br_blockcount; + dac->blkcnt -= map->br_blockcount; + + return 0; } /* @@ -669,47 +666,17 @@ xfs_attr_rmtval_invalidate( } /* - * Remove the value associated with an attribute by deleting the - * out-of-line buffer that it is stored on. - */ -int -xfs_attr_rmtval_remove( - struct xfs_da_args *args) -{ - int error; - int retval; - - trace_xfs_attr_rmtval_remove(args); - - /* - * Keep de-allocating extents until the remote-value region is gone. - */ - do { - retval = __xfs_attr_rmtval_remove(args); - if (retval && retval != -EAGAIN) - return retval; - - /* - * Close out trans and start the next one in the chain. - */ - error = xfs_trans_roll_inode(&args->trans, args->dp); - if (error) - return error; - } while (retval == -EAGAIN); - - return 0; -} - -/* * Remove the value associated with an attribute by deleting the out-of-line - * buffer that it is stored on. Returns EAGAIN for the caller to refresh the - * transaction and re-call the function + * buffer that it is stored on. Returns -EAGAIN for the caller to refresh the + * transaction and re-call the function. Callers should keep calling this + * routine until it returns something other than -EAGAIN. */ int __xfs_attr_rmtval_remove( - struct xfs_da_args *args) + struct xfs_delattr_context *dac) { - int error, done; + struct xfs_da_args *args = dac->da_args; + int error, done; /* * Unmap value blocks for this attr. @@ -719,12 +686,20 @@ __xfs_attr_rmtval_remove( if (error) return error; - error = xfs_defer_finish(&args->trans); - if (error) - return error; - - if (!done) + /* + * We don't need an explicit state here to pick up where we left off. We + * can figure it out using the !done return code. The actual value of + * attr->xattri_dela_state may be some value reminiscent of the calling + * function, but it's value is irrelevant with in the context of this + * function. Once we are done here, the next state is set as needed by + * the parent + */ + if (!done) { + dac->flags |= XFS_DAC_DEFER_FINISH; return -EAGAIN; + } - return error; + args->rmtblkno = 0; + args->rmtblkcnt = 0; + return 0; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 9eee615da156..61b85b918db8 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -9,10 +9,12 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); int xfs_attr_rmtval_get(struct xfs_da_args *args); -int xfs_attr_rmtval_set(struct xfs_da_args *args); -int xfs_attr_rmtval_remove(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); -int __xfs_attr_rmtval_remove(struct xfs_da_args *args); +int __xfs_attr_rmtval_remove(struct xfs_delattr_context *dac); +int xfs_attr_rmt_find_hole(struct xfs_da_args *args); +int xfs_attr_rmtval_set_value(struct xfs_da_args *args); +int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a3e0e6f672d6..948092babb6a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -31,6 +31,7 @@ #include "xfs_attr_leaf.h" #include "xfs_filestream.h" #include "xfs_rmap.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" #include "xfs_icache.h" @@ -1028,7 +1029,7 @@ xfs_bmap_add_attrfork_local( /* * Set an inode attr fork offset based on the format of the data fork. */ -int +static int xfs_bmap_set_attrforkoff( struct xfs_inode *ip, int size, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f9a390ecfb1d..67641f669918 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -187,7 +187,6 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); -int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5b6fcb9b44e2..be74a6b53689 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -21,6 +21,7 @@ #include "xfs_alloc.h" #include "xfs_log.h" #include "xfs_btree_staging.h" +#include "xfs_ag.h" /* * Cursor allocation zone. @@ -215,7 +216,7 @@ xfs_btree_check_sptr( { if (level <= 0) return false; - return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.agno, agbno); + return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno); } /* @@ -244,7 +245,7 @@ xfs_btree_check_ptr( return 0; xfs_err(cur->bc_mp, "AG %u: Corrupt btree %d pointer at level %d index %d.", - cur->bc_ag.agno, cur->bc_btnum, + cur->bc_ag.pag->pag_agno, cur->bc_btnum, level, index); } @@ -376,6 +377,8 @@ xfs_btree_del_cursor( XFS_FORCED_SHUTDOWN(cur->bc_mp)); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kmem_free(cur->bc_ops); + if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) + xfs_perag_put(cur->bc_ag.pag); kmem_cache_free(xfs_btree_cur_zone, cur); } @@ -885,13 +888,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, 1, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, right, 1, cur->bc_ops->buf_ops); rval++; } @@ -949,7 +952,7 @@ xfs_btree_ptr_to_daddr( *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); } else { agbno = be32_to_cpu(ptr->s); - *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.agno, + *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno); } @@ -1150,7 +1153,7 @@ xfs_btree_init_block_cur( if (cur->bc_flags & XFS_BTREE_LONG_PTRS) owner = cur->bc_ino.ip->i_ino; else - owner = cur->bc_ag.agno; + owner = cur->bc_ag.pag->pag_agno; xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, cur->bc_btnum, level, numrecs, diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 10e50cbacacf..4dbdc659c396 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -11,6 +11,7 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; struct xfs_ifork; +struct xfs_perag; extern kmem_zone_t *xfs_btree_cur_zone; @@ -180,11 +181,11 @@ union xfs_btree_irec { /* Per-AG btree information. */ struct xfs_btree_cur_ag { + struct xfs_perag *pag; union { struct xfs_buf *agbp; struct xbtree_afakeroot *afake; /* for staging cursor */ }; - xfs_agnumber_t agno; union { struct { unsigned long nr_ops; /* # record updates */ @@ -231,6 +232,13 @@ typedef struct xfs_btree_cur uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ int bc_statoff; /* offset of btre stats array */ + + /* + * Short btree pointers need an agno to be able to turn the pointers + * into physical addresses for IO, so the btree cursor switches between + * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the + * cursor. + */ union { struct xfs_btree_cur_ag bc_ag; struct xfs_btree_cur_ino bc_ino; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 83ac9771bfb5..747ec77912c3 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -282,7 +282,7 @@ xfs_da3_node_read_verify( __this_address); break; } - /* fall through */ + fallthrough; case XFS_DA_NODE_MAGIC: fa = xfs_da3_node_verify(bp); if (fa) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index eefdb518fe64..57d9cb632983 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -10,7 +10,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -27,6 +26,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_rmap.h" +#include "xfs_ag.h" /* * Lookup a record by ino in the btree given by cur. @@ -105,7 +105,7 @@ xfs_inobt_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.agno; + xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; union xfs_btree_rec *rec; int error; uint64_t realfree; @@ -172,18 +172,17 @@ xfs_inobt_insert( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_agino_t newino, xfs_agino_t newlen, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - struct xfs_agi *agi = agbp->b_addr; - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agino_t thisino; int i; int error; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum); for (thisino = newino; thisino < newino + newlen; @@ -215,10 +214,9 @@ xfs_inobt_insert( * Verify that the number of free inodes in the AGI is correct. */ #ifdef DEBUG -STATIC int +static int xfs_check_agi_freecount( - struct xfs_btree_cur *cur, - struct xfs_agi *agi) + struct xfs_btree_cur *cur) { if (cur->bc_nlevels == 1) { xfs_inobt_rec_incore_t rec; @@ -244,12 +242,12 @@ xfs_check_agi_freecount( } while (i == 1); if (!XFS_FORCED_SHUTDOWN(cur->bc_mp)) - ASSERT(freecount == be32_to_cpu(agi->agi_freecount)); + ASSERT(freecount == cur->bc_ag.pag->pagi_freecount); } return 0; } #else -#define xfs_check_agi_freecount(cur, agi) 0 +#define xfs_check_agi_freecount(cur) 0 #endif /* @@ -520,18 +518,17 @@ xfs_inobt_insert_sprec( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, int btnum, struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ bool merge) /* merge or replace */ { struct xfs_btree_cur *cur; - struct xfs_agi *agi = agbp->b_addr; - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); int error; int i; struct xfs_inobt_rec_incore rec; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, btnum); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); @@ -578,14 +575,14 @@ xfs_inobt_insert_sprec( goto error; } - trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, + trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, rec.ir_holemask, nrec->ir_startino, nrec->ir_holemask); /* merge to nrec to output the updated record */ __xfs_inobt_rec_merge(nrec, &rec); - trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino, + trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, nrec->ir_holemask); error = xfs_inobt_rec_check_count(mp, nrec); @@ -606,28 +603,28 @@ error: } /* - * Allocate new inodes in the allocation group specified by agbp. - * Returns 0 if inodes were allocated in this AG; 1 if there was no space - * in this AG; or the usual negative error code. + * Allocate new inodes in the allocation group specified by agbp. Returns 0 if + * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so + * the caller knows it can try another AG, a hard -ENOSPC when over the maximum + * inode count threshold, or the usual negative error code for other errors. */ STATIC int xfs_ialloc_ag_alloc( struct xfs_trans *tp, - struct xfs_buf *agbp) + struct xfs_buf *agbp, + struct xfs_perag *pag) { struct xfs_agi *agi; struct xfs_alloc_arg args; - xfs_agnumber_t agno; int error; xfs_agino_t newino; /* new first inode's number */ xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe */ /* unit boundary */ /* init. to full chunk */ - uint16_t allocmask = (uint16_t) -1; struct xfs_inobt_rec_incore rec; - struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp); + uint16_t allocmask = (uint16_t) -1; int do_sparse = 0; memset(&args, 0, sizeof(args)); @@ -660,14 +657,13 @@ xfs_ialloc_ag_alloc( */ agi = agbp->b_addr; newino = be32_to_cpu(agi->agi_newino); - agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + igeo->ialloc_blks; if (do_sparse) goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { - args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; args.prod = 1; @@ -727,7 +723,7 @@ xfs_ialloc_ag_alloc( * For now, just allocate blocks up front. */ args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); /* * Allocate a fixed-size extent of inodes. */ @@ -748,7 +744,7 @@ xfs_ialloc_ag_alloc( if (isaligned && args.fsbno == NULLFSBLOCK) { args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.alignment = igeo->cluster_align; if ((error = xfs_alloc_vextent(&args))) return error; @@ -764,7 +760,7 @@ xfs_ialloc_ag_alloc( sparse_alloc: args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, pag->pag_agno, args.agbno); args.alignment = args.mp->m_sb.sb_spino_align; args.prod = 1; @@ -796,7 +792,7 @@ sparse_alloc: } if (args.fsbno == NULLFSBLOCK) - return 1; + return -EAGAIN; ASSERT(args.len == args.minlen); @@ -809,7 +805,7 @@ sparse_alloc: * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno, + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag->pag_agno, args.agbno, args.len, prandom_u32()); if (error) @@ -836,12 +832,12 @@ sparse_alloc: * if necessary. If a merge does occur, rec is updated to the * merged record. */ - error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO, - &rec, true); + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, + XFS_BTNUM_INO, &rec, true); if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", - XFS_AGINO_TO_INO(args.mp, agno, + XFS_AGINO_TO_INO(args.mp, pag->pag_agno, rec.ir_startino), rec.ir_holemask, rec.ir_count); xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); @@ -861,21 +857,20 @@ sparse_alloc: * existing record with this one. */ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert_sprec(args.mp, tp, agbp, - XFS_BTNUM_FINO, &rec, - false); + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, + XFS_BTNUM_FINO, &rec, false); if (error) return error; } } else { /* full chunk - insert new records to both btrees */ - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen, XFS_BTNUM_INO); if (error) return error; if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert(args.mp, tp, agbp, newino, + error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen, XFS_BTNUM_FINO); if (error) return error; @@ -887,7 +882,6 @@ sparse_alloc: */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); - pag = agbp->b_pag; pag->pagi_freecount += newlen; pag->pagi_count += newlen; agi->agi_newino = cpu_to_be32(newino); @@ -905,139 +899,6 @@ sparse_alloc: return 0; } -STATIC xfs_agnumber_t -xfs_ialloc_next_ag( - xfs_mount_t *mp) -{ - xfs_agnumber_t agno; - - spin_lock(&mp->m_agirotor_lock); - agno = mp->m_agirotor; - if (++mp->m_agirotor >= mp->m_maxagi) - mp->m_agirotor = 0; - spin_unlock(&mp->m_agirotor_lock); - - return agno; -} - -/* - * Select an allocation group to look for a free inode in, based on the parent - * inode and the mode. Return the allocation group buffer. - */ -STATIC xfs_agnumber_t -xfs_ialloc_ag_select( - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t parent, /* parent directory inode number */ - umode_t mode) /* bits set to indicate file type */ -{ - xfs_agnumber_t agcount; /* number of ag's in the filesystem */ - xfs_agnumber_t agno; /* current ag number */ - int flags; /* alloc buffer locking flags */ - xfs_extlen_t ineed; /* blocks needed for inode allocation */ - xfs_extlen_t longest = 0; /* longest extent available */ - xfs_mount_t *mp; /* mount point structure */ - int needspace; /* file mode implies space allocated */ - xfs_perag_t *pag; /* per allocation group data */ - xfs_agnumber_t pagno; /* parent (starting) ag number */ - int error; - - /* - * Files of these types need at least one block if length > 0 - * (and they won't fit in the inode, but that's hard to figure out). - */ - needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); - mp = tp->t_mountp; - agcount = mp->m_maxagi; - if (S_ISDIR(mode)) - pagno = xfs_ialloc_next_ag(mp); - else { - pagno = XFS_INO_TO_AGNO(mp, parent); - if (pagno >= agcount) - pagno = 0; - } - - ASSERT(pagno < agcount); - - /* - * Loop through allocation groups, looking for one with a little - * free space in it. Note we don't look for free inodes, exactly. - * Instead, we include whether there is a need to allocate inodes - * to mean that blocks must be allocated for them, - * if none are currently free. - */ - agno = pagno; - flags = XFS_ALLOC_FLAG_TRYLOCK; - for (;;) { - pag = xfs_perag_get(mp, agno); - if (!pag->pagi_inodeok) { - xfs_ialloc_next_ag(mp); - goto nextag; - } - - if (!pag->pagi_init) { - error = xfs_ialloc_pagi_init(mp, tp, agno); - if (error) - goto nextag; - } - - if (pag->pagi_freecount) { - xfs_perag_put(pag); - return agno; - } - - if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, tp, agno, flags); - if (error) - goto nextag; - } - - /* - * Check that there is enough free space for the file plus a - * chunk of inodes if we need to allocate some. If this is the - * first pass across the AGs, take into account the potential - * space needed for alignment of inode chunks when checking the - * longest contiguous free space in the AG - this prevents us - * from getting ENOSPC because we have free space larger than - * ialloc_blks but alignment constraints prevent us from using - * it. - * - * If we can't find an AG with space for full alignment slack to - * be taken into account, we must be near ENOSPC in all AGs. - * Hence we don't include alignment for the second pass and so - * if we fail allocation due to alignment issues then it is most - * likely a real ENOSPC condition. - */ - ineed = M_IGEO(mp)->ialloc_min_blks; - if (flags && ineed > 1) - ineed += M_IGEO(mp)->cluster_align; - longest = pag->pagf_longest; - if (!longest) - longest = pag->pagf_flcount > 0; - - if (pag->pagf_freeblks >= needspace + ineed && - longest >= ineed) { - xfs_perag_put(pag); - return agno; - } -nextag: - xfs_perag_put(pag); - /* - * No point in iterating over the rest, if we're shutting - * down. - */ - if (XFS_FORCED_SHUTDOWN(mp)) - return NULLAGNUMBER; - agno++; - if (agno >= agcount) - agno = 0; - if (agno == pagno) { - if (flags == 0) - return NULLAGNUMBER; - flags = 0; - } - } -} - /* * Try to retrieve the next record to the left/right from the current one. */ @@ -1123,15 +984,14 @@ STATIC int xfs_dialloc_ag_inobt( struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_ino_t parent, xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agbp->b_addr; - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); - struct xfs_perag *pag = agbp->b_pag; struct xfs_btree_cur *cur, *tcur; struct xfs_inobt_rec_incore rec, trec; xfs_ino_t ino; @@ -1145,7 +1005,7 @@ xfs_dialloc_ag_inobt( ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1153,14 +1013,14 @@ xfs_dialloc_ag_inobt( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error0; /* * If in the same AG as the parent, try to get near the parent. */ - if (pagno == agno) { + if (pagno == pag->pag_agno) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ @@ -1363,7 +1223,7 @@ alloc_inode: ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; error = xfs_inobt_update(cur, &rec); @@ -1373,7 +1233,7 @@ alloc_inode: xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); pag->pagi_freecount--; - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error0; @@ -1568,16 +1428,16 @@ xfs_dialloc_ag_update_inobt( * The caller selected an AG for us, and made sure that free inodes are * available. */ -int +static int xfs_dialloc_ag( struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_ino_t parent, xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agbp->b_addr; - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); struct xfs_btree_cur *cur; /* finobt cursor */ @@ -1589,7 +1449,7 @@ xfs_dialloc_ag( int i; if (!xfs_sb_version_hasfinobt(&mp->m_sb)) - return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); + return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop); /* * If pagino is 0 (this is the root inode allocation) use newino. @@ -1598,9 +1458,9 @@ xfs_dialloc_ag( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error_cur; @@ -1609,7 +1469,7 @@ xfs_dialloc_ag( * parent. If so, find the closest available inode to the parent. If * not, consider the agi hint or find the first free inode in the AG. */ - if (agno == pagno) + if (pag->pag_agno == pagno) error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); else error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); @@ -1621,7 +1481,7 @@ xfs_dialloc_ag( ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); - ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, rec.ir_startino + offset); /* * Modify or remove the finobt record. @@ -1641,9 +1501,9 @@ xfs_dialloc_ag( * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ - icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + icur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); - error = xfs_check_agi_freecount(icur, agi); + error = xfs_check_agi_freecount(icur); if (error) goto error_icur; @@ -1657,14 +1517,14 @@ xfs_dialloc_ag( */ be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - agbp->b_pag->pagi_freecount--; + pag->pagi_freecount--; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); - error = xfs_check_agi_freecount(icur, agi); + error = xfs_check_agi_freecount(icur); if (error) goto error_icur; - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error_icur; @@ -1708,54 +1568,195 @@ xfs_dialloc_roll( /* Re-attach the quota info that we detached from prev trx. */ tp->t_dqinfo = dqinfo; + /* + * Join the buffer even on commit error so that the buffer is released + * when the caller cancels the transaction and doesn't have to handle + * this error case specially. + */ + xfs_trans_bjoin(tp, agibp); *tpp = tp; + return error; +} + +static xfs_agnumber_t +xfs_ialloc_next_ag( + xfs_mount_t *mp) +{ + xfs_agnumber_t agno; + + spin_lock(&mp->m_agirotor_lock); + agno = mp->m_agirotor; + if (++mp->m_agirotor >= mp->m_maxagi) + mp->m_agirotor = 0; + spin_unlock(&mp->m_agirotor_lock); + + return agno; +} + +static bool +xfs_dialloc_good_ag( + struct xfs_trans *tp, + struct xfs_perag *pag, + umode_t mode, + int flags, + bool ok_alloc) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_extlen_t ineed; + xfs_extlen_t longest = 0; + int needspace; + int error; + + if (!pag->pagi_inodeok) + return false; + + if (!pag->pagi_init) { + error = xfs_ialloc_pagi_init(mp, tp, pag->pag_agno); + if (error) + return false; + } + + if (pag->pagi_freecount) + return true; + if (!ok_alloc) + return false; + + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, flags); + if (error) + return false; + } + + /* + * Check that there is enough free space for the file plus a chunk of + * inodes if we need to allocate some. If this is the first pass across + * the AGs, take into account the potential space needed for alignment + * of inode chunks when checking the longest contiguous free space in + * the AG - this prevents us from getting ENOSPC because we have free + * space larger than ialloc_blks but alignment constraints prevent us + * from using it. + * + * If we can't find an AG with space for full alignment slack to be + * taken into account, we must be near ENOSPC in all AGs. Hence we + * don't include alignment for the second pass and so if we fail + * allocation due to alignment issues then it is most likely a real + * ENOSPC condition. + * + * XXX(dgc): this calculation is now bogus thanks to the per-ag + * reservations that xfs_alloc_fix_freelist() now does via + * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will + * be more than large enough for the check below to succeed, but + * xfs_alloc_space_available() will fail because of the non-zero + * metadata reservation and hence we won't actually be able to allocate + * more inodes in this AG. We do soooo much unnecessary work near ENOSPC + * because of this. + */ + ineed = M_IGEO(mp)->ialloc_min_blks; + if (flags && ineed > 1) + ineed += M_IGEO(mp)->cluster_align; + longest = pag->pagf_longest; + if (!longest) + longest = pag->pagf_flcount > 0; + needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); + + if (pag->pagf_freeblks < needspace + ineed || longest < ineed) + return false; + return true; +} + +static int +xfs_dialloc_try_ag( + struct xfs_trans **tpp, + struct xfs_perag *pag, + xfs_ino_t parent, + xfs_ino_t *new_ino, + bool ok_alloc) +{ + struct xfs_buf *agbp; + xfs_ino_t ino; + int error; + + /* + * Then read in the AGI buffer and recheck with the AGI buffer + * lock held. + */ + error = xfs_ialloc_read_agi(pag->pag_mount, *tpp, pag->pag_agno, &agbp); if (error) return error; - xfs_trans_bjoin(tp, agibp); - return 0; + + if (!pag->pagi_freecount) { + if (!ok_alloc) { + error = -EAGAIN; + goto out_release; + } + + error = xfs_ialloc_ag_alloc(*tpp, agbp, pag); + if (error < 0) + goto out_release; + + /* + * We successfully allocated space for an inode cluster in this + * AG. Roll the transaction so that we can allocate one of the + * new inodes. + */ + ASSERT(pag->pagi_freecount > 0); + error = xfs_dialloc_roll(tpp, agbp); + if (error) + goto out_release; + } + + /* Allocate an inode in the found AG */ + error = xfs_dialloc_ag(*tpp, agbp, pag, parent, &ino); + if (!error) + *new_ino = ino; + return error; + +out_release: + xfs_trans_brelse(*tpp, agbp); + return error; } /* - * Select and prepare an AG for inode allocation. + * Allocate an on-disk inode. * * Mode is used to tell whether the new inode is a directory and hence where to - * locate it. - * - * This function will ensure that the selected AG has free inodes available to - * allocate from. The selected AGI will be returned locked to the caller, and it - * will allocate more free inodes if required. If no free inodes are found or - * can be allocated, no AGI will be returned. + * locate it. The on-disk inode that is allocated will be returned in @new_ino + * on success, otherwise an error will be set to indicate the failure (e.g. + * -ENOSPC). */ int -xfs_dialloc_select_ag( +xfs_dialloc( struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode, - struct xfs_buf **IO_agbp) + xfs_ino_t *new_ino) { struct xfs_mount *mp = (*tpp)->t_mountp; - struct xfs_buf *agbp; xfs_agnumber_t agno; - int error; - bool noroom = false; + int error = 0; xfs_agnumber_t start_agno; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); - bool okalloc = true; - - *IO_agbp = NULL; + bool ok_alloc = true; + int flags; + xfs_ino_t ino; /* - * We do not have an agbp, so select an initial allocation - * group for inode allocation. + * Directories, symlinks, and regular files frequently allocate at least + * one block, so factor that potential expansion when we examine whether + * an AG has enough space for file creation. */ - start_agno = xfs_ialloc_ag_select(*tpp, parent, mode); - if (start_agno == NULLAGNUMBER) - return 0; + if (S_ISDIR(mode)) + start_agno = xfs_ialloc_next_ag(mp); + else { + start_agno = XFS_INO_TO_AGNO(mp, parent); + if (start_agno >= mp->m_maxagi) + start_agno = 0; + } /* * If we have already hit the ceiling of inode blocks then clear - * okalloc so we scan all available agi structures for a free + * ok_alloc so we scan all available agi structures for a free * inode. * * Read rough value of mp->m_icount by percpu_counter_read_positive, @@ -1764,8 +1765,7 @@ xfs_dialloc_select_ag( if (igeo->maxicount && percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos > igeo->maxicount) { - noroom = true; - okalloc = false; + ok_alloc = false; } /* @@ -1774,82 +1774,36 @@ xfs_dialloc_select_ag( * allocation groups upward, wrapping at the end. */ agno = start_agno; + flags = XFS_ALLOC_FLAG_TRYLOCK; for (;;) { pag = xfs_perag_get(mp, agno); - if (!pag->pagi_inodeok) { - xfs_ialloc_next_ag(mp); - goto nextag; - } - - if (!pag->pagi_init) { - error = xfs_ialloc_pagi_init(mp, *tpp, agno); - if (error) + if (xfs_dialloc_good_ag(*tpp, pag, mode, flags, ok_alloc)) { + error = xfs_dialloc_try_ag(tpp, pag, parent, + &ino, ok_alloc); + if (error != -EAGAIN) break; } - /* - * Do a first racy fast path check if this AG is usable. - */ - if (!pag->pagi_freecount && !okalloc) - goto nextag; - - /* - * Then read in the AGI buffer and recheck with the AGI buffer - * lock held. - */ - error = xfs_ialloc_read_agi(mp, *tpp, agno, &agbp); - if (error) - break; - - if (pag->pagi_freecount) { - xfs_perag_put(pag); - goto found_ag; - } - - if (!okalloc) - goto nextag_relse_buffer; - - error = xfs_ialloc_ag_alloc(*tpp, agbp); - if (error < 0) { - xfs_trans_brelse(*tpp, agbp); - - if (error == -ENOSPC) - error = 0; + if (XFS_FORCED_SHUTDOWN(mp)) { + error = -EFSCORRUPTED; break; } - - if (error == 0) { - /* - * We successfully allocated space for an inode cluster - * in this AG. Roll the transaction so that we can - * allocate one of the new inodes. - */ - ASSERT(pag->pagi_freecount > 0); - xfs_perag_put(pag); - - error = xfs_dialloc_roll(tpp, agbp); - if (error) { - xfs_buf_relse(agbp); - return error; + if (++agno == mp->m_maxagi) + agno = 0; + if (agno == start_agno) { + if (!flags) { + error = -ENOSPC; + break; } - goto found_ag; + flags = 0; } - -nextag_relse_buffer: - xfs_trans_brelse(*tpp, agbp); -nextag: xfs_perag_put(pag); - if (++agno == mp->m_sb.sb_agcount) - agno = 0; - if (agno == start_agno) - return noroom ? -ENOSPC : 0; } + if (!error) + *new_ino = ino; xfs_perag_put(pag); return error; -found_ag: - *IO_agbp = agbp; - return 0; } /* @@ -1935,12 +1889,12 @@ xfs_difree_inobt( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_agino_t agino, struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { struct xfs_agi *agi = agbp->b_addr; - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int ilen; @@ -1954,9 +1908,9 @@ xfs_difree_inobt( /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error0; @@ -2005,7 +1959,8 @@ xfs_difree_inobt( struct xfs_perag *pag = agbp->b_pag; xic->deleted = true; - xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + xic->first_ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, + rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* @@ -2028,7 +1983,7 @@ xfs_difree_inobt( goto error0; } - xfs_difree_inode_chunk(tp, agno, &rec); + xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); } else { xic->deleted = false; @@ -2044,11 +1999,11 @@ xfs_difree_inobt( */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - agbp->b_pag->pagi_freecount++; + pag->pagi_freecount++; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error0; @@ -2069,18 +2024,17 @@ xfs_difree_finobt( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { - struct xfs_agi *agi = agbp->b_addr; - xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; int error; int i; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_FINO); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) @@ -2158,7 +2112,7 @@ xfs_difree_finobt( } out: - error = xfs_check_agi_freecount(cur, agi); + error = xfs_check_agi_freecount(cur); if (error) goto error; @@ -2178,36 +2132,33 @@ error: */ int xfs_difree( - struct xfs_trans *tp, /* transaction pointer */ - xfs_ino_t inode, /* inode to be freed */ - struct xfs_icluster *xic) /* cluster info if deleted */ + struct xfs_trans *tp, + struct xfs_perag *pag, + xfs_ino_t inode, + struct xfs_icluster *xic) { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ struct xfs_buf *agbp; /* buffer for allocation group header */ xfs_agino_t agino; /* allocation group inode number */ - xfs_agnumber_t agno; /* allocation group number */ int error; /* error return value */ - struct xfs_mount *mp; /* mount structure for filesystem */ + struct xfs_mount *mp = tp->t_mountp; struct xfs_inobt_rec_incore rec;/* btree record */ - mp = tp->t_mountp; - /* * Break up inode number into its components. */ - agno = XFS_INO_TO_AGNO(mp, inode); - if (agno >= mp->m_sb.sb_agcount) { - xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", - __func__, agno, mp->m_sb.sb_agcount); + if (pag->pag_agno != XFS_INO_TO_AGNO(mp, inode)) { + xfs_warn(mp, "%s: agno != pag->pag_agno (%d != %d).", + __func__, XFS_INO_TO_AGNO(mp, inode), pag->pag_agno); ASSERT(0); return -EINVAL; } agino = XFS_INO_TO_AGINO(mp, inode); - if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { + if (inode != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", __func__, (unsigned long long)inode, - (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); + (unsigned long long)XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); ASSERT(0); return -EINVAL; } @@ -2221,7 +2172,7 @@ xfs_difree( /* * Get the allocation group header. */ - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); @@ -2231,7 +2182,7 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, agino, xic, &rec); + error = xfs_difree_inobt(mp, tp, agbp, pag, agino, xic, &rec); if (error) goto error0; @@ -2239,7 +2190,7 @@ xfs_difree( * Fix up the free inode btree. */ if (xfs_sb_version_hasfinobt(&mp->m_sb)) { - error = xfs_difree_finobt(mp, tp, agbp, agino, &rec); + error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec); if (error) goto error0; } @@ -2254,7 +2205,7 @@ STATIC int xfs_imap_lookup( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agino_t agino, xfs_agblock_t agbno, xfs_agblock_t *chunk_agbno, @@ -2267,11 +2218,11 @@ xfs_imap_lookup( int error; int i; - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", - __func__, error, agno); + __func__, error, pag->pag_agno); return error; } @@ -2281,7 +2232,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_INO); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) @@ -2315,42 +2266,44 @@ xfs_imap_lookup( */ int xfs_imap( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t ino, /* inode to locate */ - struct xfs_imap *imap, /* location map structure */ - uint flags) /* flags for inode btree lookup */ + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t ino, /* inode to locate */ + struct xfs_imap *imap, /* location map structure */ + uint flags) /* flags for inode btree lookup */ { - xfs_agblock_t agbno; /* block number of inode in the alloc group */ - xfs_agino_t agino; /* inode number within alloc group */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_agblock_t chunk_agbno; /* first block in inode chunk */ - xfs_agblock_t cluster_agbno; /* first block in inode cluster */ - int error; /* error code */ - int offset; /* index of inode in its buffer */ - xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ + xfs_agblock_t agbno; /* block number of inode in the alloc group */ + xfs_agino_t agino; /* inode number within alloc group */ + xfs_agblock_t chunk_agbno; /* first block in inode chunk */ + xfs_agblock_t cluster_agbno; /* first block in inode cluster */ + int error; /* error code */ + int offset; /* index of inode in its buffer */ + xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ + struct xfs_perag *pag; ASSERT(ino != NULLFSINO); /* * Split up the inode number into its parts. */ - agno = XFS_INO_TO_AGNO(mp, ino); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks || - ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + if (!pag || agbno >= mp->m_sb.sb_agblocks || + ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { + error = -EINVAL; #ifdef DEBUG /* * Don't output diagnostic information for untrusted inodes * as they can be invalid without implying corruption. */ if (flags & XFS_IGET_UNTRUSTED) - return -EINVAL; - if (agno >= mp->m_sb.sb_agcount) { + goto out_drop; + if (!pag) { xfs_alert(mp, "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)", - __func__, agno, mp->m_sb.sb_agcount); + __func__, XFS_INO_TO_AGNO(mp, ino), + mp->m_sb.sb_agcount); } if (agbno >= mp->m_sb.sb_agblocks) { xfs_alert(mp, @@ -2358,15 +2311,15 @@ xfs_imap( __func__, (unsigned long long)agbno, (unsigned long)mp->m_sb.sb_agblocks); } - if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + if (pag && ino != XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)) { xfs_alert(mp, "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", __func__, ino, - XFS_AGINO_TO_INO(mp, agno, agino)); + XFS_AGINO_TO_INO(mp, pag->pag_agno, agino)); } xfs_stack_trace(); #endif /* DEBUG */ - return -EINVAL; + goto out_drop; } /* @@ -2377,10 +2330,10 @@ xfs_imap( * in all cases where an untrusted inode number is passed. */ if (flags & XFS_IGET_UNTRUSTED) { - error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + error = xfs_imap_lookup(mp, tp, pag, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) - return error; + goto out_drop; goto out_map; } @@ -2392,11 +2345,12 @@ xfs_imap( offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); - return 0; + error = 0; + goto out_drop; } /* @@ -2408,10 +2362,10 @@ xfs_imap( offset_agbno = agbno & M_IGEO(mp)->inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { - error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + error = xfs_imap_lookup(mp, tp, pag, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) - return error; + goto out_drop; } out_map: @@ -2422,7 +2376,7 @@ out_map: offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); - imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); + imap->im_blkno = XFS_AGB_TO_DADDR(mp, pag->pag_agno, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); @@ -2439,9 +2393,14 @@ out_map: __func__, (unsigned long long) imap->im_blkno, (unsigned long long) imap->im_len, XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); - return -EINVAL; + error = -EINVAL; + goto out_drop; } - return 0; + error = 0; +out_drop: + if (pag) + xfs_perag_put(pag); + return error; } /* diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 3511086a7ae1..9df7c80408ff 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -33,42 +33,14 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) } /* - * Allocate an inode on disk. - * Mode is used to tell whether the new inode will need space, and whether - * it is a directory. - * - * There are two phases to inode allocation: selecting an AG and ensuring - * that it contains free inodes, followed by allocating one of the free - * inodes. xfs_dialloc_select_ag() does the former and returns a locked AGI - * to the caller, ensuring that followup call to xfs_dialloc_ag() will - * have free inodes to allocate from. xfs_dialloc_ag() will return the inode - * number of the free inode we allocated. + * Allocate an inode on disk. Mode is used to tell whether the new inode will + * need space, and whether it is a directory. */ -int /* error */ -xfs_dialloc_select_ag( - struct xfs_trans **tpp, /* double pointer of transaction */ - xfs_ino_t parent, /* parent inode (directory) */ - umode_t mode, /* mode bits for new inode */ - struct xfs_buf **IO_agbp); - -int -xfs_dialloc_ag( - struct xfs_trans *tp, - struct xfs_buf *agbp, - xfs_ino_t parent, - xfs_ino_t *inop); +int xfs_dialloc(struct xfs_trans **tpp, xfs_ino_t parent, umode_t mode, + xfs_ino_t *new_ino); -/* - * Free disk inode. Carefully avoids touching the incore inode, all - * manipulations incore are the caller's responsibility. - * The on-disk inode is not changed by this operation, only the - * btree (free inode mask) is changed. - */ -int /* error */ -xfs_difree( - struct xfs_trans *tp, /* transaction pointer */ - xfs_ino_t inode, /* inode to be freed */ - struct xfs_icluster *ifree); /* cluster info if deleted */ +int xfs_difree(struct xfs_trans *tp, struct xfs_perag *pag, + xfs_ino_t ino, struct xfs_icluster *ifree); /* * Return the location of the inode in imap, for mapping it into a buffer. diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 4c5831646bd9..823a038939f8 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -20,6 +20,7 @@ #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_rmap.h" +#include "xfs_ag.h" STATIC int xfs_inobt_get_minrecs( @@ -34,8 +35,7 @@ xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.agno, - cur->bc_btnum); + cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum); } STATIC void @@ -102,7 +102,7 @@ __xfs_inobt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.oinfo = XFS_RMAP_OINFO_INOBT; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.agno, sbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.pag->pag_agno, sbno); args.minlen = 1; args.maxlen = 1; args.prod = 1; @@ -235,7 +235,7 @@ xfs_inobt_init_ptr_from_cur( { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_root; } @@ -247,7 +247,7 @@ xfs_finobt_init_ptr_from_cur( { struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_free_root; } @@ -427,7 +427,7 @@ static struct xfs_btree_cur * xfs_inobt_init_common( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ + struct xfs_perag *pag, xfs_btnum_t btnum) /* ialloc or free ino btree */ { struct xfs_btree_cur *cur; @@ -449,7 +449,9 @@ xfs_inobt_init_common( if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ag.agno = agno; + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); + cur->bc_ag.pag = pag; return cur; } @@ -459,13 +461,13 @@ xfs_inobt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; struct xfs_agi *agi = agbp->b_addr; - cur = xfs_inobt_init_common(mp, tp, agno, btnum); + cur = xfs_inobt_init_common(mp, tp, pag, btnum); if (btnum == XFS_BTNUM_INO) cur->bc_nlevels = be32_to_cpu(agi->agi_level); else @@ -479,12 +481,12 @@ struct xfs_btree_cur * xfs_inobt_stage_cursor( struct xfs_mount *mp, struct xbtree_afakeroot *afake, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - cur = xfs_inobt_init_common(mp, NULL, agno, btnum); + cur = xfs_inobt_init_common(mp, NULL, pag, btnum); xfs_btree_stage_afakeroot(cur, afake); return cur; } @@ -656,7 +658,7 @@ int xfs_inobt_cur( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t which, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp) @@ -667,11 +669,11 @@ xfs_inobt_cur( ASSERT(*agi_bpp == NULL); ASSERT(*curpp == NULL); - error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp); + error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, agi_bpp); if (error) return error; - cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, which); + cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, pag, which); *curpp = cur; return 0; } @@ -680,7 +682,7 @@ static int xfs_inobt_count_blocks( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_btnum_t btnum, xfs_extlen_t *tree_blocks) { @@ -688,7 +690,7 @@ xfs_inobt_count_blocks( struct xfs_btree_cur *cur = NULL; int error; - error = xfs_inobt_cur(mp, tp, agno, btnum, &cur, &agbp); + error = xfs_inobt_cur(mp, tp, pag, btnum, &cur, &agbp); if (error) return error; @@ -704,14 +706,14 @@ static int xfs_finobt_read_blocks( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_extlen_t *tree_blocks) { struct xfs_buf *agbp; struct xfs_agi *agi; int error; - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); if (error) return error; @@ -728,7 +730,7 @@ int xfs_finobt_calc_reserves( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used) { @@ -739,14 +741,14 @@ xfs_finobt_calc_reserves( return 0; if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) - error = xfs_finobt_read_blocks(mp, tp, agno, &tree_len); + error = xfs_finobt_read_blocks(mp, tp, pag, &tree_len); else - error = xfs_inobt_count_blocks(mp, tp, agno, XFS_BTNUM_FINO, + error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO, &tree_len); if (error) return error; - *ask += xfs_inobt_max_size(mp, agno); + *ask += xfs_inobt_max_size(mp, pag->pag_agno); *used += tree_len; return 0; } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 35bbd978c272..e530c82b2217 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xfs_perag; /* * Btree block header size depends on a superblock flag. @@ -45,11 +46,11 @@ struct xfs_mount; (maxrecs) * sizeof(xfs_inobt_key_t) + \ ((index) - 1) * sizeof(xfs_inobt_ptr_t))) -extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, - struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, - xfs_btnum_t); +extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *agbp, + struct xfs_perag *pag, xfs_btnum_t btnum); struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, xfs_agnumber_t agno, + struct xbtree_afakeroot *afake, struct xfs_perag *pag, xfs_btnum_t btnum); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); @@ -64,11 +65,11 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, #endif /* DEBUG */ int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, unsigned long long len); int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_btnum_t btnum, + struct xfs_perag *pag, xfs_btnum_t btnum, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp); void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index f3254a4f4cb4..04ce361688f7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -612,7 +612,7 @@ xfs_inode_validate_extsize( */ if (rt_flag) - blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; + blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); else blocksize_bytes = mp->m_sb.sb_blocksize; diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 3e15ea29fb8d..d548ea4b6aab 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -34,9 +34,6 @@ typedef uint32_t xlog_tid_t; #define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ #define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ #define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ -#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ - (log)->l_mp->m_sb.sb_logsunit) -#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) #define XLOG_HEADER_SIZE 512 diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 2037b9f23069..860a0c9801ba 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -22,6 +22,7 @@ #include "xfs_bit.h" #include "xfs_refcount.h" #include "xfs_rmap.h" +#include "xfs_ag.h" /* Allowable refcount adjustment amounts. */ enum xfs_refc_adjust_op { @@ -46,7 +47,7 @@ xfs_refcount_lookup_le( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -63,7 +64,7 @@ xfs_refcount_lookup_ge( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -80,7 +81,7 @@ xfs_refcount_lookup_eq( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.pag->pag_agno, bno, XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -108,7 +109,7 @@ xfs_refcount_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.agno; + xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; union xfs_btree_rec *rec; int error; xfs_agblock_t realstart; @@ -119,7 +120,7 @@ xfs_refcount_get_rec( xfs_refcount_btrec_to_irec(rec, irec); - agno = cur->bc_ag.agno; + agno = cur->bc_ag.pag->pag_agno; if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; @@ -144,7 +145,7 @@ xfs_refcount_get_rec( if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) goto out_bad_rec; - trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.agno, irec); + trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); return 0; out_bad_rec: @@ -169,14 +170,14 @@ xfs_refcount_update( union xfs_btree_rec rec; int error; - trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.agno, irec); + trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); error = xfs_btree_update(cur, &rec); if (error) trace_xfs_refcount_update_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -193,7 +194,7 @@ xfs_refcount_insert( { int error; - trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.agno, irec); + trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; @@ -208,7 +209,7 @@ xfs_refcount_insert( out_error: if (error) trace_xfs_refcount_insert_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -234,7 +235,7 @@ xfs_refcount_delete( error = -EFSCORRUPTED; goto out_error; } - trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.agno, &irec); + trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec); error = xfs_btree_delete(cur, i); if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { error = -EFSCORRUPTED; @@ -246,7 +247,7 @@ xfs_refcount_delete( out_error: if (error) trace_xfs_refcount_delete_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -366,7 +367,7 @@ xfs_refcount_split_extent( return 0; *shape_changed = true; - trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &rcext, agbno); /* Establish the right extent. */ @@ -391,7 +392,7 @@ xfs_refcount_split_extent( out_error: trace_xfs_refcount_split_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -411,7 +412,7 @@ xfs_refcount_merge_center_extents( int found_rec; trace_xfs_refcount_merge_center_extents(cur->bc_mp, - cur->bc_ag.agno, left, center, right); + cur->bc_ag.pag->pag_agno, left, center, right); /* * Make sure the center and right extents are not in the btree. @@ -468,7 +469,7 @@ xfs_refcount_merge_center_extents( out_error: trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -487,7 +488,7 @@ xfs_refcount_merge_left_extent( int found_rec; trace_xfs_refcount_merge_left_extent(cur->bc_mp, - cur->bc_ag.agno, left, cleft); + cur->bc_ag.pag->pag_agno, left, cleft); /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ if (cleft->rc_refcount > 1) { @@ -530,7 +531,7 @@ xfs_refcount_merge_left_extent( out_error: trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -548,7 +549,7 @@ xfs_refcount_merge_right_extent( int found_rec; trace_xfs_refcount_merge_right_extent(cur->bc_mp, - cur->bc_ag.agno, cright, right); + cur->bc_ag.pag->pag_agno, cright, right); /* * If the extent ending at agbno+aglen (cright) wasn't synthesized, @@ -594,7 +595,7 @@ xfs_refcount_merge_right_extent( out_error: trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -679,13 +680,13 @@ xfs_refcount_find_left_extents( cleft->rc_blockcount = aglen; cleft->rc_refcount = 1; } - trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, left, cleft, agbno); return error; out_error: trace_xfs_refcount_find_left_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -768,13 +769,13 @@ xfs_refcount_find_right_extents( cright->rc_blockcount = aglen; cright->rc_refcount = 1; } - trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, cright, right, agbno + aglen); return error; out_error: trace_xfs_refcount_find_right_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -952,7 +953,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock - *agbno); tmp.rc_refcount = 1 + adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.agno, &tmp); + cur->bc_ag.pag->pag_agno, &tmp); /* * Either cover the hole (increment) or @@ -971,7 +972,7 @@ xfs_refcount_adjust_extents( cur->bc_ag.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_ag.agno, + cur->bc_ag.pag->pag_agno, tmp.rc_startblock); xfs_bmap_add_free(cur->bc_tp, fsbno, tmp.rc_blockcount, oinfo); @@ -998,7 +999,7 @@ xfs_refcount_adjust_extents( goto skip; ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.agno, &ext); + cur->bc_ag.pag->pag_agno, &ext); if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) @@ -1016,7 +1017,7 @@ xfs_refcount_adjust_extents( goto advloop; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_ag.agno, + cur->bc_ag.pag->pag_agno, ext.rc_startblock); xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount, oinfo); @@ -1035,7 +1036,7 @@ advloop: return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1057,10 +1058,10 @@ xfs_refcount_adjust( *new_agbno = agbno; *new_aglen = aglen; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno, aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno, aglen); /* @@ -1099,7 +1100,7 @@ xfs_refcount_adjust( return 0; out_error: - trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1142,30 +1143,30 @@ xfs_refcount_finish_one( struct xfs_btree_cur *rcur; struct xfs_buf *agbp = NULL; int error = 0; - xfs_agnumber_t agno; xfs_agblock_t bno; xfs_agblock_t new_agbno; unsigned long nr_ops = 0; int shape_changes = 0; + struct xfs_perag *pag; - agno = XFS_FSB_TO_AGNO(mp, startblock); - ASSERT(agno != NULLAGNUMBER); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); bno = XFS_FSB_TO_AGBNO(mp, startblock); trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), type, XFS_FSB_TO_AGBNO(mp, startblock), blockcount); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_REFCOUNT_FINISH_ONE)) - return -EIO; + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) { + error = -EIO; + goto out_drop; + } /* * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.agno != agno) { + if (rcur != NULL && rcur->bc_ag.pag != pag) { nr_ops = rcur->bc_ag.refc.nr_ops; shape_changes = rcur->bc_ag.refc.shape_changes; xfs_refcount_finish_one_cleanup(tp, rcur, 0); @@ -1173,12 +1174,12 @@ xfs_refcount_finish_one( *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(tp->t_mountp, tp, agno, + error = xfs_alloc_read_agf(tp->t_mountp, tp, pag->pag_agno, XFS_ALLOC_FLAG_FREEING, &agbp); if (error) - return error; + goto out_drop; - rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); + rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); rcur->bc_ag.refc.nr_ops = nr_ops; rcur->bc_ag.refc.shape_changes = shape_changes; } @@ -1188,12 +1189,12 @@ xfs_refcount_finish_one( case XFS_REFCOUNT_INCREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, new_len, XFS_REFCOUNT_ADJUST_INCREASE, NULL); - *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); break; case XFS_REFCOUNT_DECREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, new_len, XFS_REFCOUNT_ADJUST_DECREASE, NULL); - *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); break; case XFS_REFCOUNT_ALLOC_COW: *new_fsb = startblock + blockcount; @@ -1210,8 +1211,10 @@ xfs_refcount_finish_one( error = -EFSCORRUPTED; } if (!error && *new_len > 0) - trace_xfs_refcount_finish_one_leftover(mp, agno, type, + trace_xfs_refcount_finish_one_leftover(mp, pag->pag_agno, type, bno, blockcount, new_agbno, *new_len); +out_drop: + xfs_perag_put(pag); return error; } @@ -1294,7 +1297,7 @@ xfs_refcount_find_shared( int have; int error; - trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno, aglen); /* By default, skip the whole range */ @@ -1374,12 +1377,12 @@ xfs_refcount_find_shared( done: trace_xfs_refcount_find_shared_result(cur->bc_mp, - cur->bc_ag.agno, *fbno, *flen); + cur->bc_ag.pag->pag_agno, *fbno, *flen); out_error: if (error) trace_xfs_refcount_find_shared_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1476,7 +1479,7 @@ xfs_refcount_adjust_cow_extents( tmp.rc_blockcount = aglen; tmp.rc_refcount = 1; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.agno, &tmp); + cur->bc_ag.pag->pag_agno, &tmp); error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -1504,7 +1507,7 @@ xfs_refcount_adjust_cow_extents( ext.rc_refcount = 0; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_ag.agno, &ext); + cur->bc_ag.pag->pag_agno, &ext); error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; @@ -1520,7 +1523,7 @@ xfs_refcount_adjust_cow_extents( return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1566,7 +1569,7 @@ xfs_refcount_adjust_cow( return 0; out_error: - trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1580,7 +1583,7 @@ __xfs_refcount_cow_alloc( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.agno, + trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, aglen); /* Add refcount btree reservation */ @@ -1597,7 +1600,7 @@ __xfs_refcount_cow_free( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.agno, + trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, aglen); /* Remove refcount btree reservation */ @@ -1672,7 +1675,7 @@ xfs_refcount_recover_extent( int xfs_refcount_recover_cow_leftovers( struct xfs_mount *mp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xfs_trans *tp; struct xfs_btree_cur *cur; @@ -1704,10 +1707,10 @@ xfs_refcount_recover_cow_leftovers( if (error) return error; - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); if (error) goto out_trans; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); /* Find all the leftover CoW staging extents. */ memset(&low, 0, sizeof(low)); @@ -1729,11 +1732,12 @@ xfs_refcount_recover_cow_leftovers( if (error) goto out_free; - trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec); + trace_xfs_refcount_recover_extent(mp, pag->pag_agno, + &rr->rr_rrec); /* Free the orphan record */ agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; - fsb = XFS_AGB_TO_FSB(mp, agno, agbno); + fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno); xfs_refcount_free_cow_extent(tp, fsb, rr->rr_rrec.rc_blockcount); diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 209795539c8d..9f6e9aae4da0 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -6,6 +6,13 @@ #ifndef __XFS_REFCOUNT_H__ #define __XFS_REFCOUNT_H__ +struct xfs_trans; +struct xfs_mount; +struct xfs_perag; +struct xfs_btree_cur; +struct xfs_bmbt_irec; +struct xfs_refcount_irec; + extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, @@ -50,7 +57,7 @@ void xfs_refcount_alloc_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, void xfs_refcount_free_cow_extent(struct xfs_trans *tp, xfs_fsblock_t fsb, xfs_extlen_t len); extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, - xfs_agnumber_t agno); + struct xfs_perag *pag); /* * While we're adjusting the refcounts records of an extent, we have diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index a6ac60ae9421..92d336c17e83 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -9,7 +9,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_btree_staging.h" @@ -20,13 +19,14 @@ #include "xfs_trans.h" #include "xfs_bit.h" #include "xfs_rmap.h" +#include "xfs_ag.h" static struct xfs_btree_cur * xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.agno); + cur->bc_ag.agbp, cur->bc_ag.pag); } STATIC void @@ -65,7 +65,7 @@ xfs_refcountbt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, + args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, xfs_refc_block(args.mp)); args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; @@ -74,13 +74,13 @@ xfs_refcountbt_alloc_block( error = xfs_alloc_vextent(&args); if (error) goto out_error; - trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, args.agbno, 1); if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } - ASSERT(args.agno == cur->bc_ag.agno); + ASSERT(args.agno == cur->bc_ag.pag->pag_agno); ASSERT(args.len == 1); new->s = cpu_to_be32(args.agbno); @@ -105,7 +105,7 @@ xfs_refcountbt_free_block( xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); int error; - trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); @@ -170,7 +170,7 @@ xfs_refcountbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_refcount_root; } @@ -316,12 +316,11 @@ static struct xfs_btree_cur * xfs_refcountbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xfs_btree_cur *cur; - ASSERT(agno != NULLAGNUMBER); - ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(pag->pag_agno < mp->m_sb.sb_agcount); cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); cur->bc_tp = tp; @@ -330,9 +329,12 @@ xfs_refcountbt_init_common( cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); - cur->bc_ag.agno = agno; cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); + cur->bc_ag.pag = pag; + cur->bc_ag.refc.nr_ops = 0; cur->bc_ag.refc.shape_changes = 0; cur->bc_ops = &xfs_refcountbt_ops; @@ -345,12 +347,12 @@ xfs_refcountbt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - cur = xfs_refcountbt_init_common(mp, tp, agno); + cur = xfs_refcountbt_init_common(mp, tp, pag); cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); cur->bc_ag.agbp = agbp; return cur; @@ -361,11 +363,11 @@ struct xfs_btree_cur * xfs_refcountbt_stage_cursor( struct xfs_mount *mp, struct xbtree_afakeroot *afake, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xfs_btree_cur *cur; - cur = xfs_refcountbt_init_common(mp, NULL, agno); + cur = xfs_refcountbt_init_common(mp, NULL, pag); xfs_btree_stage_afakeroot(cur, afake); return cur; } @@ -450,7 +452,7 @@ int xfs_refcountbt_calc_reserves( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used) { @@ -463,8 +465,7 @@ xfs_refcountbt_calc_reserves( if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; - - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); if (error) return error; @@ -479,7 +480,7 @@ xfs_refcountbt_calc_reserves( * expansion. We therefore can pretend the space isn't there. */ if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) agblocks -= mp->m_sb.sb_logblocks; *ask += xfs_refcountbt_max_size(mp, agblocks); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index 69dc515db671..bd9ed9e1e41f 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xfs_perag; struct xbtree_afakeroot; /* @@ -46,9 +47,9 @@ struct xbtree_afakeroot; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno); + struct xfs_perag *pag); struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, xfs_agnumber_t agno); + struct xbtree_afakeroot *afake, struct xfs_perag *pag); extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); @@ -58,7 +59,7 @@ extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp, xfs_agblock_t agblocks); extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, - struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask, + struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur, diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 10e0cf9949a2..d1dfad0204e3 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -11,6 +11,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_sb.h" #include "xfs_defer.h" #include "xfs_btree.h" #include "xfs_trans.h" @@ -21,6 +22,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_inode.h" +#include "xfs_ag.h" /* * Lookup the first record less than or equal to [bno, len, owner, offset] @@ -79,7 +81,7 @@ xfs_rmap_update( union xfs_btree_rec rec; int error; - trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); @@ -91,7 +93,7 @@ xfs_rmap_update( error = xfs_btree_update(cur, &rec); if (error) trace_xfs_rmap_update_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -107,7 +109,7 @@ xfs_rmap_insert( int i; int error; - trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.agno, agbno, + trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); @@ -133,7 +135,7 @@ xfs_rmap_insert( done: if (error) trace_xfs_rmap_insert_error(rcur->bc_mp, - rcur->bc_ag.agno, error, _RET_IP_); + rcur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -149,7 +151,7 @@ xfs_rmap_delete( int i; int error; - trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.agno, agbno, + trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.pag->pag_agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); @@ -170,7 +172,7 @@ xfs_rmap_delete( done: if (error) trace_xfs_rmap_delete_error(rcur->bc_mp, - rcur->bc_ag.agno, error, _RET_IP_); + rcur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -197,7 +199,7 @@ xfs_rmap_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.agno; + xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; union xfs_btree_rec *rec; int error; @@ -260,7 +262,7 @@ xfs_rmap_find_left_neighbor_helper( struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, - cur->bc_ag.agno, rec->rm_startblock, + cur->bc_ag.pag->pag_agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -312,7 +314,7 @@ xfs_rmap_find_left_neighbor( info.stat = stat; trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, - cur->bc_ag.agno, bno, 0, owner, offset, flags); + cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_find_left_neighbor_helper, &info); @@ -320,7 +322,7 @@ xfs_rmap_find_left_neighbor( error = 0; if (*stat) trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.agno, irec->rm_startblock, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return error; @@ -336,7 +338,7 @@ xfs_rmap_lookup_le_range_helper( struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, - cur->bc_ag.agno, rec->rm_startblock, + cur->bc_ag.pag->pag_agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -385,14 +387,14 @@ xfs_rmap_lookup_le_range( info.stat = stat; trace_xfs_rmap_lookup_le_range(cur->bc_mp, - cur->bc_ag.agno, bno, 0, owner, offset, flags); + cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_lookup_le_range_helper, &info); if (error == -ECANCELED) error = 0; if (*stat) trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.agno, irec->rm_startblock, + cur->bc_ag.pag->pag_agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return error; @@ -498,7 +500,7 @@ xfs_rmap_unmap( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* @@ -522,7 +524,7 @@ xfs_rmap_unmap( goto out_error; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.agno, ltrec.rm_startblock, + cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); ltoff = ltrec.rm_offset; @@ -588,7 +590,7 @@ xfs_rmap_unmap( if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ - trace_xfs_rmap_delete(mp, cur->bc_ag.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -666,7 +668,7 @@ xfs_rmap_unmap( else cur->bc_rec.r.rm_offset = offset + len; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_ag.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, cur->bc_rec.r.rm_startblock, cur->bc_rec.r.rm_blockcount, cur->bc_rec.r.rm_owner, @@ -678,11 +680,11 @@ xfs_rmap_unmap( } out_done: - trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(mp, cur->bc_ag.agno, + trace_xfs_rmap_unmap_error(mp, cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -694,7 +696,7 @@ int xfs_rmap_free( struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo) @@ -706,7 +708,7 @@ xfs_rmap_free( if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; - cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); error = xfs_rmap_unmap(cur, bno, len, false, oinfo); @@ -773,7 +775,7 @@ xfs_rmap_map( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); @@ -795,7 +797,7 @@ xfs_rmap_map( goto out_error; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.agno, ltrec.rm_startblock, + cur->bc_ag.pag->pag_agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -831,7 +833,7 @@ xfs_rmap_map( goto out_error; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.agno, gtrec.rm_startblock, + cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) @@ -870,7 +872,7 @@ xfs_rmap_map( * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| */ ltrec.rm_blockcount += gtrec.rm_blockcount; - trace_xfs_rmap_delete(mp, cur->bc_ag.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, @@ -921,7 +923,7 @@ xfs_rmap_map( cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, owner, offset, flags); error = xfs_btree_insert(cur, &i); if (error) @@ -932,11 +934,11 @@ xfs_rmap_map( } } - trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(mp, cur->bc_ag.agno, + trace_xfs_rmap_map_error(mp, cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -948,7 +950,7 @@ int xfs_rmap_alloc( struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo) @@ -960,7 +962,7 @@ xfs_rmap_alloc( if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; - cur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); error = xfs_rmap_map(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -1010,7 +1012,7 @@ xfs_rmap_convert( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* @@ -1034,7 +1036,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_ag.agno, PREV.rm_startblock, + cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1076,7 +1078,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_ag.agno, LEFT.rm_startblock, + cur->bc_ag.pag->pag_agno, LEFT.rm_startblock, LEFT.rm_blockcount, LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags); if (LEFT.rm_startblock + LEFT.rm_blockcount == bno && @@ -1114,7 +1116,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.agno, RIGHT.rm_startblock, + cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (bno + len == RIGHT.rm_startblock && @@ -1132,7 +1134,7 @@ xfs_rmap_convert( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state, _RET_IP_); /* reset the cursor back to PREV */ @@ -1162,7 +1164,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); @@ -1180,7 +1182,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1210,7 +1212,7 @@ xfs_rmap_convert( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - trace_xfs_rmap_delete(mp, cur->bc_ag.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1247,7 +1249,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_ag.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); @@ -1326,7 +1328,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1383,7 +1385,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1414,7 +1416,7 @@ xfs_rmap_convert( NEW = PREV; NEW.rm_blockcount = offset - PREV.rm_offset; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_ag.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, NEW.rm_startblock, NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, NEW.rm_flags); @@ -1441,7 +1443,7 @@ xfs_rmap_convert( /* new middle extent - newext */ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; cur->bc_rec.r.rm_flags |= newext; - trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.pag->pag_agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1465,12 +1467,12 @@ xfs_rmap_convert( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1506,7 +1508,7 @@ xfs_rmap_convert_shared( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* @@ -1573,7 +1575,7 @@ xfs_rmap_convert_shared( goto done; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.agno, RIGHT.rm_startblock, + cur->bc_ag.pag->pag_agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) @@ -1589,7 +1591,7 @@ xfs_rmap_convert_shared( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.pag->pag_agno, state, _RET_IP_); /* * Switch out based on the FILLING and CONTIG state bits. @@ -1880,12 +1882,12 @@ xfs_rmap_convert_shared( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -1923,7 +1925,7 @@ xfs_rmap_unmap_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* @@ -2072,12 +2074,12 @@ xfs_rmap_unmap_shared( goto out_error; } - trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_unmap_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -2112,7 +2114,7 @@ xfs_rmap_map_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); /* Is there a left record that abuts our range? */ @@ -2138,7 +2140,7 @@ xfs_rmap_map_shared( goto out_error; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_ag.agno, gtrec.rm_startblock, + cur->bc_ag.pag->pag_agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); @@ -2231,12 +2233,12 @@ xfs_rmap_map_shared( goto out_error; } - trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.pag->pag_agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_map_error(cur->bc_mp, - cur->bc_ag.agno, error, _RET_IP_); + cur->bc_ag.pag->pag_agno, error, _RET_IP_); return error; } @@ -2362,31 +2364,32 @@ xfs_rmap_finish_one( struct xfs_btree_cur **pcur) { struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; struct xfs_btree_cur *rcur; struct xfs_buf *agbp = NULL; int error = 0; - xfs_agnumber_t agno; struct xfs_owner_info oinfo; xfs_agblock_t bno; bool unwritten; - agno = XFS_FSB_TO_AGNO(mp, startblock); - ASSERT(agno != NULLAGNUMBER); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, startblock)); bno = XFS_FSB_TO_AGBNO(mp, startblock); - trace_xfs_rmap_deferred(mp, agno, type, bno, owner, whichfork, + trace_xfs_rmap_deferred(mp, pag->pag_agno, type, bno, owner, whichfork, startoff, blockcount, state); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_RMAP_FINISH_ONE)) - return -EIO; + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) { + error = -EIO; + goto out_drop; + } + /* * If we haven't gotten a cursor or the cursor AG doesn't match * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_ag.agno != agno) { + if (rcur != NULL && rcur->bc_ag.pag != pag) { xfs_rmap_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -2397,13 +2400,15 @@ xfs_rmap_finish_one( * rmapbt, because a shape change could cause us to * allocate blocks. */ - error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + error = xfs_free_extent_fix_freelist(tp, pag, &agbp); if (error) - return error; - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) - return -EFSCORRUPTED; + goto out_drop; + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { + error = -EFSCORRUPTED; + goto out_drop; + } - rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, agno); + rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); } *pcur = rcur; @@ -2441,6 +2446,8 @@ xfs_rmap_finish_one( ASSERT(0); error = -EFSCORRUPTED; } +out_drop: + xfs_perag_put(pag); return error; } diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index abe633403fd1..f2423cf7f1e2 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -6,6 +6,8 @@ #ifndef __XFS_RMAP_H__ #define __XFS_RMAP_H__ +struct xfs_perag; + static inline void xfs_rmap_ino_bmbt_owner( struct xfs_owner_info *oi, @@ -113,10 +115,10 @@ xfs_owner_info_pack( } int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, const struct xfs_owner_info *oinfo); int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 9f5bcbd834c3..f29bc71b9950 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -9,7 +9,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_alloc.h" @@ -20,6 +19,7 @@ #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_extent_busy.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" /* @@ -52,7 +52,7 @@ xfs_rmapbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.agno); + cur->bc_ag.agbp, cur->bc_ag.pag); } STATIC void @@ -64,13 +64,12 @@ xfs_rmapbt_set_root( struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; int btnum = cur->bc_btnum; - struct xfs_perag *pag = agbp->b_pag; ASSERT(ptr->s != 0); agf->agf_roots[btnum] = ptr->s; be32_add_cpu(&agf->agf_levels[btnum], inc); - pag->pagf_levels[btnum] += inc; + cur->bc_ag.pag->pagf_levels[btnum] += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -84,6 +83,7 @@ xfs_rmapbt_alloc_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; + struct xfs_perag *pag = cur->bc_ag.pag; int error; xfs_agblock_t bno; @@ -93,21 +93,19 @@ xfs_rmapbt_alloc_block( if (error) return error; - trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, - bno, 1); + trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1); if (bno == NULLAGBLOCK) { *stat = 0; return 0; } - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, - false); + xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false); new->s = cpu_to_be32(bno); be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_ag.agno); + xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno); *stat = 1; return 0; @@ -120,12 +118,12 @@ xfs_rmapbt_free_block( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag; + struct xfs_perag *pag = cur->bc_ag.pag; xfs_agblock_t bno; int error; bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); - trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno, + trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno, bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); @@ -133,10 +131,9 @@ xfs_rmapbt_free_block( if (error) return error; - xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); - pag = cur->bc_ag.agbp->b_pag; xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); return 0; } @@ -215,7 +212,7 @@ xfs_rmapbt_init_ptr_from_cur( { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -450,7 +447,7 @@ static struct xfs_btree_cur * xfs_rmapbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xfs_btree_cur *cur; @@ -462,9 +459,12 @@ xfs_rmapbt_init_common( cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); - cur->bc_ag.agno = agno; cur->bc_ops = &xfs_rmapbt_ops; + /* take a reference for the cursor */ + atomic_inc(&pag->pag_ref); + cur->bc_ag.pag = pag; + return cur; } @@ -474,12 +474,12 @@ xfs_rmapbt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - cur = xfs_rmapbt_init_common(mp, tp, agno); + cur = xfs_rmapbt_init_common(mp, tp, pag); cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); cur->bc_ag.agbp = agbp; return cur; @@ -490,11 +490,11 @@ struct xfs_btree_cur * xfs_rmapbt_stage_cursor( struct xfs_mount *mp, struct xbtree_afakeroot *afake, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xfs_btree_cur *cur; - cur = xfs_rmapbt_init_common(mp, NULL, agno); + cur = xfs_rmapbt_init_common(mp, NULL, pag); xfs_btree_stage_afakeroot(cur, afake); return cur; } @@ -596,7 +596,7 @@ int xfs_rmapbt_calc_reserves( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used) { @@ -609,7 +609,7 @@ xfs_rmapbt_calc_reserves( if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); if (error) return error; @@ -624,7 +624,7 @@ xfs_rmapbt_calc_reserves( * expansion. We therefore can pretend the space isn't there. */ if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) agblocks -= mp->m_sb.sb_logblocks; /* Reserve 1% of the AG or enough for 1 block per record. */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 115c3455a734..88d8d18788a2 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -43,9 +43,9 @@ struct xbtree_afakeroot; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, - xfs_agnumber_t agno); + struct xfs_perag *pag); struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, xfs_agnumber_t agno); + struct xbtree_afakeroot *afake, struct xfs_perag *pag); void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); int xfs_rmapbt_maxrecs(int blocklen, int leaf); @@ -57,6 +57,6 @@ extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp, xfs_agblock_t agblocks); extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); #endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index dfbbcbd448c1..04f5386446db 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -15,7 +15,6 @@ #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" -#include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_bmap_btree.h" @@ -25,72 +24,12 @@ #include "xfs_refcount_btree.h" #include "xfs_da_format.h" #include "xfs_health.h" +#include "xfs_ag.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. */ -/* - * Reference counting access wrappers to the perag structures. - * Because we never free per-ag structures, the only thing we - * have to protect against changes is the tree structure itself. - */ -struct xfs_perag * -xfs_perag_get( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - struct xfs_perag *pag; - int ref = 0; - - rcu_read_lock(); - pag = radix_tree_lookup(&mp->m_perag_tree, agno); - if (pag) { - ASSERT(atomic_read(&pag->pag_ref) >= 0); - ref = atomic_inc_return(&pag->pag_ref); - } - rcu_read_unlock(); - trace_xfs_perag_get(mp, agno, ref, _RET_IP_); - return pag; -} - -/* - * search from @first to find the next perag with the given tag set. - */ -struct xfs_perag * -xfs_perag_get_tag( - struct xfs_mount *mp, - xfs_agnumber_t first, - int tag) -{ - struct xfs_perag *pag; - int found; - int ref; - - rcu_read_lock(); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, first, 1, tag); - if (found <= 0) { - rcu_read_unlock(); - return NULL; - } - ref = atomic_inc_return(&pag->pag_ref); - rcu_read_unlock(); - trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); - return pag; -} - -void -xfs_perag_put( - struct xfs_perag *pag) -{ - int ref; - - ASSERT(atomic_read(&pag->pag_ref) > 0); - ref = atomic_dec_return(&pag->pag_ref); - trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); -} - /* Check all the superblock fields we care about when reading one in. */ STATIC int xfs_validate_sb_read( @@ -842,78 +781,6 @@ xfs_sb_mount_common( } /* - * xfs_initialize_perag_data - * - * Read in each per-ag structure so we can count up the number of - * allocated inodes, free inodes and used filesystem blocks as this - * information is no longer persistent in the superblock. Once we have - * this information, write it into the in-core superblock structure. - */ -int -xfs_initialize_perag_data( - struct xfs_mount *mp, - xfs_agnumber_t agcount) -{ - xfs_agnumber_t index; - xfs_perag_t *pag; - xfs_sb_t *sbp = &mp->m_sb; - uint64_t ifree = 0; - uint64_t ialloc = 0; - uint64_t bfree = 0; - uint64_t bfreelst = 0; - uint64_t btree = 0; - uint64_t fdblocks; - int error = 0; - - for (index = 0; index < agcount; index++) { - /* - * read the agf, then the agi. This gets us - * all the information we need and populates the - * per-ag structures for us. - */ - error = xfs_alloc_pagf_init(mp, NULL, index, 0); - if (error) - return error; - - error = xfs_ialloc_pagi_init(mp, NULL, index); - if (error) - return error; - pag = xfs_perag_get(mp, index); - ifree += pag->pagi_freecount; - ialloc += pag->pagi_count; - bfree += pag->pagf_freeblks; - bfreelst += pag->pagf_flcount; - btree += pag->pagf_btreeblks; - xfs_perag_put(pag); - } - fdblocks = bfree + bfreelst + btree; - - /* - * If the new summary counts are obviously incorrect, fail the - * mount operation because that implies the AGFs are also corrupt. - * Clear FS_COUNTERS so that we don't unmount with a dirty log, which - * will prevent xfs_repair from fixing anything. - */ - if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { - xfs_alert(mp, "AGF corruption. Please run xfs_repair."); - error = -EFSCORRUPTED; - goto out; - } - - /* Overwrite incore superblock counters with just-read data */ - spin_lock(&mp->m_sb_lock); - sbp->sb_ifree = ifree; - sbp->sb_icount = ialloc; - sbp->sb_fdblocks = fdblocks; - spin_unlock(&mp->m_sb_lock); - - xfs_reinit_percpu_counters(mp); -out: - xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); - return error; -} - -/* * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock * into the superblock buffer to be logged. It does not provide the higher * level of locking that is needed to protect the in-core superblock from @@ -989,17 +856,18 @@ int xfs_update_secondary_sbs( struct xfs_mount *mp) { - xfs_agnumber_t agno; + struct xfs_perag *pag; + xfs_agnumber_t agno = 1; int saved_error = 0; int error = 0; LIST_HEAD (buffer_list); /* update secondary superblocks. */ - for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) { + for_each_perag_from(mp, agno, pag) { struct xfs_buf *bp; error = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_SB_DADDR), + XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR), XFS_FSS_TO_BB(mp, 1), &bp); /* * If we get an error reading or writing alternate superblocks, @@ -1011,7 +879,7 @@ xfs_update_secondary_sbs( if (error) { xfs_warn(mp, "error allocating secondary superblock for ag %d", - agno); + pag->pag_agno); if (!saved_error) saved_error = error; continue; @@ -1032,7 +900,7 @@ xfs_update_secondary_sbs( if (error) { xfs_warn(mp, "write error %d updating a secondary superblock near ag %d", - error, agno); + error, pag->pag_agno); if (!saved_error) saved_error = error; continue; diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index f79f9dc632b6..0c1602d9b53d 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -13,15 +13,6 @@ struct xfs_trans; struct xfs_fsop_geom; struct xfs_perag; -/* - * perag get/put wrappers for ref counting - */ -extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); -extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, - int tag); -extern void xfs_perag_put(struct xfs_perag *pag); -extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); - extern void xfs_log_sb(struct xfs_trans *tp); extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); extern int xfs_sync_sb_buf(struct xfs_mount *mp); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 782fdd08f759..25c4cab58851 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -22,30 +22,26 @@ struct xfs_inode; * Buffer verifier operations are widely used, including userspace tools */ extern const struct xfs_buf_ops xfs_agf_buf_ops; -extern const struct xfs_buf_ops xfs_agi_buf_ops; -extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; -extern const struct xfs_buf_ops xfs_bnobt_buf_ops; -extern const struct xfs_buf_ops xfs_cntbt_buf_ops; -extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; -extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; extern const struct xfs_buf_ops xfs_bmbt_buf_ops; +extern const struct xfs_buf_ops xfs_bnobt_buf_ops; +extern const struct xfs_buf_ops xfs_cntbt_buf_ops; extern const struct xfs_buf_ops xfs_da3_node_buf_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; -extern const struct xfs_buf_ops xfs_symlink_buf_ops; -extern const struct xfs_buf_ops xfs_agi_buf_ops; -extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; extern const struct xfs_buf_ops xfs_finobt_buf_ops; +extern const struct xfs_buf_ops xfs_inobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; -extern const struct xfs_buf_ops xfs_dquot_buf_ops; -extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; +extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; +extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_rtbuf_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; -extern const struct xfs_buf_ops xfs_rtbuf_ops; /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 04801362e1a7..e8f4abee7892 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -11,6 +11,7 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" +#include "xfs_ag.h" /* Find the size of the AG, in blocks. */ inline xfs_agblock_t @@ -222,12 +223,13 @@ xfs_icount_range( unsigned long long *max) { unsigned long long nr_inos = 0; + struct xfs_perag *pag; xfs_agnumber_t agno; /* root, rtbitmap, rtsum all live in the first chunk */ *min = XFS_INODES_PER_CHUNK; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + for_each_perag(mp, agno, pag) { xfs_agino_t first, last; xfs_agino_range(mp, agno, &first, &last); diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 064bd6e8c922..0870ef6f933d 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -21,6 +21,7 @@ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */ typedef int64_t xfs_lsn_t; /* log sequence number */ +typedef int64_t xfs_csn_t; /* CIL sequence number */ typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ typedef uint32_t xfs_dahash_t; /* dir/attr hash value */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 7a2f9b5f2db5..be1a7e1e65f7 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -14,6 +14,7 @@ #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_rmap.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -86,6 +87,7 @@ xchk_superblock( case -ENOSYS: case -EFBIG: error = -EFSCORRUPTED; + fallthrough; default: break; } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 23690f824ffa..e95f8c98f0f7 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -20,6 +20,7 @@ #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -245,8 +246,8 @@ xrep_agf_calc_from_btrees( int error; /* Update the AGF counters from the bnobt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, - XFS_BTNUM_BNO); + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, + sc->sa.pag, XFS_BTNUM_BNO); error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa); if (error) goto err; @@ -259,8 +260,8 @@ xrep_agf_calc_from_btrees( agf->agf_longest = cpu_to_be32(raa.longest); /* Update the AGF counters from the cntbt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, - XFS_BTNUM_CNT); + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, + sc->sa.pag, XFS_BTNUM_CNT); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -268,7 +269,7 @@ xrep_agf_calc_from_btrees( btreeblks += blocks - 1; /* Update the AGF counters from the rmapbt. */ - cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -281,7 +282,7 @@ xrep_agf_calc_from_btrees( /* Update the AGF counters from the refcountbt. */ if (xfs_sb_version_hasreflink(&mp->m_sb)) { cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.agno); + sc->sa.pag); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -453,7 +454,7 @@ xrep_agfl_walk_rmap( /* Record all the OWN_AG blocks. */ if (rec->rm_owner == XFS_RMAP_OWN_AG) { - fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock); error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount); if (error) @@ -489,23 +490,23 @@ xrep_agfl_collect_blocks( xbitmap_init(&ra.agmetablocks); /* Find all space used by the free space btrees & rmapbt. */ - cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra); if (error) goto err; xfs_btree_del_cursor(cur, error); /* Find all blocks currently being used by the bnobt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, - XFS_BTNUM_BNO); + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, + sc->sa.pag, XFS_BTNUM_BNO); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) goto err; xfs_btree_del_cursor(cur, error); /* Find all blocks currently being used by the cntbt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, - XFS_BTNUM_CNT); + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, + sc->sa.pag, XFS_BTNUM_CNT); error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) goto err; @@ -805,8 +806,8 @@ xrep_agi_calc_from_btrees( xfs_agino_t freecount; int error; - cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno, - XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, + sc->sa.pag, XFS_BTNUM_INO); error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; @@ -827,8 +828,8 @@ xrep_agi_calc_from_btrees( xfs_sb_version_hasinobtcounts(&mp->m_sb)) { xfs_agblock_t blocks; - cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, sc->sa.agno, - XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, + sc->sa.pag, XFS_BTNUM_FINO); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 2720bd7fe53b..d5741980094a 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -15,6 +15,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "xfs_ag.h" /* * Set us up to scrub free space btrees. @@ -93,7 +94,7 @@ xchk_allocbt_rec( union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_ag.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agblock_t bno; xfs_extlen_t len; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index b5ebf1d1b4db..1d146c9d9de1 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -22,6 +22,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "xfs_ag.h" /* Set us up with an inode's bmap. */ int @@ -271,7 +272,7 @@ xchk_bmap_iextent_xref( case XFS_DATA_FORK: if (xfs_is_reflink_inode(info->sc->ip)) break; - /* fall through */ + fallthrough; case XFS_ATTR_FORK: xchk_xref_is_not_shared(info->sc, agbno, irec->br_blockcount); @@ -514,7 +515,7 @@ xchk_bmap_check_rmap( xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, - cur->bc_ag.agno, rec->rm_startblock)) + cur->bc_ag.pag->pag_agno, rec->rm_startblock)) xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); if (irec.br_blockcount > rec->rm_blockcount) @@ -544,18 +545,18 @@ STATIC int xchk_bmap_check_ag_rmaps( struct xfs_scrub *sc, int whichfork, - xfs_agnumber_t agno) + struct xfs_perag *pag) { struct xchk_bmap_check_rmap_info sbcri; struct xfs_btree_cur *cur; struct xfs_buf *agf; int error; - error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf); + error = xfs_alloc_read_agf(sc->mp, sc->tp, pag->pag_agno, 0, &agf); if (error) return error; - cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, agno); + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, pag); sbcri.sc = sc; sbcri.whichfork = whichfork; @@ -575,6 +576,7 @@ xchk_bmap_check_rmaps( int whichfork) { struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); + struct xfs_perag *pag; xfs_agnumber_t agno; bool zero_size; int error; @@ -607,15 +609,16 @@ xchk_bmap_check_rmaps( (zero_size || ifp->if_nextents > 0)) return 0; - for (agno = 0; agno < sc->mp->m_sb.sb_agcount; agno++) { - error = xchk_bmap_check_ag_rmaps(sc, whichfork, agno); + for_each_perag(sc->mp, agno, pag) { + error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag); if (error) - return error; + break; if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) break; } - - return 0; + if (pag) + xfs_perag_put(pag); + return error; } /* diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index a94bd8122c60..bd1172358964 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -44,7 +44,7 @@ __xchk_btree_process_error( /* Note the badness but don't abort. */ sc->sm->sm_flags |= errflag; *error = 0; - /* fall through */ + fallthrough; default: if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) trace_xchk_ifork_btree_op_error(sc, cur, level, diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index be38c960da85..8558ca05e11d 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -12,7 +12,6 @@ #include "xfs_btree.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_inode.h" #include "xfs_icache.h" #include "xfs_alloc.h" @@ -26,6 +25,7 @@ #include "xfs_trans_priv.h" #include "xfs_attr.h" #include "xfs_reflink.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -83,7 +83,7 @@ __xchk_process_error( /* Note the badness but don't abort. */ sc->sm->sm_flags |= errflag; *error = 0; - /* fall through */ + fallthrough; default: trace_xchk_op_error(sc, agno, bno, *error, ret_ip); @@ -136,7 +136,7 @@ __xchk_fblock_process_error( /* Note the badness but don't abort. */ sc->sm->sm_flags |= errflag; *error = 0; - /* fall through */ + fallthrough; default: trace_xchk_file_op_error(sc, whichfork, offset, *error, ret_ip); @@ -460,49 +460,48 @@ xchk_ag_btcur_init( struct xchk_ag *sa) { struct xfs_mount *mp = sc->mp; - xfs_agnumber_t agno = sa->agno; xchk_perag_get(sc->mp, sa); if (sa->agf_bp && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { /* Set up a bnobt cursor for cross-referencing. */ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - agno, XFS_BTNUM_BNO); + sa->pag, XFS_BTNUM_BNO); } if (sa->agf_bp && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) { /* Set up a cntbt cursor for cross-referencing. */ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - agno, XFS_BTNUM_CNT); + sa->pag, XFS_BTNUM_CNT); } /* Set up a inobt cursor for cross-referencing. */ if (sa->agi_bp && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, - agno, XFS_BTNUM_INO); + sa->pag, XFS_BTNUM_INO); } /* Set up a finobt cursor for cross-referencing. */ if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, - agno, XFS_BTNUM_FINO); + sa->pag, XFS_BTNUM_FINO); } /* Set up a rmapbt cursor for cross-referencing. */ if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, - agno); + sa->pag); } /* Set up a refcountbt cursor for cross-referencing. */ if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, - sa->agf_bp, agno); + sa->agf_bp, sa->pag); } } @@ -696,7 +695,7 @@ xchk_get_inode( if (error) return -ENOENT; error = -EFSCORRUPTED; - /* fall through */ + fallthrough; default: trace_xchk_op_error(sc, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 653f3280e1c1..9f0dbb47c82c 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -47,7 +47,7 @@ xchk_da_process_error( /* Note the badness but don't abort. */ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; *error = 0; - /* fall through */ + fallthrough; default: trace_xchk_file_op_error(sc, ds->dargs.whichfork, xfs_dir2_da_to_db(ds->dargs.geo, diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index f1d1a8c58853..fd7941e04ae1 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -9,11 +9,11 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_health.h" #include "xfs_btree.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -71,11 +71,11 @@ xchk_fscount_warmup( xfs_agnumber_t agno; int error = 0; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); - + for_each_perag(mp, agno, pag) { + if (xchk_should_terminate(sc, &error)) + break; if (pag->pagi_init && pag->pagf_init) - goto next_loop_perag; + continue; /* Lock both AG headers. */ error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp); @@ -89,21 +89,15 @@ xchk_fscount_warmup( * These are supposed to be initialized by the header read * function. */ - error = -EFSCORRUPTED; - if (!pag->pagi_init || !pag->pagf_init) + if (!pag->pagi_init || !pag->pagf_init) { + error = -EFSCORRUPTED; break; + } xfs_buf_relse(agf_bp); agf_bp = NULL; xfs_buf_relse(agi_bp); agi_bp = NULL; -next_loop_perag: - xfs_perag_put(pag); - pag = NULL; - error = 0; - - if (xchk_should_terminate(sc, &error)) - break; } if (agf_bp) @@ -196,13 +190,14 @@ retry: fsc->ifree = 0; fsc->fdblocks = 0; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); + for_each_perag(mp, agno, pag) { + if (xchk_should_terminate(sc, &error)) + break; /* This somehow got unset since the warmup? */ if (!pag->pagi_init || !pag->pagf_init) { - xfs_perag_put(pag); - return -EFSCORRUPTED; + error = -EFSCORRUPTED; + break; } /* Count all the inodes */ @@ -216,10 +211,8 @@ retry: fsc->fdblocks += pag->pagf_btreeblks; } else { error = xchk_fscount_btreeblks(sc, fsc, agno); - if (error) { - xfs_perag_put(pag); + if (error) break; - } } /* @@ -229,12 +222,9 @@ retry: fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; - xfs_perag_put(pag); - - if (xchk_should_terminate(sc, &error)) - break; } - + if (pag) + xfs_perag_put(pag); if (error) return error; diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 3de59b5c2ce6..2e61df3bca83 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -8,7 +8,7 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_btree.h" -#include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/health.h" diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 8d9f3fb0cd22..30e568596b79 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -21,6 +21,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" +#include "xfs_ag.h" /* * Set us up to scrub inode btrees. @@ -103,7 +104,7 @@ xchk_iallocbt_chunk( xfs_extlen_t len) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_ag.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agblock_t bno; bno = XFS_AGINO_TO_AGBNO(mp, agino); @@ -163,7 +164,7 @@ xchk_iallocbt_check_cluster_ifree( * the record, compute which fs inode we're talking about. */ agino = irec->ir_startino + irec_ino; - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.agno, agino); + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.pag->pag_agno, agino); irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || @@ -213,7 +214,7 @@ xchk_iallocbt_check_cluster( struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_buf *cluster_bp; unsigned int nr_inodes; - xfs_agnumber_t agno = bs->cur->bc_ag.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agblock_t agbno; unsigned int cluster_index; uint16_t cluster_mask = 0; @@ -423,7 +424,7 @@ xchk_iallocbt_rec( struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; - xfs_agnumber_t agno = bs->cur->bc_ag.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agino_t agino; xfs_extlen_t len; int holecount; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 744530a66c0c..7014b7408bad 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -13,6 +13,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "xfs_ag.h" /* * Set us up to scrub reference count btrees. @@ -333,7 +334,7 @@ xchk_refcountbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; xfs_agblock_t *cow_blocks = bs->private; - xfs_agnumber_t agno = bs->cur->bc_ag.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agblock_t bno; xfs_extlen_t len; xfs_nlink_t refcount; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index c2857d854c83..ebb0e245aa72 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -22,6 +22,7 @@ #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_extent_busy.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_quota.h" #include "scrub/scrub.h" @@ -303,7 +304,7 @@ xrep_alloc_ag_block( return error; if (bno == NULLAGBLOCK) return -ENOSPC; - xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno, + xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false); *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno); if (resv == XFS_AG_RESV_RMAPBT) @@ -508,7 +509,7 @@ xrep_put_freelist( * create an rmap for the block prior to merging it or else other * parts will break. */ - error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1, + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1, &XFS_RMAP_OINFO_AG); if (error) return error; @@ -518,7 +519,7 @@ xrep_put_freelist( agbno, 0); if (error) return error; - xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1, + xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; @@ -554,7 +555,7 @@ xrep_reap_block( } else { agf_bp = sc->sa.agf_bp; } - cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno); + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag); /* Can we find any other rmappings? */ error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap); @@ -576,7 +577,8 @@ xrep_reap_block( * to run xfs_repair. */ if (has_other_rmap) - error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo); + error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno, + 1, oinfo); else if (resv == XFS_AG_RESV_AGFL) error = xrep_put_freelist(sc, agbno); else @@ -891,7 +893,7 @@ xrep_find_ag_btree_roots( fab->height = 0; } - cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri); xfs_btree_del_cursor(cur, error); @@ -947,7 +949,7 @@ xrep_ino_dqattach( xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); - /* fall through */ + fallthrough; case -ESRCH: error = 0; break; diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index a4f17477c5d1..fc306573f0ac 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -15,6 +15,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "xfs_ag.h" /* * Set us up to scrub reverse mapping btrees. @@ -91,7 +92,7 @@ xchk_rmapbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_rmap_irec irec; - xfs_agnumber_t agno = bs->cur->bc_ag.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; bool non_inode; bool is_unwritten; bool is_bmbt; diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 2c6c248be823..03882a605a3c 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -13,6 +13,7 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "scrub/scrub.h" +#include "xfs_ag.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t @@ -26,7 +27,7 @@ xchk_btree_cur_fsbno( cur->bc_flags & XFS_BTREE_LONG_PTRS) return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) - return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, 0); + return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, 0); return NULLFSBLOCK; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 826caa6b4a5a..cb4e0fcf4c76 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -561,7 +561,7 @@ const struct address_space_operations xfs_address_space_operations = { .readahead = xfs_vm_readahead, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, - .set_page_dirty = iomap_set_page_dirty, + .set_page_dirty = __set_page_dirty_nobuffers, .releasepage = iomap_releasepage, .invalidatepage = iomap_invalidatepage, .bmap = xfs_vm_bmap, @@ -575,7 +575,7 @@ const struct address_space_operations xfs_address_space_operations = { const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, .direct_IO = noop_direct_IO, - .set_page_dirty = noop_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .invalidatepage = noop_invalidatepage, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index bfad669e6b2f..aaa7e66c42d7 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -15,10 +15,10 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" +#include "xfs_attr.h" #include "xfs_attr_remote.h" #include "xfs_trans.h" #include "xfs_bmap.h" -#include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_quota.h" #include "xfs_dir2.h" diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index 17f36db2f792..667e297f59b1 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -9,6 +9,41 @@ static inline unsigned int bio_max_vecs(unsigned int count) return bio_max_segs(howmany(count, PAGE_SIZE)); } +static void +xfs_flush_bdev_async_endio( + struct bio *bio) +{ + complete(bio->bi_private); +} + +/* + * Submit a request for an async cache flush to run. If the request queue does + * not require flush operations, just skip it altogether. If the caller needs + * to wait for the flush completion at a later point in time, they must supply a + * valid completion. This will be signalled when the flush completes. The + * caller never sees the bio that is issued here. + */ +void +xfs_flush_bdev_async( + struct bio *bio, + struct block_device *bdev, + struct completion *done) +{ + struct request_queue *q = bdev->bd_disk->queue; + + if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { + complete(done); + return; + } + + bio_init(bio, NULL, 0); + bio_set_dev(bio, bdev); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + bio->bi_private = done; + bio->bi_end_io = xfs_flush_bdev_async_endio; + + submit_bio(bio); +} int xfs_rw_bdev( struct block_device *bdev, diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0936f3a96fe6..213a97a921bb 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -286,7 +286,7 @@ xfs_bmap_count_blocks( */ *count += btblocks - 1; - /* fall through */ + fallthrough; case XFS_DINODE_FMT_EXTENTS: *nextents = xfs_bmap_count_leaves(ifp, count); break; @@ -945,7 +945,7 @@ xfs_flush_unmap_range( xfs_off_t rounding, start, end; int error; - rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); + rounding = max_t(xfs_off_t, mp->m_sb.sb_blocksize, PAGE_SIZE); start = round_down(offset, rounding); end = round_up(offset + len, rounding) - 1; @@ -1053,9 +1053,9 @@ xfs_prepare_shift( * extent (after split) during the shift and corrupt the file. Start * with the block just prior to the start to stabilize the boundary. */ - offset = round_down(offset, 1 << mp->m_sb.sb_blocklog); + offset = round_down(offset, mp->m_sb.sb_blocksize); if (offset) - offset -= (1 << mp->m_sb.sb_blocklog); + offset -= mp->m_sb.sb_blocksize; /* * Writeback and invalidate cache for the remainder of the file as we're diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 592800c8852f..8ff42b3585e0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -10,7 +10,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_trace.h" #include "xfs_log.h" @@ -19,12 +18,10 @@ #include "xfs_buf_item.h" #include "xfs_errortag.h" #include "xfs_error.h" +#include "xfs_ag.h" static kmem_zone_t *xfs_buf_zone; -#define xb_to_gfp(flags) \ - ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) - /* * Locking orders * @@ -79,7 +76,7 @@ static inline int xfs_buf_vmap_len( struct xfs_buf *bp) { - return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; + return (bp->b_page_count * PAGE_SIZE); } /* @@ -272,51 +269,30 @@ _xfs_buf_alloc( return 0; } -/* - * Allocate a page array capable of holding a specified number - * of pages, and point the page buf at it. - */ -STATIC int -_xfs_buf_get_pages( - struct xfs_buf *bp, - int page_count) +static void +xfs_buf_free_pages( + struct xfs_buf *bp) { - /* Make sure that we have a page list */ - if (bp->b_pages == NULL) { - bp->b_page_count = page_count; - if (page_count <= XB_PAGES) { - bp->b_pages = bp->b_page_array; - } else { - bp->b_pages = kmem_alloc(sizeof(struct page *) * - page_count, KM_NOFS); - if (bp->b_pages == NULL) - return -ENOMEM; - } - memset(bp->b_pages, 0, sizeof(struct page *) * page_count); + uint i; + + ASSERT(bp->b_flags & _XBF_PAGES); + + if (xfs_buf_is_vmapped(bp)) + vm_unmap_ram(bp->b_addr, bp->b_page_count); + + for (i = 0; i < bp->b_page_count; i++) { + if (bp->b_pages[i]) + __free_page(bp->b_pages[i]); } - return 0; -} + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += bp->b_page_count; -/* - * Frees b_pages if it was allocated. - */ -STATIC void -_xfs_buf_free_pages( - struct xfs_buf *bp) -{ - if (bp->b_pages != bp->b_page_array) { + if (bp->b_pages != bp->b_page_array) kmem_free(bp->b_pages); - bp->b_pages = NULL; - } + bp->b_pages = NULL; + bp->b_flags &= ~_XBF_PAGES; } -/* - * Releases the specified buffer. - * - * The modification state of any associated pages is left unchanged. - * The buffer must not be on any hash - use xfs_buf_rele instead for - * hashed and refcounted buffers - */ static void xfs_buf_free( struct xfs_buf *bp) @@ -325,137 +301,103 @@ xfs_buf_free( ASSERT(list_empty(&bp->b_lru)); - if (bp->b_flags & _XBF_PAGES) { - uint i; - - if (xfs_buf_is_vmapped(bp)) - vm_unmap_ram(bp->b_addr - bp->b_offset, - bp->b_page_count); - - for (i = 0; i < bp->b_page_count; i++) { - struct page *page = bp->b_pages[i]; - - __free_page(page); - } - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += - bp->b_page_count; - } else if (bp->b_flags & _XBF_KMEM) + if (bp->b_flags & _XBF_PAGES) + xfs_buf_free_pages(bp); + else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); - _xfs_buf_free_pages(bp); + xfs_buf_free_maps(bp); kmem_cache_free(xfs_buf_zone, bp); } -/* - * Allocates all the pages for buffer in question and builds it's page list. - */ -STATIC int -xfs_buf_allocate_memory( - struct xfs_buf *bp, - uint flags) +static int +xfs_buf_alloc_kmem( + struct xfs_buf *bp, + xfs_buf_flags_t flags) { - size_t size; - size_t nbytes, offset; - gfp_t gfp_mask = xb_to_gfp(flags); - unsigned short page_count, i; - xfs_off_t start, end; - int error; - xfs_km_flags_t kmflag_mask = 0; + int align_mask = xfs_buftarg_dma_alignment(bp->b_target); + xfs_km_flags_t kmflag_mask = KM_NOFS; + size_t size = BBTOB(bp->b_length); - /* - * assure zeroed buffer for non-read cases. - */ - if (!(flags & XBF_READ)) { + /* Assure zeroed buffer for non-read cases. */ + if (!(flags & XBF_READ)) kmflag_mask |= KM_ZERO; - gfp_mask |= __GFP_ZERO; - } - /* - * for buffers that are contained within a single page, just allocate - * the memory from the heap - there's no need for the complexity of - * page arrays to keep allocation down to order 0. - */ - size = BBTOB(bp->b_length); - if (size < PAGE_SIZE) { - int align_mask = xfs_buftarg_dma_alignment(bp->b_target); - bp->b_addr = kmem_alloc_io(size, align_mask, - KM_NOFS | kmflag_mask); - if (!bp->b_addr) { - /* low memory - use alloc_page loop instead */ - goto use_alloc_page; - } + bp->b_addr = kmem_alloc_io(size, align_mask, kmflag_mask); + if (!bp->b_addr) + return -ENOMEM; - if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != - ((unsigned long)bp->b_addr & PAGE_MASK)) { - /* b_addr spans two pages - use alloc_page instead */ - kmem_free(bp->b_addr); - bp->b_addr = NULL; - goto use_alloc_page; - } - bp->b_offset = offset_in_page(bp->b_addr); - bp->b_pages = bp->b_page_array; - bp->b_pages[0] = kmem_to_page(bp->b_addr); - bp->b_page_count = 1; - bp->b_flags |= _XBF_KMEM; - return 0; + if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != + ((unsigned long)bp->b_addr & PAGE_MASK)) { + /* b_addr spans two pages - use alloc_page instead */ + kmem_free(bp->b_addr); + bp->b_addr = NULL; + return -ENOMEM; } + bp->b_offset = offset_in_page(bp->b_addr); + bp->b_pages = bp->b_page_array; + bp->b_pages[0] = kmem_to_page(bp->b_addr); + bp->b_page_count = 1; + bp->b_flags |= _XBF_KMEM; + return 0; +} -use_alloc_page: - start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; - end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) - >> PAGE_SHIFT; - page_count = end - start; - error = _xfs_buf_get_pages(bp, page_count); - if (unlikely(error)) - return error; +static int +xfs_buf_alloc_pages( + struct xfs_buf *bp, + xfs_buf_flags_t flags) +{ + gfp_t gfp_mask = __GFP_NOWARN; + long filled = 0; - offset = bp->b_offset; + if (flags & XBF_READ_AHEAD) + gfp_mask |= __GFP_NORETRY; + else + gfp_mask |= GFP_NOFS; + + /* Make sure that we have a page list */ + bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); + if (bp->b_page_count <= XB_PAGES) { + bp->b_pages = bp->b_page_array; + } else { + bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count, + gfp_mask); + if (!bp->b_pages) + return -ENOMEM; + } bp->b_flags |= _XBF_PAGES; - for (i = 0; i < bp->b_page_count; i++) { - struct page *page; - uint retries = 0; -retry: - page = alloc_page(gfp_mask); - if (unlikely(page == NULL)) { - if (flags & XBF_READ_AHEAD) { - bp->b_page_count = i; - error = -ENOMEM; - goto out_free_pages; - } + /* Assure zeroed buffer for non-read cases. */ + if (!(flags & XBF_READ)) + gfp_mask |= __GFP_ZERO; - /* - * This could deadlock. - * - * But until all the XFS lowlevel code is revamped to - * handle buffer allocation failures we can't do much. - */ - if (!(++retries % 100)) - xfs_err(NULL, - "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", - current->comm, current->pid, - __func__, gfp_mask); - - XFS_STATS_INC(bp->b_mount, xb_page_retries); - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry; + /* + * Bulk filling of pages can take multiple calls. Not filling the entire + * array is not an allocation failure, so don't back off if we get at + * least one extra page. + */ + for (;;) { + long last = filled; + + filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count, + bp->b_pages); + if (filled == bp->b_page_count) { + XFS_STATS_INC(bp->b_mount, xb_page_found); + break; } - XFS_STATS_INC(bp->b_mount, xb_page_found); + if (filled != last) + continue; - nbytes = min_t(size_t, size, PAGE_SIZE - offset); - size -= nbytes; - bp->b_pages[i] = page; - offset = 0; + if (flags & XBF_READ_AHEAD) { + xfs_buf_free_pages(bp); + return -ENOMEM; + } + + XFS_STATS_INC(bp->b_mount, xb_page_retries); + congestion_wait(BLK_RW_ASYNC, HZ / 50); } return 0; - -out_free_pages: - for (i = 0; i < bp->b_page_count; i++) - __free_page(bp->b_pages[i]); - bp->b_flags &= ~_XBF_PAGES; - return error; } /* @@ -469,7 +411,7 @@ _xfs_buf_map_pages( ASSERT(bp->b_flags & _XBF_PAGES); if (bp->b_page_count == 1) { /* A single page buffer is always mappable */ - bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; + bp->b_addr = page_address(bp->b_pages[0]); } else if (flags & XBF_UNMAPPED) { bp->b_addr = NULL; } else { @@ -496,7 +438,6 @@ _xfs_buf_map_pages( if (!bp->b_addr) return -ENOMEM; - bp->b_addr += bp->b_offset; } return 0; @@ -707,7 +648,7 @@ xfs_buf_get_map( { struct xfs_buf *bp; struct xfs_buf *new_bp; - int error = 0; + int error; *bpp = NULL; error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); @@ -720,17 +661,22 @@ xfs_buf_get_map( if (error) return error; - error = xfs_buf_allocate_memory(new_bp, flags); - if (error) { - xfs_buf_free(new_bp); - return error; + /* + * For buffers that fit entirely within a single page, first attempt to + * allocate the memory from the heap to minimise memory usage. If we + * can't get heap memory for these small buffers, we fall back to using + * the page allocator. + */ + if (BBTOB(new_bp->b_length) >= PAGE_SIZE || + xfs_buf_alloc_kmem(new_bp, flags) < 0) { + error = xfs_buf_alloc_pages(new_bp, flags); + if (error) + goto out_free_buf; } error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); - if (error) { - xfs_buf_free(new_bp); - return error; - } + if (error) + goto out_free_buf; if (bp != new_bp) xfs_buf_free(new_bp); @@ -758,6 +704,9 @@ found: trace_xfs_buf_get(bp, flags, _RET_IP_); *bpp = bp; return 0; +out_free_buf: + xfs_buf_free(new_bp); + return error; } int @@ -950,8 +899,7 @@ xfs_buf_get_uncached( int flags, struct xfs_buf **bpp) { - unsigned long page_count; - int error, i; + int error; struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); @@ -960,41 +908,25 @@ xfs_buf_get_uncached( /* flags might contain irrelevant bits, pass only what we care about */ error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp); if (error) - goto fail; + return error; - page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; - error = _xfs_buf_get_pages(bp, page_count); + error = xfs_buf_alloc_pages(bp, flags); if (error) goto fail_free_buf; - for (i = 0; i < page_count; i++) { - bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); - if (!bp->b_pages[i]) { - error = -ENOMEM; - goto fail_free_mem; - } - } - bp->b_flags |= _XBF_PAGES; - error = _xfs_buf_map_pages(bp, 0); if (unlikely(error)) { xfs_warn(target->bt_mount, "%s: failed to map pages", __func__); - goto fail_free_mem; + goto fail_free_buf; } trace_xfs_buf_get_uncached(bp, _RET_IP_); *bpp = bp; return 0; - fail_free_mem: - while (--i >= 0) - __free_page(bp->b_pages[i]); - _xfs_buf_free_pages(bp); - fail_free_buf: - xfs_buf_free_maps(bp); - kmem_cache_free(xfs_buf_zone, bp); - fail: +fail_free_buf: + xfs_buf_free(bp); return error; } @@ -1722,7 +1654,6 @@ xfs_buf_offset( if (bp->b_addr) return bp->b_addr + offset; - offset += bp->b_offset; page = bp->b_pages[offset >> PAGE_SHIFT]; return page_address(page) + (offset & (PAGE_SIZE-1)); } @@ -1958,7 +1889,7 @@ xfs_free_buftarg( percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); - xfs_blkdev_issue_flush(btp); + blkdev_issue_flush(btp->bt_bdev); kmem_free(btp); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 459ca34f26f5..464dc548fa23 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -167,7 +167,8 @@ struct xfs_buf { atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ - unsigned int b_offset; /* page offset in first page */ + unsigned int b_offset; /* page offset of b_addr, + only for _XBF_KMEM buffers */ int b_error; /* error code on I/O */ /* diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index fb69879e4b2b..2828ce45b701 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -74,14 +74,12 @@ xfs_buf_item_straddle( } /* - * This returns the number of log iovecs needed to log the - * given buf log item. + * Return the number of log iovecs and space needed to log the given buf log + * item segment. * - * It calculates this as 1 iovec for the buf log format structure - * and 1 for each stretch of non-contiguous chunks to be logged. - * Contiguous chunks are logged in a single iovec. - * - * If the XFS_BLI_STALE flag has been set, then log nothing. + * It calculates this as 1 iovec for the buf log format structure and 1 for each + * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged + * in a single iovec. */ STATIC void xfs_buf_item_size_segment( @@ -168,11 +166,8 @@ slow_scan: } /* - * This returns the number of log iovecs needed to log the given buf log item. - * - * It calculates this as 1 iovec for the buf log format structure and 1 for each - * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged - * in a single iovec. + * Return the number of log iovecs and space needed to log the given buf log + * item. * * Discontiguous buffers need a format structure per region that is being * logged. This makes the changes in the buffer appear to log recovery as though @@ -182,7 +177,11 @@ slow_scan: * what ends up on disk. * * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log - * format structures. + * format structures. If the item has previously been logged and has dirty + * regions, we do not relog them in stale buffers. This has the effect of + * reducing the size of the relogged item by the amount of dirty data tracked + * by the log item. This can result in the committing transaction reducing the + * amount of space being consumed by the CIL. */ STATIC void xfs_buf_item_size( @@ -199,9 +198,9 @@ xfs_buf_item_size( ASSERT(atomic_read(&bip->bli_refcount) > 0); if (bip->bli_flags & XFS_BLI_STALE) { /* - * The buffer is stale, so all we need to log - * is the buf log format structure with the - * cancel flag in it. + * The buffer is stale, so all we need to log is the buf log + * format structure with the cancel flag in it as we are never + * going to replay the changes tracked in the log item. */ trace_xfs_buf_item_size_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); @@ -216,9 +215,9 @@ xfs_buf_item_size( if (bip->bli_flags & XFS_BLI_ORDERED) { /* - * The buffer has been logged just to order it. - * It is not being included in the transaction - * commit, so no vectors are used at all. + * The buffer has been logged just to order it. It is not being + * included in the transaction commit, so no vectors are used at + * all. */ trace_xfs_buf_item_size_ordered(bip); *nvecs = XFS_LOG_VEC_ORDERED; @@ -475,17 +474,8 @@ xfs_buf_item_pin( } /* - * This is called to unpin the buffer associated with the buf log - * item which was previously pinned with a call to xfs_buf_item_pin(). - * - * Also drop the reference to the buf item for the current transaction. - * If the XFS_BLI_STALE flag is set and we are the last reference, - * then free up the buf log item and unlock the buffer. - * - * If the remove flag is set we are called from uncommit in the - * forced-shutdown path. If that is true and the reference count on - * the log item is going to drop to zero we need to free the item's - * descriptor in the transaction. + * This is called to unpin the buffer associated with the buf log item which + * was previously pinned with a call to xfs_buf_item_pin(). */ STATIC void xfs_buf_item_unpin( @@ -502,38 +492,35 @@ xfs_buf_item_unpin( trace_xfs_buf_item_unpin(bip); + /* + * Drop the bli ref associated with the pin and grab the hold required + * for the I/O simulation failure in the abort case. We have to do this + * before the pin count drops because the AIL doesn't acquire a bli + * reference. Therefore if the refcount drops to zero, the bli could + * still be AIL resident and the buffer submitted for I/O (and freed on + * completion) at any point before we return. This can be removed once + * the AIL properly holds a reference on the bli. + */ freed = atomic_dec_and_test(&bip->bli_refcount); - + if (freed && !stale && remove) + xfs_buf_hold(bp); if (atomic_dec_and_test(&bp->b_pin_count)) wake_up_all(&bp->b_waiters); - if (freed && stale) { + /* nothing to do but drop the pin count if the bli is active */ + if (!freed) + return; + + if (stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_flags & XBF_STALE); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(list_empty(&lip->li_trans)); + ASSERT(!bp->b_transp); trace_xfs_buf_item_unpin_stale(bip); - if (remove) { - /* - * If we are in a transaction context, we have to - * remove the log item from the transaction as we are - * about to release our reference to the buffer. If we - * don't, the unlock that occurs later in - * xfs_trans_uncommit() will try to reference the - * buffer which we no longer have a hold on. - */ - if (!list_empty(&lip->li_trans)) - xfs_trans_del_item(lip); - - /* - * Since the transaction no longer refers to the buffer, - * the buffer should no longer refer to the transaction. - */ - bp->b_transp = NULL; - } - /* * If we get called here because of an IO error, we may or may * not have the item on the AIL. xfs_trans_ail_delete() will @@ -550,13 +537,13 @@ xfs_buf_item_unpin( ASSERT(bp->b_log_item == NULL); } xfs_buf_relse(bp); - } else if (freed && remove) { + } else if (remove) { /* * The buffer must be locked and held by the caller to simulate - * an async I/O failure. + * an async I/O failure. We acquired the hold for this case + * before the buffer was unpinned. */ xfs_buf_lock(bp); - xfs_buf_hold(bp); bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); } @@ -714,7 +701,7 @@ xfs_buf_item_release( STATIC void xfs_buf_item_committing( struct xfs_log_item *lip, - xfs_lsn_t commit_lsn) + xfs_csn_t seq) { return xfs_buf_item_release(lip); } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index f979d0d7e6cd..736df5660f1f 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -8,7 +8,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_alloc_btree.h" @@ -18,6 +17,7 @@ #include "xfs_extent_busy.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_ag.h" STATIC int xfs_trim_extents( @@ -50,7 +50,7 @@ xfs_trim_extents( goto out_put_perag; agf = agbp->b_addr; - cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); /* * Look up the longest btree in the AGF and start with it. @@ -108,7 +108,7 @@ xfs_trim_extents( * If any blocks in the range are still busy, skip the * discard and try again the next time. */ - if (xfs_extent_busy_search(mp, agno, fbno, flen)) { + if (xfs_extent_busy_search(mp, pag, fbno, flen)) { trace_xfs_discard_busy(mp, agno, fbno, flen); goto next_extent; } diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 8c1fdf37ee8f..8ed47b739b6c 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -188,7 +188,7 @@ xfs_qm_dquot_logitem_release( STATIC void xfs_qm_dquot_logitem_committing( struct xfs_log_item *lip, - xfs_lsn_t commit_lsn) + xfs_csn_t seq) { return xfs_qm_dquot_logitem_release(lip); } diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 465fd9e048d4..1da59bdff245 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -84,7 +84,7 @@ xfs_fs_encode_fh( case FILEID_INO32_GEN_PARENT: fid->i32.parent_ino = XFS_I(parent)->i_ino; fid->i32.parent_gen = parent->i_generation; - /*FALLTHRU*/ + fallthrough; case FILEID_INO32_GEN: fid->i32.ino = XFS_I(inode)->i_ino; fid->i32.gen = inode->i_generation; @@ -92,7 +92,7 @@ xfs_fs_encode_fh( case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: fid64->parent_ino = XFS_I(parent)->i_ino; fid64->parent_gen = parent->i_generation; - /*FALLTHRU*/ + fallthrough; case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: fid64->ino = XFS_I(inode)->i_ino; fid64->gen = inode->i_generation; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index a4075685d9eb..ad22a003f959 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -11,39 +11,37 @@ #include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_log.h" +#include "xfs_ag.h" void xfs_extent_busy_insert( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags) { struct xfs_extent_busy *new; struct xfs_extent_busy *busyp; - struct xfs_perag *pag; struct rb_node **rbp; struct rb_node *parent = NULL; new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0); - new->agno = agno; + new->agno = pag->pag_agno; new->bno = bno; new->length = len; INIT_LIST_HEAD(&new->list); new->flags = flags; /* trace before insert to be able to see failed inserts */ - trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); + trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len); - pag = xfs_perag_get(tp->t_mountp, new->agno); spin_lock(&pag->pagb_lock); rbp = &pag->pagb_tree.rb_node; while (*rbp) { @@ -66,7 +64,6 @@ xfs_extent_busy_insert( list_add(&new->list, &tp->t_busy); spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); } /* @@ -81,21 +78,17 @@ xfs_extent_busy_insert( int xfs_extent_busy_search( struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len) { - struct xfs_perag *pag; struct rb_node *rbp; struct xfs_extent_busy *busyp; int match = 0; - pag = xfs_perag_get(mp, agno); + /* find closest start bno overlap */ spin_lock(&pag->pagb_lock); - rbp = pag->pagb_tree.rb_node; - - /* find closest start bno overlap */ while (rbp) { busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); if (bno < busyp->bno) { @@ -115,7 +108,6 @@ xfs_extent_busy_search( } } spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); return match; } @@ -281,17 +273,14 @@ out_force_log: void xfs_extent_busy_reuse( struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata) { - struct xfs_perag *pag; struct rb_node *rbp; ASSERT(flen > 0); - - pag = xfs_perag_get(mp, agno); spin_lock(&pag->pagb_lock); restart: rbp = pag->pagb_tree.rb_node; @@ -314,7 +303,6 @@ restart: goto restart; } spin_unlock(&pag->pagb_lock); - xfs_perag_put(pag); } /* @@ -605,12 +593,11 @@ void xfs_extent_busy_wait_all( struct xfs_mount *mp) { + struct xfs_perag *pag; DEFINE_WAIT (wait); xfs_agnumber_t agno; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - struct xfs_perag *pag = xfs_perag_get(mp, agno); - + for_each_perag(mp, agno, pag) { do { prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); if (RB_EMPTY_ROOT(&pag->pagb_tree)) @@ -618,8 +605,6 @@ xfs_extent_busy_wait_all( schedule(); } while (1); finish_wait(&pag->pagb_wait, &wait); - - xfs_perag_put(pag); } } diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 8aea07100092..4a118131059f 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -9,6 +9,7 @@ #define __XFS_EXTENT_BUSY_H__ struct xfs_mount; +struct xfs_perag; struct xfs_trans; struct xfs_alloc_arg; @@ -31,7 +32,7 @@ struct xfs_extent_busy { }; void -xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, +xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); void @@ -39,11 +40,11 @@ xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, bool do_discard); int -xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, +xfs_extent_busy_search(struct xfs_mount *mp, struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len); void -xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, +xfs_extent_busy_reuse(struct xfs_mount *mp, struct xfs_perag *pag, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); bool diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 396ef36dcd0a..cc3cfb12df53 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -119,8 +119,8 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); } -static xfs_lsn_t -xfs_fsync_lsn( +static xfs_csn_t +xfs_fsync_seq( struct xfs_inode *ip, bool datasync) { @@ -128,7 +128,7 @@ xfs_fsync_lsn( return 0; if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) return 0; - return ip->i_itemp->ili_last_lsn; + return ip->i_itemp->ili_commit_seq; } /* @@ -151,12 +151,12 @@ xfs_fsync_flush_log( int *log_flushed) { int error = 0; - xfs_lsn_t lsn; + xfs_csn_t seq; xfs_ilock(ip, XFS_ILOCK_SHARED); - lsn = xfs_fsync_lsn(ip, datasync); - if (lsn) { - error = xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, + seq = xfs_fsync_seq(ip, datasync); + if (seq) { + error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, log_flushed); spin_lock(&ip->i_itemp->ili_lock); @@ -197,9 +197,9 @@ xfs_file_fsync( * inode size in case of an extending write. */ if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(mp->m_rtdev_targp); + blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); else if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); + blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); /* * Any inode that has dirty modifications in the log is pinned. The @@ -219,7 +219,7 @@ xfs_file_fsync( */ if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && mp->m_logdev_targp == mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); + blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); return error; } @@ -384,21 +384,30 @@ restart: } goto restart; } + /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the - * iolock shared, we need to update it to exclusive which implies - * having to redo all checks before. + * write. If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies having to + * redo all checks before. + * + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while we + * do this check until we have placed an IO barrier (i.e. hold the + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have the + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. * - * We need to serialise against EOF updates that occur in IO - * completions here. We want to make sure that nobody is changing the - * size while we do this check until we have placed an IO barrier (i.e. - * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. - * The spinlock effectively forms a memory barrier once we have the - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value - * and hence be able to correctly determine if we need to run zeroing. + * We can do an unlocked check here safely as IO completion can only + * extend EOF. Truncate is locked out at this point, so the EOF can + * not move backwards, only forwards. Hence we only need to take the + * slow path and spin locks when we are at or beyond the current EOF. */ + if (iocb->ki_pos <= i_size_read(inode)) + goto out; + spin_lock(&ip->i_flags_lock); isize = i_size_read(inode); if (iocb->ki_pos > isize) { @@ -426,7 +435,7 @@ restart: drained_dio = true; goto restart; } - + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, NULL, &xfs_buffered_write_iomap_ops); @@ -435,6 +444,7 @@ restart: } else spin_unlock(&ip->i_flags_lock); +out: return file_modified(file); } @@ -500,7 +510,17 @@ xfs_dio_write_end_io( * other IO completions here to update the EOF. Failing to serialise * here can result in EOF moving backwards and Bad Things Happen when * that occurs. + * + * As IO completion only ever extends EOF, we can do an unlocked check + * here to avoid taking the spinlock. If we land within the current EOF, + * then we do not need to do an extending update at all, and we don't + * need to take the lock to check this. If we race with an update moving + * EOF, then we'll either still be beyond EOF and need to take the lock, + * or we'll be within EOF and we don't need to take it at all. */ + if (offset + size <= i_size_read(inode)) + goto out; + spin_lock(&ip->i_flags_lock); if (offset + size > i_size_read(inode)) { i_size_write(inode, offset + size); @@ -749,18 +769,18 @@ write_retry: */ if (ret == -EDQUOT && !cleared_space) { xfs_iunlock(ip, iolock); - xfs_blockgc_free_quota(ip, XFS_EOF_FLAGS_SYNC); + xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC); cleared_space = true; goto write_retry; } else if (ret == -ENOSPC && !cleared_space) { - struct xfs_eofblocks eofb = {0}; + struct xfs_icwalk icw = {0}; cleared_space = true; xfs_flush_inodes(ip->i_mount); xfs_iunlock(ip, iolock); - eofb.eof_flags = XFS_EOF_FLAGS_SYNC; - xfs_blockgc_free_space(ip->i_mount, &eofb); + icw.icw_flags = XFS_ICWALK_FLAG_SYNC; + xfs_blockgc_free_space(ip->i_mount, &icw); goto write_retry; } @@ -863,7 +883,7 @@ xfs_break_layouts( error = xfs_break_dax_layouts(inode, &retry); if (error || retry) break; - /* fall through */ + fallthrough; case BREAK_WRITE: error = xfs_break_leased_layouts(inode, iolock, &retry); break; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index db23e455eb91..eed6ca5f8f91 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -9,13 +9,13 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_alloc.h" #include "xfs_mru_cache.h" #include "xfs_trace.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_trans.h" #include "xfs_filestream.h" diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 34f2b971ce43..7d0b09c1366e 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -24,6 +24,7 @@ #include "xfs_refcount_btree.h" #include "xfs_alloc_btree.h" #include "xfs_rtalloc.h" +#include "xfs_ag.h" /* Convert an xfs_fsmap to an fsmap. */ static void @@ -157,10 +158,10 @@ struct xfs_getfsmap_info { struct xfs_fsmap_head *head; struct fsmap *fsmap_recs; /* mapping records */ struct xfs_buf *agf_bp; /* AGF, for refcount queries */ + struct xfs_perag *pag; /* AG info, if applicable */ xfs_daddr_t next_daddr; /* next daddr we expect */ u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ - xfs_agnumber_t agno; /* AG number, if applicable */ struct xfs_rmap_irec low; /* low rmap key */ struct xfs_rmap_irec high; /* high rmap key */ bool last; /* last extent? */ @@ -203,14 +204,13 @@ xfs_getfsmap_is_shared( *stat = false; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; - /* rt files will have agno set to NULLAGNUMBER */ - if (info->agno == NULLAGNUMBER) + /* rt files will have no perag structure */ + if (!info->pag) return 0; /* Are there any shared blocks here? */ flen = 0; - cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, - info->agno); + cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, info->pag); error = xfs_refcount_find_shared(cur, rec->rm_startblock, rec->rm_blockcount, &fbno, &flen, false); @@ -311,7 +311,8 @@ xfs_getfsmap_helper( if (info->head->fmh_entries >= info->head->fmh_count) return -ECANCELED; - trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec); + trace_xfs_fsmap_mapping(mp, info->dev, + info->pag ? info->pag->pag_agno : NULLAGNUMBER, rec); fmr.fmr_device = info->dev; fmr.fmr_physical = rec_daddr; @@ -354,7 +355,7 @@ xfs_getfsmap_datadev_helper( xfs_fsblock_t fsb; xfs_daddr_t rec_daddr; - fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.agno, rec->rm_startblock); + fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock); rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); @@ -372,7 +373,7 @@ xfs_getfsmap_datadev_bnobt_helper( struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; - rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.agno, + rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.pag->pag_agno, rec->ar_startblock); irec.rm_startblock = rec->ar_startblock; @@ -429,8 +430,8 @@ xfs_getfsmap_logdev( info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; info->missing_owner = XFS_FMR_OWN_FREE; - trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high); + trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high); if (keys[0].fmr_physical > 0) return 0; @@ -508,8 +509,8 @@ __xfs_getfsmap_rtdev( info->high.rm_blockcount = 0; xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); - trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high); + trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high); return query_fn(tp, info); } @@ -572,6 +573,7 @@ __xfs_getfsmap_datadev( void *priv) { struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; struct xfs_btree_cur *bt_cur = NULL; xfs_fsblock_t start_fsb; xfs_fsblock_t end_fsb; @@ -610,20 +612,20 @@ __xfs_getfsmap_datadev( start_ag = XFS_FSB_TO_AGNO(mp, start_fsb); end_ag = XFS_FSB_TO_AGNO(mp, end_fsb); - /* Query each AG */ - for (info->agno = start_ag; info->agno <= end_ag; info->agno++) { + for_each_perag_range(mp, start_ag, end_ag, pag) { /* * Set the AG high key from the fsmap high key if this * is the last AG that we're querying. */ - if (info->agno == end_ag) { + info->pag = pag; + if (pag->pag_agno == end_ag) { info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsb); info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset); error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); if (error) - goto err; + break; xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); } @@ -634,38 +636,45 @@ __xfs_getfsmap_datadev( info->agf_bp = NULL; } - error = xfs_alloc_read_agf(mp, tp, info->agno, 0, + error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &info->agf_bp); if (error) - goto err; + break; - trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, info->agno, + trace_xfs_fsmap_low_key(mp, info->dev, pag->pag_agno, + &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, pag->pag_agno, &info->high); error = query_fn(tp, info, &bt_cur, priv); if (error) - goto err; + break; /* * Set the AG low key to the start of the AG prior to * moving on to the next AG. */ - if (info->agno == start_ag) { + if (pag->pag_agno == start_ag) { info->low.rm_startblock = 0; info->low.rm_owner = 0; info->low.rm_offset = 0; info->low.rm_flags = 0; } - } - /* Report any gap at the end of the AG */ - info->last = true; - error = query_fn(tp, info, &bt_cur, priv); - if (error) - goto err; + /* + * If this is the last AG, report any gap at the end of it + * before we drop the reference to the perag when the loop + * terminates. + */ + if (pag->pag_agno == end_ag) { + info->last = true; + error = query_fn(tp, info, &bt_cur, priv); + if (error) + break; + } + info->pag = NULL; + } -err: if (bt_cur) xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); @@ -673,6 +682,13 @@ err: xfs_trans_brelse(tp, info->agf_bp); info->agf_bp = NULL; } + if (info->pag) { + xfs_perag_put(info->pag); + info->pag = NULL; + } else if (pag) { + /* loop termination case */ + xfs_perag_put(pag); + } return error; } @@ -691,7 +707,7 @@ xfs_getfsmap_datadev_rmapbt_query( /* Allocate cursor for this AG and query_range it. */ *curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->agno); + info->pag); return xfs_rmap_query_range(*curpp, &info->low, &info->high, xfs_getfsmap_datadev_helper, info); } @@ -724,7 +740,7 @@ xfs_getfsmap_datadev_bnobt_query( /* Allocate cursor for this AG and query_range it. */ *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->agno, XFS_BTNUM_BNO); + info->pag, XFS_BTNUM_BNO); key->ar_startblock = info->low.rm_startblock; key[1].ar_startblock = info->high.rm_startblock; return xfs_alloc_query_range(*curpp, key, &key[1], @@ -937,7 +953,7 @@ xfs_getfsmap( info.dev = handlers[i].dev; info.last = false; - info.agno = NULLAGNUMBER; + info.pag = NULL; error = handlers[i].fn(tp, dkeys, &info); if (error) break; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index be9cf88d2ad7..6ed29b158312 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -538,25 +538,25 @@ xfs_do_force_shutdown( if (flags & SHUTDOWN_FORCE_UMOUNT) { xfs_alert(mp, -"User initiated shutdown received. Shutting down filesystem"); +"User initiated shutdown (0x%x) received. Shutting down filesystem", + flags); return; } - xfs_notice(mp, -"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT, - __func__, flags, lnnum, fname, __return_address); - if (flags & SHUTDOWN_CORRUPT_INCORE) { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, -"Corruption of in-memory data detected. Shutting down filesystem"); +"Corruption of in-memory data (0x%x) detected at %pS (%s:%d). Shutting down filesystem", + flags, __return_address, fname, lnnum); if (XFS_ERRLEVEL_HIGH <= xfs_error_level) xfs_stack_trace(); } else if (logerror) { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, - "Log I/O Error Detected. Shutting down filesystem"); +"Log I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem", + flags, __return_address, fname, lnnum); } else { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "I/O Error Detected. Shutting down filesystem"); +"I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem", + flags, __return_address, fname, lnnum); } xfs_alert(mp, @@ -576,10 +576,8 @@ xfs_fs_reserve_ag_blocks( int err2; mp->m_finobt_nores = false; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); + for_each_perag(mp, agno, pag) { err2 = xfs_ag_resv_init(pag, NULL); - xfs_perag_put(pag); if (err2 && !error) error = err2; } @@ -605,10 +603,8 @@ xfs_fs_unreserve_ag_blocks( int error = 0; int err2; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); + for_each_perag(mp, agno, pag) { err2 = xfs_ag_resv_free(pag); - xfs_perag_put(pag); if (err2 && !error) error = err2; } diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 8e0cb05a7142..eb10eacabc8f 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -9,11 +9,11 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trace.h" #include "xfs_health.h" +#include "xfs_ag.h" /* * Warn about metadata corruption that we detected but haven't fixed, and @@ -34,14 +34,12 @@ xfs_health_unmount( return; /* Measure AG corruption levels. */ - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - pag = xfs_perag_get(mp, agno); + for_each_perag(mp, agno, pag) { xfs_ag_measure_sickness(pag, &sick, &checked); if (sick) { trace_xfs_ag_unfixed_corruption(mp, agno, sick); warn = true; } - xfs_perag_put(pag); } /* Measure realtime volume corruption levels. */ @@ -231,6 +229,15 @@ xfs_inode_mark_sick( ip->i_sick |= mask; ip->i_checked |= mask; spin_unlock(&ip->i_flags_lock); + + /* + * Keep this inode around so we don't lose the sickness report. Scrub + * grabs inodes with DONTCACHE assuming that most inode are ok, which + * is not the case here. + */ + spin_lock(&VFS_I(ip)->i_lock); + VFS_I(ip)->i_state &= ~I_DONTCACHE; + spin_unlock(&VFS_I(ip)->i_lock); } /* Mark parts of an inode healed. */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3c81daca0e9a..6007683482c6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -9,7 +9,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -23,9 +22,65 @@ #include "xfs_dquot.h" #include "xfs_reflink.h" #include "xfs_ialloc.h" +#include "xfs_ag.h" #include <linux/iversion.h> +/* Radix tree tags for incore inode tree. */ + +/* inode is to be reclaimed */ +#define XFS_ICI_RECLAIM_TAG 0 +/* Inode has speculative preallocations (posteof or cow) to clean. */ +#define XFS_ICI_BLOCKGC_TAG 1 + +/* + * The goal for walking incore inodes. These can correspond with incore inode + * radix tree tags when convenient. Avoid existing XFS_IWALK namespace. + */ +enum xfs_icwalk_goal { + /* Goals that are not related to tags; these must be < 0. */ + XFS_ICWALK_DQRELE = -1, + + /* Goals directly associated with tagged inodes. */ + XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, + XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, +}; + +#define XFS_ICWALK_NULL_TAG (-1U) + +/* Compute the inode radix tree tag for this goal. */ +static inline unsigned int +xfs_icwalk_tag(enum xfs_icwalk_goal goal) +{ + return goal < 0 ? XFS_ICWALK_NULL_TAG : goal; +} + +static int xfs_icwalk(struct xfs_mount *mp, + enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); +static int xfs_icwalk_ag(struct xfs_perag *pag, + enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); + +/* + * Private inode cache walk flags for struct xfs_icwalk. Must not + * coincide with XFS_ICWALK_FLAGS_VALID. + */ +#define XFS_ICWALK_FLAG_DROP_UDQUOT (1U << 31) +#define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) +#define XFS_ICWALK_FLAG_DROP_PDQUOT (1U << 29) + +/* Stop scanning after icw_scan_limit inodes. */ +#define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) + +#define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) +#define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */ + +#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ + XFS_ICWALK_FLAG_DROP_GDQUOT | \ + XFS_ICWALK_FLAG_DROP_PDQUOT | \ + XFS_ICWALK_FLAG_SCAN_LIMIT | \ + XFS_ICWALK_FLAG_RECLAIM_SICK | \ + XFS_ICWALK_FLAG_UNION) + /* * Allocate and initialise an xfs_inode. */ @@ -157,46 +212,94 @@ xfs_reclaim_work_queue( rcu_read_unlock(); } -static void -xfs_perag_set_reclaim_tag( +/* + * Background scanning to trim preallocated space. This is queued based on the + * 'speculative_prealloc_lifetime' tunable (5m by default). + */ +static inline void +xfs_blockgc_queue( struct xfs_perag *pag) { + rcu_read_lock(); + if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) + queue_delayed_work(pag->pag_mount->m_gc_workqueue, + &pag->pag_blockgc_work, + msecs_to_jiffies(xfs_blockgc_secs * 1000)); + rcu_read_unlock(); +} + +/* Set a tag on both the AG incore inode tree and the AG radix tree. */ +static void +xfs_perag_set_inode_tag( + struct xfs_perag *pag, + xfs_agino_t agino, + unsigned int tag) +{ struct xfs_mount *mp = pag->pag_mount; + bool was_tagged; lockdep_assert_held(&pag->pag_ici_lock); - if (pag->pag_ici_reclaimable++) + + was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag); + radix_tree_tag_set(&pag->pag_ici_root, agino, tag); + + if (tag == XFS_ICI_RECLAIM_TAG) + pag->pag_ici_reclaimable++; + + if (was_tagged) return; - /* propagate the reclaim tag up into the perag radix tree */ + /* propagate the tag up into the perag radix tree */ spin_lock(&mp->m_perag_lock); - radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, - XFS_ICI_RECLAIM_TAG); + radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag); spin_unlock(&mp->m_perag_lock); - /* schedule periodic background inode reclaim */ - xfs_reclaim_work_queue(mp); + /* start background work */ + switch (tag) { + case XFS_ICI_RECLAIM_TAG: + xfs_reclaim_work_queue(mp); + break; + case XFS_ICI_BLOCKGC_TAG: + xfs_blockgc_queue(pag); + break; + } - trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); + trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); } +/* Clear a tag on both the AG incore inode tree and the AG radix tree. */ static void -xfs_perag_clear_reclaim_tag( - struct xfs_perag *pag) +xfs_perag_clear_inode_tag( + struct xfs_perag *pag, + xfs_agino_t agino, + unsigned int tag) { struct xfs_mount *mp = pag->pag_mount; lockdep_assert_held(&pag->pag_ici_lock); - if (--pag->pag_ici_reclaimable) + + /* + * Reclaim can signal (with a null agino) that it cleared its own tag + * by removing the inode from the radix tree. + */ + if (agino != NULLAGINO) + radix_tree_tag_clear(&pag->pag_ici_root, agino, tag); + else + ASSERT(tag == XFS_ICI_RECLAIM_TAG); + + if (tag == XFS_ICI_RECLAIM_TAG) + pag->pag_ici_reclaimable--; + + if (radix_tree_tagged(&pag->pag_ici_root, tag)) return; - /* clear the reclaim tag from the perag radix tree */ + /* clear the tag from the perag radix tree */ spin_lock(&mp->m_perag_lock); - radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, - XFS_ICI_RECLAIM_TAG); + radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag); spin_unlock(&mp->m_perag_lock); - trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); -} + trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); +} /* * We set the inode flag atomically with the radix tree tag. @@ -204,7 +307,7 @@ xfs_perag_clear_reclaim_tag( * can go away. */ void -xfs_inode_set_reclaim_tag( +xfs_inode_mark_reclaimable( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; @@ -214,9 +317,8 @@ xfs_inode_set_reclaim_tag( spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); - radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - xfs_perag_set_reclaim_tag(pag); + xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); @@ -224,18 +326,7 @@ xfs_inode_set_reclaim_tag( xfs_perag_put(pag); } -STATIC void -xfs_inode_clear_reclaim_tag( - struct xfs_perag *pag, - xfs_ino_t ino) -{ - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(pag->pag_mount, ino), - XFS_ICI_RECLAIM_TAG); - xfs_perag_clear_reclaim_tag(pag); -} - -static void +static inline void xfs_inew_wait( struct xfs_inode *ip) { @@ -264,14 +355,14 @@ xfs_reinit_inode( struct xfs_mount *mp, struct inode *inode) { - int error; - uint32_t nlink = inode->i_nlink; - uint32_t generation = inode->i_generation; - uint64_t version = inode_peek_iversion(inode); - umode_t mode = inode->i_mode; - dev_t dev = inode->i_rdev; - kuid_t uid = inode->i_uid; - kgid_t gid = inode->i_gid; + int error; + uint32_t nlink = inode->i_nlink; + uint32_t generation = inode->i_generation; + uint64_t version = inode_peek_iversion(inode); + umode_t mode = inode->i_mode; + dev_t dev = inode->i_rdev; + kuid_t uid = inode->i_uid; + kgid_t gid = inode->i_gid; error = inode_init_always(mp->m_super, inode); @@ -286,6 +377,74 @@ xfs_reinit_inode( } /* + * Carefully nudge an inode whose VFS state has been torn down back into a + * usable state. Drops the i_flags_lock and the rcu read lock. + */ +static int +xfs_iget_recycle( + struct xfs_perag *pag, + struct xfs_inode *ip) __releases(&ip->i_flags_lock) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + int error; + + trace_xfs_iget_recycle(ip); + + /* + * We need to make it look like the inode is being reclaimed to prevent + * the actual reclaim workers from stomping over us while we recycle + * the inode. We can't clear the radix tree tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); + error = xfs_reinit_inode(mp, inode); + if (error) { + bool wake; + + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + wake = !!__xfs_iflags_test(ip, XFS_INEW); + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + if (wake) + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + trace_xfs_iget_recycle_fail(ip); + return error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now effectively + * a new inode and need to return to the initial state before reuse + * occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + inode->i_state = I_NEW; + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + + return 0; +} + +/* * If we are allocating a new inode, then check what was returned is * actually a free, empty inode. If we are not allocating an inode, * then check we didn't find a free inode. @@ -348,30 +507,21 @@ xfs_iget_cache_hit( * will not match, so check for that, too. */ spin_lock(&ip->i_flags_lock); - if (ip->i_ino != ino) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(mp, xs_ig_frecycle); - error = -EAGAIN; - goto out_error; - } - + if (ip->i_ino != ino) + goto out_skip; /* * If we are racing with another cache hit that is currently * instantiating this inode or currently recycling it out of - * reclaimabe state, wait for the initialisation to complete + * reclaimable state, wait for the initialisation to complete * before continuing. * * XXX(hch): eventually we should do something equivalent to * wait_on_inode to wait for these flags to be cleared * instead of polling for it. */ - if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { - trace_xfs_iget_skip(ip); - XFS_STATS_INC(mp, xs_ig_frecycle); - error = -EAGAIN; - goto out_error; - } + if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM)) + goto out_skip; /* * Check the inode free state is valid. This also detects lookup @@ -381,72 +531,21 @@ xfs_iget_cache_hit( if (error) goto out_error; - /* - * If IRECLAIMABLE is set, we've torn down the VFS inode already. - * Need to carefully get it back into useable state. - */ - if (ip->i_flags & XFS_IRECLAIMABLE) { - trace_xfs_iget_reclaim(ip); - - if (flags & XFS_IGET_INCORE) { - error = -EAGAIN; - goto out_error; - } - - /* - * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode - * from stomping over us while we recycle the inode. We can't - * clear the radix tree reclaimable tag yet as it requires - * pag_ici_lock to be held exclusive. - */ - ip->i_flags |= XFS_IRECLAIM; - - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); + /* Skip inodes that have no vfs state. */ + if ((flags & XFS_IGET_INCORE) && + (ip->i_flags & XFS_IRECLAIMABLE)) + goto out_skip; - ASSERT(!rwsem_is_locked(&inode->i_rwsem)); - error = xfs_reinit_inode(mp, inode); - if (error) { - bool wake; - /* - * Re-initializing the inode failed, and we are in deep - * trouble. Try to re-add it to the reclaim list. - */ - rcu_read_lock(); - spin_lock(&ip->i_flags_lock); - wake = !!__xfs_iflags_test(ip, XFS_INEW); - ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); - if (wake) - wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); - ASSERT(ip->i_flags & XFS_IRECLAIMABLE); - trace_xfs_iget_reclaim_fail(ip); - goto out_error; - } - - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - - /* - * Clear the per-lifetime state in the inode as we are now - * effectively a new inode and need to return to the initial - * state before reuse occurs. - */ - ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; - ip->i_flags |= XFS_INEW; - xfs_inode_clear_reclaim_tag(pag, ip->i_ino); - inode->i_state = I_NEW; - ip->i_sick = 0; - ip->i_checked = 0; - - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); + /* The inode fits the selection criteria; process it. */ + if (ip->i_flags & XFS_IRECLAIMABLE) { + /* Drops i_flags_lock and RCU read lock. */ + error = xfs_iget_recycle(pag, ip); + if (error) + return error; } else { /* If the VFS inode is being torn down, pause and try again. */ - if (!igrab(inode)) { - trace_xfs_iget_skip(ip); - error = -EAGAIN; - goto out_error; - } + if (!igrab(inode)) + goto out_skip; /* We've got a live one. */ spin_unlock(&ip->i_flags_lock); @@ -463,13 +562,16 @@ xfs_iget_cache_hit( return 0; +out_skip: + trace_xfs_iget_skip(ip); + XFS_STATS_INC(mp, xs_ig_frecycle); + error = -EAGAIN; out_error: spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); return error; } - static int xfs_iget_cache_miss( struct xfs_mount *mp, @@ -715,207 +817,96 @@ xfs_icache_inode_is_allocated( return 0; } -/* - * The inode lookup is done in batches to keep the amount of lock traffic and - * radix tree lookups to a minimum. The batch size is a trade off between - * lookup reduction and stack usage. This is in the reclaim path, so we can't - * be too greedy. - */ -#define XFS_LOOKUP_BATCH 32 - -/* - * Decide if the given @ip is eligible to be a part of the inode walk, and - * grab it if so. Returns true if it's ready to go or false if we should just - * ignore it. - */ -STATIC bool -xfs_inode_walk_ag_grab( - struct xfs_inode *ip, - int flags) +#ifdef CONFIG_XFS_QUOTA +/* Decide if we want to grab this inode to drop its dquots. */ +static bool +xfs_dqrele_igrab( + struct xfs_inode *ip) { - struct inode *inode = VFS_I(ip); - bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); + bool ret = false; ASSERT(rcu_read_lock_held()); /* Check for stale RCU freed inode */ spin_lock(&ip->i_flags_lock); if (!ip->i_ino) - goto out_unlock_noent; - - /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || - __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) - goto out_unlock_noent; - spin_unlock(&ip->i_flags_lock); - - /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return false; + goto out_unlock; - /* If we can't grab the inode, it must on it's way to reclaim. */ - if (!igrab(inode)) - return false; + /* + * Skip inodes that are anywhere in the reclaim machinery because we + * drop dquots before tagging an inode for reclamation. + */ + if (ip->i_flags & (XFS_IRECLAIM | XFS_IRECLAIMABLE)) + goto out_unlock; - /* inode is valid */ - return true; + /* + * The inode looks alive; try to grab a VFS reference so that it won't + * get destroyed. If we got the reference, return true to say that + * we grabbed the inode. + * + * If we can't get the reference, then we know the inode had its VFS + * state torn down and hasn't yet entered the reclaim machinery. Since + * we also know that dquots are detached from an inode before it enters + * reclaim, we can skip the inode. + */ + ret = igrab(VFS_I(ip)) != NULL; -out_unlock_noent: +out_unlock: spin_unlock(&ip->i_flags_lock); - return false; + return ret; } -/* - * For a given per-AG structure @pag, grab, @execute, and rele all incore - * inodes with the given radix tree @tag. - */ -STATIC int -xfs_inode_walk_ag( - struct xfs_perag *pag, - int iter_flags, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, - int tag) +/* Drop this inode's dquots. */ +static void +xfs_dqrele_inode( + struct xfs_inode *ip, + struct xfs_icwalk *icw) { - struct xfs_mount *mp = pag->pag_mount; - uint32_t first_index; - int last_error = 0; - int skipped; - bool done; - int nr_found; - -restart: - done = false; - skipped = 0; - first_index = 0; - nr_found = 0; - do { - struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - int error = 0; - int i; - - rcu_read_lock(); - - if (tag == XFS_ICI_NO_TAG) - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH); - else - nr_found = radix_tree_gang_lookup_tag( - &pag->pag_ici_root, - (void **) batch, first_index, - XFS_LOOKUP_BATCH, tag); - - if (!nr_found) { - rcu_read_unlock(); - break; - } - - /* - * Grab the inodes before we drop the lock. if we found - * nothing, nr == 0 and the loop will be skipped. - */ - for (i = 0; i < nr_found; i++) { - struct xfs_inode *ip = batch[i]; - - if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) - batch[i] = NULL; - - /* - * Update the index for the next lookup. Catch - * overflows into the next AG range which can occur if - * we have inodes in the last block of the AG and we - * are currently pointing to the last inode. - * - * Because we may see inodes that are from the wrong AG - * due to RCU freeing and reallocation, only update the - * index if it lies in this AG. It was a race that lead - * us to see this inode, so another lookup from the - * same index will not find it again. - */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) - continue; - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = true; - } - - /* unlock now we've grabbed the inodes. */ - rcu_read_unlock(); - - for (i = 0; i < nr_found; i++) { - if (!batch[i]) - continue; - if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && - xfs_iflags_test(batch[i], XFS_INEW)) - xfs_inew_wait(batch[i]); - error = execute(batch[i], args); - xfs_irele(batch[i]); - if (error == -EAGAIN) { - skipped++; - continue; - } - if (error && last_error != -EFSCORRUPTED) - last_error = error; - } - - /* bail out if the filesystem is corrupted. */ - if (error == -EFSCORRUPTED) - break; + if (xfs_iflags_test(ip, XFS_INEW)) + xfs_inew_wait(ip); - cond_resched(); - - } while (nr_found && !done); - - if (skipped) { - delay(1); - goto restart; + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; } - return last_error; -} - -/* Fetch the next (possibly tagged) per-AG structure. */ -static inline struct xfs_perag * -xfs_inode_walk_get_perag( - struct xfs_mount *mp, - xfs_agnumber_t agno, - int tag) -{ - if (tag == XFS_ICI_NO_TAG) - return xfs_perag_get(mp, agno); - return xfs_perag_get_tag(mp, agno, tag); + if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } + if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) { + xfs_qm_dqrele(ip->i_pdquot); + ip->i_pdquot = NULL; + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); } /* - * Call the @execute function on all incore inodes matching the radix tree - * @tag. + * Detach all dquots from incore inodes if we can. The caller must already + * have dropped the relevant XFS_[UGP]QUOTA_ACTIVE flags so that dquots will + * not get reattached. */ int -xfs_inode_walk( +xfs_dqrele_all_inodes( struct xfs_mount *mp, - int iter_flags, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, - int tag) + unsigned int qflags) { - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; + struct xfs_icwalk icw = { .icw_flags = 0 }; - ag = 0; - while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) { - ag = pag->pag_agno + 1; - error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); - xfs_perag_put(pag); - if (error) { - last_error = error; - if (error == -EFSCORRUPTED) - break; - } - } - return last_error; + if (qflags & XFS_UQUOTA_ACCT) + icw.icw_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT; + if (qflags & XFS_GQUOTA_ACCT) + icw.icw_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT; + if (qflags & XFS_PQUOTA_ACCT) + icw.icw_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; + + return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &icw); } +#else +# define xfs_dqrele_igrab(ip) (false) +# define xfs_dqrele_inode(ip, priv) ((void)0) +#endif /* CONFIG_XFS_QUOTA */ /* * Grab the inode for reclaim exclusively. @@ -935,8 +926,9 @@ xfs_inode_walk( * Return true if we grabbed it, false otherwise. */ static bool -xfs_reclaim_inode_grab( - struct xfs_inode *ip) +xfs_reclaim_igrab( + struct xfs_inode *ip, + struct xfs_icwalk *icw) { ASSERT(rcu_read_lock_held()); @@ -947,6 +939,14 @@ xfs_reclaim_inode_grab( spin_unlock(&ip->i_flags_lock); return false; } + + /* Don't reclaim a sick inode unless the caller asked for it. */ + if (ip->i_sick && + (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { + spin_unlock(&ip->i_flags_lock); + return false; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); spin_unlock(&ip->i_flags_lock); return true; @@ -1002,6 +1002,8 @@ reclaim: spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; ip->i_ino = 0; + ip->i_sick = 0; + ip->i_checked = 0; spin_unlock(&ip->i_flags_lock); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1018,7 +1020,7 @@ reclaim: if (!radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ino))) ASSERT(0); - xfs_perag_clear_reclaim_tag(pag); + xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG); spin_unlock(&pag->pag_ici_lock); /* @@ -1030,7 +1032,7 @@ reclaim: * unlocked after the lookup before we go ahead and free it. */ xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_qm_dqdetach(ip); + ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot); xfs_iunlock(ip, XFS_ILOCK_EXCL); ASSERT(xfs_inode_clean(ip)); @@ -1045,108 +1047,30 @@ out: xfs_iflags_clear(ip, XFS_IRECLAIM); } -/* - * Walk the AGs and reclaim the inodes in them. Even if the filesystem is - * corrupted, we still want to try to reclaim all the inodes. If we don't, - * then a shut down during filesystem unmount reclaim walk leak all the - * unreclaimed inodes. - * - * Returns non-zero if any AGs or inodes were skipped in the reclaim pass - * so that callers that want to block until all dirty inodes are written back - * and reclaimed can sanely loop. - */ -static void -xfs_reclaim_inodes_ag( - struct xfs_mount *mp, - int *nr_to_scan) +/* Reclaim sick inodes if we're unmounting or the fs went down. */ +static inline bool +xfs_want_reclaim_sick( + struct xfs_mount *mp) { - struct xfs_perag *pag; - xfs_agnumber_t ag = 0; - - while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { - unsigned long first_index = 0; - int done = 0; - int nr_found = 0; - - ag = pag->pag_agno + 1; - - first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); - do { - struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - int i; - - rcu_read_lock(); - nr_found = radix_tree_gang_lookup_tag( - &pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH, - XFS_ICI_RECLAIM_TAG); - if (!nr_found) { - done = 1; - rcu_read_unlock(); - break; - } - - /* - * Grab the inodes before we drop the lock. if we found - * nothing, nr == 0 and the loop will be skipped. - */ - for (i = 0; i < nr_found; i++) { - struct xfs_inode *ip = batch[i]; - - if (done || !xfs_reclaim_inode_grab(ip)) - batch[i] = NULL; - - /* - * Update the index for the next lookup. Catch - * overflows into the next AG range which can - * occur if we have inodes in the last block of - * the AG and we are currently pointing to the - * last inode. - * - * Because we may see inodes that are from the - * wrong AG due to RCU freeing and - * reallocation, only update the index if it - * lies in this AG. It was a race that lead us - * to see this inode, so another lookup from - * the same index will not find it again. - */ - if (XFS_INO_TO_AGNO(mp, ip->i_ino) != - pag->pag_agno) - continue; - first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; - } - - /* unlock now we've grabbed the inodes. */ - rcu_read_unlock(); - - for (i = 0; i < nr_found; i++) { - if (batch[i]) - xfs_reclaim_inode(batch[i], pag); - } - - *nr_to_scan -= XFS_LOOKUP_BATCH; - cond_resched(); - } while (nr_found && !done && *nr_to_scan > 0); - - if (done) - first_index = 0; - WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); - xfs_perag_put(pag); - } + return (mp->m_flags & XFS_MOUNT_UNMOUNTING) || + (mp->m_flags & XFS_MOUNT_NORECOVERY) || + XFS_FORCED_SHUTDOWN(mp); } void xfs_reclaim_inodes( struct xfs_mount *mp) { - int nr_to_scan = INT_MAX; + struct xfs_icwalk icw = { + .icw_flags = 0, + }; + + if (xfs_want_reclaim_sick(mp)) + icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { xfs_ail_push_all_sync(mp->m_ail); - xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); } } @@ -1160,13 +1084,21 @@ xfs_reclaim_inodes( long xfs_reclaim_inodes_nr( struct xfs_mount *mp, - int nr_to_scan) + unsigned long nr_to_scan) { + struct xfs_icwalk icw = { + .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, + .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan), + }; + + if (xfs_want_reclaim_sick(mp)) + icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; + /* kick background reclaimer and push the AIL */ xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); - xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); return 0; } @@ -1174,13 +1106,13 @@ xfs_reclaim_inodes_nr( * Return the number of reclaimable inodes in the filesystem for * the shrinker to determine how much to reclaim. */ -int +long xfs_reclaim_inodes_count( struct xfs_mount *mp) { struct xfs_perag *pag; xfs_agnumber_t ag = 0; - int reclaimable = 0; + long reclaimable = 0; while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { ag = pag->pag_agno + 1; @@ -1191,20 +1123,20 @@ xfs_reclaim_inodes_count( } STATIC bool -xfs_inode_match_id( +xfs_icwalk_match_id( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { - if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && - !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && + !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) return false; - if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && - !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && + !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) return false; - if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && - ip->i_projid != eofb->eof_prid) + if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && + ip->i_projid != icw->icw_prid) return false; return true; @@ -1215,20 +1147,20 @@ xfs_inode_match_id( * criteria match. This is for global/internal scans only. */ STATIC bool -xfs_inode_match_id_union( +xfs_icwalk_match_id_union( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { - if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && - uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && + uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) return true; - if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && - gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && + gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) return true; - if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && - ip->i_projid == eofb->eof_prid) + if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && + ip->i_projid == icw->icw_prid) return true; return false; @@ -1236,29 +1168,29 @@ xfs_inode_match_id_union( /* * Is this inode @ip eligible for eof/cow block reclamation, given some - * filtering parameters @eofb? The inode is eligible if @eofb is null or + * filtering parameters @icw? The inode is eligible if @icw is null or * if the predicate functions match. */ static bool -xfs_inode_matches_eofb( +xfs_icwalk_match( struct xfs_inode *ip, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { bool match; - if (!eofb) + if (!icw) return true; - if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) - match = xfs_inode_match_id_union(ip, eofb); + if (icw->icw_flags & XFS_ICWALK_FLAG_UNION) + match = xfs_icwalk_match_id_union(ip, icw); else - match = xfs_inode_match_id(ip, eofb); + match = xfs_icwalk_match_id(ip, icw); if (!match) return false; /* skip the inode if the file size is too small */ - if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) && - XFS_ISIZE(ip) < eofb->eof_min_file_size) + if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) && + XFS_ISIZE(ip) < icw->icw_min_file_size) return false; return true; @@ -1276,22 +1208,20 @@ xfs_reclaim_worker( { struct xfs_mount *mp = container_of(to_delayed_work(work), struct xfs_mount, m_reclaim_work); - int nr_to_scan = INT_MAX; - xfs_reclaim_inodes_ag(mp, &nr_to_scan); + xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); xfs_reclaim_work_queue(mp); } STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - void *args, + struct xfs_icwalk *icw, unsigned int *lockflags) { - struct xfs_eofblocks *eofb = args; bool wait; - wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); + wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) return 0; @@ -1303,7 +1233,7 @@ xfs_inode_free_eofblocks( if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; - if (!xfs_inode_matches_eofb(ip, eofb)) + if (!xfs_icwalk_match(ip, icw)) return 0; /* @@ -1326,22 +1256,6 @@ xfs_inode_free_eofblocks( return 0; } -/* - * Background scanning to trim preallocated space. This is queued based on the - * 'speculative_prealloc_lifetime' tunable (5m by default). - */ -static inline void -xfs_blockgc_queue( - struct xfs_perag *pag) -{ - rcu_read_lock(); - if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(pag->pag_mount->m_gc_workqueue, - &pag->pag_blockgc_work, - msecs_to_jiffies(xfs_blockgc_secs * 1000)); - rcu_read_unlock(); -} - static void xfs_blockgc_set_iflag( struct xfs_inode *ip, @@ -1349,7 +1263,6 @@ xfs_blockgc_set_iflag( { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; - int tagged; ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); @@ -1366,24 +1279,8 @@ xfs_blockgc_set_iflag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - tagged = radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG); - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); - if (!tagged) { - /* propagate the blockgc tag up into the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_set(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - - /* kick off background trimming */ - xfs_blockgc_queue(pag); - - trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1, - _RET_IP_); - } + xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_BLOCKGC_TAG); spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); @@ -1419,19 +1316,8 @@ xfs_blockgc_clear_iflag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); - if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) { - /* clear the blockgc tag from the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_clear(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_BLOCKGC_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_blockgc(ip->i_mount, pag->pag_agno, -1, - _RET_IP_); - } + xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_BLOCKGC_TAG); spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); @@ -1492,14 +1378,13 @@ xfs_prep_free_cowblocks( STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, - void *args, + struct xfs_icwalk *icw, unsigned int *lockflags) { - struct xfs_eofblocks *eofb = args; bool wait; int ret = 0; - wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); + wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) return 0; @@ -1507,7 +1392,7 @@ xfs_inode_free_cowblocks( if (!xfs_prep_free_cowblocks(ip)) return 0; - if (!xfs_inode_matches_eofb(ip, eofb)) + if (!xfs_icwalk_match(ip, icw)) return 0; /* @@ -1554,14 +1439,6 @@ xfs_inode_clear_cowblocks_tag( return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); } -#define for_each_perag_tag(mp, next_agno, pag, tag) \ - for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ - (pag) != NULL; \ - (next_agno) = (pag)->pag_agno + 1, \ - xfs_perag_put(pag), \ - (pag) = xfs_perag_get_tag((mp), (next_agno), (tag))) - - /* Disable post-EOF and CoW block auto-reclamation. */ void xfs_blockgc_stop( @@ -1586,23 +1463,66 @@ xfs_blockgc_start( xfs_blockgc_queue(pag); } +/* Don't try to run block gc on an inode that's in any of these states. */ +#define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \ + XFS_IRECLAIMABLE | \ + XFS_IRECLAIM) +/* + * Decide if the given @ip is eligible for garbage collection of speculative + * preallocations, and grab it if so. Returns true if it's ready to go or + * false if we should just ignore it. + */ +static bool +xfs_blockgc_igrab( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(rcu_read_lock_held()); + + /* Check for stale RCU freed inode */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock_noent; + + if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS) + goto out_unlock_noent; + spin_unlock(&ip->i_flags_lock); + + /* nothing to sync during shutdown */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return false; + + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + return false; + + /* inode is valid */ + return true; + +out_unlock_noent: + spin_unlock(&ip->i_flags_lock); + return false; +} + /* Scan one incore inode for block preallocations that we can remove. */ static int xfs_blockgc_scan_inode( struct xfs_inode *ip, - void *args) + struct xfs_icwalk *icw) { unsigned int lockflags = 0; int error; - error = xfs_inode_free_eofblocks(ip, args, &lockflags); + error = xfs_inode_free_eofblocks(ip, icw, &lockflags); if (error) goto unlock; - error = xfs_inode_free_cowblocks(ip, args, &lockflags); + error = xfs_inode_free_cowblocks(ip, icw, &lockflags); unlock: if (lockflags) xfs_iunlock(ip, lockflags); + xfs_irele(ip); return error; } @@ -1618,8 +1538,7 @@ xfs_blockgc_worker( if (!sb_start_write_trylock(mp->m_super)) return; - error = xfs_inode_walk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, - XFS_ICI_BLOCKGC_TAG); + error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", pag->pag_agno, error); @@ -1633,12 +1552,11 @@ xfs_blockgc_worker( int xfs_blockgc_free_space( struct xfs_mount *mp, - struct xfs_eofblocks *eofb) + struct xfs_icwalk *icw) { - trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_); + trace_xfs_blockgc_free_space(mp, icw, _RET_IP_); - return xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, eofb, - XFS_ICI_BLOCKGC_TAG); + return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); } /* @@ -1648,7 +1566,7 @@ xfs_blockgc_free_space( * scan. * * Callers must not hold any inode's ILOCK. If requesting a synchronous scan - * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or + * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or * MMAPLOCK. */ int @@ -1657,9 +1575,9 @@ xfs_blockgc_free_dquots( struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, - unsigned int eof_flags) + unsigned int iwalk_flags) { - struct xfs_eofblocks eofb = {0}; + struct xfs_icwalk icw = {0}; bool do_work = false; if (!udqp && !gdqp && !pdqp) @@ -1669,40 +1587,260 @@ xfs_blockgc_free_dquots( * Run a scan to free blocks using the union filter to cover all * applicable quotas in a single scan. */ - eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags; + icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags; if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { - eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); - eofb.eof_flags |= XFS_EOF_FLAGS_UID; + icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); + icw.icw_flags |= XFS_ICWALK_FLAG_UID; do_work = true; } if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { - eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); - eofb.eof_flags |= XFS_EOF_FLAGS_GID; + icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); + icw.icw_flags |= XFS_ICWALK_FLAG_GID; do_work = true; } if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { - eofb.eof_prid = pdqp->q_id; - eofb.eof_flags |= XFS_EOF_FLAGS_PRID; + icw.icw_prid = pdqp->q_id; + icw.icw_flags |= XFS_ICWALK_FLAG_PRID; do_work = true; } if (!do_work) return 0; - return xfs_blockgc_free_space(mp, &eofb); + return xfs_blockgc_free_space(mp, &icw); } /* Run cow/eofblocks scans on the quotas attached to the inode. */ int xfs_blockgc_free_quota( struct xfs_inode *ip, - unsigned int eof_flags) + unsigned int iwalk_flags) { return xfs_blockgc_free_dquots(ip->i_mount, xfs_inode_dquot(ip, XFS_DQTYPE_USER), xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), - xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags); + xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags); +} + +/* XFS Inode Cache Walking Code */ + +/* + * The inode lookup is done in batches to keep the amount of lock traffic and + * radix tree lookups to a minimum. The batch size is a trade off between + * lookup reduction and stack usage. This is in the reclaim path, so we can't + * be too greedy. + */ +#define XFS_LOOKUP_BATCH 32 + + +/* + * Decide if we want to grab this inode in anticipation of doing work towards + * the goal. + */ +static inline bool +xfs_icwalk_igrab( + enum xfs_icwalk_goal goal, + struct xfs_inode *ip, + struct xfs_icwalk *icw) +{ + switch (goal) { + case XFS_ICWALK_DQRELE: + return xfs_dqrele_igrab(ip); + case XFS_ICWALK_BLOCKGC: + return xfs_blockgc_igrab(ip); + case XFS_ICWALK_RECLAIM: + return xfs_reclaim_igrab(ip, icw); + default: + return false; + } +} + +/* + * Process an inode. Each processing function must handle any state changes + * made by the icwalk igrab function. Return -EAGAIN to skip an inode. + */ +static inline int +xfs_icwalk_process_inode( + enum xfs_icwalk_goal goal, + struct xfs_inode *ip, + struct xfs_perag *pag, + struct xfs_icwalk *icw) +{ + int error = 0; + + switch (goal) { + case XFS_ICWALK_DQRELE: + xfs_dqrele_inode(ip, icw); + break; + case XFS_ICWALK_BLOCKGC: + error = xfs_blockgc_scan_inode(ip, icw); + break; + case XFS_ICWALK_RECLAIM: + xfs_reclaim_inode(ip, pag); + break; + } + return error; +} + +/* + * For a given per-AG structure @pag and a goal, grab qualifying inodes and + * process them in some manner. + */ +static int +xfs_icwalk_ag( + struct xfs_perag *pag, + enum xfs_icwalk_goal goal, + struct xfs_icwalk *icw) +{ + struct xfs_mount *mp = pag->pag_mount; + uint32_t first_index; + int last_error = 0; + int skipped; + bool done; + int nr_found; + +restart: + done = false; + skipped = 0; + if (goal == XFS_ICWALK_RECLAIM) + first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); + else + first_index = 0; + nr_found = 0; + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + unsigned int tag = xfs_icwalk_tag(goal); + int error = 0; + int i; + + rcu_read_lock(); + + if (tag == XFS_ICWALK_NULL_TAG) + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH); + else + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, tag); + + if (!nr_found) { + done = true; + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || !xfs_icwalk_igrab(goal, ip, icw)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = true; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = xfs_icwalk_process_inode(goal, batch[i], pag, + icw); + if (error == -EAGAIN) { + skipped++; + continue; + } + if (error && last_error != -EFSCORRUPTED) + last_error = error; + } + + /* bail out if the filesystem is corrupted. */ + if (error == -EFSCORRUPTED) + break; + + cond_resched(); + + if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { + icw->icw_scan_limit -= XFS_LOOKUP_BATCH; + if (icw->icw_scan_limit <= 0) + break; + } + } while (nr_found && !done); + + if (goal == XFS_ICWALK_RECLAIM) { + if (done) + first_index = 0; + WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); + } + + if (skipped) { + delay(1); + goto restart; + } + return last_error; +} + +/* Fetch the next (possibly tagged) per-AG structure. */ +static inline struct xfs_perag * +xfs_icwalk_get_perag( + struct xfs_mount *mp, + xfs_agnumber_t agno, + enum xfs_icwalk_goal goal) +{ + unsigned int tag = xfs_icwalk_tag(goal); + + if (tag == XFS_ICWALK_NULL_TAG) + return xfs_perag_get(mp, agno); + return xfs_perag_get_tag(mp, agno, tag); +} + +/* Walk all incore inodes to achieve a given goal. */ +static int +xfs_icwalk( + struct xfs_mount *mp, + enum xfs_icwalk_goal goal, + struct xfs_icwalk *icw) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t agno = 0; + + while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { + agno = pag->pag_agno + 1; + error = xfs_icwalk_ag(pag, goal, icw); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == -EFSCORRUPTED) + break; + } + } + return last_error; + BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index d1fddb152420..c751cc32dc46 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -9,22 +9,27 @@ struct xfs_mount; struct xfs_perag; -struct xfs_eofblocks { - __u32 eof_flags; - kuid_t eof_uid; - kgid_t eof_gid; - prid_t eof_prid; - __u64 eof_min_file_size; +struct xfs_icwalk { + __u32 icw_flags; + kuid_t icw_uid; + kgid_t icw_gid; + prid_t icw_prid; + __u64 icw_min_file_size; + long icw_scan_limit; }; -/* - * tags for inode radix tree - */ -#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup - in xfs_inode_walk */ -#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ -/* Inode has speculative preallocations (posteof or cow) to clean. */ -#define XFS_ICI_BLOCKGC_TAG 1 +/* Flags that reflect xfs_fs_eofblocks functionality. */ +#define XFS_ICWALK_FLAG_SYNC (1U << 0) /* sync/wait mode scan */ +#define XFS_ICWALK_FLAG_UID (1U << 1) /* filter by uid */ +#define XFS_ICWALK_FLAG_GID (1U << 2) /* filter by gid */ +#define XFS_ICWALK_FLAG_PRID (1U << 3) /* filter by project id */ +#define XFS_ICWALK_FLAG_MINFILESIZE (1U << 4) /* filter by min file size */ + +#define XFS_ICWALK_FLAGS_VALID (XFS_ICWALK_FLAG_SYNC | \ + XFS_ICWALK_FLAG_UID | \ + XFS_ICWALK_FLAG_GID | \ + XFS_ICWALK_FLAG_PRID | \ + XFS_ICWALK_FLAG_MINFILESIZE) /* * Flags for xfs_iget() @@ -34,11 +39,6 @@ struct xfs_eofblocks { #define XFS_IGET_DONTCACHE 0x4 #define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */ -/* - * flags for AG inode iterator - */ -#define XFS_INODE_WALK_INEW_WAIT 0x1 /* wait on new inodes */ - int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); @@ -49,16 +49,16 @@ void xfs_inode_free(struct xfs_inode *ip); void xfs_reclaim_worker(struct work_struct *work); void xfs_reclaim_inodes(struct xfs_mount *mp); -int xfs_reclaim_inodes_count(struct xfs_mount *mp); -long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); +long xfs_reclaim_inodes_count(struct xfs_mount *mp); +long xfs_reclaim_inodes_nr(struct xfs_mount *mp, unsigned long nr_to_scan); -void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void xfs_inode_mark_reclaimable(struct xfs_inode *ip); int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, - unsigned int eof_flags); -int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int eof_flags); -int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_eofblocks *eofb); + unsigned int iwalk_flags); +int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); +int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); @@ -68,9 +68,11 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); void xfs_blockgc_worker(struct work_struct *work); -int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, - int (*execute)(struct xfs_inode *ip, void *args), - void *args, int tag); +#ifdef CONFIG_XFS_QUOTA +int xfs_dqrele_all_inodes(struct xfs_mount *mp, unsigned int qflags); +#else +# define xfs_dqrele_all_inodes(mp, qflags) (0) +#endif int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e4c2da4566f1..a835ceb79ba5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -11,7 +11,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" @@ -35,6 +34,7 @@ #include "xfs_log.h" #include "xfs_bmap_btree.h" #include "xfs_reflink.h" +#include "xfs_ag.h" kmem_zone_t *xfs_inode_zone; @@ -45,7 +45,8 @@ kmem_zone_t *xfs_inode_zone; #define XFS_ITRUNC_MAX_EXTENTS 2 STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); -STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *); +STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_inode *); /* * helper function to extract extent size hint from inode @@ -778,7 +779,7 @@ xfs_inode_inherit_flags2( * Initialise a newly allocated inode and return the in-core inode to the * caller locked exclusively. */ -static int +int xfs_init_new_inode( struct user_namespace *mnt_userns, struct xfs_trans *tp, @@ -877,7 +878,7 @@ xfs_init_new_inode( xfs_inode_inherit_flags(ip, pip); if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) xfs_inode_inherit_flags2(ip, pip); - /* FALLTHROUGH */ + fallthrough; case S_IFLNK: ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_bytes = 0; @@ -915,57 +916,6 @@ xfs_init_new_inode( } /* - * Allocates a new inode from disk and return a pointer to the incore copy. This - * routine will internally commit the current transaction and allocate a new one - * if we needed to allocate more on-disk free inodes to perform the requested - * operation. - * - * If we are allocating quota inodes, we do not have a parent inode to attach to - * or associate with (i.e. dp == NULL) because they are not linked into the - * directory structure - they are attached directly to the superblock - and so - * have no parent. - */ -int -xfs_dir_ialloc( - struct user_namespace *mnt_userns, - struct xfs_trans **tpp, - struct xfs_inode *dp, - umode_t mode, - xfs_nlink_t nlink, - dev_t rdev, - prid_t prid, - bool init_xattrs, - struct xfs_inode **ipp) -{ - struct xfs_buf *agibp; - xfs_ino_t parent_ino = dp ? dp->i_ino : 0; - xfs_ino_t ino; - int error; - - ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); - - /* - * Call the space management code to pick the on-disk inode to be - * allocated. - */ - error = xfs_dialloc_select_ag(tpp, parent_ino, mode, &agibp); - if (error) - return error; - - if (!agibp) - return -ENOSPC; - - /* Allocate an inode from the selected AG */ - error = xfs_dialloc_ag(*tpp, agibp, parent_ino, &ino); - if (error) - return error; - ASSERT(ino != NULLFSINO); - - return xfs_init_new_inode(mnt_userns, *tpp, dp, ino, mode, nlink, rdev, - prid, init_xattrs, ipp); -} - -/* * Decrement the link count on an inode & log the change. If this causes the * link count to go to zero, move the inode to AGI unlinked list so that it can * be freed when the last active reference goes away via xfs_inactive(). @@ -1022,6 +972,7 @@ xfs_create( struct xfs_dquot *pdqp = NULL; struct xfs_trans_res *tres; uint resblks; + xfs_ino_t ino; trace_xfs_create(dp, name); @@ -1078,14 +1029,16 @@ xfs_create( * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, init_xattrs, &ip); + error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); + if (!error) + error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode, + is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip); if (error) goto out_trans_cancel; /* * Now we join the directory inode to the transaction. We do not do it - * earlier because xfs_dir_ialloc might commit the previous transaction + * earlier because xfs_dialloc might commit the previous transaction * (and release all the locks). An error from here on will result in * the transaction cancel unlocking dp so don't do it explicitly in the * error path. @@ -1175,6 +1128,7 @@ xfs_create_tmpfile( struct xfs_dquot *pdqp = NULL; struct xfs_trans_res *tres; uint resblks; + xfs_ino_t ino; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1199,8 +1153,10 @@ xfs_create_tmpfile( if (error) goto out_release_dquots; - error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, 0, 0, prid, - false, &ip); + error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); + if (!error) + error = xfs_init_new_inode(mnt_userns, tp, dp, ino, mode, + 0, 0, prid, false, &ip); if (error) goto out_trans_cancel; @@ -1315,7 +1271,11 @@ xfs_link( * Handle initial link state of O_TMPFILE inode */ if (VFS_I(sip)->i_nlink == 0) { - error = xfs_iunlink_remove(tp, sip); + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino)); + error = xfs_iunlink_remove(tp, pag, sip); + xfs_perag_put(pag); if (error) goto error_return; } @@ -1716,7 +1676,7 @@ xfs_inactive( */ if (VFS_I(ip)->i_mode == 0) { ASSERT(ip->i_df.if_broot_bytes == 0); - return; + goto out; } mp = ip->i_mount; @@ -1724,11 +1684,11 @@ xfs_inactive( /* If this is a read-only mount, don't do this (would generate I/O) */ if (mp->m_flags & XFS_MOUNT_RDONLY) - return; + goto out; /* Metadata inodes require explicit resource cleanup. */ if (xfs_is_metadata_inode(ip)) - return; + goto out; /* Try to clean out the cow blocks if there are any. */ if (xfs_inode_has_cow_data(ip)) @@ -1747,7 +1707,7 @@ xfs_inactive( if (xfs_can_free_eofblocks(ip, true)) xfs_free_eofblocks(ip); - return; + goto out; } if (S_ISREG(VFS_I(ip)->i_mode) && @@ -1757,14 +1717,14 @@ xfs_inactive( error = xfs_qm_dqattach(ip); if (error) - return; + goto out; if (S_ISLNK(VFS_I(ip)->i_mode)) error = xfs_inactive_symlink(ip); else if (truncate) error = xfs_inactive_truncate(ip); if (error) - return; + goto out; /* * If there are attributes associated with the file then blow them away @@ -1774,7 +1734,7 @@ xfs_inactive( if (XFS_IFORK_Q(ip)) { error = xfs_attr_inactive(ip); if (error) - return; + goto out; } ASSERT(!ip->i_afp); @@ -1783,12 +1743,12 @@ xfs_inactive( /* * Free the inode. */ - error = xfs_inactive_ifree(ip); - if (error) - return; + xfs_inactive_ifree(ip); +out: /* - * Release the dquots held by inode, if any. + * We're done making metadata updates for this inode, so we can release + * the attached dquots. */ xfs_qm_dqdetach(ip); } @@ -2008,7 +1968,7 @@ xfs_iunlink_destroy( STATIC int xfs_iunlink_update_bucket( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_buf *agibp, unsigned int bucket_index, xfs_agino_t new_agino) @@ -2017,10 +1977,10 @@ xfs_iunlink_update_bucket( xfs_agino_t old_value; int offset; - ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino)); + ASSERT(xfs_verify_agino_or_null(tp->t_mountp, pag->pag_agno, new_agino)); old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); - trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index, + trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, old_value, new_agino); /* @@ -2044,7 +2004,7 @@ xfs_iunlink_update_bucket( STATIC void xfs_iunlink_update_dinode( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agino_t agino, struct xfs_buf *ibp, struct xfs_dinode *dip, @@ -2054,9 +2014,9 @@ xfs_iunlink_update_dinode( struct xfs_mount *mp = tp->t_mountp; int offset; - ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)); - trace_xfs_iunlink_update_dinode(mp, agno, agino, + trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, agino, be32_to_cpu(dip->di_next_unlinked), next_agino); dip->di_next_unlinked = cpu_to_be32(next_agino); @@ -2074,7 +2034,7 @@ STATIC int xfs_iunlink_update_inode( struct xfs_trans *tp, struct xfs_inode *ip, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agino_t next_agino, xfs_agino_t *old_next_agino) { @@ -2084,7 +2044,7 @@ xfs_iunlink_update_inode( xfs_agino_t old_value; int error; - ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)); error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); if (error) @@ -2093,7 +2053,7 @@ xfs_iunlink_update_inode( /* Make sure the old pointer isn't garbage. */ old_value = be32_to_cpu(dip->di_next_unlinked); - if (!xfs_verify_agino_or_null(mp, agno, old_value)) { + if (!xfs_verify_agino_or_null(mp, pag->pag_agno, old_value)) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); error = -EFSCORRUPTED; @@ -2116,7 +2076,7 @@ xfs_iunlink_update_inode( } /* Ok, update the new pointer. */ - xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino), + xfs_iunlink_update_dinode(tp, pag, XFS_INO_TO_AGINO(mp, ip->i_ino), ibp, dip, &ip->i_imap, next_agino); return 0; out: @@ -2137,10 +2097,10 @@ xfs_iunlink( struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; struct xfs_agi *agi; struct xfs_buf *agibp; xfs_agino_t next_agino; - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; @@ -2149,10 +2109,12 @@ xfs_iunlink( ASSERT(VFS_I(ip)->i_mode != 0); trace_xfs_iunlink(ip); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(mp, tp, agno, &agibp); + error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp); if (error) - return error; + goto out; agi = agibp->b_addr; /* @@ -2162,9 +2124,10 @@ xfs_iunlink( */ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); if (next_agino == agino || - !xfs_verify_agino_or_null(mp, agno, next_agino)) { + !xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)) { xfs_buf_mark_corrupt(agibp); - return -EFSCORRUPTED; + error = -EFSCORRUPTED; + goto out; } if (next_agino != NULLAGINO) { @@ -2174,23 +2137,26 @@ xfs_iunlink( * There is already another inode in the bucket, so point this * inode to the current head of the list. */ - error = xfs_iunlink_update_inode(tp, ip, agno, next_agino, + error = xfs_iunlink_update_inode(tp, ip, pag, next_agino, &old_agino); if (error) - return error; + goto out; ASSERT(old_agino == NULLAGINO); /* * agino has been unlinked, add a backref from the next inode * back to agino. */ - error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino); + error = xfs_iunlink_add_backref(pag, agino, next_agino); if (error) - return error; + goto out; } /* Point the head of the list to point to this inode. */ - return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); + error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); +out: + xfs_perag_put(pag); + return error; } /* Return the imap, dinode pointer, and buffer for an inode. */ @@ -2238,14 +2204,13 @@ xfs_iunlink_map_ino( STATIC int xfs_iunlink_map_prev( struct xfs_trans *tp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agino_t head_agino, xfs_agino_t target_agino, xfs_agino_t *agino, struct xfs_imap *imap, struct xfs_dinode **dipp, - struct xfs_buf **bpp, - struct xfs_perag *pag) + struct xfs_buf **bpp) { struct xfs_mount *mp = tp->t_mountp; xfs_agino_t next_agino; @@ -2257,7 +2222,8 @@ xfs_iunlink_map_prev( /* See if our backref cache can find it faster. */ *agino = xfs_iunlink_lookup_backref(pag, target_agino); if (*agino != NULLAGINO) { - error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp); + error = xfs_iunlink_map_ino(tp, pag->pag_agno, *agino, imap, + dipp, bpp); if (error) return error; @@ -2273,7 +2239,7 @@ xfs_iunlink_map_prev( WARN_ON_ONCE(1); } - trace_xfs_iunlink_map_prev_fallback(mp, agno); + trace_xfs_iunlink_map_prev_fallback(mp, pag->pag_agno); /* Otherwise, walk the entire bucket until we find it. */ next_agino = head_agino; @@ -2284,8 +2250,8 @@ xfs_iunlink_map_prev( xfs_trans_brelse(tp, *bpp); *agino = next_agino; - error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp, - bpp); + error = xfs_iunlink_map_ino(tp, pag->pag_agno, next_agino, imap, + dipp, bpp); if (error) return error; @@ -2294,7 +2260,7 @@ xfs_iunlink_map_prev( * Make sure this pointer is valid and isn't an obvious * infinite loop. */ - if (!xfs_verify_agino(mp, agno, unlinked_agino) || + if (!xfs_verify_agino(mp, pag->pag_agno, unlinked_agino) || next_agino == unlinked_agino) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -2314,6 +2280,7 @@ xfs_iunlink_map_prev( STATIC int xfs_iunlink_remove( struct xfs_trans *tp, + struct xfs_perag *pag, struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; @@ -2321,7 +2288,6 @@ xfs_iunlink_remove( struct xfs_buf *agibp; struct xfs_buf *last_ibp; struct xfs_dinode *last_dip = NULL; - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; xfs_agino_t head_agino; @@ -2331,7 +2297,7 @@ xfs_iunlink_remove( trace_xfs_iunlink_remove(ip); /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(mp, tp, agno, &agibp); + error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp); if (error) return error; agi = agibp->b_addr; @@ -2341,7 +2307,7 @@ xfs_iunlink_remove( * go on. Make sure the head pointer isn't garbage. */ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - if (!xfs_verify_agino(mp, agno, head_agino)) { + if (!xfs_verify_agino(mp, pag->pag_agno, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; @@ -2352,7 +2318,7 @@ xfs_iunlink_remove( * the old pointer value so that we can update whatever was previous * to us in the list to point to whatever was next in the list. */ - error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino); + error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino); if (error) return error; @@ -2364,8 +2330,7 @@ xfs_iunlink_remove( * this inode's backref to point from the next inode. */ if (next_agino != NULLAGINO) { - error = xfs_iunlink_change_backref(agibp->b_pag, next_agino, - NULLAGINO); + error = xfs_iunlink_change_backref(pag, next_agino, NULLAGINO); if (error) return error; } @@ -2375,14 +2340,13 @@ xfs_iunlink_remove( xfs_agino_t prev_agino; /* We need to search the list for the inode being freed. */ - error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, - &prev_agino, &imap, &last_dip, &last_ibp, - agibp->b_pag); + error = xfs_iunlink_map_prev(tp, pag, head_agino, agino, + &prev_agino, &imap, &last_dip, &last_ibp); if (error) return error; /* Point the previous inode on the list to the next inode. */ - xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, + xfs_iunlink_update_dinode(tp, pag, prev_agino, last_ibp, last_dip, &imap, next_agino); /* @@ -2398,7 +2362,7 @@ xfs_iunlink_remove( } /* Point the head of the list to the next unlinked inode. */ - return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, next_agino); } @@ -2409,12 +2373,11 @@ xfs_iunlink_remove( */ static void xfs_ifree_mark_inode_stale( - struct xfs_buf *bp, + struct xfs_perag *pag, struct xfs_inode *free_ip, xfs_ino_t inum) { - struct xfs_mount *mp = bp->b_mount; - struct xfs_perag *pag = bp->b_pag; + struct xfs_mount *mp = pag->pag_mount; struct xfs_inode_log_item *iip; struct xfs_inode *ip; @@ -2504,10 +2467,11 @@ out_iflags_unlock: * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. */ -STATIC int +static int xfs_ifree_cluster( - struct xfs_inode *free_ip, struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_inode *free_ip, struct xfs_icluster *xic) { struct xfs_mount *mp = free_ip->i_mount; @@ -2569,7 +2533,7 @@ xfs_ifree_cluster( * already marked XFS_ISTALE. */ for (i = 0; i < igeo->inodes_per_cluster; i++) - xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); + xfs_ifree_mark_inode_stale(pag, free_ip, inum + i); xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); @@ -2592,9 +2556,11 @@ xfs_ifree( struct xfs_trans *tp, struct xfs_inode *ip) { - int error; + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; struct xfs_icluster xic = { 0 }; struct xfs_inode_log_item *iip = ip->i_itemp; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); @@ -2602,16 +2568,18 @@ xfs_ifree( ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); ASSERT(ip->i_nblocks == 0); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + /* * Pull the on-disk inode from the AGI unlinked list. */ - error = xfs_iunlink_remove(tp, ip); + error = xfs_iunlink_remove(tp, pag, ip); if (error) - return error; + goto out; - error = xfs_difree(tp, ip->i_ino, &xic); + error = xfs_difree(tp, pag, ip->i_ino, &xic); if (error) - return error; + goto out; /* * Free any local-format data sitting around before we reset the @@ -2626,7 +2594,7 @@ xfs_ifree( VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_diflags = 0; - ip->i_diflags2 = ip->i_mount->m_ino_geo.new_diflags2; + ip->i_diflags2 = mp->m_ino_geo.new_diflags2; ip->i_forkoff = 0; /* mark the attr fork not in use */ ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) @@ -2645,8 +2613,9 @@ xfs_ifree( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (xic.deleted) - error = xfs_ifree_cluster(ip, tp, &xic); - + error = xfs_ifree_cluster(tp, pag, ip, &xic); +out: + xfs_perag_put(pag); return error; } @@ -2664,7 +2633,7 @@ xfs_iunpin( trace_xfs_inode_unpin_nowait(ip, _RET_IP_); /* Give the log a push to start the unpinning I/O */ - xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL); + xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); } @@ -3250,8 +3219,13 @@ xfs_rename( * in future. */ if (wip) { + struct xfs_perag *pag; + ASSERT(VFS_I(wip)->i_nlink == 0); - error = xfs_iunlink_remove(tp, wip); + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino)); + error = xfs_iunlink_remove(tp, pag, wip); + xfs_perag_put(pag); if (error) goto out_trans_cancel; @@ -3673,16 +3647,16 @@ int xfs_log_force_inode( struct xfs_inode *ip) { - xfs_lsn_t lsn = 0; + xfs_csn_t seq = 0; xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; + seq = ip->i_itemp->ili_commit_seq; xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (!lsn) + if (!seq) return 0; - return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL); } /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index ca826cfba91c..4b6703dbffb8 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -431,11 +431,10 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); -int xfs_dir_ialloc(struct user_namespace *mnt_userns, - struct xfs_trans **tpp, struct xfs_inode *dp, - umode_t mode, xfs_nlink_t nlink, dev_t dev, - prid_t prid, bool need_xattr, - struct xfs_inode **ipp); +int xfs_init_new_inode(struct user_namespace *mnt_userns, struct xfs_trans *tp, + struct xfs_inode *pip, xfs_ino_t ino, umode_t mode, + xfs_nlink_t nlink, dev_t rdev, prid_t prid, bool init_xattrs, + struct xfs_inode **ipp); static inline int xfs_itruncate_extents( diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6764d12342da..35de30849fcc 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -28,6 +28,20 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_inode_log_item, ili_item); } +/* + * The logged size of an inode fork is always the current size of the inode + * fork. This means that when an inode fork is relogged, the size of the logged + * region is determined by the current state, not the combination of the + * previously logged state + the current state. This is different relogging + * behaviour to most other log items which will retain the size of the + * previously logged changes when smaller regions are relogged. + * + * Hence operations that remove data from the inode fork (e.g. shortform + * dir/attr remove, extent form extent removal, etc), the size of the relogged + * inode gets -smaller- rather than stays the same size as the previously logged + * size and this can result in the committing transaction reducing the amount of + * space being consumed by the CIL. + */ STATIC void xfs_inode_item_data_fork_size( struct xfs_inode_log_item *iip, @@ -629,9 +643,9 @@ xfs_inode_item_committed( STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, - xfs_lsn_t commit_lsn) + xfs_csn_t seq) { - INODE_ITEM(lip)->ili_last_lsn = commit_lsn; + INODE_ITEM(lip)->ili_commit_seq = seq; return xfs_inode_item_release(lip); } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 4b926e32831c..403b45ab9aa2 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -33,7 +33,7 @@ struct xfs_inode_log_item { unsigned int ili_fields; /* fields to be logged */ unsigned int ili_fsync_fields; /* logged since last fsync */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ - xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ + xfs_csn_t ili_commit_seq; /* last transaction commit */ }; static inline int xfs_inode_clean(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1fe4c1fc0aea..65270e63c032 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -558,7 +558,7 @@ xfs_ioc_attrmulti_one( case ATTR_OP_REMOVE: value = NULL; *len = 0; - /* fall through */ + fallthrough; case ATTR_OP_SET: error = mnt_want_write_file(parfilp); if (error) @@ -1511,7 +1511,7 @@ xfs_ioc_getbmap( switch (cmd) { case XFS_IOC_GETBMAPA: bmx.bmv_iflags = BMV_IF_ATTRFORK; - /*FALLTHRU*/ + fallthrough; case XFS_IOC_GETBMAP: /* struct getbmap is a strict subset of struct getbmapx. */ recsize = sizeof(struct getbmap); @@ -1875,7 +1875,7 @@ out: static inline int xfs_fs_eofblocks_from_user( struct xfs_fs_eofblocks *src, - struct xfs_eofblocks *dst) + struct xfs_icwalk *dst) { if (src->eof_version != XFS_EOFBLOCKS_VERSION) return -EINVAL; @@ -1887,21 +1887,32 @@ xfs_fs_eofblocks_from_user( memchr_inv(src->pad64, 0, sizeof(src->pad64))) return -EINVAL; - dst->eof_flags = src->eof_flags; - dst->eof_prid = src->eof_prid; - dst->eof_min_file_size = src->eof_min_file_size; - - dst->eof_uid = INVALID_UID; + dst->icw_flags = 0; + if (src->eof_flags & XFS_EOF_FLAGS_SYNC) + dst->icw_flags |= XFS_ICWALK_FLAG_SYNC; + if (src->eof_flags & XFS_EOF_FLAGS_UID) + dst->icw_flags |= XFS_ICWALK_FLAG_UID; + if (src->eof_flags & XFS_EOF_FLAGS_GID) + dst->icw_flags |= XFS_ICWALK_FLAG_GID; + if (src->eof_flags & XFS_EOF_FLAGS_PRID) + dst->icw_flags |= XFS_ICWALK_FLAG_PRID; + if (src->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) + dst->icw_flags |= XFS_ICWALK_FLAG_MINFILESIZE; + + dst->icw_prid = src->eof_prid; + dst->icw_min_file_size = src->eof_min_file_size; + + dst->icw_uid = INVALID_UID; if (src->eof_flags & XFS_EOF_FLAGS_UID) { - dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); - if (!uid_valid(dst->eof_uid)) + dst->icw_uid = make_kuid(current_user_ns(), src->eof_uid); + if (!uid_valid(dst->icw_uid)) return -EINVAL; } - dst->eof_gid = INVALID_GID; + dst->icw_gid = INVALID_GID; if (src->eof_flags & XFS_EOF_FLAGS_GID) { - dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); - if (!gid_valid(dst->eof_gid)) + dst->icw_gid = make_kgid(current_user_ns(), src->eof_gid); + if (!gid_valid(dst->icw_gid)) return -EINVAL; } return 0; @@ -2164,8 +2175,8 @@ xfs_file_ioctl( return xfs_errortag_clearall(mp); case XFS_IOC_FREE_EOFBLOCKS: { - struct xfs_fs_eofblocks eofb; - struct xfs_eofblocks keofb; + struct xfs_fs_eofblocks eofb; + struct xfs_icwalk icw; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2176,14 +2187,14 @@ xfs_file_ioctl( if (copy_from_user(&eofb, arg, sizeof(eofb))) return -EFAULT; - error = xfs_fs_eofblocks_from_user(&eofb, &keofb); + error = xfs_fs_eofblocks_from_user(&eofb, &icw); if (error) return error; - trace_xfs_ioc_free_eofblocks(mp, &keofb, _RET_IP_); + trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_); sb_start_write(mp->m_super); - error = xfs_blockgc_free_space(mp, &keofb); + error = xfs_blockgc_free_space(mp, &icw); sb_end_write(mp->m_super); return error; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d154f42e2dc6..d8cd2583dedb 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1036,7 +1036,7 @@ retry: prealloc_blocks = 0; goto retry; } - /*FALLTHRU*/ + fallthrough; default: goto out_unlock; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index dfe24b7f26e5..93c082db04b7 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -543,7 +543,7 @@ xfs_stat_blksize( * always return the realtime extent size. */ if (XFS_IS_REALTIME_INODE(ip)) - return xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; + return XFS_FSB_TO_B(mp, xfs_get_extsz_hint(ip)); /* * Allow large block sizes to be reported to userspace programs if the @@ -560,7 +560,7 @@ xfs_stat_blksize( */ if (mp->m_flags & XFS_MOUNT_LARGEIO) { if (mp->m_swidth) - return mp->m_swidth << mp->m_sb.sb_blocklog; + return XFS_FSB_TO_B(mp, mp->m_swidth); if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) return 1U << mp->m_allocsize_log; } diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index c4a340f1f1e1..917d51eefee3 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -21,6 +21,7 @@ #include "xfs_health.h" #include "xfs_trans.h" #include "xfs_pwork.h" +#include "xfs_ag.h" /* * Walking Inodes in the Filesystem @@ -51,6 +52,7 @@ struct xfs_iwalk_ag { struct xfs_mount *mp; struct xfs_trans *tp; + struct xfs_perag *pag; /* Where do we start the traversal? */ xfs_ino_t startino; @@ -90,7 +92,7 @@ struct xfs_iwalk_ag { STATIC void xfs_iwalk_ichunk_ra( struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_inobt_rec_incore *irec) { struct xfs_ino_geometry *igeo = M_IGEO(mp); @@ -106,7 +108,7 @@ xfs_iwalk_ichunk_ra( imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster); if (imask & ~irec->ir_free) { - xfs_btree_reada_bufs(mp, agno, agbno, + xfs_btree_reada_bufs(mp, pag->pag_agno, agbno, igeo->blocks_per_cluster, &xfs_inode_buf_ops); } @@ -174,26 +176,25 @@ xfs_iwalk_free( /* For each inuse inode in each cached inobt record, call our function. */ STATIC int xfs_iwalk_ag_recs( - struct xfs_iwalk_ag *iwag) + struct xfs_iwalk_ag *iwag) { - struct xfs_mount *mp = iwag->mp; - struct xfs_trans *tp = iwag->tp; - xfs_ino_t ino; - unsigned int i, j; - xfs_agnumber_t agno; - int error; + struct xfs_mount *mp = iwag->mp; + struct xfs_trans *tp = iwag->tp; + struct xfs_perag *pag = iwag->pag; + xfs_ino_t ino; + unsigned int i, j; + int error; - agno = XFS_INO_TO_AGNO(mp, iwag->startino); for (i = 0; i < iwag->nr_recs; i++) { struct xfs_inobt_rec_incore *irec = &iwag->recs[i]; - trace_xfs_iwalk_ag_rec(mp, agno, irec); + trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec); if (xfs_pwork_want_abort(&iwag->pwork)) return 0; if (iwag->inobt_walk_fn) { - error = iwag->inobt_walk_fn(mp, tp, agno, irec, + error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec, iwag->data); if (error) return error; @@ -211,7 +212,8 @@ xfs_iwalk_ag_recs( continue; /* Otherwise call our function. */ - ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j); + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, + irec->ir_startino + j); error = iwag->iwalk_fn(mp, tp, ino, iwag->data); if (error) return error; @@ -257,7 +259,6 @@ xfs_iwalk_del_inobt( STATIC int xfs_iwalk_ag_start( struct xfs_iwalk_ag *iwag, - xfs_agnumber_t agno, xfs_agino_t agino, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp, @@ -265,12 +266,13 @@ xfs_iwalk_ag_start( { struct xfs_mount *mp = iwag->mp; struct xfs_trans *tp = iwag->tp; + struct xfs_perag *pag = iwag->pag; struct xfs_inobt_rec_incore *irec; int error; /* Set up a fresh cursor and empty the inobt cache. */ iwag->nr_recs = 0; - error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp); + error = xfs_inobt_cur(mp, tp, pag, XFS_BTNUM_INO, curpp, agi_bpp); if (error) return error; @@ -304,7 +306,7 @@ xfs_iwalk_ag_start( if (XFS_IS_CORRUPT(mp, *has_more != 1)) return -EFSCORRUPTED; - iwag->lastino = XFS_AGINO_TO_INO(mp, agno, + iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino + XFS_INODES_PER_CHUNK - 1); /* @@ -345,7 +347,6 @@ out_advance: STATIC int xfs_iwalk_run_callbacks( struct xfs_iwalk_ag *iwag, - xfs_agnumber_t agno, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp, int *has_more) @@ -376,7 +377,7 @@ xfs_iwalk_run_callbacks( return 0; /* ...and recreate the cursor just past where we left off. */ - error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp); + error = xfs_inobt_cur(mp, tp, iwag->pag, XFS_BTNUM_INO, curpp, agi_bpp); if (error) return error; @@ -390,17 +391,17 @@ xfs_iwalk_ag( { struct xfs_mount *mp = iwag->mp; struct xfs_trans *tp = iwag->tp; + struct xfs_perag *pag = iwag->pag; struct xfs_buf *agi_bp = NULL; struct xfs_btree_cur *cur = NULL; - xfs_agnumber_t agno; xfs_agino_t agino; int has_more; int error = 0; /* Set up our cursor at the right place in the inode btree. */ - agno = XFS_INO_TO_AGNO(mp, iwag->startino); + ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino)); agino = XFS_INO_TO_AGINO(mp, iwag->startino); - error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); + error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more); while (!error && has_more) { struct xfs_inobt_rec_incore *irec; @@ -417,7 +418,7 @@ xfs_iwalk_ag( break; /* Make sure that we always move forward. */ - rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino); + rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino); if (iwag->lastino != NULLFSINO && XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) { error = -EFSCORRUPTED; @@ -438,7 +439,7 @@ xfs_iwalk_ag( * walking the inodes. */ if (iwag->iwalk_fn) - xfs_iwalk_ichunk_ra(mp, agno, irec); + xfs_iwalk_ichunk_ra(mp, pag, irec); /* * If there's space in the buffer for more records, increment @@ -458,15 +459,14 @@ xfs_iwalk_ag( * we would be if we had been able to increment like above. */ ASSERT(has_more); - error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, - &has_more); + error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more); } if (iwag->nr_recs == 0 || error) goto out; /* Walk the unprocessed records in the cache. */ - error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more); + error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more); out: xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); @@ -555,6 +555,7 @@ xfs_iwalk( .pwork = XFS_PWORK_SINGLE_THREADED, .lastino = NULLFSINO, }; + struct xfs_perag *pag; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error; @@ -565,15 +566,19 @@ xfs_iwalk( if (error) return error; - for (; agno < mp->m_sb.sb_agcount; agno++) { + for_each_perag_from(mp, agno, pag) { + iwag.pag = pag; error = xfs_iwalk_ag(&iwag); if (error) break; iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); if (flags & XFS_INOBT_WALK_SAME_AG) break; + iwag.pag = NULL; } + if (iwag.pag) + xfs_perag_put(pag); xfs_iwalk_free(&iwag); return error; } @@ -598,6 +603,7 @@ xfs_iwalk_ag_work( error = xfs_iwalk_ag(iwag); xfs_iwalk_free(iwag); out: + xfs_perag_put(iwag->pag); kmem_free(iwag); return error; } @@ -617,6 +623,7 @@ xfs_iwalk_threaded( void *data) { struct xfs_pwork_ctl pctl; + struct xfs_perag *pag; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error; @@ -627,7 +634,7 @@ xfs_iwalk_threaded( if (error) return error; - for (; agno < mp->m_sb.sb_agcount; agno++) { + for_each_perag_from(mp, agno, pag) { struct xfs_iwalk_ag *iwag; if (xfs_pwork_ctl_want_abort(&pctl)) @@ -635,17 +642,25 @@ xfs_iwalk_threaded( iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0); iwag->mp = mp; + + /* + * perag is being handed off to async work, so take another + * reference for the async work to release. + */ + atomic_inc(&pag->pag_ref); + iwag->pag = pag; iwag->iwalk_fn = iwalk_fn; iwag->data = data; iwag->startino = startino; iwag->sz_recs = xfs_iwalk_prefetch(inode_records); iwag->lastino = NULLFSINO; xfs_pwork_queue(&pctl, &iwag->pwork); - startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); + startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0); if (flags & XFS_INOBT_WALK_SAME_AG) break; } - + if (pag) + xfs_perag_put(pag); if (polled) xfs_pwork_poll(&pctl); return xfs_pwork_destroy(&pctl); @@ -715,6 +730,7 @@ xfs_inobt_walk( .pwork = XFS_PWORK_SINGLE_THREADED, .lastino = NULLFSINO, }; + struct xfs_perag *pag; xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); int error; @@ -725,15 +741,19 @@ xfs_inobt_walk( if (error) return error; - for (; agno < mp->m_sb.sb_agcount; agno++) { + for_each_perag_from(mp, agno, pag) { + iwag.pag = pag; error = xfs_iwalk_ag(&iwag); if (error) break; - iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); + iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0); if (flags & XFS_INOBT_WALK_SAME_AG) break; + iwag.pag = NULL; } + if (iwag.pag) + xfs_perag_put(pag); xfs_iwalk_free(&iwag); return error; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 7688663b9773..c174262a074e 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -196,6 +196,8 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, unsigned int op); +void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev, + struct completion *done); #define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c19a82adea1e..36fa2650b081 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -502,6 +502,7 @@ __xlog_state_release_iclog( iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); xlog_verify_tail_lsn(log, iclog, tail_lsn); /* cycle incremented when incrementing curr_block */ + trace_xlog_iclog_syncing(iclog, _RET_IP_); return true; } @@ -513,13 +514,14 @@ __xlog_state_release_iclog( * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. */ -static int +int xlog_state_release_iclog( struct xlog *log, struct xlog_in_core *iclog) { lockdep_assert_held(&log->l_icloglock); + trace_xlog_iclog_release(iclog, _RET_IP_); if (iclog->ic_state == XLOG_STATE_IOERROR) return -EIO; @@ -533,23 +535,6 @@ xlog_state_release_iclog( return 0; } -void -xfs_log_release_iclog( - struct xlog_in_core *iclog) -{ - struct xlog *log = iclog->ic_log; - bool sync = false; - - if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) { - if (iclog->ic_state != XLOG_STATE_IOERROR) - sync = __xlog_state_release_iclog(log, iclog); - spin_unlock(&log->l_icloglock); - } - - if (sync) - xlog_sync(log, iclog); -} - /* * Mount a log filesystem * @@ -770,6 +755,9 @@ xfs_log_mount_finish( if (readonly) mp->m_flags |= XFS_MOUNT_RDONLY; + /* Make sure the log is dead if we're returning failure. */ + ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR)); + return error; } @@ -786,16 +774,19 @@ xfs_log_mount_cancel( } /* - * Wait for the iclog to be written disk, or return an error if the log has been - * shut down. + * Wait for the iclog and all prior iclogs to be written disk as required by the + * log force state machine. Waiting on ic_force_wait ensures iclog completions + * have been ordered and callbacks run before we are woken here, hence + * guaranteeing that all the iclogs up to this one are on stable storage. */ -static int +int xlog_wait_on_iclog( struct xlog_in_core *iclog) __releases(iclog->ic_log->l_icloglock) { struct xlog *log = iclog->ic_log; + trace_xlog_iclog_wait_on(iclog, _RET_IP_); if (!XLOG_FORCED_SHUTDOWN(log) && iclog->ic_state != XLOG_STATE_ACTIVE && iclog->ic_state != XLOG_STATE_DIRTY) { @@ -818,9 +809,7 @@ xlog_wait_on_iclog( static int xlog_write_unmount_record( struct xlog *log, - struct xlog_ticket *ticket, - xfs_lsn_t *lsn, - uint flags) + struct xlog_ticket *ticket) { struct xfs_unmount_log_format ulf = { .magic = XLOG_UNMOUNT_TYPE, @@ -837,7 +826,15 @@ xlog_write_unmount_record( /* account for space used by record data */ ticket->t_curr_res -= sizeof(ulf); - return xlog_write(log, &vec, ticket, lsn, NULL, flags, false); + + /* + * For external log devices, we need to flush the data device cache + * first to ensure all metadata writeback is on stable storage before we + * stamp the tail LSN into the unmount record. + */ + if (log->l_targ != log->l_mp->m_ddev_targp) + blkdev_issue_flush(log->l_targ->bt_bdev); + return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS); } /* @@ -851,15 +848,13 @@ xlog_unmount_write( struct xfs_mount *mp = log->l_mp; struct xlog_in_core *iclog; struct xlog_ticket *tic = NULL; - xfs_lsn_t lsn; - uint flags = XLOG_UNMOUNT_TRANS; int error; error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); if (error) goto out_err; - error = xlog_write_unmount_record(log, tic, &lsn, flags); + error = xlog_write_unmount_record(log, tic); /* * At this point, we're umounting anyway, so there's no point in * transitioning log state to IOERROR. Just continue... @@ -876,6 +871,11 @@ out_err: else ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || iclog->ic_state == XLOG_STATE_IOERROR); + /* + * Ensure the journal is fully flushed and on stable storage once the + * iclog containing the unmount record is written. + */ + iclog->ic_flags |= (XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); error = xlog_state_release_iclog(log, iclog); xlog_wait_on_iclog(iclog); @@ -1401,6 +1401,11 @@ xlog_alloc_log( xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + log->l_iclog_roundoff = mp->m_sb.sb_logsunit; + else + log->l_iclog_roundoff = BBSIZE; + xlog_grant_head_init(&log->l_reserve_head); xlog_grant_head_init(&log->l_write_head); @@ -1479,7 +1484,6 @@ xlog_alloc_log( iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); - spin_lock_init(&iclog->ic_callback_lock); INIT_LIST_HEAD(&iclog->ic_callbacks); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; @@ -1546,8 +1550,7 @@ xlog_commit_record( if (XLOG_FORCED_SHUTDOWN(log)) return -EIO; - error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS, - false); + error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS); if (error) xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); return error; @@ -1753,10 +1756,10 @@ xlog_write_iclog( struct xlog *log, struct xlog_in_core *iclog, uint64_t bno, - unsigned int count, - bool need_flush) + unsigned int count) { ASSERT(bno < log->l_logBBsize); + trace_xlog_iclog_write(iclog, _RET_IP_); /* * We lock the iclogbufs here so that we can serialise against I/O @@ -1792,10 +1795,12 @@ xlog_write_iclog( * writeback throttle from throttling log writes behind background * metadata writeback and causing priority inversions. */ - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | - REQ_IDLE | REQ_FUA; - if (need_flush) + iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE; + if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) iclog->ic_bio.bi_opf |= REQ_PREFLUSH; + if (iclog->ic_flags & XLOG_ICL_NEED_FUA) + iclog->ic_bio.bi_opf |= REQ_FUA; + iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); @@ -1854,29 +1859,15 @@ xlog_calc_iclog_size( uint32_t *roundoff) { uint32_t count_init, count; - bool use_lsunit; - - use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1; /* Add for LR header */ count_init = log->l_iclog_hsize + iclog->ic_offset; + count = roundup(count_init, log->l_iclog_roundoff); - /* Round out the log write size */ - if (use_lsunit) { - /* we have a v2 stripe unit to use */ - count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); - } else { - count = BBTOB(BTOBB(count_init)); - } - - ASSERT(count >= count_init); *roundoff = count - count_init; - if (use_lsunit) - ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit); - else - ASSERT(*roundoff < BBTOB(1)); + ASSERT(count >= count_init); + ASSERT(*roundoff < log->l_iclog_roundoff); return count; } @@ -1912,9 +1903,9 @@ xlog_sync( unsigned int roundoff; /* roundoff to BB or stripe */ uint64_t bno; unsigned int size; - bool need_flush = true, split = false; ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + trace_xlog_iclog_sync(iclog, _RET_IP_); count = xlog_calc_iclog_size(log, iclog, &roundoff); @@ -1937,10 +1928,8 @@ xlog_sync( bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); /* Do we need to split this write into 2 parts? */ - if (bno + BTOBB(count) > log->l_logBBsize) { + if (bno + BTOBB(count) > log->l_logBBsize) xlog_split_iclog(log, &iclog->ic_header, bno, count); - split = true; - } /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, @@ -1961,22 +1950,8 @@ xlog_sync( be64_to_cpu(iclog->ic_header.h_lsn)); } #endif - - /* - * Flush the data device before flushing the log to make sure all meta - * data written back from the AIL actually made it to disk before - * stamping the new log tail LSN into the log buffer. For an external - * log we need to issue the flush explicitly, and unfortunately - * synchronously here; for an internal log we can simply use the block - * layer state machine for preflushes. - */ - if (log->l_targ != log->l_mp->m_ddev_targp || split) { - xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); - need_flush = false; - } - xlog_verify_iclog(log, iclog, count); - xlog_write_iclog(log, iclog, bno, count, need_flush); + xlog_write_iclog(log, iclog, bno, count); } /* @@ -2158,13 +2133,16 @@ static int xlog_write_calc_vec_length( struct xlog_ticket *ticket, struct xfs_log_vec *log_vector, - bool need_start_rec) + uint optype) { struct xfs_log_vec *lv; - int headers = need_start_rec ? 1 : 0; + int headers = 0; int len = 0; int i; + if (optype & XLOG_START_TRANS) + headers++; + for (lv = log_vector; lv; lv = lv->lv_next) { /* we don't write ordered log vectors */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) @@ -2384,8 +2362,7 @@ xlog_write( struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, - uint flags, - bool need_start_rec) + uint optype) { struct xlog_in_core *iclog = NULL; struct xfs_log_vec *lv = log_vector; @@ -2413,8 +2390,9 @@ xlog_write( xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } - len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec); - *start_lsn = 0; + len = xlog_write_calc_vec_length(ticket, log_vector, optype); + if (start_lsn) + *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2427,8 +2405,8 @@ xlog_write( ASSERT(log_offset <= iclog->ic_size - 1); ptr = iclog->ic_datap + log_offset; - /* start_lsn is the first lsn written to. That's all we need. */ - if (!*start_lsn) + /* Start_lsn is the first lsn written to. */ + if (start_lsn && !*start_lsn) *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); /* @@ -2441,6 +2419,7 @@ xlog_write( int copy_len; int copy_off; bool ordered = false; + bool wrote_start_rec = false; /* ordered log vectors have no regions to write */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { @@ -2458,13 +2437,15 @@ xlog_write( * write a start record. Only do this for the first * iclog we write to. */ - if (need_start_rec) { + if (optype & XLOG_START_TRANS) { xlog_write_start_rec(ptr, ticket); xlog_write_adv_cnt(&ptr, &len, &log_offset, sizeof(struct xlog_op_header)); + optype &= ~XLOG_START_TRANS; + wrote_start_rec = true; } - ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); + ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype); if (!ophdr) return -EIO; @@ -2495,14 +2476,13 @@ xlog_write( } copy_len += sizeof(struct xlog_op_header); record_cnt++; - if (need_start_rec) { + if (wrote_start_rec) { copy_len += sizeof(struct xlog_op_header); record_cnt++; - need_start_rec = false; } data_cnt += contwr ? copy_len : 0; - error = xlog_write_copy_finish(log, iclog, flags, + error = xlog_write_copy_finish(log, iclog, optype, &record_cnt, &data_cnt, &partial_copy, &partial_copy_len, @@ -2546,7 +2526,7 @@ next_lv: spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); if (commit_iclog) { - ASSERT(flags & XLOG_COMMIT_TRANS); + ASSERT(optype & XLOG_COMMIT_TRANS); *commit_iclog = iclog; } else { error = xlog_state_release_iclog(log, iclog); @@ -2562,6 +2542,7 @@ xlog_state_activate_iclog( int *iclogs_changed) { ASSERT(list_empty_careful(&iclog->ic_callbacks)); + trace_xlog_iclog_activate(iclog, _RET_IP_); /* * If the number of ops in this iclog indicate it just contains the @@ -2626,6 +2607,7 @@ xlog_covered_state( case XLOG_STATE_COVER_IDLE: if (iclogs_changed == 1) return XLOG_STATE_COVER_IDLE; + fallthrough; case XLOG_STATE_COVER_NEED: case XLOG_STATE_COVER_NEED2: break; @@ -2651,6 +2633,8 @@ xlog_state_clean_iclog( { int iclogs_changed = 0; + trace_xlog_iclog_clean(dirty_iclog, _RET_IP_); + dirty_iclog->ic_state = XLOG_STATE_DIRTY; xlog_state_activate_iclogs(log, &iclogs_changed); @@ -2710,6 +2694,7 @@ xlog_state_set_callback( struct xlog_in_core *iclog, xfs_lsn_t header_lsn) { + trace_xlog_iclog_callback(iclog, _RET_IP_); iclog->ic_state = XLOG_STATE_CALLBACK; ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), @@ -2775,43 +2760,6 @@ xlog_state_iodone_process_iclog( } } -/* - * Keep processing entries in the iclog callback list until we come around and - * it is empty. We need to atomically see that the list is empty and change the - * state to DIRTY so that we don't miss any more callbacks being added. - * - * This function is called with the icloglock held and returns with it held. We - * drop it while running callbacks, however, as holding it over thousands of - * callbacks is unnecessary and causes excessive contention if we do. - */ -static void -xlog_state_do_iclog_callbacks( - struct xlog *log, - struct xlog_in_core *iclog) - __releases(&log->l_icloglock) - __acquires(&log->l_icloglock) -{ - spin_unlock(&log->l_icloglock); - spin_lock(&iclog->ic_callback_lock); - while (!list_empty(&iclog->ic_callbacks)) { - LIST_HEAD(tmp); - - list_splice_init(&iclog->ic_callbacks, &tmp); - - spin_unlock(&iclog->ic_callback_lock); - xlog_cil_process_committed(&tmp); - spin_lock(&iclog->ic_callback_lock); - } - - /* - * Pick up the icloglock while still holding the callback lock so we - * serialise against anyone trying to add more callbacks to this iclog - * now we've finished processing. - */ - spin_lock(&log->l_icloglock); - spin_unlock(&iclog->ic_callback_lock); -} - STATIC void xlog_state_do_callback( struct xlog *log) @@ -2840,6 +2788,8 @@ xlog_state_do_callback( repeats++; do { + LIST_HEAD(cb_list); + if (xlog_state_iodone_process_iclog(log, iclog, &ioerror)) break; @@ -2849,13 +2799,15 @@ xlog_state_do_callback( iclog = iclog->ic_next; continue; } + list_splice_init(&iclog->ic_callbacks, &cb_list); + spin_unlock(&log->l_icloglock); - /* - * Running callbacks will drop the icloglock which means - * we'll have to run at least one more complete loop. - */ + trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); + xlog_cil_process_committed(&cb_list); + trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); cycled_icloglock = true; - xlog_state_do_iclog_callbacks(log, iclog); + + spin_lock(&log->l_icloglock); if (XLOG_FORCED_SHUTDOWN(log)) wake_up_all(&iclog->ic_force_wait); else @@ -2901,6 +2853,7 @@ xlog_state_done_syncing( spin_lock(&log->l_icloglock); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); + trace_xlog_iclog_sync_done(iclog, _RET_IP_); /* * If we got an error, either on the first buffer, or in the case of @@ -2974,6 +2927,8 @@ restart: atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset; + trace_xlog_iclog_get_space(iclog, _RET_IP_); + /* On the 1st write to an iclog, figure out lsn. This works * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are * committing to. If the offset is set, that's how many blocks @@ -3139,6 +3094,7 @@ xlog_state_switch_iclogs( { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); assert_spin_locked(&log->l_icloglock); + trace_xlog_iclog_switch(iclog, _RET_IP_); if (!eventual_size) eventual_size = iclog->ic_offset; @@ -3151,9 +3107,8 @@ xlog_state_switch_iclogs( log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); /* Round up to next log-sunit */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1) { - uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); + if (log->l_iclog_roundoff > BBSIZE) { + uint32_t sunit_bb = BTOBB(log->l_iclog_roundoff); log->l_curr_block = roundup(log->l_curr_block, sunit_bb); } @@ -3222,6 +3177,8 @@ xfs_log_force( if (iclog->ic_state == XLOG_STATE_IOERROR) goto out_error; + trace_xlog_iclog_force(iclog, _RET_IP_); + if (iclog->ic_state == XLOG_STATE_DIRTY || (iclog->ic_state == XLOG_STATE_ACTIVE && atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) { @@ -3280,14 +3237,13 @@ out_error: } static int -__xfs_log_force_lsn( - struct xfs_mount *mp, +xlog_force_lsn( + struct xlog *log, xfs_lsn_t lsn, uint flags, int *log_flushed, bool already_slept) { - struct xlog *log = mp->m_log; struct xlog_in_core *iclog; spin_lock(&log->l_icloglock); @@ -3296,6 +3252,7 @@ __xfs_log_force_lsn( goto out_error; while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + trace_xlog_iclog_force_lsn(iclog, _RET_IP_); iclog = iclog->ic_next; if (iclog == log->l_iclog) goto out_unlock; @@ -3320,8 +3277,6 @@ __xfs_log_force_lsn( if (!already_slept && (iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC || iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) { - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); return -EAGAIN; @@ -3359,25 +3314,29 @@ out_error: * to disk, that thread will wake up all threads waiting on the queue. */ int -xfs_log_force_lsn( +xfs_log_force_seq( struct xfs_mount *mp, - xfs_lsn_t lsn, + xfs_csn_t seq, uint flags, int *log_flushed) { + struct xlog *log = mp->m_log; + xfs_lsn_t lsn; int ret; - ASSERT(lsn != 0); + ASSERT(seq != 0); XFS_STATS_INC(mp, xs_log_force); - trace_xfs_log_force(mp, lsn, _RET_IP_); + trace_xfs_log_force(mp, seq, _RET_IP_); - lsn = xlog_cil_force_lsn(mp->m_log, lsn); + lsn = xlog_cil_force_seq(log, seq); if (lsn == NULLCOMMITLSN) return 0; - ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false); - if (ret == -EAGAIN) - ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true); + ret = xlog_force_lsn(log, lsn, flags, log_flushed, false); + if (ret == -EAGAIN) { + XFS_STATS_INC(mp, xs_log_force_sleep); + ret = xlog_force_lsn(log, lsn, flags, log_flushed, true); + } return ret; } @@ -3406,12 +3365,11 @@ xfs_log_ticket_get( * Figure out the total log space unit (in bytes) that would be * required for a log ticket. */ -int -xfs_log_calc_unit_res( - struct xfs_mount *mp, +static int +xlog_calc_unit_res( + struct xlog *log, int unit_bytes) { - struct xlog *log = mp->m_log; int iclog_space; uint num_headers; @@ -3487,18 +3445,20 @@ xfs_log_calc_unit_res( /* for commit-rec LR header - note: padding will subsume the ophdr */ unit_bytes += log->l_iclog_hsize; - /* for roundoff padding for transaction data and one for commit record */ - if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { - /* log su roundoff */ - unit_bytes += 2 * mp->m_sb.sb_logsunit; - } else { - /* BB roundoff */ - unit_bytes += 2 * BBSIZE; - } + /* roundoff padding for transaction data and one for commit record */ + unit_bytes += 2 * log->l_iclog_roundoff; return unit_bytes; } +int +xfs_log_calc_unit_res( + struct xfs_mount *mp, + int unit_bytes) +{ + return xlog_calc_unit_res(mp->m_log, unit_bytes); +} + /* * Allocate and initialise a new log ticket. */ @@ -3515,7 +3475,7 @@ xlog_ticket_alloc( tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL); - unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); + unit_res = xlog_calc_unit_res(log, unit_bytes); atomic_set(&tic->t_ref, 1); tic->t_task = current; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 044e02cb8921..813b972e9788 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -106,7 +106,7 @@ struct xfs_item_ops; struct xfs_trans; int xfs_log_force(struct xfs_mount *mp, uint flags); -int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, +int xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags, int *log_forced); int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, @@ -117,7 +117,6 @@ void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); -void xfs_log_release_iclog(struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, @@ -132,8 +131,6 @@ bool xfs_log_writable(struct xfs_mount *mp); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); -void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, bool regrant); void xlog_cil_process_committed(struct list_head *list); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index b0ef071b3cb5..b128aaa9b870 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -656,6 +656,8 @@ xlog_cil_push_work( struct xfs_log_vec lvhdr = { NULL }; xfs_lsn_t commit_lsn; xfs_lsn_t push_seq; + struct bio bio; + DECLARE_COMPLETION_ONSTACK(bdev_flush); new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -668,9 +670,14 @@ xlog_cil_push_work( ASSERT(push_seq <= ctx->sequence); /* - * Wake up any background push waiters now this context is being pushed. + * As we are about to switch to a new, empty CIL context, we no longer + * need to throttle tasks on CIL space overruns. Wake any waiters that + * the hard push throttle may have caught so they can start committing + * to the new context. The ctx->xc_push_lock provides the serialisation + * necessary for safely using the lockless waitqueue_active() check in + * this context. */ - if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) + if (waitqueue_active(&cil->xc_push_wait)) wake_up_all(&cil->xc_push_wait); /* @@ -719,10 +726,19 @@ xlog_cil_push_work( spin_unlock(&cil->xc_push_lock); /* - * pull all the log vectors off the items in the CIL, and - * remove the items from the CIL. We don't need the CIL lock - * here because it's only needed on the transaction commit - * side which is currently locked out by the flush lock. + * The CIL is stable at this point - nothing new will be added to it + * because we hold the flush lock exclusively. Hence we can now issue + * a cache flush to ensure all the completed metadata in the journal we + * are about to overwrite is on stable storage. + */ + xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev, + &bdev_flush); + + /* + * Pull all the log vectors off the items in the CIL, and remove the + * items from the CIL. We don't need the CIL lock here because it's only + * needed on the transaction commit side which is currently locked out + * by the flush lock. */ lv = NULL; num_iovecs = 0; @@ -772,7 +788,7 @@ xlog_cil_push_work( * that higher sequences will wait for us to write out a commit record * before they do. * - * xfs_log_force_lsn requires us to mirror the new sequence into the cil + * xfs_log_force_seq requires us to mirror the new sequence into the cil * structure atomically with the addition of this sequence to the * committing list. This also ensures that we can do unlocked checks * against the current sequence in log forces without risking @@ -806,7 +822,14 @@ xlog_cil_push_work( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain; - error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true); + /* + * Before we format and submit the first iclog, we have to ensure that + * the metadata writeback ordering cache flush is complete. + */ + wait_for_completion(&bdev_flush); + + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, + XLOG_START_TRANS); if (error) goto out_abort_free_ticket; @@ -850,15 +873,21 @@ restart: xfs_log_ticket_ungrant(log, tic); - spin_lock(&commit_iclog->ic_callback_lock); + /* + * Once we attach the ctx to the iclog, a shutdown can process the + * iclog, run the callbacks and free the ctx. The only thing preventing + * this potential UAF situation here is that we are holding the + * icloglock. Hence we cannot access the ctx once we have attached the + * callbacks and dropped the icloglock. + */ + spin_lock(&log->l_icloglock); if (commit_iclog->ic_state == XLOG_STATE_IOERROR) { - spin_unlock(&commit_iclog->ic_callback_lock); + spin_unlock(&log->l_icloglock); goto out_abort; } ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE || commit_iclog->ic_state == XLOG_STATE_WANT_SYNC); list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks); - spin_unlock(&commit_iclog->ic_callback_lock); /* * now the checkpoint commit is complete and we've attached the @@ -870,8 +899,50 @@ restart: wake_up_all(&cil->xc_commit_wait); spin_unlock(&cil->xc_push_lock); - /* release the hounds! */ - xfs_log_release_iclog(commit_iclog); + /* + * If the checkpoint spans multiple iclogs, wait for all previous iclogs + * to complete before we submit the commit_iclog. We can't use state + * checks for this - ACTIVE can be either a past completed iclog or a + * future iclog being filled, while WANT_SYNC through SYNC_DONE can be a + * past or future iclog awaiting IO or ordered IO completion to be run. + * In the latter case, if it's a future iclog and we wait on it, the we + * will hang because it won't get processed through to ic_force_wait + * wakeup until this commit_iclog is written to disk. Hence we use the + * iclog header lsn and compare it to the commit lsn to determine if we + * need to wait on iclogs or not. + * + * NOTE: It is not safe to reference the ctx after this check as we drop + * the icloglock if we have to wait for completion of other iclogs. + */ + if (ctx->start_lsn != commit_lsn) { + xfs_lsn_t plsn; + + plsn = be64_to_cpu(commit_iclog->ic_prev->ic_header.h_lsn); + if (plsn && XFS_LSN_CMP(plsn, commit_lsn) < 0) { + /* + * Waiting on ic_force_wait orders the completion of + * iclogs older than ic_prev. Hence we only need to wait + * on the most recent older iclog here. + */ + xlog_wait_on_iclog(commit_iclog->ic_prev); + spin_lock(&log->l_icloglock); + } + + /* + * We need to issue a pre-flush so that the ordering for this + * checkpoint is correctly preserved down to stable storage. + */ + commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; + } + + /* + * The commit iclog must be written to stable storage to guarantee + * journal IO vs metadata writeback IO is correctly ordered on stable + * storage. + */ + commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA; + xlog_state_release_iclog(log, commit_iclog); + spin_unlock(&log->l_icloglock); return; out_skip: @@ -907,7 +978,7 @@ xlog_cil_push_background( ASSERT(!list_empty(&cil->xc_cil)); /* - * don't do a background push if we haven't used up all the + * Don't do a background push if we haven't used up all the * space available yet. */ if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) { @@ -931,9 +1002,16 @@ xlog_cil_push_background( /* * If we are well over the space limit, throttle the work that is being - * done until the push work on this context has begun. + * done until the push work on this context has begun. Enforce the hard + * throttle on all transaction commits once it has been activated, even + * if the committing transactions have resulted in the space usage + * dipping back down under the hard limit. + * + * The ctx->xc_push_lock provides the serialisation necessary for safely + * using the lockless waitqueue_active() check in this context. */ - if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) { + if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) || + waitqueue_active(&cil->xc_push_wait)) { trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); ASSERT(cil->xc_ctx->space_used < log->l_logsize); xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock); @@ -1008,16 +1086,14 @@ xlog_cil_empty( * allowed again. */ void -xfs_log_commit_cil( - struct xfs_mount *mp, +xlog_cil_commit( + struct xlog *log, struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, + xfs_csn_t *commit_seq, bool regrant) { - struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; struct xfs_log_item *lip, *next; - xfs_lsn_t xc_commit_lsn; /* * Do all necessary memory allocation before we lock the CIL. @@ -1031,10 +1107,6 @@ xfs_log_commit_cil( xlog_cil_insert_items(log, tp); - xc_commit_lsn = cil->xc_ctx->sequence; - if (commit_lsn) - *commit_lsn = xc_commit_lsn; - if (regrant && !XLOG_FORCED_SHUTDOWN(log)) xfs_log_ticket_regrant(log, tp->t_ticket); else @@ -1057,8 +1129,10 @@ xfs_log_commit_cil( list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { xfs_trans_del_item(lip); if (lip->li_ops->iop_committing) - lip->li_ops->iop_committing(lip, xc_commit_lsn); + lip->li_ops->iop_committing(lip, cil->xc_ctx->sequence); } + if (commit_seq) + *commit_seq = cil->xc_ctx->sequence; /* xlog_cil_push_background() releases cil->xc_ctx_lock */ xlog_cil_push_background(log); @@ -1075,9 +1149,9 @@ xfs_log_commit_cil( * iclog flush is necessary following this call. */ xfs_lsn_t -xlog_cil_force_lsn( +xlog_cil_force_seq( struct xlog *log, - xfs_lsn_t sequence) + xfs_csn_t sequence) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx; @@ -1173,21 +1247,17 @@ bool xfs_log_item_in_current_chkpt( struct xfs_log_item *lip) { - struct xfs_cil_ctx *ctx; + struct xfs_cil_ctx *ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; if (list_empty(&lip->li_cil)) return false; - ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; - /* * li_seq is written on the first commit of a log item to record the * first checkpoint it is written to. Hence if it is different to the * current sequence, we're in a new checkpoint. */ - if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) - return false; - return true; + return lip->li_seq == ctx->sequence; } /* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 1c6fdbf3d506..4c41bbfa33b0 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -50,6 +50,16 @@ enum xlog_iclog_state { XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */ }; +#define XLOG_STATE_STRINGS \ + { XLOG_STATE_ACTIVE, "XLOG_STATE_ACTIVE" }, \ + { XLOG_STATE_WANT_SYNC, "XLOG_STATE_WANT_SYNC" }, \ + { XLOG_STATE_SYNCING, "XLOG_STATE_SYNCING" }, \ + { XLOG_STATE_DONE_SYNC, "XLOG_STATE_DONE_SYNC" }, \ + { XLOG_STATE_CALLBACK, "XLOG_STATE_CALLBACK" }, \ + { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }, \ + { XLOG_STATE_IOERROR, "XLOG_STATE_IOERROR" } + + /* * Log ticket flags */ @@ -133,6 +143,9 @@ enum xlog_iclog_state { #define XLOG_COVER_OPS 5 +#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */ +#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */ + /* Ticket reservation region accounting */ #define XLOG_TIC_LEN_MAX 15 @@ -201,10 +214,8 @@ typedef struct xlog_in_core { u32 ic_size; u32 ic_offset; enum xlog_iclog_state ic_state; + unsigned int ic_flags; char *ic_datap; /* pointer to iclog data */ - - /* Callback structures need their own cacheline */ - spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; struct list_head ic_callbacks; /* reference counts need their own cacheline */ @@ -230,7 +241,7 @@ struct xfs_cil; struct xfs_cil_ctx { struct xfs_cil *cil; - xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_csn_t sequence; /* chkpt sequence # */ xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ struct xlog_ticket *ticket; /* chkpt ticket */ @@ -268,10 +279,10 @@ struct xfs_cil { struct xfs_cil_ctx *xc_ctx; spinlock_t xc_push_lock ____cacheline_aligned_in_smp; - xfs_lsn_t xc_push_seq; + xfs_csn_t xc_push_seq; struct list_head xc_committing; wait_queue_head_t xc_commit_wait; - xfs_lsn_t xc_current_sequence; + xfs_csn_t xc_current_sequence; struct work_struct xc_push_work; wait_queue_head_t xc_push_wait; /* background push throttle */ } ____cacheline_aligned_in_smp; @@ -436,6 +447,8 @@ struct xlog { #endif /* log recovery lsn tracking (for buffer submission */ xfs_lsn_t l_recovery_lsn; + + uint32_t l_iclog_roundoff;/* padding roundoff */ }; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ @@ -478,13 +491,14 @@ void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, struct xlog_ticket *tic, xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, uint flags, - bool need_start_rec); + struct xlog_in_core **commit_iclog, uint optype); int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, xfs_lsn_t *lsn); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); +int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); + /* * When we crack an atomic LSN, we sample it first so that the value will not * change while we are cracking it into the component values. This means we @@ -547,19 +561,18 @@ int xlog_cil_init(struct xlog *log); void xlog_cil_init_post_recovery(struct xlog *log); void xlog_cil_destroy(struct xlog *log); bool xlog_cil_empty(struct xlog *log); +void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp, + xfs_csn_t *commit_seq, bool regrant); /* * CIL force routines */ -xfs_lsn_t -xlog_cil_force_lsn( - struct xlog *log, - xfs_lsn_t sequence); +xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence); static inline void xlog_cil_force(struct xlog *log) { - xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); + xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence); } /* @@ -582,6 +595,8 @@ xlog_wait( remove_wait_queue(wq, &wait); } +int xlog_wait_on_iclog(struct xlog_in_core *iclog); + /* * The LSN is valid so long as it is behind the current LSN. If it isn't, this * means that the next log record that includes this metadata could have a diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e5dd1c0c2f03..1721fce2ec94 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -25,6 +25,7 @@ #include "xfs_icache.h" #include "xfs_error.h" #include "xfs_buf_item.h" +#include "xfs_ag.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -2457,8 +2458,10 @@ xlog_finish_defer_ops( error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); - if (error) + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); return error; + } /* * Transfer to this new transaction all the dfops we captured @@ -2741,21 +2744,17 @@ STATIC void xlog_recover_process_iunlinks( struct xlog *log) { - xfs_mount_t *mp; - xfs_agnumber_t agno; - xfs_agi_t *agi; - struct xfs_buf *agibp; - xfs_agino_t agino; - int bucket; - int error; - - mp = log->l_mp; + struct xfs_mount *mp = log->l_mp; + struct xfs_perag *pag; + xfs_agnumber_t agno; + struct xfs_agi *agi; + struct xfs_buf *agibp; + xfs_agino_t agino; + int bucket; + int error; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - /* - * Find the agi for this ag. - */ - error = xfs_read_agi(mp, NULL, agno, &agibp); + for_each_perag(mp, agno, pag) { + error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp); if (error) { /* * AGI is b0rked. Don't process it. @@ -2781,7 +2780,7 @@ xlog_recover_process_iunlinks( agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { agino = xlog_recover_process_one_iunlink(mp, - agno, agino, bucket); + pag->pag_agno, agino, bucket); cond_resched(); } } @@ -3452,6 +3451,7 @@ xlog_recover_finish( * this) before we get around to xfs_log_mount_cancel. */ xlog_recover_cancel_intents(log); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); xfs_alert(log->l_mp, "Failed to recover intents"); return error; } @@ -3493,27 +3493,28 @@ xlog_recover_cancel( */ STATIC void xlog_recover_check_summary( - struct xlog *log) + struct xlog *log) { - xfs_mount_t *mp; - struct xfs_buf *agfbp; - struct xfs_buf *agibp; - xfs_agnumber_t agno; - uint64_t freeblks; - uint64_t itotal; - uint64_t ifree; - int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_perag *pag; + struct xfs_buf *agfbp; + struct xfs_buf *agibp; + xfs_agnumber_t agno; + uint64_t freeblks; + uint64_t itotal; + uint64_t ifree; + int error; mp = log->l_mp; freeblks = 0LL; itotal = 0LL; ifree = 0LL; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); + for_each_perag(mp, agno, pag) { + error = xfs_read_agf(mp, NULL, pag->pag_agno, 0, &agfbp); if (error) { xfs_alert(mp, "%s agf read failed agno %d error %d", - __func__, agno, error); + __func__, pag->pag_agno, error); } else { struct xfs_agf *agfp = agfbp->b_addr; @@ -3522,10 +3523,10 @@ xlog_recover_check_summary( xfs_buf_relse(agfbp); } - error = xfs_read_agi(mp, NULL, agno, &agibp); + error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp); if (error) { xfs_alert(mp, "%s agi read failed agno %d error %d", - __func__, agno, error); + __func__, pag->pag_agno, error); } else { struct xfs_agi *agi = agibp->b_addr; diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 7ec1a9207517..bb9860ec9a93 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -2,6 +2,8 @@ #ifndef __XFS_MESSAGE_H #define __XFS_MESSAGE_H 1 +#include <linux/once_lite.h> + struct xfs_mount; extern __printf(2, 3) @@ -41,16 +43,7 @@ do { \ } while (0) #define xfs_printk_once(func, dev, fmt, ...) \ -({ \ - static bool __section(".data.once") __print_once; \ - bool __ret_print_once = !__print_once; \ - \ - if (!__print_once) { \ - __print_once = true; \ - func(dev, fmt, ##__VA_ARGS__); \ - } \ - unlikely(__ret_print_once); \ -}) + DO_ONCE_LITE(func, dev, fmt, ##__VA_ARGS__) #define xfs_emerg_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bdfee1943796..d0755494597f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -32,6 +32,7 @@ #include "xfs_extent_busy.h" #include "xfs_health.h" #include "xfs_trace.h" +#include "xfs_ag.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -119,41 +120,6 @@ xfs_uuid_unmount( mutex_unlock(&xfs_uuid_table_mutex); } - -STATIC void -__xfs_free_perag( - struct rcu_head *head) -{ - struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); - - ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - ASSERT(atomic_read(&pag->pag_ref) == 0); - kmem_free(pag); -} - -/* - * Free up the per-ag resources associated with the mount structure. - */ -STATIC void -xfs_free_perag( - xfs_mount_t *mp) -{ - xfs_agnumber_t agno; - struct xfs_perag *pag; - - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - spin_lock(&mp->m_perag_lock); - pag = radix_tree_delete(&mp->m_perag_tree, agno); - spin_unlock(&mp->m_perag_lock); - ASSERT(pag); - ASSERT(atomic_read(&pag->pag_ref) == 0); - cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_iunlink_destroy(pag); - xfs_buf_hash_destroy(pag); - call_rcu(&pag->rcu_head, __xfs_free_perag); - } -} - /* * Check size of device based on the (data/realtime) block count. * Note: this check is used by the growfs code as well as mount. @@ -172,96 +138,6 @@ xfs_sb_validate_fsb_count( return 0; } -int -xfs_initialize_perag( - xfs_mount_t *mp, - xfs_agnumber_t agcount, - xfs_agnumber_t *maxagi) -{ - xfs_agnumber_t index; - xfs_agnumber_t first_initialised = NULLAGNUMBER; - xfs_perag_t *pag; - int error = -ENOMEM; - - /* - * Walk the current per-ag tree so we don't try to initialise AGs - * that already exist (growfs case). Allocate and insert all the - * AGs we don't find ready for initialisation. - */ - for (index = 0; index < agcount; index++) { - pag = xfs_perag_get(mp, index); - if (pag) { - xfs_perag_put(pag); - continue; - } - - pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); - if (!pag) { - error = -ENOMEM; - goto out_unwind_new_pags; - } - pag->pag_agno = index; - pag->pag_mount = mp; - spin_lock_init(&pag->pag_ici_lock); - INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker); - INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - - error = xfs_buf_hash_init(pag); - if (error) - goto out_free_pag; - init_waitqueue_head(&pag->pagb_wait); - spin_lock_init(&pag->pagb_lock); - pag->pagb_count = 0; - pag->pagb_tree = RB_ROOT; - - error = radix_tree_preload(GFP_NOFS); - if (error) - goto out_hash_destroy; - - spin_lock(&mp->m_perag_lock); - if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { - WARN_ON_ONCE(1); - spin_unlock(&mp->m_perag_lock); - radix_tree_preload_end(); - error = -EEXIST; - goto out_hash_destroy; - } - spin_unlock(&mp->m_perag_lock); - radix_tree_preload_end(); - /* first new pag is fully initialized */ - if (first_initialised == NULLAGNUMBER) - first_initialised = index; - error = xfs_iunlink_init(pag); - if (error) - goto out_hash_destroy; - spin_lock_init(&pag->pag_state_lock); - } - - index = xfs_set_inode_alloc(mp, agcount); - - if (maxagi) - *maxagi = index; - - mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); - return 0; - -out_hash_destroy: - xfs_buf_hash_destroy(pag); -out_free_pag: - kmem_free(pag); -out_unwind_new_pags: - /* unwind any prior newly initialized pags */ - for (index = first_initialised; index < agcount; index++) { - pag = radix_tree_delete(&mp->m_perag_tree, index); - if (!pag) - break; - xfs_buf_hash_destroy(pag); - xfs_iunlink_destroy(pag); - kmem_free(pag); - } - return error; -} - /* * xfs_readsb * @@ -983,9 +859,17 @@ xfs_mountfs( /* * Finish recovering the file system. This part needed to be delayed * until after the root and real-time bitmap inodes were consistently - * read in. + * read in. Temporarily create per-AG space reservations for metadata + * btree shape changes because space freeing transactions (for inode + * inactivation) require the per-AG reservation in lieu of reserving + * blocks. */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error == -ENOSPC) + xfs_warn(mp, + "ENOSPC reserving per-AG metadata pool, log recovery may fail."); error = xfs_log_mount_finish(mp); + xfs_fs_unreserve_ag_blocks(mp); if (error) { xfs_warn(mp, "log mount finish failed"); goto out_rtunmount; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index bb67274ee23f..c78b63fe779a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -12,6 +12,7 @@ struct xfs_mru_cache; struct xfs_ail; struct xfs_quotainfo; struct xfs_da_geometry; +struct xfs_perag; /* dynamic preallocation free space thresholds, 5% down to 1% */ enum { @@ -297,117 +298,12 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } -/* per-AG block reservation data structures*/ -struct xfs_ag_resv { - /* number of blocks originally reserved here */ - xfs_extlen_t ar_orig_reserved; - /* number of blocks reserved here */ - xfs_extlen_t ar_reserved; - /* number of blocks originally asked for */ - xfs_extlen_t ar_asked; -}; - -/* - * Per-ag incore structure, copies of information in agf and agi, to improve the - * performance of allocation group selection. - */ -typedef struct xfs_perag { - struct xfs_mount *pag_mount; /* owner filesystem */ - xfs_agnumber_t pag_agno; /* AG this structure belongs to */ - atomic_t pag_ref; /* perag reference count */ - char pagf_init; /* this agf's entry is initialized */ - char pagi_init; /* this agi's entry is initialized */ - char pagf_metadata; /* the agf is preferred to be metadata */ - char pagi_inodeok; /* The agi is ok for inodes */ - uint8_t pagf_levels[XFS_BTNUM_AGF]; - /* # of levels in bno & cnt btree */ - bool pagf_agflreset; /* agfl requires reset before use */ - uint32_t pagf_flcount; /* count of blocks in freelist */ - xfs_extlen_t pagf_freeblks; /* total free blocks */ - xfs_extlen_t pagf_longest; /* longest free space */ - uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ - xfs_agino_t pagi_freecount; /* number of free inodes */ - xfs_agino_t pagi_count; /* number of allocated inodes */ - - /* - * Inode allocation search lookup optimisation. - * If the pagino matches, the search for new inodes - * doesn't need to search the near ones again straight away - */ - xfs_agino_t pagl_pagino; - xfs_agino_t pagl_leftrec; - xfs_agino_t pagl_rightrec; - - /* - * Bitsets of per-ag metadata that have been checked and/or are sick. - * Callers should hold pag_state_lock before accessing this field. - */ - uint16_t pag_checked; - uint16_t pag_sick; - spinlock_t pag_state_lock; - - spinlock_t pagb_lock; /* lock for pagb_tree */ - struct rb_root pagb_tree; /* ordered tree of busy extents */ - unsigned int pagb_gen; /* generation count for pagb_tree */ - wait_queue_head_t pagb_wait; /* woken when pagb_gen changes */ - - atomic_t pagf_fstrms; /* # of filestreams active in this AG */ - - spinlock_t pag_ici_lock; /* incore inode cache lock */ - struct radix_tree_root pag_ici_root; /* incore inode cache root */ - int pag_ici_reclaimable; /* reclaimable inodes */ - unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ - - /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ - struct rhashtable pag_buf_hash; - - /* for rcu-safe freeing */ - struct rcu_head rcu_head; - int pagb_count; /* pagb slots in use */ - - /* Blocks reserved for all kinds of metadata. */ - struct xfs_ag_resv pag_meta_resv; - /* Blocks reserved for the reverse mapping btree. */ - struct xfs_ag_resv pag_rmapbt_resv; - - /* background prealloc block trimming */ - struct delayed_work pag_blockgc_work; - - /* reference count */ - uint8_t pagf_refcount_level; - - /* - * Unlinked inode information. This incore information reflects - * data stored in the AGI, so callers must hold the AGI buffer lock - * or have some other means to control concurrency. - */ - struct rhashtable pagi_unlinked_hash; -} xfs_perag_t; - -static inline struct xfs_ag_resv * -xfs_perag_resv( - struct xfs_perag *pag, - enum xfs_ag_resv_type type) -{ - switch (type) { - case XFS_AG_RESV_METADATA: - return &pag->pag_meta_resv; - case XFS_AG_RESV_RMAPBT: - return &pag->pag_rmapbt_resv; - default: - return NULL; - } -} - -int xfs_buf_hash_init(xfs_perag_t *pag); -void xfs_buf_hash_destroy(xfs_perag_t *pag); +int xfs_buf_hash_init(struct xfs_perag *pag); +void xfs_buf_hash_destroy(struct xfs_perag *pag); extern void xfs_uuid_table_free(void); extern uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); -extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, - xfs_agnumber_t *maxagi); extern void xfs_unmountfs(xfs_mount_t *); extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 4bf949a89d0d..fe341f3fd419 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -23,6 +23,8 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_error.h" +#include "xfs_ag.h" +#include "xfs_ialloc.h" /* * The global quota manager. There is only one of these for the entire @@ -787,8 +789,12 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { - error = xfs_dir_ialloc(&init_user_ns, &tp, NULL, S_IFREG, 1, 0, - 0, false, ipp); + xfs_ino_t ino; + + error = xfs_dialloc(&tp, 0, S_IFREG, &ino); + if (!error) + error = xfs_init_new_inode(&init_user_ns, tp, NULL, ino, + S_IFREG, 1, 0, 0, false, ipp); if (error) { xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index e3dabab44097..ebbb484c49dc 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -142,7 +142,6 @@ extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); /* dquot stuff */ extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); -extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 11f1e2fbf22f..13a56e1ea15c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -201,7 +201,8 @@ xfs_qm_scall_quotaoff( * depend on the quota inodes (and other things) being valid as long as * we keep the lock(s). */ - xfs_qm_dqrele_all_inodes(mp, flags); + error = xfs_dqrele_all_inodes(mp, flags); + ASSERT(!error); /* * Next we make the changes in the quota flag in the mount struct. @@ -747,54 +748,3 @@ xfs_qm_scall_getquota_next( xfs_qm_dqput(dqp); return error; } - -STATIC int -xfs_dqrele_inode( - struct xfs_inode *ip, - void *args) -{ - uint *flags = args; - - /* skip quota inodes */ - if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || - ip == ip->i_mount->m_quotainfo->qi_gquotaip || - ip == ip->i_mount->m_quotainfo->qi_pquotaip) { - ASSERT(ip->i_udquot == NULL); - ASSERT(ip->i_gdquot == NULL); - ASSERT(ip->i_pdquot == NULL); - return 0; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if ((*flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { - xfs_qm_dqrele(ip->i_udquot); - ip->i_udquot = NULL; - } - if ((*flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { - xfs_qm_dqrele(ip->i_gdquot); - ip->i_gdquot = NULL; - } - if ((*flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { - xfs_qm_dqrele(ip->i_pdquot); - ip->i_pdquot = NULL; - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return 0; -} - - -/* - * Go thru all the inodes in the file system, releasing their dquots. - * - * Note that the mount structure gets modified to indicate that quotas are off - * AFTER this, in the case of quotaoff. - */ -void -xfs_qm_dqrele_all_inodes( - struct xfs_mount *mp, - uint flags) -{ - ASSERT(mp->m_quotainfo); - xfs_inode_walk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, - &flags, XFS_ICI_NO_TAG); -} diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 060695d6d56a..c256104772cb 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -27,7 +27,7 @@ #include "xfs_quota.h" #include "xfs_reflink.h" #include "xfs_iomap.h" -#include "xfs_sb.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" /* @@ -144,7 +144,7 @@ xfs_reflink_find_shared( if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno); + cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agbp->b_pag); error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, find_end_of_shared); @@ -755,16 +755,19 @@ int xfs_reflink_recover_cow( struct xfs_mount *mp) { + struct xfs_perag *pag; xfs_agnumber_t agno; int error = 0; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - error = xfs_refcount_recover_cow_leftovers(mp, agno); - if (error) + for_each_perag(mp, agno, pag) { + error = xfs_refcount_recover_cow_leftovers(mp, pag); + if (error) { + xfs_perag_put(pag); break; + } } return error; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index a2dab05332ac..2c9e26a44546 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -36,6 +36,7 @@ #include "xfs_bmap_item.h" #include "xfs_reflink.h" #include "xfs_pwork.h" +#include "xfs_ag.h" #include <linux/magic.h> #include <linux/fs_context.h> @@ -339,13 +340,6 @@ xfs_blkdev_put( blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } -void -xfs_blkdev_issue_flush( - xfs_buftarg_t *buftarg) -{ - blkdev_issue_flush(buftarg->bt_bdev); -} - STATIC void xfs_close_devices( struct xfs_mount *mp) @@ -667,7 +661,7 @@ xfs_fs_destroy_inode( * reclaim path handles this more efficiently than we can here, so * simply let background reclaim tear down all inodes. */ - xfs_inode_set_reclaim_tag(ip); + xfs_inode_mark_reclaimable(ip); } static void diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index d2b40dc60dfc..167d23f92ffe 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -87,7 +87,6 @@ struct xfs_buftarg; struct block_device; extern void xfs_flush_inodes(struct xfs_mount *mp); -extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, xfs_agnumber_t agcount); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 99fbec32c10a..1525636f4065 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -21,6 +21,7 @@ #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_trans.h" +#include "xfs_ialloc.h" /* ----- Kernel only functions below ----- */ int @@ -161,6 +162,7 @@ xfs_symlink( struct xfs_dquot *gdqp = NULL; struct xfs_dquot *pdqp = NULL; uint resblks; + xfs_ino_t ino; *ipp = NULL; @@ -223,8 +225,11 @@ xfs_symlink( /* * Allocate an inode for the symlink. */ - error = xfs_dir_ialloc(mnt_userns, &tp, dp, S_IFLNK | (mode & ~S_IFMT), - 1, 0, prid, false, &ip); + error = xfs_dialloc(&tp, dp->i_ino, S_IFLNK, &ino); + if (!error) + error = xfs_init_new_inode(mnt_userns, tp, dp, ino, + S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, + false, &ip); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 9b8d703dc9fd..7e01e00550ac 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -30,6 +30,8 @@ #include "xfs_fsmap.h" #include "xfs_btree_staging.h" #include "xfs_icache.h" +#include "xfs_ag.h" +#include "xfs_ag_resv.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 808ae337b222..f9d8d605f9b1 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -24,6 +24,7 @@ struct xlog_ticket; struct xlog_recover; struct xlog_recover_item; struct xlog_rec_header; +struct xlog_in_core; struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; @@ -37,7 +38,7 @@ struct xfs_trans_res; struct xfs_inobt_rec_incore; union xfs_btree_ptr; struct xfs_dqtrx; -struct xfs_eofblocks; +struct xfs_icwalk; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -153,10 +154,8 @@ DEFINE_EVENT(xfs_perag_class, name, \ DEFINE_PERAG_REF_EVENT(xfs_perag_get); DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); -DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); -DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); -DEFINE_PERAG_REF_EVENT(xfs_perag_set_blockgc); -DEFINE_PERAG_REF_EVENT(xfs_perag_clear_blockgc); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag); DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), @@ -632,8 +631,8 @@ DEFINE_EVENT(xfs_inode_class, name, \ TP_PROTO(struct xfs_inode *ip), \ TP_ARGS(ip)) DEFINE_INODE_EVENT(xfs_iget_skip); -DEFINE_INODE_EVENT(xfs_iget_reclaim); -DEFINE_INODE_EVENT(xfs_iget_reclaim_fail); +DEFINE_INODE_EVENT(xfs_iget_recycle); +DEFINE_INODE_EVENT(xfs_iget_recycle_fail); DEFINE_INODE_EVENT(xfs_iget_hit); DEFINE_INODE_EVENT(xfs_iget_miss); @@ -1914,7 +1913,6 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_add); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work); -DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); DEFINE_ATTR_EVENT(xfs_attr_leaf_create); DEFINE_ATTR_EVENT(xfs_attr_leaf_compact); DEFINE_ATTR_EVENT(xfs_attr_leaf_get); @@ -1944,7 +1942,6 @@ DEFINE_ATTR_EVENT(xfs_attr_refillstate); DEFINE_ATTR_EVENT(xfs_attr_rmtval_get); DEFINE_ATTR_EVENT(xfs_attr_rmtval_set); -DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove); #define DEFINE_DA_EVENT(name) \ DEFINE_EVENT(xfs_da_class, name, \ @@ -3730,7 +3727,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot, TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; __entry->btnum = cur->bc_btnum; - __entry->agno = cur->bc_ag.agno; + __entry->agno = cur->bc_ag.pag->pag_agno; __entry->agbno = cur->bc_ag.afake->af_root; __entry->levels = cur->bc_ag.afake->af_levels; __entry->blocks = cur->bc_ag.afake->af_blocks; @@ -3845,7 +3842,7 @@ TRACE_EVENT(xfs_btree_bload_block, __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb); __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb); } else { - __entry->agno = cur->bc_ag.agno; + __entry->agno = cur->bc_ag.pag->pag_agno; __entry->agbno = be32_to_cpu(ptr->s); } __entry->nr_records = nr_records; @@ -3887,10 +3884,10 @@ DEFINE_EVENT(xfs_timestamp_range_class, name, \ DEFINE_TIMESTAMP_RANGE_EVENT(xfs_inode_timestamp_range); DEFINE_TIMESTAMP_RANGE_EVENT(xfs_quota_expiry_range); -DECLARE_EVENT_CLASS(xfs_eofblocks_class, - TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, +DECLARE_EVENT_CLASS(xfs_icwalk_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw, unsigned long caller_ip), - TP_ARGS(mp, eofb, caller_ip), + TP_ARGS(mp, icw, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(__u32, flags) @@ -3898,35 +3895,97 @@ DECLARE_EVENT_CLASS(xfs_eofblocks_class, __field(uint32_t, gid) __field(prid_t, prid) __field(__u64, min_file_size) + __field(long, scan_limit) __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; - __entry->flags = eofb ? eofb->eof_flags : 0; - __entry->uid = eofb ? from_kuid(mp->m_super->s_user_ns, - eofb->eof_uid) : 0; - __entry->gid = eofb ? from_kgid(mp->m_super->s_user_ns, - eofb->eof_gid) : 0; - __entry->prid = eofb ? eofb->eof_prid : 0; - __entry->min_file_size = eofb ? eofb->eof_min_file_size : 0; + __entry->flags = icw ? icw->icw_flags : 0; + __entry->uid = icw ? from_kuid(mp->m_super->s_user_ns, + icw->icw_uid) : 0; + __entry->gid = icw ? from_kgid(mp->m_super->s_user_ns, + icw->icw_gid) : 0; + __entry->prid = icw ? icw->icw_prid : 0; + __entry->min_file_size = icw ? icw->icw_min_file_size : 0; + __entry->scan_limit = icw ? icw->icw_scan_limit : 0; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu caller %pS", + TP_printk("dev %d:%d flags 0x%x uid %u gid %u prid %u minsize %llu scan_limit %ld caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->flags, __entry->uid, __entry->gid, __entry->prid, __entry->min_file_size, + __entry->scan_limit, (char *)__entry->caller_ip) ); -#define DEFINE_EOFBLOCKS_EVENT(name) \ -DEFINE_EVENT(xfs_eofblocks_class, name, \ - TP_PROTO(struct xfs_mount *mp, struct xfs_eofblocks *eofb, \ +#define DEFINE_ICWALK_EVENT(name) \ +DEFINE_EVENT(xfs_icwalk_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_icwalk *icw, \ unsigned long caller_ip), \ - TP_ARGS(mp, eofb, caller_ip)) -DEFINE_EOFBLOCKS_EVENT(xfs_ioc_free_eofblocks); -DEFINE_EOFBLOCKS_EVENT(xfs_blockgc_free_space); + TP_ARGS(mp, icw, caller_ip)) +DEFINE_ICWALK_EVENT(xfs_ioc_free_eofblocks); +DEFINE_ICWALK_EVENT(xfs_blockgc_free_space); + +TRACE_DEFINE_ENUM(XLOG_STATE_ACTIVE); +TRACE_DEFINE_ENUM(XLOG_STATE_WANT_SYNC); +TRACE_DEFINE_ENUM(XLOG_STATE_SYNCING); +TRACE_DEFINE_ENUM(XLOG_STATE_DONE_SYNC); +TRACE_DEFINE_ENUM(XLOG_STATE_CALLBACK); +TRACE_DEFINE_ENUM(XLOG_STATE_DIRTY); +TRACE_DEFINE_ENUM(XLOG_STATE_IOERROR); + +DECLARE_EVENT_CLASS(xlog_iclog_class, + TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip), + TP_ARGS(iclog, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint32_t, state) + __field(int32_t, refcount) + __field(uint32_t, offset) + __field(unsigned long long, lsn) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = iclog->ic_log->l_mp->m_super->s_dev; + __entry->state = iclog->ic_state; + __entry->refcount = atomic_read(&iclog->ic_refcnt); + __entry->offset = iclog->ic_offset; + __entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->state, XLOG_STATE_STRINGS), + __entry->refcount, + __entry->offset, + __entry->lsn, + (char *)__entry->caller_ip) + +); + +#define DEFINE_ICLOG_EVENT(name) \ +DEFINE_EVENT(xlog_iclog_class, name, \ + TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip), \ + TP_ARGS(iclog, caller_ip)) + +DEFINE_ICLOG_EVENT(xlog_iclog_activate); +DEFINE_ICLOG_EVENT(xlog_iclog_clean); +DEFINE_ICLOG_EVENT(xlog_iclog_callback); +DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_start); +DEFINE_ICLOG_EVENT(xlog_iclog_callbacks_done); +DEFINE_ICLOG_EVENT(xlog_iclog_force); +DEFINE_ICLOG_EVENT(xlog_iclog_force_lsn); +DEFINE_ICLOG_EVENT(xlog_iclog_get_space); +DEFINE_ICLOG_EVENT(xlog_iclog_release); +DEFINE_ICLOG_EVENT(xlog_iclog_switch); +DEFINE_ICLOG_EVENT(xlog_iclog_sync); +DEFINE_ICLOG_EVENT(xlog_iclog_syncing); +DEFINE_ICLOG_EVENT(xlog_iclog_sync_done); +DEFINE_ICLOG_EVENT(xlog_iclog_want_sync); +DEFINE_ICLOG_EVENT(xlog_iclog_wait_on); +DEFINE_ICLOG_EVENT(xlog_iclog_write); #endif /* _TRACE_XFS_H */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 586f2992b789..87bffd12c20c 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -839,7 +839,7 @@ __xfs_trans_commit( bool regrant) { struct xfs_mount *mp = tp->t_mountp; - xfs_lsn_t commit_lsn = -1; + xfs_csn_t commit_seq = 0; int error = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; @@ -881,7 +881,7 @@ __xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); + xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant); xfs_trans_free(tp); @@ -890,7 +890,7 @@ __xfs_trans_commit( * log out now and wait for it. */ if (sync) { - error = xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); + error = xfs_log_force_seq(mp, commit_seq, XFS_LOG_SYNC, NULL); XFS_STATS_INC(mp, xs_trans_sync); } else { XFS_STATS_INC(mp, xs_trans_async); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index ee42d98d9011..50da47f23a07 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -43,7 +43,7 @@ struct xfs_log_item { struct list_head li_cil; /* CIL pointers */ struct xfs_log_vec *li_lv; /* active log vector */ struct xfs_log_vec *li_lv_shadow; /* standby vector */ - xfs_lsn_t li_seq; /* CIL commit seq */ + xfs_csn_t li_seq; /* CIL commit seq */ }; /* @@ -69,7 +69,7 @@ struct xfs_item_ops { void (*iop_pin)(struct xfs_log_item *); void (*iop_unpin)(struct xfs_log_item *, int remove); uint (*iop_push)(struct xfs_log_item *, struct list_head *); - void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn); + void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq); void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); int (*iop_recover)(struct xfs_log_item *lip, diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 9aced0a00003..d11d032da0b4 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -294,7 +294,7 @@ xfs_trans_read_buf_map( default: if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); - /* fall through */ + fallthrough; case -ENOMEM: case -EAGAIN: return error; diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index cd145d318b17..dbf03635869c 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -5,7 +5,7 @@ * Copyright (C) 2019 Western Digital Corporation or its affiliates. */ #include <linux/module.h> -#include <linux/fs.h> +#include <linux/pagemap.h> #include <linux/magic.h> #include <linux/iomap.h> #include <linux/init.h> @@ -185,7 +185,7 @@ static const struct address_space_operations zonefs_file_aops = { .readahead = zonefs_readahead, .writepage = zonefs_writepage, .writepages = zonefs_writepages, - .set_page_dirty = iomap_set_page_dirty, + .set_page_dirty = __set_page_dirty_nobuffers, .releasepage = iomap_releasepage, .invalidatepage = iomap_invalidatepage, .migratepage = iomap_migrate_page, |