diff options
Diffstat (limited to 'fs')
219 files changed, 3714 insertions, 1823 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 7bf835f85bc8..eadc894faea2 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -449,14 +449,14 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, if (retval) return retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); retval = p9_client_wstat(fid, &wstat); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } @@ -472,13 +472,13 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, if (retval) return retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; retval = p9_client_fsync(fid, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index ea4aba56f29d..fadf408bdd46 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -44,24 +44,24 @@ struct adfs_dir_ops; */ struct adfs_sb_info { union { struct { - struct adfs_discmap *s_map; /* bh list containing map */ - const struct adfs_dir_ops *s_dir; /* directory operations */ + struct adfs_discmap *s_map; /* bh list containing map */ + const struct adfs_dir_ops *s_dir; /* directory operations */ }; - struct rcu_head rcu; /* used only at shutdown time */ + struct rcu_head rcu; /* used only at shutdown time */ }; - kuid_t s_uid; /* owner uid */ - kgid_t s_gid; /* owner gid */ - umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ - umode_t s_other_mask; /* ADFS other perm -> unix perm */ + kuid_t s_uid; /* owner uid */ + kgid_t s_gid; /* owner gid */ + umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ + umode_t s_other_mask; /* ADFS other perm -> unix perm */ int s_ftsuffix; /* ,xyz hex filetype suffix option */ - __u32 s_ids_per_zone; /* max. no ids in one zone */ - __u32 s_idlen; /* length of ID in map */ - __u32 s_map_size; /* sector size of a map */ - unsigned long s_size; /* total size (in blocks) of this fs */ - signed int s_map2blk; /* shift left by this for map->sector */ - unsigned int s_log2sharesize;/* log2 share size */ - __le32 s_version; /* disc format version */ + __u32 s_ids_per_zone; /* max. no ids in one zone */ + __u32 s_idlen; /* length of ID in map */ + __u32 s_map_size; /* sector size of a map */ + unsigned long s_size; /* total size (in blocks) of this fs */ + signed int s_map2blk; /* shift left by this for map->sector*/ + unsigned int s_log2sharesize;/* log2 share size */ + __le32 s_version; /* disc format version */ unsigned int s_namelen; /* maximum number of characters in name */ }; diff --git a/fs/affs/file.c b/fs/affs/file.c index 659c579c4588..0548c53f41d5 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -33,11 +33,11 @@ affs_file_release(struct inode *inode, struct file *filp) inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (inode->i_size != AFFS_I(inode)->mmu_private) affs_truncate(inode); affs_free_prealloc(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; @@ -958,12 +958,12 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = write_inode_now(inode, 0); err = sync_blockdev(inode->i_sb->s_bdev); if (!ret) ret = err; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } const struct file_operations affs_file_operations = { diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 4baf1d2b39e4..d91a9c9cfbd0 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -483,7 +483,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) fl->fl_type = F_UNLCK; - mutex_lock(&vnode->vfs_inode.i_mutex); + inode_lock(&vnode->vfs_inode); /* check local lock records first */ ret = 0; @@ -505,7 +505,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) } error: - mutex_unlock(&vnode->vfs_inode.i_mutex); + inode_unlock(&vnode->vfs_inode); _leave(" = %d [%hd]", ret, fl->fl_type); return ret; } diff --git a/fs/afs/write.c b/fs/afs/write.c index 0714abcd7f32..dfef94f70667 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -693,7 +693,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* use a writeback record as a marker in the queue - when this reaches * the front of the queue, all the outstanding writes are either @@ -735,7 +735,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) afs_put_writeback(wb); _leave(" = %d", ret); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/attr.c b/fs/attr.c index 6530ced19697..25b24d0f6c88 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -195,7 +195,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de struct timespec now; unsigned int ia_valid = attr->ia_valid; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3a93755e880f..051ea4809c14 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -491,6 +491,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr, * arch_check_elf() - check an ELF executable * @ehdr: The main ELF header * @has_interp: True if the ELF has an interpreter, else false. + * @interp_ehdr: The interpreter's ELF header * @state: Architecture-specific state preserved throughout the process * of loading the ELF. * @@ -502,6 +503,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr, * with that return code. */ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, + struct elfhdr *interp_ehdr, struct arch_elf_state *state) { /* Dummy implementation, always proceed */ @@ -829,7 +831,9 @@ static int load_elf_binary(struct linux_binprm *bprm) * still possible to return an error to the code that invoked * the exec syscall. */ - retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state); + retval = arch_check_elf(&loc->elf_ex, + !!interpreter, &loc->interp_elf_ex, + &arch_state); if (retval) goto out_free_dentry; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 78f005f37847..3a3ced779fc7 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -638,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, case 3: /* Delete this handler. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); kill_node(e); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); break; default: @@ -675,7 +675,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); root = dget(sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -711,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, out2: dput(dentry); out: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); if (err) { @@ -754,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, case 3: /* Delete all handlers. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); while (!list_empty(&entries)) kill_node(list_entry(entries.next, Node, list)); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); break; default: diff --git a/fs/block_dev.c b/fs/block_dev.c index 530145b607c4..afb437484362 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0 && mapping->nrshadows == 0) + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; invalidate_bh_lrus(); @@ -346,9 +346,9 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) struct inode *bd_inode = bdev_file_inode(file); loff_t retval; - mutex_lock(&bd_inode->i_mutex); + inode_lock(bd_inode); retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); - mutex_unlock(&bd_inode->i_mutex); + inode_unlock(bd_inode); return retval; } @@ -400,7 +400,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return result; - result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL); + result = blk_queue_enter(bdev->bd_queue, false); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); @@ -437,7 +437,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_queue, GFP_NOIO); + result = blk_queue_enter(bdev->bd_queue, false); if (result) return result; @@ -700,7 +700,7 @@ static struct block_device *bd_acquire(struct inode *inode) spin_lock(&bdev_lock); bdev = inode->i_bdev; if (bdev) { - ihold(bdev->bd_inode); + bdgrab(bdev); spin_unlock(&bdev_lock); return bdev; } @@ -716,7 +716,7 @@ static struct block_device *bd_acquire(struct inode *inode) * So, we can access it via ->i_mapping always * without igrab(). */ - ihold(bdev->bd_inode); + bdgrab(bdev); inode->i_bdev = bdev; inode->i_mapping = bdev->bd_inode->i_mapping; list_add(&inode->i_devices, &bdev->bd_inodes); @@ -739,7 +739,7 @@ void bd_forget(struct inode *inode) spin_unlock(&bdev_lock); if (bdev) - iput(bdev->bd_inode); + bdput(bdev); } /** @@ -1142,9 +1142,9 @@ void bd_set_size(struct block_device *bdev, loff_t size) { unsigned bsize = bdev_logical_block_size(bdev); - mutex_lock(&bdev->bd_inode->i_mutex); + inode_lock(bdev->bd_inode); i_size_write(bdev->bd_inode, size); - mutex_unlock(&bdev->bd_inode->i_mutex); + inode_unlock(bdev->bd_inode); while (bsize < PAGE_CACHE_SIZE) { if (size & bsize) break; @@ -1736,37 +1736,13 @@ static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); } -static void blkdev_vm_open(struct vm_area_struct *vma) -{ - struct inode *bd_inode = bdev_file_inode(vma->vm_file); - struct block_device *bdev = I_BDEV(bd_inode); - - mutex_lock(&bd_inode->i_mutex); - bdev->bd_map_count++; - mutex_unlock(&bd_inode->i_mutex); -} - -static void blkdev_vm_close(struct vm_area_struct *vma) -{ - struct inode *bd_inode = bdev_file_inode(vma->vm_file); - struct block_device *bdev = I_BDEV(bd_inode); - - mutex_lock(&bd_inode->i_mutex); - bdev->bd_map_count--; - mutex_unlock(&bd_inode->i_mutex); -} - static const struct vm_operations_struct blkdev_dax_vm_ops = { - .open = blkdev_vm_open, - .close = blkdev_vm_close, .fault = blkdev_dax_fault, .pmd_fault = blkdev_dax_pmd_fault, .pfn_mkwrite = blkdev_dax_fault, }; static const struct vm_operations_struct blkdev_default_vm_ops = { - .open = blkdev_vm_open, - .close = blkdev_vm_close, .fault = filemap_fault, .map_pages = filemap_map_pages, }; @@ -1774,18 +1750,14 @@ static const struct vm_operations_struct blkdev_default_vm_ops = { static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *bd_inode = bdev_file_inode(file); - struct block_device *bdev = I_BDEV(bd_inode); file_accessed(file); - mutex_lock(&bd_inode->i_mutex); - bdev->bd_map_count++; if (IS_DAX(bd_inode)) { vma->vm_ops = &blkdev_dax_vm_ops; vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; } else { vma->vm_ops = &blkdev_default_vm_ops; } - mutex_unlock(&bd_inode->i_mutex); return 0; } diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 88d9af3d4581..5fb60ea7eee2 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -328,8 +328,8 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, list_add_tail(&work->ordered_list, &wq->ordered_list); spin_unlock_irqrestore(&wq->list_lock, flags); } - queue_work(wq->normal_wq, &work->normal_work); trace_btrfs_work_queued(work); + queue_work(wq->normal_wq, &work->normal_work); } void btrfs_queue_work(struct btrfs_workqueue *wq, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 08405a3da6b1..b90cd3776f8e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -560,13 +560,13 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, */ static void __merge_refs(struct list_head *head, int mode) { - struct __prelim_ref *ref1; + struct __prelim_ref *pos1; - list_for_each_entry(ref1, head, list) { - struct __prelim_ref *ref2 = ref1, *tmp; + list_for_each_entry(pos1, head, list) { + struct __prelim_ref *pos2 = pos1, *tmp; - list_for_each_entry_safe_continue(ref2, tmp, head, list) { - struct __prelim_ref *xchg; + list_for_each_entry_safe_continue(pos2, tmp, head, list) { + struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2; struct extent_inode_elem *eie; if (!ref_for_same_block(ref1, ref2)) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 97ad9bbeb35d..bfe4a337fb4d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1614,7 +1614,7 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; - struct rw_semaphore delayed_iput_sem; + struct mutex cleaner_delayed_iput_mutex; /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; @@ -3641,6 +3641,7 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int __get_raid_index(u64 flags); int btrfs_start_write_no_snapshoting(struct btrfs_root *root); void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); void check_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *root, const u64 type); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1e668fb7dd4c..cbb7dbfb3fff 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -614,7 +614,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( em = lookup_extent_mapping(em_tree, start, (u64)-1); if (!em) break; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e99ccd6ffb2c..4545e2e2ad45 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -55,6 +55,12 @@ #include <asm/cpufeature.h> #endif +#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ + BTRFS_HEADER_FLAG_RELOC |\ + BTRFS_SUPER_FLAG_ERROR |\ + BTRFS_SUPER_FLAG_SEEDING |\ + BTRFS_SUPER_FLAG_METADUMP) + static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); @@ -176,6 +182,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" }, + { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" }, { .id = 0, .name_stem = "tree" }, }; @@ -1583,8 +1590,23 @@ int btrfs_init_fs_root(struct btrfs_root *root) ret = get_anon_bdev(&root->anon_dev); if (ret) goto free_writers; + + mutex_lock(&root->objectid_mutex); + ret = btrfs_find_highest_objectid(root, + &root->highest_objectid); + if (ret) { + mutex_unlock(&root->objectid_mutex); + goto free_root_dev; + } + + ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + + mutex_unlock(&root->objectid_mutex); + return 0; +free_root_dev: + free_anon_bdev(root->anon_dev); free_writers: btrfs_free_subvolume_writers(root->subv_writers); fail: @@ -1766,7 +1788,6 @@ static int cleaner_kthread(void *arg) int again; struct btrfs_trans_handle *trans; - set_freezable(); do { again = 0; @@ -1786,7 +1807,10 @@ static int cleaner_kthread(void *arg) goto sleep; } + mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); btrfs_run_delayed_iputs(root); + mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); + again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -2556,8 +2580,8 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); + mutex_init(&fs_info->cleaner_delayed_iput_mutex); seqlock_init(&fs_info->profiles_lock); - init_rwsem(&fs_info->delayed_iput_sem); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); @@ -2742,26 +2766,6 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - /* - * Leafsize and nodesize were always equal, this is only a sanity check. - */ - if (le32_to_cpu(disk_super->__unused_leafsize) != - btrfs_super_nodesize(disk_super)) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksizes don't match. node %d leaf %d\n", - btrfs_super_nodesize(disk_super), - le32_to_cpu(disk_super->__unused_leafsize)); - err = -EINVAL; - goto fail_alloc; - } - if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksize (%d) was too large\n", - btrfs_super_nodesize(disk_super)); - err = -EINVAL; - goto fail_alloc; - } - features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) @@ -2833,17 +2837,6 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id); - goto fail_sb_buffer; - } - - if (sectorsize != PAGE_SIZE) { - printk(KERN_ERR "BTRFS: incompatible sector size (%lu) " - "found on %s\n", (unsigned long)sectorsize, sb->s_id); - goto fail_sb_buffer; - } - mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); @@ -2915,6 +2908,18 @@ retry_root_backup: tree_root->commit_root = btrfs_root_node(tree_root); btrfs_set_root_refs(&tree_root->root_item, 1); + mutex_lock(&tree_root->objectid_mutex); + ret = btrfs_find_highest_objectid(tree_root, + &tree_root->highest_objectid); + if (ret) { + mutex_unlock(&tree_root->objectid_mutex); + goto recovery_tree_root; + } + + ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + + mutex_unlock(&tree_root->objectid_mutex); + ret = btrfs_read_roots(fs_info, tree_root); if (ret) goto recovery_tree_root; @@ -4018,8 +4023,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { struct btrfs_super_block *sb = fs_info->super_copy; + u64 nodesize = btrfs_super_nodesize(sb); + u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; + if (btrfs_super_magic(sb) != BTRFS_MAGIC) { + printk(KERN_ERR "BTRFS: no valid FS found\n"); + ret = -EINVAL; + } + if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) + printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); @@ -4037,31 +4051,46 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, } /* - * The common minimum, we don't know if we can trust the nodesize/sectorsize - * items yet, they'll be verified later. Issue just a warning. + * Check sectorsize and nodesize first, other check will need it. + * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) + if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize); + ret = -EINVAL; + } + /* Only PAGE SIZE is supported yet */ + if (sectorsize != PAGE_CACHE_SIZE) { + printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n", + sectorsize, PAGE_CACHE_SIZE); + ret = -EINVAL; + } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || + nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize); + ret = -EINVAL; + } + if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { + printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n", + le32_to_cpu(sb->__unused_leafsize), + nodesize); + ret = -EINVAL; + } + + /* Root alignment check */ + if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", btrfs_super_root(sb)); - if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n", btrfs_super_chunk_root(sb)); - if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", - btrfs_super_log_root(sb)); - - /* - * Check the lower bound, the alignment and other constraints are - * checked later. - */ - if (btrfs_super_nodesize(sb) < 4096) { - printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n", - btrfs_super_nodesize(sb)); ret = -EINVAL; } - if (btrfs_super_sectorsize(sb) < 4096) { - printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n", - btrfs_super_sectorsize(sb)); + if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { + printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", + btrfs_super_log_root(sb)); ret = -EINVAL; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 60cc1399c64f..e2287c7c10be 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4139,8 +4139,10 @@ commit_trans: !atomic_read(&root->fs_info->open_ioctl_trans)) { need_commit--; - if (need_commit > 0) + if (need_commit > 0) { + btrfs_start_delalloc_roots(fs_info, 0, -1); btrfs_wait_ordered_roots(fs_info, -1); + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -4153,11 +4155,12 @@ commit_trans: if (ret) return ret; /* - * make sure that all running delayed iput are - * done + * The cleaner kthread might still be doing iput + * operations. Wait for it to finish so that + * more space is released. */ - down_write(&root->fs_info->delayed_iput_sem); - up_write(&root->fs_info->delayed_iput_sem); + mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); + mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); goto again; } else { btrfs_end_transaction(trans, root); @@ -10399,7 +10402,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, * more device items and remove one chunk item), but this is done at * btrfs_remove_chunk() through a call to check_system_chunk(). */ - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; num_items = 3 + map->num_stripes; free_extent_map(em); @@ -10586,7 +10589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) - return 1; + return -EINVAL; features = btrfs_super_incompat_flags(disk_super); if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) @@ -10816,3 +10819,23 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root) } return 1; } + +static int wait_snapshoting_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) +{ + while (true) { + int ret; + + ret = btrfs_start_write_no_snapshoting(root); + if (ret) + break; + wait_on_atomic_t(&root->will_be_snapshoted, + wait_snapshoting_atomic_t, + TASK_UNINTERRUPTIBLE); + } +} diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6a98bddd8f33..84fb56d5c018 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em) WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - kfree(em->bdev); + kfree(em->map_lookup); kmem_cache_free(extent_map_cache, em); } } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index b2991fd8583e..eb8b8fae036b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -32,7 +32,15 @@ struct extent_map { u64 block_len; u64 generation; unsigned long flags; - struct block_device *bdev; + union { + struct block_device *bdev; + + /* + * used for chunk mappings + * flags & EXTENT_FLAG_FS_MAPPING must be set + */ + struct map_lookup *map_lookup; + }; atomic_t refs; unsigned int compress_type; struct list_head list; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 83d7859d7619..098bb8f690c9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -406,8 +406,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. */ -static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, - size_t write_bytes, +static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, struct page **prepared_pages, struct iov_iter *i) { @@ -1588,8 +1587,7 @@ again: ret = 0; } - copied = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, i); + copied = btrfs_copy_from_user(pos, write_bytes, pages, i); /* * if we have trouble faulting in the pages, fall @@ -1764,17 +1762,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, loff_t pos; size_t count; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = generic_write_checks(iocb, from); if (err <= 0) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } @@ -1785,7 +1783,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * to stop this write operation to ensure FS consistency. */ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); err = -EROFS; goto out; } @@ -1806,7 +1804,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, end_pos = round_up(pos + count, root->sectorsize); err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } } @@ -1822,7 +1820,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, iocb->ki_pos = pos + num_written; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * We also have to set last_sub_trans to the current log transid, @@ -1911,7 +1909,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -1963,7 +1961,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = start_ordered_ops(inode, start, end); } if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } atomic_inc(&root->log_batch); @@ -2009,7 +2007,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } @@ -2033,7 +2031,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } trans->sync = true; @@ -2056,7 +2054,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If any of the ordered extents had an error, just return it to user @@ -2305,7 +2303,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); ret = find_first_non_hole(inode, &offset, &len); if (ret < 0) @@ -2345,7 +2343,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncated_page = true; ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } @@ -2421,7 +2419,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = btrfs_wait_ordered_range(inode, lockstart, lockend - lockstart + 1); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } @@ -2576,7 +2574,7 @@ out_only_mutex: ret = btrfs_end_transaction(trans, root); } } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret && !err) err = ret; return err; @@ -2660,7 +2658,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret < 0) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = inode_newsize_ok(inode, alloc_end); if (ret) goto out; @@ -2818,7 +2816,7 @@ out: * So this is completely used as cleanup. */ btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_start, alloc_end - alloc_start); @@ -2894,7 +2892,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_END: case SEEK_CUR: @@ -2903,20 +2901,20 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_DATA: case SEEK_HOLE: if (offset >= i_size_read(inode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } ret = find_desired_extent(inode, &offset, whence); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 393e36bd5845..53dbeaf6ce94 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -153,6 +153,20 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) static unsigned long *alloc_bitmap(u32 bitmap_size) { + void *mem; + + /* + * The allocation size varies, observed numbers were < 4K up to 16K. + * Using vmalloc unconditionally would be too heavy, we'll try + * contiguous allocations first. + */ + if (bitmap_size <= PAGE_SIZE) + return kzalloc(bitmap_size, GFP_NOFS); + + mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN); + if (mem) + return mem; + return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); } @@ -289,7 +303,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = 0; out: - vfree(bitmap); + kvfree(bitmap); if (ret) btrfs_abort_transaction(trans, root, ret); return ret; @@ -438,7 +452,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, ret = 0; out: - vfree(bitmap); + kvfree(bitmap); if (ret) btrfs_abort_transaction(trans, root, ret); return ret; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 8b57c17b3fb3..e50316c4af15 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -515,7 +515,7 @@ out: return ret; } -static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) { struct btrfs_path *path; int ret; @@ -555,13 +555,6 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) int ret; mutex_lock(&root->objectid_mutex); - if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); - if (ret) - goto out; - } - if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { ret = -ENOSPC; goto out; diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h index ddb347bfee23..c8e864b2d530 100644 --- a/fs/btrfs/inode-map.h +++ b/fs/btrfs/inode-map.h @@ -9,5 +9,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans); int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 247830107686..5f06eb1f4384 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3134,7 +3134,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - down_read(&fs_info->delayed_iput_sem); spin_lock(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { struct btrfs_inode *inode; @@ -3153,7 +3152,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) spin_lock(&fs_info->delayed_iput_lock); } spin_unlock(&fs_info->delayed_iput_lock); - up_read(&root->fs_info->delayed_iput_sem); } /* @@ -4874,26 +4872,6 @@ next: return err; } -static int wait_snapshoting_atomic_t(atomic_t *a) -{ - schedule(); - return 0; -} - -static void wait_for_snapshot_creation(struct btrfs_root *root) -{ - while (true) { - int ret; - - ret = btrfs_start_write_no_snapshoting(root); - if (ret) - break; - wait_on_atomic_t(&root->will_be_snapshoted, - wait_snapshoting_atomic_t, - TASK_UNINTERRUPTIBLE); - } -} - static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4925,7 +4903,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * truncation, it must capture all writes that happened before * this truncation. */ - wait_for_snapshot_creation(root); + btrfs_wait_for_snapshot_creation(root); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { btrfs_end_write_no_snapshoting(root); @@ -7138,21 +7116,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return em; - } - + /* + * Create the ordered extent before the extent map. This is to avoid + * races with the fast fsync path that would lead to it logging file + * extent items that point to disk extents that were not yet written to. + * The fast fsync path collects ordered extents into a local list and + * then collects all the new extent maps, so we must create the ordered + * extent first and make sure the fast fsync path collects any new + * ordered extents after collecting new extent maps as well. + * The fsync path simply can not rely on inode_dio_wait() because it + * causes deadlock with AIO. + */ ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); if (ret) { btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - free_extent_map(em); return ERR_PTR(ret); } + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, + ins.offset, ins.offset, ins.offset, 0); + if (IS_ERR(em)) { + struct btrfs_ordered_extent *oe; + + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); + oe = btrfs_lookup_ordered_extent(inode, start); + ASSERT(oe); + if (WARN_ON(!oe)) + return em; + set_bit(BTRFS_ORDERED_IOERR, &oe->flags); + set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); + btrfs_remove_ordered_extent(inode, oe); + /* Once for our lookup and once for the ordered extents tree. */ + btrfs_put_ordered_extent(oe); + btrfs_put_ordered_extent(oe); + } return em; } @@ -8469,7 +8467,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); relock = true; } ret = btrfs_delalloc_reserve_space(inode, offset, count); @@ -8526,7 +8524,7 @@ out: if (wakeup) inode_dio_end(inode); if (relock) - mutex_lock(&inode->i_mutex); + inode_lock(inode); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2a47a3148ec8..952172ca7e45 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -240,7 +240,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ip_oldflags = ip->flags; i_oldflags = inode->i_flags; @@ -358,7 +358,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(file); return ret; } @@ -568,6 +568,10 @@ static noinline int create_subvol(struct inode *dir, goto fail; } + mutex_lock(&new_root->objectid_mutex); + new_root->highest_objectid = new_dirid; + mutex_unlock(&new_root->objectid_mutex); + /* * insert the directory item */ @@ -877,7 +881,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return error; } @@ -1389,18 +1393,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index += cluster; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out_ra; } defrag_count += ret; balance_dirty_pages_ratelimited(inode->i_mapping); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (newer_than) { if (newer_off == (u64)-1) @@ -1461,9 +1465,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, out_ra: if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (!file) kfree(ra); @@ -2426,7 +2430,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Don't allow to delete a subvolume with send in progress. This is @@ -2539,7 +2543,7 @@ out_up_write: spin_unlock(&dest->root_item_lock); } out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!err) { d_invalidate(dentry); btrfs_invalidate_inodes(dest); @@ -2555,7 +2559,7 @@ out_unlock_inode: out_dput: dput(dentry); out_unlock_dir: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); out_drop_write: mnt_drop_write_file(file); out: @@ -2853,8 +2857,8 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { - mutex_unlock(&inode1->i_mutex); - mutex_unlock(&inode2->i_mutex); + inode_unlock(inode1); + inode_unlock(inode2); } static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) @@ -2862,8 +2866,8 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) if (inode1 < inode2) swap(inode1, inode2); - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(inode1, I_MUTEX_PARENT); + inode_lock_nested(inode2, I_MUTEX_CHILD); } static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, @@ -3022,7 +3026,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, return 0; if (same_inode) { - mutex_lock(&src->i_mutex); + inode_lock(src); ret = extent_same_check_offsets(src, loff, &len, olen); if (ret) @@ -3097,7 +3101,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, btrfs_cmp_data_free(&cmp); out_unlock: if (same_inode) - mutex_unlock(&src->i_mutex); + inode_unlock(src); else btrfs_double_inode_unlock(src, dst); @@ -3745,7 +3749,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, if (!same_inode) { btrfs_double_inode_lock(src, inode); } else { - mutex_lock(&src->i_mutex); + inode_lock(src); } /* determine range to clone */ @@ -3816,7 +3820,7 @@ out_unlock: if (!same_inode) btrfs_double_inode_unlock(src, inode); else - mutex_unlock(&src->i_mutex); + inode_unlock(src); return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6d707545f775..55161369fab1 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -609,13 +609,28 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, return 1; } +static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, + int index) +{ + return stripe * rbio->stripe_npages + index; +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, + int index) +{ + return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; +} + /* * helper to index into the pstripe */ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) { - index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; - return rbio->stripe_pages[index]; + return rbio_stripe_page(rbio, rbio->nr_data, index); } /* @@ -626,10 +641,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) { if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; - - index += ((rbio->nr_data + 1) * rbio->stripe_len) >> - PAGE_CACHE_SHIFT; - return rbio->stripe_pages[index]; + return rbio_stripe_page(rbio, rbio->nr_data + 1, index); } /* @@ -889,6 +901,7 @@ static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; int err = bio->bi_error; + int max_errors; if (err) fail_bio_stripe(rbio, bio); @@ -901,7 +914,9 @@ static void raid_write_end_io(struct bio *bio) err = 0; /* OK, we have read all the stripes we need to. */ - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? + 0 : rbio->bbio->max_errors; + if (atomic_read(&rbio->error) > max_errors) err = -EIO; rbio_orig_end_io(rbio, err); @@ -947,8 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, */ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) { - unsigned long nr = stripe_len * nr_stripes; - return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE); + return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes; } /* @@ -966,8 +980,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, void *p; rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + - DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), - GFP_NOFS); + DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) * + sizeof(long), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -1021,18 +1035,17 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) if (!page) return -ENOMEM; rbio->stripe_pages[i] = page; - ClearPageUptodate(page); } return 0; } -/* allocate pages for just the p/q stripes */ +/* only allocate pages for p/q stripes */ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) { int i; struct page *page; - i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); for (; i < rbio->nr_pages; i++) { if (rbio->stripe_pages[i]) @@ -1121,18 +1134,6 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) } /* - * these are just the pages from the rbio array, not from anything - * the FS sent down to us - */ -static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) -{ - int index; - index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); - index += page; - return rbio->stripe_pages[index]; -} - -/* * helper function to walk our bio list and populate the bio_pages array with * the result. This seems expensive, but it is faster than constantly * searching through the bio list as we setup the IO in finish_rmw or stripe @@ -1175,7 +1176,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_bio *bbio = rbio->bbio; void *pointers[rbio->real_stripes]; - int stripe_len = rbio->stripe_len; int nr_data = rbio->nr_data; int stripe; int pagenr; @@ -1183,7 +1183,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) int q_stripe = -1; struct bio_list bio_list; struct bio *bio; - int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; int ret; bio_list_init(&bio_list); @@ -1226,7 +1225,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *p; /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { @@ -1268,7 +1267,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) * everything else. */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); @@ -1292,7 +1291,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) if (!bbio->tgtdev_map[stripe]) continue; - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); @@ -1506,7 +1505,6 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; @@ -1525,7 +1523,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; /* * we want to find all the pages missing from @@ -1801,7 +1799,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int pagenr, stripe; void **pointers; int faila = -1, failb = -1; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); struct page *page; int err; int i; @@ -1824,7 +1821,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { /* * Now we just use bitmap to mark the horizontal stripes in * which we have data when doing parity scrub. @@ -1935,7 +1932,7 @@ pstripe: * other endio functions will fiddle the uptodate bits */ if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < rbio->stripe_npages; i++) { if (faila != -1) { page = rbio_stripe_page(rbio, faila, i); SetPageUptodate(page); @@ -2031,7 +2028,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; @@ -2055,7 +2051,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) continue; } - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *p; /* @@ -2279,37 +2275,11 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) if (!page) return -ENOMEM; rbio->stripe_pages[index] = page; - ClearPageUptodate(page); } } return 0; } -/* - * end io function used by finish_rmw. When we finally - * get here, we've written a full stripe - */ -static void raid_write_parity_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - int err = bio->bi_error; - - if (bio->bi_error) - fail_bio_stripe(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; - - err = 0; - - if (atomic_read(&rbio->error)) - err = -EIO; - - rbio_orig_end_io(rbio, err); -} - static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { @@ -2462,7 +2432,7 @@ submit_write: break; bio->bi_private = rbio; - bio->bi_end_io = raid_write_parity_end_io; + bio->bi_end_io = raid_write_end_io; submit_bio(WRITE, bio); } return; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ef6d8fc85853..2bd0011450df 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -575,7 +575,8 @@ static int is_cowonly_root(u64 root_objectid) root_objectid == BTRFS_TREE_LOG_OBJECTID || root_objectid == BTRFS_CSUM_TREE_OBJECTID || root_objectid == BTRFS_UUID_TREE_OBJECTID || - root_objectid == BTRFS_QUOTA_TREE_OBJECTID) + root_objectid == BTRFS_QUOTA_TREE_OBJECTID || + root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) return 1; return 0; } @@ -3030,7 +3031,7 @@ int prealloc_file_extent_cluster(struct inode *inode, int ret = 0; BUG_ON(cluster->start != cluster->boundary[0]); - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = btrfs_check_data_free_space(inode, cluster->start, cluster->end + 1 - cluster->start); @@ -3057,7 +3058,7 @@ int prealloc_file_extent_cluster(struct inode *inode, btrfs_free_reserved_data_space(inode, cluster->start, cluster->end + 1 - cluster->start); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0c981ebe2acb..92bf5ee732fb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2813,7 +2813,7 @@ out: static inline int scrub_calc_parity_bitmap_len(int nsectors) { - return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8); + return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long); } static void scrub_parity_get(struct scrub_parity *sparity) @@ -3458,7 +3458,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, return ret; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (em->start != chunk_offset) goto out; @@ -4279,7 +4279,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, return PTR_ERR(inode); /* Avoid truncate/dio/punch hole.. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode_dio_wait(inode); physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; @@ -4358,7 +4358,7 @@ next_page: } ret = COPY_COMPLETE; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); return ret; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9b9eab6d048e..d41e09fe8e38 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -383,6 +383,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) int ret = 0; char *compress_type; bool compress_force = false; + enum btrfs_compression_type saved_compress_type; + bool saved_compress_force; + int no_compress = 0; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE)) @@ -462,6 +465,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) /* Fallthrough */ case Opt_compress: case Opt_compress_type: + saved_compress_type = btrfs_test_opt(root, COMPRESS) ? + info->compress_type : BTRFS_COMPRESS_NONE; + saved_compress_force = + btrfs_test_opt(root, FORCE_COMPRESS); if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -470,6 +477,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); + no_compress = 0; } else if (strcmp(args[0].from, "lzo") == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; @@ -477,25 +485,21 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); + no_compress = 0; } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); compress_force = false; + no_compress++; } else { ret = -EINVAL; goto out; } if (compress_force) { - btrfs_set_and_info(root, FORCE_COMPRESS, - "force %s compression", - compress_type); + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); } else { - if (!btrfs_test_opt(root, COMPRESS)) - btrfs_info(root->fs_info, - "btrfs: use %s compression", - compress_type); /* * If we remount from compress-force=xxx to * compress=xxx, we need clear FORCE_COMPRESS @@ -504,6 +508,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } + if ((btrfs_test_opt(root, COMPRESS) && + (info->compress_type != saved_compress_type || + compress_force != saved_compress_force)) || + (!btrfs_test_opt(root, COMPRESS) && + no_compress == 1)) { + btrfs_info(root->fs_info, + "%s %s compression", + (compress_force) ? "force" : "use", + compress_type); + } + compress_force = false; break; case Opt_ssd: btrfs_set_and_info(root, SSD, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e0ac85949067..539e7b5e3f86 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -202,6 +202,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); +BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(mixed_backref), @@ -213,6 +214,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), BTRFS_FEAT_ATTR_PTR(no_holes), + BTRFS_FEAT_ATTR_PTR(free_space_tree), NULL }; @@ -780,6 +782,39 @@ failure: return error; } + +/* + * Change per-fs features in /sys/fs/btrfs/UUID/features to match current + * values in superblock. Call after any changes to incompat/compat_ro flags + */ +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, + u64 bit, enum btrfs_feature_set set) +{ + struct btrfs_fs_devices *fs_devs; + struct kobject *fsid_kobj; + u64 features; + int ret; + + if (!fs_info) + return; + + features = get_features(fs_info, set); + ASSERT(bit & supported_feature_masks[set]); + + fs_devs = fs_info->fs_devices; + fsid_kobj = &fs_devs->fsid_kobj; + + if (!fsid_kobj->state_initialized) + return; + + /* + * FIXME: this is too heavy to update just one value, ideally we'd like + * to use sysfs_update_group but some refactoring is needed first. + */ + sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); + ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); +} + static int btrfs_init_debugfs(void) { #ifdef CONFIG_DEBUG_FS diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 9c09522125a6..d7da1a4c2f6c 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -56,7 +56,7 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \ #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) #define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature) + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) #define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) @@ -90,4 +90,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, struct kobject *parent); int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, + u64 bit, enum btrfs_feature_set set); + #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b1d920b30070..0e1e61a7ec23 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -82,18 +82,18 @@ void btrfs_destroy_test_fs(void) struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) { struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), - GFP_NOFS); + GFP_KERNEL); if (!fs_info) return fs_info; fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), - GFP_NOFS); + GFP_KERNEL); if (!fs_info->fs_devices) { kfree(fs_info); return NULL; } fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), - GFP_NOFS); + GFP_KERNEL); if (!fs_info->super_copy) { kfree(fs_info->fs_devices); kfree(fs_info); @@ -180,11 +180,11 @@ btrfs_alloc_dummy_block_group(unsigned long length) { struct btrfs_block_group_cache *cache; - cache = kzalloc(sizeof(*cache), GFP_NOFS); + cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (!cache) return NULL; cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); + GFP_KERNEL); if (!cache->free_space_ctl) { kfree(cache); return NULL; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index e29fa297e053..669b58201e36 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -94,7 +94,7 @@ static int test_find_delalloc(void) * test. */ for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) { - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { test_msg("Failed to allocate test page\n"); ret = -ENOMEM; @@ -113,7 +113,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS); + set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -144,7 +144,7 @@ static int test_find_delalloc(void) test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS); + set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -199,7 +199,7 @@ static int test_find_delalloc(void) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS); + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -262,7 +262,7 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL); out: if (locked_page) page_cache_release(locked_page); @@ -360,7 +360,7 @@ static int test_eb_bitmaps(void) test_msg("Running extent buffer bitmap tests\n"); - bitmap = kmalloc(len, GFP_NOFS); + bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { test_msg("Couldn't allocate test bitmap\n"); return -ENOMEM; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 5de55fdd28bc..e2d3da02deee 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -974,7 +974,7 @@ static int test_extent_accounting(void) (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1045,7 +1045,7 @@ static int test_extent_accounting(void) BTRFS_MAX_EXTENT_SIZE+8191, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1079,7 +1079,7 @@ static int test_extent_accounting(void) ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1096,7 +1096,7 @@ out: clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); iput(inode); btrfs_free_dummy_root(root); return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 323e12cc9d2f..978c3a810893 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, struct list_head *logged_list, - struct btrfs_log_ctx *ctx) + struct btrfs_log_ctx *ctx, + const u64 start, + const u64 end) { struct extent_map *em, *n; struct list_head extents; @@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); - + /* + * Collect any new ordered extents within the range. This is to + * prevent logging file extent items without waiting for the disk + * location they point to being written. We do this only to deal + * with races against concurrent lockless direct IO writes. + */ + btrfs_get_logged_extents(inode, logged_list, start, end); process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4701,7 +4709,7 @@ log_extents: goto out_unlock; } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - &logged_list, ctx); + &logged_list, ctx, start, end); if (ret) { err = ret; goto out_unlock; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c32abbca9d77..366b335946fa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; -const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { +const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, @@ -233,6 +233,7 @@ static struct btrfs_device *__alloc_device(void) spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); + btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); @@ -1183,7 +1184,7 @@ again: struct map_lookup *map; int i; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { u64 end; @@ -2755,7 +2756,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, free_extent_map(em); return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; lock_chunks(root->fs_info->chunk_root); check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); @@ -3751,7 +3752,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { btrfs_warn(fs_info, - "metatdata profile 0x%llx has lower redundancy than data profile 0x%llx", + "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", bctl->meta.target, bctl->data.target); } @@ -4718,7 +4719,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, goto error; } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->bdev = (struct block_device *)map; + em->map_lookup = map; em->start = start; em->len = num_bytes; em->block_start = 0; @@ -4813,7 +4814,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); stripe_size = em->orig_block_len; @@ -4968,7 +4969,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) if (!em) return 1; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->missing) { miss_ndevs++; @@ -5048,7 +5049,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) @@ -5084,7 +5085,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); free_extent_map(em); @@ -5105,7 +5106,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; free_extent_map(em); @@ -5264,7 +5265,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; offset = logical - em->start; stripe_len = map->stripe_len; @@ -5378,35 +5379,33 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * target drive. */ for (i = 0; i < tmp_num_stripes; i++) { - if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it - * simple, only add the mirror with the - * lowest physical address - */ - if (found && - physical_of_found <= - tmp_bbio->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = - tmp_bbio->stripes[i].physical; - } + if (tmp_bbio->stripes[i].dev->devid != srcdev_devid) + continue; + + /* + * In case of DUP, in order to keep it simple, only add + * the mirror with the lowest physical address + */ + if (found && + physical_of_found <= tmp_bbio->stripes[i].physical) + continue; + + index_srcdev = i; + found = 1; + physical_of_found = tmp_bbio->stripes[i].physical; } - if (found) { - mirror_num = index_srcdev + 1; - patch_the_first_stripe_for_dev_replace = 1; - physical_to_patch_in_first_stripe = physical_of_found; - } else { + btrfs_put_bbio(tmp_bbio); + + if (!found) { WARN_ON(1); ret = -EIO; - btrfs_put_bbio(tmp_bbio); goto out; } - btrfs_put_bbio(tmp_bbio); + mirror_num = index_srcdev + 1; + patch_the_first_stripe_for_dev_replace = 1; + physical_to_patch_in_first_stripe = physical_of_found; } else if (mirror_num > map->num_stripes) { mirror_num = 0; } @@ -5806,7 +5805,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, free_extent_map(em); return -EIO; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; length = em->len; rmap_len = map->stripe_len; @@ -6069,7 +6068,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bbio->fs_info = root->fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); - if (bbio->raid_map) { + if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && + ((rw & WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (rw & WRITE) { @@ -6210,6 +6210,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, struct extent_map *em; u64 logical; u64 length; + u64 stripe_len; u64 devid; u8 uuid[BTRFS_UUID_SIZE]; int num_stripes; @@ -6218,6 +6219,37 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + /* Validation check */ + if (!num_stripes) { + btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", + num_stripes); + return -EIO; + } + if (!IS_ALIGNED(logical, root->sectorsize)) { + btrfs_err(root->fs_info, + "invalid chunk logical %llu", logical); + return -EIO; + } + if (!length || !IS_ALIGNED(length, root->sectorsize)) { + btrfs_err(root->fs_info, + "invalid chunk length %llu", length); + return -EIO; + } + if (!is_power_of_2(stripe_len)) { + btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", + stripe_len); + return -EIO; + } + if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)) { + btrfs_err(root->fs_info, "unrecognized chunk type: %llu", + ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)); + return -EIO; + } read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6234,7 +6266,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, em = alloc_extent_map(); if (!em) return -ENOMEM; - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); if (!map) { free_extent_map(em); @@ -6242,7 +6273,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->bdev = (struct block_device *)map; + em->map_lookup = map; em->start = logical; em->len = length; em->orig_start = 0; @@ -6944,7 +6975,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, /* In order to kick the device replace finish process */ lock_chunks(root); list_for_each_entry(em, &transaction->pending_chunks, list) { - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { dev = map->stripes[i].dev; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index fd953c361a43..6c68d6356197 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -126,7 +126,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { - ASSERT(mutex_is_locked(&inode->i_mutex)); + ASSERT(inode_is_locked(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, name_len, 0); if (!di) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index afa023dded5b..675a3332d72f 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -446,7 +446,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) return 0; cachefiles_begin_secure(cache, &saved_cred); - mutex_lock(&d_inode(object->backer)->i_mutex); + inode_lock(d_inode(object->backer)); /* if there's an extension to a partial page at the end of the backing * file, we need to discard the partial page so that we pick up new @@ -465,7 +465,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) ret = notify_change(object->backer, &newattrs, NULL); truncate_failed: - mutex_unlock(&d_inode(object->backer)->i_mutex); + inode_unlock(d_inode(object->backer)); cachefiles_end_secure(cache, saved_cred); if (ret == -EIO) { diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index c4b893453e0e..1c2334c163dd 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -295,7 +295,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, cachefiles_mark_object_buried(cache, rep, why); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (ret == -EIO) cachefiles_io_error(cache, "Unlink failed"); @@ -306,7 +306,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, /* directories have to be moved to the graveyard */ _debug("move stale object to graveyard"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); try_again: /* first step is to make up a grave dentry in the graveyard */ @@ -423,13 +423,13 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, dir = dget_parent(object->dentry); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) { /* object allocation for the same key preemptively deleted this * object's file so that it could create its own file */ _debug("object preemptively buried"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = 0; } else { /* we need to check that our parent is _still_ our parent - it @@ -442,7 +442,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, /* it got moved, presumably by cachefilesd culling it, * so it's no longer in the key path and we can ignore * it */ - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = 0; } } @@ -501,7 +501,7 @@ lookup_again: /* search the current directory for the element name */ _debug("lookup '%s'", name); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); start = jiffies; next = lookup_one_len(name, dir, nlen); @@ -585,7 +585,7 @@ lookup_again: /* process the next component */ if (key) { _debug("advance"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); dir = next; next = NULL; @@ -623,7 +623,7 @@ lookup_again: /* note that we're now using this object */ ret = cachefiles_mark_object_active(cache, object); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); dir = NULL; @@ -705,7 +705,7 @@ lookup_error: cachefiles_io_error(cache, "Lookup failed"); next = NULL; error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(next); error_out2: dput(dir); @@ -729,7 +729,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _enter(",,%s", dirname); /* search the current directory for the element name */ - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); @@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, d_backing_inode(subdir)->i_ino); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); /* we need to make sure the subdir is a directory */ ASSERT(d_backing_inode(subdir)); @@ -800,19 +800,19 @@ check_error: return ERR_PTR(ret); mkdir_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = PTR_ERR(subdir); pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); nomem_d_alloc: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } @@ -837,7 +837,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, // dir, filename); /* look up the victim */ - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); start = jiffies; victim = lookup_one_len(filename, dir, strlen(filename)); @@ -852,7 +852,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, * at the netfs's request whilst the cull was in progress */ if (d_is_negative(victim)) { - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); _leave(" = -ENOENT [absent]"); return ERR_PTR(-ENOENT); @@ -881,13 +881,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, object_in_use: read_unlock(&cache->active_lock); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); //_leave(" = -EBUSY [in use]"); return ERR_PTR(-EBUSY); lookup_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = PTR_ERR(victim); if (ret == -ENOENT) { /* file or dir now absent - probably retired by netfs */ @@ -947,7 +947,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, return 0; error_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); error: dput(victim); if (ret == -ENOENT) { @@ -982,7 +982,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, if (IS_ERR(victim)) return PTR_ERR(victim); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); //_leave(" = 0"); return 0; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b7d218a168fb..c22213789090 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1108,7 +1108,7 @@ retry_locked: return 0; /* past end of file? */ - i_size = inode->i_size; /* caller holds i_mutex */ + i_size = i_size_read(inode); if (page_off >= i_size || (pos_in_page == 0 && (pos+len) >= i_size && @@ -1149,7 +1149,6 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, page = grab_cache_page_write_begin(mapping, index, 0); if (!page) return -ENOMEM; - *pagep = page; dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); @@ -1184,8 +1183,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, zero_user_segment(page, from+copied, len); /* did file size increase? */ - /* (no need for i_size_read(); we caller holds i_mutex */ - if (pos+copied > inode->i_size) + if (pos+copied > i_size_read(inode)) check_cap = ceph_inode_set_size(inode, pos+copied); if (!PageUptodate(page)) @@ -1378,11 +1376,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; if ((off > size) || - (page->mapping != inode->i_mapping)) + (page->mapping != inode->i_mapping)) { + unlock_page(page); goto out; + } ret = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (ret == 0) { + if (ret >= 0) { /* success. we'll keep the page locked. */ set_page_dirty(page); ret = VM_FAULT_LOCKED; @@ -1393,8 +1393,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } out: - if (ret != VM_FAULT_LOCKED) - unlock_page(page); if (ret == VM_FAULT_LOCKED || ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a4766ded1ba7..a351480dbabc 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -106,7 +106,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, memset(&aux, 0, sizeof(aux)); aux.mtime = inode->i_mtime; - aux.size = inode->i_size; + aux.size = i_size_read(inode); memcpy(buffer, &aux, sizeof(aux)); @@ -117,9 +117,7 @@ static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) { const struct ceph_inode_info* ci = cookie_netfs_data; - const struct inode* inode = &ci->vfs_inode; - - *size = inode->i_size; + *size = i_size_read(&ci->vfs_inode); } static enum fscache_checkaux ceph_fscache_inode_check_aux( @@ -134,7 +132,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( memset(&aux, 0, sizeof(aux)); aux.mtime = inode->i_mtime; - aux.size = inode->i_size; + aux.size = i_size_read(inode); if (memcmp(data, &aux, sizeof(aux)) != 0) return FSCACHE_CHECKAUX_OBSOLETE; @@ -197,7 +195,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, return; /* Avoid multiple racing open requests */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (ci->fscache) goto done; @@ -207,7 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, ci, true); fscache_check_consistency(ci->fscache); done: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c69e1253b47b..cdbf8cf3d52c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2030,7 +2030,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (datasync) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); @@ -2046,7 +2046,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 9314b4ea2375..fd11fb231a2e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -507,7 +507,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = -EINVAL; switch (whence) { case SEEK_CUR: @@ -542,7 +542,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index fe02ae7f056a..3b3172357326 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -215,7 +215,7 @@ static int ceph_get_name(struct dentry *parent, char *name, if (IS_ERR(req)) return PTR_ERR(req); - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); req->r_inode = d_inode(child); ihold(d_inode(child)); @@ -224,7 +224,7 @@ static int ceph_get_name(struct dentry *parent, char *name, req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); if (!err) { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3c68e6aee2f0..86a9c383955e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -397,8 +397,9 @@ int ceph_release(struct inode *inode, struct file *file) } enum { - CHECK_EOF = 1, - READ_INLINE = 2, + HAVE_RETRIED = 1, + CHECK_EOF = 2, + READ_INLINE = 3, }; /* @@ -411,17 +412,15 @@ enum { static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof, bool o_direct, - unsigned long buf_align) + int *checkeof) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len, left; - int io_align, page_align; - int pages_left; - int read; + loff_t i_size; + int page_align, pages_left; + int read, ret; struct page **page_pos; - int ret; bool hit_stripe, was_short; /* @@ -432,13 +431,9 @@ static int striped_read(struct inode *inode, page_pos = pages; pages_left = num_pages; read = 0; - io_align = off & ~PAGE_MASK; more: - if (o_direct) - page_align = (pos - io_align + buf_align) & ~PAGE_MASK; - else - page_align = pos & ~PAGE_MASK; + page_align = pos & ~PAGE_MASK; this_len = left; ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, @@ -452,13 +447,12 @@ more: dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); + i_size = i_size_read(inode); if (ret >= 0) { int didpages; - if (was_short && (pos + ret < inode->i_size)) { - int zlen = min(this_len - ret, - inode->i_size - pos - ret); - int zoff = (o_direct ? buf_align : io_align) + - read + ret; + if (was_short && (pos + ret < i_size)) { + int zlen = min(this_len - ret, i_size - pos - ret); + int zoff = (off & ~PAGE_MASK) + read + ret; dout(" zero gap %llu to %llu\n", pos + ret, pos + ret + zlen); ceph_zero_page_vector_range(zoff, zlen, pages); @@ -473,14 +467,14 @@ more: pages_left -= didpages; /* hit stripe and need continue*/ - if (left && hit_stripe && pos < inode->i_size) + if (left && hit_stripe && pos < i_size) goto more; } if (read > 0) { ret = read; /* did we bounce off eof? */ - if (pos + left > inode->i_size) + if (pos + left > i_size) *checkeof = CHECK_EOF; } @@ -521,54 +515,28 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, if (ret < 0) return ret; - if (iocb->ki_flags & IOCB_DIRECT) { - while (iov_iter_count(i)) { - size_t start; - ssize_t n; - - n = dio_get_pagev_size(i); - pages = dio_get_pages_alloc(i, n, &start, &num_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - ret = striped_read(inode, off, n, - pages, num_pages, checkeof, - 1, start); - - ceph_put_page_vector(pages, num_pages, true); - - if (ret <= 0) - break; - off += ret; - iov_iter_advance(i, ret); - if (ret < n) + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = striped_read(inode, off, len, pages, + num_pages, checkeof); + if (ret > 0) { + int l, k = 0; + size_t left = ret; + + while (left) { + size_t page_off = off & ~PAGE_MASK; + size_t copy = min_t(size_t, left, + PAGE_SIZE - page_off); + l = copy_page_to_iter(pages[k++], page_off, copy, i); + off += l; + left -= l; + if (l < copy) break; } - } else { - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); - ret = striped_read(inode, off, len, pages, - num_pages, checkeof, 0, 0); - if (ret > 0) { - int l, k = 0; - size_t left = ret; - - while (left) { - size_t page_off = off & ~PAGE_MASK; - size_t copy = min_t(size_t, - PAGE_SIZE - page_off, left); - l = copy_page_to_iter(pages[k++], page_off, - copy, i); - off += l; - left -= l; - if (l < copy) - break; - } - } - ceph_release_page_vector(pages, num_pages); } + ceph_release_page_vector(pages, num_pages); if (off > iocb->ki_pos) { ret = off - iocb->ki_pos; @@ -579,6 +547,193 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, return ret; } +struct ceph_aio_request { + struct kiocb *iocb; + size_t total_len; + int write; + int error; + struct list_head osd_reqs; + unsigned num_reqs; + atomic_t pending_reqs; + struct timespec mtime; + struct ceph_cap_flush *prealloc_cf; +}; + +struct ceph_aio_work { + struct work_struct work; + struct ceph_osd_request *req; +}; + +static void ceph_aio_retry_work(struct work_struct *work); + +static void ceph_aio_complete(struct inode *inode, + struct ceph_aio_request *aio_req) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!atomic_dec_and_test(&aio_req->pending_reqs)) + return; + + ret = aio_req->error; + if (!ret) + ret = aio_req->total_len; + + dout("ceph_aio_complete %p rc %d\n", inode, ret); + + if (ret >= 0 && aio_req->write) { + int dirty; + + loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; + if (endoff > i_size_read(inode)) { + if (ceph_inode_set_size(inode, endoff)) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + } + + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &aio_req->prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + + } + + ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : + CEPH_CAP_FILE_RD)); + + aio_req->iocb->ki_complete(aio_req->iocb, ret, 0); + + ceph_free_cap_flush(aio_req->prealloc_cf); + kfree(aio_req); +} + +static void ceph_aio_complete_req(struct ceph_osd_request *req, + struct ceph_msg *msg) +{ + int rc = req->r_result; + struct inode *inode = req->r_inode; + struct ceph_aio_request *aio_req = req->r_priv; + struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + int num_pages = calc_pages_for((u64)osd_data->alignment, + osd_data->length); + + dout("ceph_aio_complete_req %p rc %d bytes %llu\n", + inode, rc, osd_data->length); + + if (rc == -EOLDSNAPC) { + struct ceph_aio_work *aio_work; + BUG_ON(!aio_req->write); + + aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS); + if (aio_work) { + INIT_WORK(&aio_work->work, ceph_aio_retry_work); + aio_work->req = req; + queue_work(ceph_inode_to_client(inode)->wb_wq, + &aio_work->work); + return; + } + rc = -ENOMEM; + } else if (!aio_req->write) { + if (rc == -ENOENT) + rc = 0; + if (rc >= 0 && osd_data->length > rc) { + int zoff = osd_data->alignment + rc; + int zlen = osd_data->length - rc; + /* + * If read is satisfied by single OSD request, + * it can pass EOF. Otherwise read is within + * i_size. + */ + if (aio_req->num_reqs == 1) { + loff_t i_size = i_size_read(inode); + loff_t endoff = aio_req->iocb->ki_pos + rc; + if (endoff < i_size) + zlen = min_t(size_t, zlen, + i_size - endoff); + aio_req->total_len = rc + zlen; + } + + if (zlen > 0) + ceph_zero_page_vector_range(zoff, zlen, + osd_data->pages); + } + } + + ceph_put_page_vector(osd_data->pages, num_pages, false); + ceph_osdc_put_request(req); + + if (rc < 0) + cmpxchg(&aio_req->error, 0, rc); + + ceph_aio_complete(inode, aio_req); + return; +} + +static void ceph_aio_retry_work(struct work_struct *work) +{ + struct ceph_aio_work *aio_work = + container_of(work, struct ceph_aio_work, work); + struct ceph_osd_request *orig_req = aio_work->req; + struct ceph_aio_request *aio_req = orig_req->r_priv; + struct inode *inode = orig_req->r_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + struct ceph_osd_request *req; + int ret; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + + req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, + false, GFP_NOFS); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + req = orig_req; + goto out; + } + + req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE; + req->r_base_oloc = orig_req->r_base_oloc; + req->r_base_oid = orig_req->r_base_oid; + + req->r_ops[0] = orig_req->r_ops[0]; + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + + ceph_osdc_build_request(req, req->r_ops[0].extent.offset, + snapc, CEPH_NOSNAP, &aio_req->mtime); + + ceph_put_snap_context(snapc); + ceph_osdc_put_request(orig_req); + + req->r_callback = ceph_aio_complete_req; + req->r_inode = inode; + req->r_priv = aio_req; + + ret = ceph_osdc_start_request(req->r_osdc, req, false); +out: + if (ret < 0) { + BUG_ON(ret == -EOLDSNAPC); + req->r_result = ret; + ceph_aio_complete_req(req, NULL); + } + + kfree(aio_work); +} + /* * Write commit request unsafe callback, called to tell us when a * request is unsafe (that is, in flight--has been handed to the @@ -612,16 +767,10 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } -/* - * Synchronous write, straight from __user pointer or user pages. - * - * If write spans object boundary, just do multiple writes. (For a - * correct atomic write, we should e.g. take write locks on all - * objects, rollback on failure, etc.) - */ static ssize_t -ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, - struct ceph_snap_context *snapc) +ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, + struct ceph_snap_context *snapc, + struct ceph_cap_flush **pcf) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -630,44 +779,52 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; - int num_pages; - int written = 0; + struct ceph_aio_request *aio_req = NULL; + int num_pages = 0; int flags; - int check_caps = 0; int ret; struct timespec mtime = CURRENT_TIME; - size_t count = iov_iter_count(from); + size_t count = iov_iter_count(iter); + loff_t pos = iocb->ki_pos; + bool write = iov_iter_rw(iter) == WRITE; - if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) + if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_direct_write on file %p %lld~%u\n", file, pos, - (unsigned)count); + dout("sync_direct_read_write (%s) on file %p %lld~%u\n", + (write ? "write" : "read"), file, pos, (unsigned)count); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) return ret; - ret = invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_CACHE_SHIFT, - (pos + count) >> PAGE_CACHE_SHIFT); - if (ret < 0) - dout("invalidate_inode_pages2_range returned %d\n", ret); + if (write) { + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + count) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); - flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE; + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE; + } else { + flags = CEPH_OSD_FLAG_READ; + } - while (iov_iter_count(from) > 0) { - u64 len = dio_get_pagev_size(from); - size_t start; - ssize_t n; + while (iov_iter_count(iter) > 0) { + u64 size = dio_get_pagev_size(iter); + size_t start = 0; + ssize_t len; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, 0, - 2,/*include a 'startsync' command*/ - CEPH_OSD_OP_WRITE, flags, snapc, + vino, pos, &size, 0, + /*include a 'startsync' command*/ + write ? 2 : 1, + write ? CEPH_OSD_OP_WRITE : + CEPH_OSD_OP_READ, + flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, false); @@ -676,10 +833,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, break; } - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); - - n = len; - pages = dio_get_pages_alloc(from, len, &start, &num_pages); + len = size; + pages = dio_get_pages_alloc(iter, len, &start, &num_pages); if (IS_ERR(pages)) { ceph_osdc_put_request(req); ret = PTR_ERR(pages); @@ -687,47 +842,128 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, } /* - * throw out any page cache pages in this range. this - * may block. + * To simplify error handling, allow AIO when IO within i_size + * or IO can be satisfied by single OSD request. */ - truncate_inode_pages_range(inode->i_mapping, pos, - (pos+n) | (PAGE_CACHE_SIZE-1)); - osd_req_op_extent_osd_data_pages(req, 0, pages, n, start, - false, false); + if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) && + (len == count || pos + count <= i_size_read(inode))) { + aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL); + if (aio_req) { + aio_req->iocb = iocb; + aio_req->write = write; + INIT_LIST_HEAD(&aio_req->osd_reqs); + if (write) { + aio_req->mtime = mtime; + swap(aio_req->prealloc_cf, *pcf); + } + } + /* ignore error */ + } + + if (write) { + /* + * throw out any page cache pages in this range. this + * may block. + */ + truncate_inode_pages_range(inode->i_mapping, pos, + (pos+len) | (PAGE_CACHE_SIZE - 1)); + + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + } + + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, + false, false); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (aio_req) { + aio_req->total_len += len; + aio_req->num_reqs++; + atomic_inc(&aio_req->pending_reqs); + + req->r_callback = ceph_aio_complete_req; + req->r_inode = inode; + req->r_priv = aio_req; + list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); + + pos += len; + iov_iter_advance(iter, len); + continue; + } + + ret = ceph_osdc_start_request(req->r_osdc, req, false); if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + size = i_size_read(inode); + if (!write) { + if (ret == -ENOENT) + ret = 0; + if (ret >= 0 && ret < len && pos + ret < size) { + int zlen = min_t(size_t, len - ret, + size - pos - ret); + ceph_zero_page_vector_range(start + ret, zlen, + pages); + ret += zlen; + } + if (ret >= 0) + len = ret; + } + ceph_put_page_vector(pages, num_pages, false); ceph_osdc_put_request(req); - if (ret) + if (ret < 0) break; - pos += n; - written += n; - iov_iter_advance(from, n); - if (pos > i_size_read(inode)) { - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) + pos += len; + iov_iter_advance(iter, len); + + if (!write && pos >= size) + break; + + if (write && pos > size) { + if (ceph_inode_set_size(inode, pos)) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); } } - if (ret != -EOLDSNAPC && written > 0) { + if (aio_req) { + if (aio_req->num_reqs == 0) { + kfree(aio_req); + return ret; + } + + ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : + CEPH_CAP_FILE_RD); + + while (!list_empty(&aio_req->osd_reqs)) { + req = list_first_entry(&aio_req->osd_reqs, + struct ceph_osd_request, + r_unsafe_item); + list_del_init(&req->r_unsafe_item); + if (ret >= 0) + ret = ceph_osdc_start_request(req->r_osdc, + req, false); + if (ret < 0) { + BUG_ON(ret == -EOLDSNAPC); + req->r_result = ret; + ceph_aio_complete_req(req, NULL); + } + } + return -EIOCBQUEUED; + } + + if (ret != -EOLDSNAPC && pos > iocb->ki_pos) { + ret = pos - iocb->ki_pos; iocb->ki_pos = pos; - ret = written; } return ret; } - /* * Synchronous write, straight from __user pointer or user pages. * @@ -897,8 +1133,14 @@ again: ceph_cap_string(got)); if (ci->i_inline_version == CEPH_INLINE_NONE) { - /* hmm, this isn't really async... */ - ret = ceph_sync_read(iocb, to, &retry_op); + if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + ret = ceph_direct_read_write(iocb, to, + NULL, NULL); + if (ret >= 0 && ret < len) + retry_op = CHECK_EOF; + } else { + ret = ceph_sync_read(iocb, to, &retry_op); + } } else { retry_op = READ_INLINE; } @@ -916,7 +1158,7 @@ again: pinned_page = NULL; } ceph_put_cap_refs(ci, got); - if (retry_op && ret >= 0) { + if (retry_op > HAVE_RETRIED && ret >= 0) { int statret; struct page *page = NULL; loff_t i_size; @@ -968,12 +1210,11 @@ again: if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && ret < len) { dout("sync_read hit hole, ppos %lld < size %lld" - ", reading more\n", iocb->ki_pos, - inode->i_size); + ", reading more\n", iocb->ki_pos, i_size); read += ret; len -= ret; - retry_op = 0; + retry_op = HAVE_RETRIED; goto again; } } @@ -1014,7 +1255,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!prealloc_cf) return -ENOMEM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1052,7 +1293,7 @@ retry_snap: } dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, count, inode->i_size); + inode, ceph_vinop(inode), pos, count, i_size_read(inode)); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else @@ -1070,7 +1311,7 @@ retry_snap: (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { struct ceph_snap_context *snapc; struct iov_iter data; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); spin_lock(&ci->i_ceph_lock); if (__ceph_have_pending_cap_snap(ci)) { @@ -1088,8 +1329,8 @@ retry_snap: /* we might need to revert back to that point */ data = *from; if (iocb->ki_flags & IOCB_DIRECT) - written = ceph_sync_direct_write(iocb, &data, pos, - snapc); + written = ceph_direct_read_write(iocb, &data, snapc, + &prealloc_cf); else written = ceph_sync_write(iocb, &data, pos, snapc); if (written == -EOLDSNAPC) { @@ -1097,14 +1338,14 @@ retry_snap: "got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)count); - mutex_lock(&inode->i_mutex); + inode_lock(inode); goto retry_snap; } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); } else { - loff_t old_size = inode->i_size; + loff_t old_size = i_size_read(inode); /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -1115,9 +1356,9 @@ retry_snap: written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - if (inode->i_size > old_size) + if (i_size_read(inode) > old_size) ceph_fscache_update_objectsize(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (written >= 0) { @@ -1147,7 +1388,7 @@ retry_snap: goto out_unlocked; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; @@ -1160,9 +1401,10 @@ out_unlocked: static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; + loff_t i_size; int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); @@ -1172,9 +1414,10 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) } } + i_size = i_size_read(inode); switch (whence) { case SEEK_END: - offset += inode->i_size; + offset += i_size; break; case SEEK_CUR: /* @@ -1190,24 +1433,24 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) offset += file->f_pos; break; case SEEK_DATA: - if (offset >= inode->i_size) { + if (offset >= i_size) { ret = -ENXIO; goto out; } break; case SEEK_HOLE: - if (offset >= inode->i_size) { + if (offset >= i_size) { ret = -ENXIO; goto out; } - offset = inode->i_size; + offset = i_size; break; } offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -1363,7 +1606,7 @@ static long ceph_fallocate(struct file *file, int mode, if (!prealloc_cf) return -ENOMEM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (ceph_snap(inode) != CEPH_NOSNAP) { ret = -EROFS; @@ -1418,7 +1661,7 @@ static long ceph_fallocate(struct file *file, int mode, ceph_put_cap_refs(ci, got); unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ceph_free_cap_flush(prealloc_cf); return ret; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index da55eb8bcffa..fb4ba2e4e2a5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -548,7 +548,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { dout("size %lld -> %llu\n", inode->i_size, size); - inode->i_size = size; + i_size_write(inode, size); inode->i_blocks = (size + (1<<9) - 1) >> 9; ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { @@ -808,7 +808,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); err = -EINVAL; - if (WARN_ON(symlen != inode->i_size)) + if (WARN_ON(symlen != i_size_read(inode))) goto out; err = -ENOMEM; @@ -1549,7 +1549,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); - inode->i_size = size; + i_size_write(inode, size); inode->i_blocks = (size + (1 << 9) - 1) >> 9; /* tell the MDS if we are approaching max_size */ @@ -1911,7 +1911,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) inode->i_size, attr->ia_size); if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > inode->i_size) { - inode->i_size = attr->ia_size; + i_size_write(inode, attr->ia_size); inode->i_blocks = (attr->ia_size + (1 << 9) - 1) >> 9; inode->i_ctime = attr->ia_ctime; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 7febcf2475c5..50b268483302 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -50,7 +50,7 @@ void cifs_vfs_err(const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - pr_err("CIFS VFS: %pV", &vaf); + pr_err_ratelimited("CIFS VFS: %pV", &vaf); va_end(args); } diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index f40fbaca1b2a..66cf0f9fff89 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -51,14 +51,13 @@ __printf(1, 2) void cifs_vfs_err(const char *fmt, ...); /* information message: e.g., configuration, major event */ #define cifs_dbg(type, fmt, ...) \ do { \ - if (type == FYI) { \ - if (cifsFYI & CIFS_INFO) { \ - pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__); \ - } \ + if (type == FYI && cifsFYI & CIFS_INFO) { \ + pr_debug_ratelimited("%s: " \ + fmt, __FILE__, ##__VA_ARGS__); \ } else if (type == VFS) { \ cifs_vfs_err(fmt, ##__VA_ARGS__); \ } else if (type == NOISY && type != 0) { \ - pr_debug(fmt, ##__VA_ARGS__); \ + pr_debug_ratelimited(fmt, ##__VA_ARGS__); \ } \ } while (0) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c4c1169814b2..c48ca13673e3 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -507,6 +507,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rsize=%u", cifs_sb->rsize); seq_printf(s, ",wsize=%u", cifs_sb->wsize); + seq_printf(s, ",echo_interval=%lu", + tcon->ses->server->echo_interval / HZ); /* convert actimeo and display it in seconds */ seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); @@ -640,9 +642,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) while (*s && *s != sep) s++; - mutex_lock(&dir->i_mutex); + inode_lock(dir); child = lookup_one_len(p, dentry, s - p); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); @@ -752,6 +754,9 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t rc; struct inode *inode = file_inode(iocb->ki_filp); + if (iocb->ki_filp->f_flags & O_DIRECT) + return cifs_user_readv(iocb, iter); + rc = cifs_revalidate_mapping(inode); if (rc) return rc; @@ -766,6 +771,18 @@ static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t written; int rc; + if (iocb->ki_filp->f_flags & O_DIRECT) { + written = cifs_user_writev(iocb, from); + if (written > 0 && CIFS_CACHE_READ(cinode)) { + cifs_zap_mapping(inode); + cifs_dbg(FYI, + "Set no oplock for inode=%p after a write operation\n", + inode); + cinode->oplock = 0; + } + return written; + } + written = cifs_get_writer(cinode); if (written) return written; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 2b510c537a0d..a25b2513f146 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -70,8 +70,10 @@ #define SERVER_NAME_LENGTH 40 #define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) -/* SMB echo "timeout" -- FIXME: tunable? */ -#define SMB_ECHO_INTERVAL (60 * HZ) +/* echo interval in seconds */ +#define SMB_ECHO_INTERVAL_MIN 1 +#define SMB_ECHO_INTERVAL_MAX 600 +#define SMB_ECHO_INTERVAL_DEFAULT 60 #include "cifspdu.h" @@ -225,7 +227,7 @@ struct smb_version_operations { void (*print_stats)(struct seq_file *m, struct cifs_tcon *); void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *); /* verify the message */ - int (*check_message)(char *, unsigned int); + int (*check_message)(char *, unsigned int, struct TCP_Server_Info *); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); void (*downgrade_oplock)(struct TCP_Server_Info *, struct cifsInodeInfo *, bool); @@ -507,6 +509,7 @@ struct smb_vol { struct sockaddr_storage dstaddr; /* destination address */ struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; + unsigned int echo_interval; /* echo interval in secs */ }; #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \ @@ -627,7 +630,9 @@ struct TCP_Server_Info { #ifdef CONFIG_CIFS_SMB2 unsigned int max_read; unsigned int max_write; + __u8 preauth_hash[512]; #endif /* CONFIG_CIFS_SMB2 */ + unsigned long echo_interval; }; static inline unsigned int @@ -809,7 +814,10 @@ struct cifs_ses { bool need_reconnect:1; /* connection reset, uid now invalid */ #ifdef CONFIG_CIFS_SMB2 __u16 session_flags; - char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */ + __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 preauth_hash[512]; #endif /* CONFIG_CIFS_SMB2 */ }; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c63fd1dde25b..eed7ff50faf0 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -102,7 +102,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *out_buf, int *bytes_returned); extern int cifs_reconnect(struct TCP_Server_Info *server); -extern int checkSMB(char *buf, unsigned int length); +extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr); extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *); extern bool backup_cred(struct cifs_sb_info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); @@ -439,7 +439,8 @@ extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); extern int calc_seckey(struct cifs_ses *); -extern int generate_smb3signingkey(struct cifs_ses *); +extern int generate_smb30signingkey(struct cifs_ses *); +extern int generate_smb311signingkey(struct cifs_ses *); #ifdef CONFIG_CIFS_WEAK_PW_HASH extern int calc_lanman_hash(const char *password, const char *cryptkey, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ecb0803bdb0e..4fbd92d2e113 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -95,6 +95,7 @@ enum { Opt_cruid, Opt_gid, Opt_file_mode, Opt_dirmode, Opt_port, Opt_rsize, Opt_wsize, Opt_actimeo, + Opt_echo_interval, /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, @@ -188,6 +189,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_rsize, "rsize=%s" }, { Opt_wsize, "wsize=%s" }, { Opt_actimeo, "actimeo=%s" }, + { Opt_echo_interval, "echo_interval=%s" }, { Opt_blank_user, "user=" }, { Opt_blank_user, "username=" }, @@ -368,7 +370,6 @@ cifs_reconnect(struct TCP_Server_Info *server) server->session_key.response = NULL; server->session_key.len = 0; server->lstrp = jiffies; - mutex_unlock(&server->srv_mutex); /* mark submitted MIDs for retry and issue callback */ INIT_LIST_HEAD(&retry_list); @@ -381,6 +382,7 @@ cifs_reconnect(struct TCP_Server_Info *server) list_move(&mid_entry->qhead, &retry_list); } spin_unlock(&GlobalMid_Lock); + mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__); list_for_each_safe(tmp, tmp2, &retry_list) { @@ -418,6 +420,7 @@ cifs_echo_request(struct work_struct *work) int rc; struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, echo.work); + unsigned long echo_interval = server->echo_interval; /* * We cannot send an echo if it is disabled or until the @@ -427,7 +430,7 @@ cifs_echo_request(struct work_struct *work) */ if (!server->ops->need_neg || server->ops->need_neg(server) || (server->ops->can_echo && !server->ops->can_echo(server)) || - time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) + time_before(jiffies, server->lstrp + echo_interval - HZ)) goto requeue_echo; rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS; @@ -436,7 +439,7 @@ cifs_echo_request(struct work_struct *work) server->hostname); requeue_echo: - queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL); + queue_delayed_work(cifsiod_wq, &server->echo, echo_interval); } static bool @@ -487,9 +490,9 @@ server_unresponsive(struct TCP_Server_Info *server) * a response in >60s. */ if (server->tcpStatus == CifsGood && - time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) { - cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n", - server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ); + time_after(jiffies, server->lstrp + 2 * server->echo_interval)) { + cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n", + server->hostname, (2 * server->echo_interval) / HZ); cifs_reconnect(server); wake_up(&server->response_q); return true; @@ -828,7 +831,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) * 48 bytes is enough to display the header and a little bit * into the payload for debugging purposes. */ - length = server->ops->check_message(buf, server->total_read); + length = server->ops->check_message(buf, server->total_read, server); if (length != 0) cifs_dump_mem("Bad SMB: ", buf, min_t(unsigned int, server->total_read, 48)); @@ -1624,6 +1627,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } break; + case Opt_echo_interval: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid echo interval value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->echo_interval = option; + break; /* String Arguments */ @@ -2089,6 +2100,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) if (!match_security(server, vol)) return 0; + if (server->echo_interval != vol->echo_interval) + return 0; + return 1; } @@ -2208,6 +2222,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses->tcpStatus = CifsNew; ++tcp_ses->srv_count; + if (volume_info->echo_interval >= SMB_ECHO_INTERVAL_MIN && + volume_info->echo_interval <= SMB_ECHO_INTERVAL_MAX) + tcp_ses->echo_interval = volume_info->echo_interval * HZ; + else + tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ; + rc = ip_connect(tcp_ses); if (rc < 0) { cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n"); @@ -2237,7 +2257,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cifs_fscache_get_client_cookie(tcp_ses); /* queue echo request delayed work */ - queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL); + queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval); return tcp_ses; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0a2752b79e72..ff882aeaccc6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2267,7 +2267,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, rc = filemap_write_and_wait_range(inode->i_mapping, start, end); if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); xid = get_xid(); @@ -2292,7 +2292,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, } free_xid(xid); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } @@ -2309,7 +2309,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) rc = filemap_write_and_wait_range(inode->i_mapping, start, end); if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); xid = get_xid(); @@ -2326,7 +2326,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) } free_xid(xid); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } @@ -2672,7 +2672,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) * with a brlock that prevents writing. */ down_read(&cinode->lock_sem); - mutex_lock(&inode->i_mutex); + inode_lock(inode); rc = generic_write_checks(iocb, from); if (rc <= 0) @@ -2685,7 +2685,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) else rc = -EACCES; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (rc > 0) { ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a329f5ba35aa..aeb26dbfa1bf 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -814,8 +814,21 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } } else fattr.cf_uniqueid = iunique(sb, ROOT_I); - } else - fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } else { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + validinum == false && server->ops->get_srv_inum) { + /* + * Pass a NULL tcon to ensure we don't make a round + * trip to the server. This only works for SMB2+. + */ + tmprc = server->ops->get_srv_inum(xid, + NULL, cifs_sb, full_path, + &fattr.cf_uniqueid, data); + if (tmprc) + fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } else + fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } /* query for SFU type info if supported and needed */ if (fattr.cf_cifsattrs & ATTR_SYSTEM && @@ -856,6 +869,13 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } else { /* we already have inode, update it */ + /* if uniqueid is different, return error */ + if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && + CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { + rc = -ESTALE; + goto cgii_exit; + } + /* if filetype is different, return error */ if (unlikely(((*inode)->i_mode & S_IFMT) != (fattr.cf_mode & S_IFMT))) { diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 8442b8b8e0be..813fe13c2ae1 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -310,7 +310,7 @@ check_smb_hdr(struct smb_hdr *smb) } int -checkSMB(char *buf, unsigned int total_read) +checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server) { struct smb_hdr *smb = (struct smb_hdr *)buf; __u32 rfclen = be32_to_cpu(smb->smb_buf_length); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 0557c45e9c33..b30a4a6d98a0 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -847,6 +847,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) * if buggy server returns . and .. late do we want to * check for that here? */ + *tmp_buf = 0; rc = cifs_filldir(current_entry, file, ctx, tmp_buf, max_len); if (rc) { diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 1c5907019045..389fb9f8c84e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -38,7 +38,7 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) * Make sure that this really is an SMB, that it is a response, * and that the message ids match. */ - if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) && + if ((hdr->ProtocolId == SMB2_PROTO_NUMBER) && (mid == wire_mid)) { if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) return 0; @@ -50,9 +50,9 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) cifs_dbg(VFS, "Received Request not response\n"); } } else { /* bad signature or mid */ - if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER) + if (hdr->ProtocolId != SMB2_PROTO_NUMBER) cifs_dbg(VFS, "Bad protocol string signature header %x\n", - *(unsigned int *) hdr->ProtocolId); + le32_to_cpu(hdr->ProtocolId)); if (mid != wire_mid) cifs_dbg(VFS, "Mids do not match: %llu and %llu\n", mid, wire_mid); @@ -93,11 +93,11 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { }; int -smb2_check_message(char *buf, unsigned int length) +smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) { struct smb2_hdr *hdr = (struct smb2_hdr *)buf; struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; - __u64 mid = le64_to_cpu(hdr->MessageId); + __u64 mid; __u32 len = get_rfc1002_length(buf); __u32 clc_len; /* calculated length */ int command; @@ -111,6 +111,30 @@ smb2_check_message(char *buf, unsigned int length) * ie Validate the wct via smb2_struct_sizes table above */ + if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { + struct smb2_transform_hdr *thdr = + (struct smb2_transform_hdr *)buf; + struct cifs_ses *ses = NULL; + struct list_head *tmp; + + /* decrypt frame now that it is completely read in */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &srvr->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + if (ses->Suid == thdr->SessionId) + break; + + ses = NULL; + } + spin_unlock(&cifs_tcp_ses_lock); + if (ses == NULL) { + cifs_dbg(VFS, "no decryption - session id not found\n"); + return 1; + } + } + + + mid = le64_to_cpu(hdr->MessageId); if (length < sizeof(struct smb2_pdu)) { if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) { pdu->StructureSize2 = 0; @@ -322,7 +346,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) /* return pointer to beginning of data area, ie offset from SMB start */ if ((*off != 0) && (*len != 0)) - return (char *)(&hdr->ProtocolId[0]) + *off; + return (char *)(&hdr->ProtocolId) + *off; else return NULL; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 53ccdde6ff18..3525ed756173 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -182,6 +182,11 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) struct smb2_hdr *hdr = (struct smb2_hdr *)buf; __u64 wire_mid = le64_to_cpu(hdr->MessageId); + if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { + cifs_dbg(VFS, "encrypted frame parsing not supported yet"); + return NULL; + } + spin_lock(&GlobalMid_Lock); list_for_each_entry(mid, &server->pending_mid_q, qhead) { if ((mid->mid == wire_mid) && @@ -1692,7 +1697,7 @@ struct smb_version_operations smb30_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, - .generate_signingkey = generate_smb3signingkey, + .generate_signingkey = generate_smb30signingkey, .calc_signature = smb3_calc_signature, .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, @@ -1779,7 +1784,7 @@ struct smb_version_operations smb311_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, - .generate_signingkey = generate_smb3signingkey, + .generate_signingkey = generate_smb311signingkey, .calc_signature = smb3_calc_signature, .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, @@ -1838,7 +1843,7 @@ struct smb_version_values smb21_values = { struct smb_version_values smb30_values = { .version_string = SMB30_VERSION_STRING, .protocol_id = SMB30_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, @@ -1858,7 +1863,7 @@ struct smb_version_values smb30_values = { struct smb_version_values smb302_values = { .version_string = SMB302_VERSION_STRING, .protocol_id = SMB302_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 767555518d40..10f8d5cf5681 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -97,10 +97,7 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr) - 4 /* RFC 1001 length field itself not counted */); - hdr->ProtocolId[0] = 0xFE; - hdr->ProtocolId[1] = 'S'; - hdr->ProtocolId[2] = 'M'; - hdr->ProtocolId[3] = 'B'; + hdr->ProtocolId = SMB2_PROTO_NUMBER; hdr->StructureSize = cpu_to_le16(64); hdr->Command = smb2_cmd; hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */ @@ -1573,7 +1570,8 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto ioctl_exit; } - memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset), + memcpy(*out_data, + (char *)&rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset), *plen); ioctl_exit: free_rsp_buf(resp_buftype, rsp); @@ -2093,7 +2091,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, } if (*buf) { - memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset, + memcpy(*buf, (char *)&rsp->hdr.ProtocolId + rsp->DataOffset, *nbytes); free_rsp_buf(resp_buftype, iov[0].iov_base); } else if (resp_buftype != CIFS_NO_BUFFER) { diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 4af52780ec35..ff88d9feb01e 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -86,6 +86,7 @@ #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) +#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) /* * SMB2 Header Definition @@ -102,7 +103,7 @@ struct smb2_hdr { __be32 smb2_buf_length; /* big endian on wire */ /* length is only two or three bytes - with one or two byte type preceding it that MBZ */ - __u8 ProtocolId[4]; /* 0xFE 'S' 'M' 'B' */ + __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ __le16 StructureSize; /* 64 */ __le16 CreditCharge; /* MBZ */ __le32 Status; /* Error from server */ @@ -128,11 +129,10 @@ struct smb2_transform_hdr { one or two byte type preceding it that MBZ */ __u8 ProtocolId[4]; /* 0xFD 'S' 'M' 'B' */ __u8 Signature[16]; - __u8 Nonce[11]; - __u8 Reserved[5]; + __u8 Nonce[16]; __le32 OriginalMessageSize; __u16 Reserved1; - __le16 EncryptionAlgorithm; + __le16 Flags; /* EncryptionAlgorithm */ __u64 SessionId; } __packed; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 79dc650c18b2..4f07dc93608d 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -34,7 +34,8 @@ struct smb_rqst; ***************************************************************** */ extern int map_smb2_to_linux_error(char *buf, bool log_err); -extern int smb2_check_message(char *buf, unsigned int length); +extern int smb2_check_message(char *buf, unsigned int length, + struct TCP_Server_Info *server); extern unsigned int smb2_calc_size(void *buf); extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr); extern __le16 *cifs_convert_path_to_utf16(const char *from, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index d4c5b6f109a7..8732a43b1008 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -222,8 +222,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } -int -generate_smb3signingkey(struct cifs_ses *ses) +static int generate_key(struct cifs_ses *ses, struct kvec label, + struct kvec context, __u8 *key, unsigned int key_size) { unsigned char zero = 0x0; __u8 i[4] = {0, 0, 0, 1}; @@ -233,7 +233,7 @@ generate_smb3signingkey(struct cifs_ses *ses) unsigned char *hashptr = prfhash; memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); - memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); + memset(key, 0x0, key_size); rc = smb3_crypto_shash_allocate(ses->server); if (rc) { @@ -262,7 +262,7 @@ generate_smb3signingkey(struct cifs_ses *ses) } rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, - "SMB2AESCMAC", 12); + label.iov_base, label.iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with label\n", __func__); goto smb3signkey_ret; @@ -276,7 +276,7 @@ generate_smb3signingkey(struct cifs_ses *ses) } rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, - "SmbSign", 8); + context.iov_base, context.iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with context\n", __func__); goto smb3signkey_ret; @@ -296,12 +296,102 @@ generate_smb3signingkey(struct cifs_ses *ses) goto smb3signkey_ret; } - memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE); + memcpy(key, hashptr, key_size); smb3signkey_ret: return rc; } +struct derivation { + struct kvec label; + struct kvec context; +}; + +struct derivation_triplet { + struct derivation signing; + struct derivation encryption; + struct derivation decryption; +}; + +static int +generate_smb3signingkey(struct cifs_ses *ses, + const struct derivation_triplet *ptriplet) +{ + int rc; + + rc = generate_key(ses, ptriplet->signing.label, + ptriplet->signing.context, ses->smb3signingkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + + rc = generate_key(ses, ptriplet->encryption.label, + ptriplet->encryption.context, ses->smb3encryptionkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + + return generate_key(ses, ptriplet->decryption.label, + ptriplet->decryption.context, + ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); +} + +int +generate_smb30signingkey(struct cifs_ses *ses) + +{ + struct derivation_triplet triplet; + struct derivation *d; + + d = &triplet.signing; + d->label.iov_base = "SMB2AESCMAC"; + d->label.iov_len = 12; + d->context.iov_base = "SmbSign"; + d->context.iov_len = 8; + + d = &triplet.encryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerIn "; + d->context.iov_len = 10; + + d = &triplet.decryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerOut"; + d->context.iov_len = 10; + + return generate_smb3signingkey(ses, &triplet); +} + +int +generate_smb311signingkey(struct cifs_ses *ses) + +{ + struct derivation_triplet triplet; + struct derivation *d; + + d = &triplet.signing; + d->label.iov_base = "SMB2AESCMAC"; + d->label.iov_len = 12; + d->context.iov_base = "SmbSign"; + d->context.iov_len = 8; + + d = &triplet.encryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerIn "; + d->context.iov_len = 10; + + d = &triplet.decryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerOut"; + d->context.iov_len = 10; + + return generate_smb3signingkey(ses, &triplet); +} + int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 2a24c524fb9a..87abe8ed074c 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -576,14 +576,16 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, cifs_in_send_dec(server); cifs_save_when_sent(mid); - if (rc < 0) + if (rc < 0) { server->sequence_number -= 2; + cifs_delete_mid(mid); + } + mutex_unlock(&server->srv_mutex); if (rc == 0) return 0; - cifs_delete_mid(mid); add_credits_and_wake_if(server, credits, optype); return rc; } diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index f829fe963f5b..5104d84c4f64 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -72,8 +72,7 @@ void coda_sysctl_clean(void); } while (0) -#define CODA_FREE(ptr,size) \ - do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0) +#define CODA_FREE(ptr, size) kvfree((ptr)) /* inode to cnode access functions */ diff --git a/fs/coda/dir.c b/fs/coda/dir.c index fda9f4311212..42e731b8c80a 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -427,13 +427,13 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) if (host_file->f_op->iterate) { struct inode *host_inode = file_inode(host_file); - mutex_lock(&host_inode->i_mutex); + inode_lock(host_inode); ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { ret = host_file->f_op->iterate(host_file, ctx); file_accessed(host_file); } - mutex_unlock(&host_inode->i_mutex); + inode_unlock(host_inode); return ret; } /* Venus: we must read Venus dirents from a file */ diff --git a/fs/coda/file.c b/fs/coda/file.c index 1da3805f3ddc..f47c7483863b 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -71,12 +71,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) host_file = cfi->cfi_container; file_start_write(host_file); - mutex_lock(&coda_inode->i_mutex); + inode_lock(coda_inode); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; - mutex_unlock(&coda_inode->i_mutex); + inode_unlock(coda_inode); file_end_write(host_file); return ret; } @@ -203,7 +203,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end); if (err) return err; - mutex_lock(&coda_inode->i_mutex); + inode_lock(coda_inode); cfi = CODA_FTOC(coda_file); BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); @@ -212,7 +212,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) err = vfs_fsync(host_file, datasync); if (!err && !datasync) err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); - mutex_unlock(&coda_inode->i_mutex); + inode_unlock(coda_inode); return err; } diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 7ae97e83f121..f419519ec41f 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -640,13 +640,13 @@ static void detach_groups(struct config_group *group) child = sd->s_dentry; - mutex_lock(&d_inode(child)->i_mutex); + inode_lock(d_inode(child)); configfs_detach_group(sd->s_element); d_inode(child)->i_flags |= S_DEAD; dont_mount(child); - mutex_unlock(&d_inode(child)->i_mutex); + inode_unlock(d_inode(child)); d_delete(child); dput(child); @@ -834,11 +834,11 @@ static int configfs_attach_item(struct config_item *parent_item, * the VFS may already have hit and used them. Thus, * we must lock them as rmdir() would. */ - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); configfs_remove_dir(item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); d_delete(dentry); } } @@ -874,7 +874,7 @@ static int configfs_attach_group(struct config_item *parent_item, * We must also lock the inode to remove it safely in case of * error, as rmdir() would. */ - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); configfs_adjust_dir_dirent_depth_before_populate(sd); ret = populate_groups(to_config_group(item)); if (ret) { @@ -883,7 +883,7 @@ static int configfs_attach_group(struct config_item *parent_item, dont_mount(dentry); } configfs_adjust_dir_dirent_depth_after_populate(sd); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (ret) d_delete(dentry); } @@ -1070,11 +1070,55 @@ out: return ret; } +static int configfs_do_depend_item(struct dentry *subsys_dentry, + struct config_item *target) +{ + struct configfs_dirent *p; + int ret; + + spin_lock(&configfs_dirent_lock); + /* Scan the tree, return 0 if found */ + ret = configfs_depend_prep(subsys_dentry, target); + if (ret) + goto out_unlock_dirent_lock; + + /* + * We are sure that the item is not about to be removed by rmdir(), and + * not in the middle of attachment by mkdir(). + */ + p = target->ci_dentry->d_fsdata; + p->s_dependent_count += 1; + +out_unlock_dirent_lock: + spin_unlock(&configfs_dirent_lock); + + return ret; +} + +static inline struct configfs_dirent * +configfs_find_subsys_dentry(struct configfs_dirent *root_sd, + struct config_item *subsys_item) +{ + struct configfs_dirent *p; + struct configfs_dirent *ret = NULL; + + list_for_each_entry(p, &root_sd->s_children, s_sibling) { + if (p->s_type & CONFIGFS_DIR && + p->s_element == subsys_item) { + ret = p; + break; + } + } + + return ret; +} + + int configfs_depend_item(struct configfs_subsystem *subsys, struct config_item *target) { int ret; - struct configfs_dirent *p, *root_sd, *subsys_sd = NULL; + struct configfs_dirent *subsys_sd; struct config_item *s_item = &subsys->su_group.cg_item; struct dentry *root; @@ -1091,43 +1135,19 @@ int configfs_depend_item(struct configfs_subsystem *subsys, * subsystem is really registered, and so we need to lock out * configfs_[un]register_subsystem(). */ - mutex_lock(&d_inode(root)->i_mutex); - - root_sd = root->d_fsdata; - - list_for_each_entry(p, &root_sd->s_children, s_sibling) { - if (p->s_type & CONFIGFS_DIR) { - if (p->s_element == s_item) { - subsys_sd = p; - break; - } - } - } + inode_lock(d_inode(root)); + subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item); if (!subsys_sd) { ret = -ENOENT; goto out_unlock_fs; } /* Ok, now we can trust subsys/s_item */ + ret = configfs_do_depend_item(subsys_sd->s_dentry, target); - spin_lock(&configfs_dirent_lock); - /* Scan the tree, return 0 if found */ - ret = configfs_depend_prep(subsys_sd->s_dentry, target); - if (ret) - goto out_unlock_dirent_lock; - - /* - * We are sure that the item is not about to be removed by rmdir(), and - * not in the middle of attachment by mkdir(). - */ - p = target->ci_dentry->d_fsdata; - p->s_dependent_count += 1; - -out_unlock_dirent_lock: - spin_unlock(&configfs_dirent_lock); out_unlock_fs: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); /* * If we succeeded, the fs is pinned via other methods. If not, @@ -1144,8 +1164,7 @@ EXPORT_SYMBOL(configfs_depend_item); * configfs_depend_item() because we know that that the client driver is * pinned, thus the subsystem is pinned, and therefore configfs is pinned. */ -void configfs_undepend_item(struct configfs_subsystem *subsys, - struct config_item *target) +void configfs_undepend_item(struct config_item *target) { struct configfs_dirent *sd; @@ -1168,6 +1187,79 @@ void configfs_undepend_item(struct configfs_subsystem *subsys, } EXPORT_SYMBOL(configfs_undepend_item); +/* + * caller_subsys is a caller's subsystem not target's. This is used to + * determine if we should lock root and check subsys or not. When we are + * in the same subsystem as our target there is no need to do locking as + * we know that subsys is valid and is not unregistered during this function + * as we are called from callback of one of his children and VFS holds a lock + * on some inode. Otherwise we have to lock our root to ensure that target's + * subsystem it is not unregistered during this function. + */ +int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys, + struct config_item *target) +{ + struct configfs_subsystem *target_subsys; + struct config_group *root, *parent; + struct configfs_dirent *subsys_sd; + int ret = -ENOENT; + + /* Disallow this function for configfs root */ + if (configfs_is_root(target)) + return -EINVAL; + + parent = target->ci_group; + /* + * This may happen when someone is trying to depend root + * directory of some subsystem + */ + if (configfs_is_root(&parent->cg_item)) { + target_subsys = to_configfs_subsystem(to_config_group(target)); + root = parent; + } else { + target_subsys = parent->cg_subsys; + /* Find a cofnigfs root as we may need it for locking */ + for (root = parent; !configfs_is_root(&root->cg_item); + root = root->cg_item.ci_group) + ; + } + + if (target_subsys != caller_subsys) { + /* + * We are in other configfs subsystem, so we have to do + * additional locking to prevent other subsystem from being + * unregistered + */ + inode_lock(d_inode(root->cg_item.ci_dentry)); + + /* + * As we are trying to depend item from other subsystem + * we have to check if this subsystem is still registered + */ + subsys_sd = configfs_find_subsys_dentry( + root->cg_item.ci_dentry->d_fsdata, + &target_subsys->su_group.cg_item); + if (!subsys_sd) + goto out_root_unlock; + } else { + subsys_sd = target_subsys->su_group.cg_item.ci_dentry->d_fsdata; + } + + /* Now we can execute core of depend item */ + ret = configfs_do_depend_item(subsys_sd->s_dentry, target); + + if (target_subsys != caller_subsys) +out_root_unlock: + /* + * We were called from subsystem other than our target so we + * took some locks so now it's time to release them + */ + inode_unlock(d_inode(root->cg_item.ci_dentry)); + + return ret; +} +EXPORT_SYMBOL(configfs_depend_item_unlocked); + static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int ret = 0; @@ -1469,7 +1561,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) down_write(&configfs_rename_sem); parent = item->parent->dentry; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { @@ -1485,7 +1577,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) error = -EEXIST; dput(new_dentry); } - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); up_write(&configfs_rename_sem); return error; @@ -1498,7 +1590,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) struct configfs_dirent * parent_sd = dentry->d_fsdata; int err; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); /* * Fake invisibility if dir belongs to a group/default groups hierarchy * being attached @@ -1511,7 +1603,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) else err = 0; } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return err; } @@ -1521,11 +1613,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file) struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * cursor = file->private_data; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); spin_lock(&configfs_dirent_lock); list_del_init(&cursor->s_sibling); spin_unlock(&configfs_dirent_lock); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); release_configfs_dirent(cursor); @@ -1606,7 +1698,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry * dentry = file->f_path.dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); switch (whence) { case 1: offset += file->f_pos; @@ -1614,7 +1706,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return -EINVAL; } if (offset != file->f_pos) { @@ -1640,7 +1732,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&configfs_dirent_lock); } } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return offset; } @@ -1675,14 +1767,14 @@ int configfs_register_group(struct config_group *parent_group, parent = parent_group->cg_item.ci_dentry; - mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); ret = create_default_group(parent_group, group); if (!ret) { spin_lock(&configfs_dirent_lock); configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); spin_unlock(&configfs_dirent_lock); } - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return ret; } EXPORT_SYMBOL(configfs_register_group); @@ -1699,7 +1791,7 @@ void configfs_unregister_group(struct config_group *group) struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *parent = group->cg_item.ci_parent->ci_dentry; - mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); spin_lock(&configfs_dirent_lock); configfs_detach_prep(dentry, NULL); spin_unlock(&configfs_dirent_lock); @@ -1708,7 +1800,7 @@ void configfs_unregister_group(struct config_group *group) d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); d_delete(dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); dput(dentry); @@ -1780,7 +1872,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) sd = root->d_fsdata; link_group(to_config_group(sd->s_element), group); - mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); err = -ENOMEM; dentry = d_alloc_name(root, group->cg_item.ci_name); @@ -1800,7 +1892,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) } } - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); if (err) { unlink_group(group); @@ -1821,9 +1913,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) return; } - mutex_lock_nested(&d_inode(root)->i_mutex, + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { @@ -1834,11 +1926,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); d_delete(dentry); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(dentry); diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 3687187c8ea5..33b7ee34eda5 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -540,10 +540,10 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL); + inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, CONFIGFS_ITEM_ATTR); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); return error; } @@ -562,10 +562,10 @@ int configfs_create_bin_file(struct config_item *item, umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); + inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode, CONFIGFS_ITEM_BIN_ATTR); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); return error; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 0cc810e9dccc..cee087d8f7e0 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -255,7 +255,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) /* no inode means this hasn't been made visible yet */ return; - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; @@ -268,5 +268,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) break; } } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); } diff --git a/fs/coredump.c b/fs/coredump.c index b3c153ca435d..9ea87e9fdccf 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -118,6 +118,26 @@ int cn_esc_printf(struct core_name *cn, const char *fmt, ...) ret = cn_vprintf(cn, fmt, arg); va_end(arg); + if (ret == 0) { + /* + * Ensure that this coredump name component can't cause the + * resulting corefile path to consist of a ".." or ".". + */ + if ((cn->used - cur == 1 && cn->corename[cur] == '.') || + (cn->used - cur == 2 && cn->corename[cur] == '.' + && cn->corename[cur+1] == '.')) + cn->corename[cur] = '!'; + + /* + * Empty names are fishy and could be used to create a "//" in a + * corefile name, causing the coredump to happen one directory + * level too high. Enforce that all components of the core + * pattern are at least one character long. + */ + if (cn->used == cur) + ret = cn_printf(cn, "!"); + } + for (; cur < cn->used; ++cur) { if (cn->corename[cur] == '/') cn->corename[cur] = '!'; @@ -24,6 +24,7 @@ #include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/mutex.h> +#include <linux/pagevec.h> #include <linux/pmem.h> #include <linux/sched.h> #include <linux/uio.h> @@ -57,6 +58,26 @@ static void dax_unmap_atomic(struct block_device *bdev, blk_queue_exit(bdev->bd_queue); } +struct page *read_dax_sector(struct block_device *bdev, sector_t n) +{ + struct page *page = alloc_pages(GFP_KERNEL, 0); + struct blk_dax_ctl dax = { + .size = PAGE_SIZE, + .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), + }; + long rc; + + if (!page) + return ERR_PTR(-ENOMEM); + + rc = dax_map_atomic(bdev, &dax); + if (rc < 0) + return ERR_PTR(rc); + memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); + dax_unmap_atomic(bdev, &dax); + return page; +} + /* * dax_clear_blocks() is called from within transaction context from XFS, * and hence this means the stack from this point must follow GFP_NOFS @@ -245,13 +266,14 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, loff_t end = pos + iov_iter_count(iter); memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { struct address_space *mapping = inode->i_mapping; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = filemap_write_and_wait_range(mapping, pos, end - 1); if (retval) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } } @@ -263,7 +285,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, retval = dax_io(inode, iter, pos, end, get_block, &bh); if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if ((retval > 0) && end_io) end_io(iocb, pos, retval, bh.b_private); @@ -324,6 +346,199 @@ static int copy_user_bh(struct page *to, struct inode *inode, return 0; } +#define NO_SECTOR -1 +#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT)) + +static int dax_radix_entry(struct address_space *mapping, pgoff_t index, + sector_t sector, bool pmd_entry, bool dirty) +{ + struct radix_tree_root *page_tree = &mapping->page_tree; + pgoff_t pmd_index = DAX_PMD_INDEX(index); + int type, error = 0; + void *entry; + + WARN_ON_ONCE(pmd_entry && !dirty); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + spin_lock_irq(&mapping->tree_lock); + + entry = radix_tree_lookup(page_tree, pmd_index); + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { + index = pmd_index; + goto dirty; + } + + entry = radix_tree_lookup(page_tree, index); + if (entry) { + type = RADIX_DAX_TYPE(entry); + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && + type != RADIX_DAX_PMD)) { + error = -EIO; + goto unlock; + } + + if (!pmd_entry || type == RADIX_DAX_PMD) + goto dirty; + + /* + * We only insert dirty PMD entries into the radix tree. This + * means we don't need to worry about removing a dirty PTE + * entry and inserting a clean PMD entry, thus reducing the + * range we would flush with a follow-up fsync/msync call. + */ + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + } + + if (sector == NO_SECTOR) { + /* + * This can happen during correct operation if our pfn_mkwrite + * fault raced against a hole punch operation. If this + * happens the pte that was hole punched will have been + * unmapped and the radix tree entry will have been removed by + * the time we are called, but the call will still happen. We + * will return all the way up to wp_pfn_shared(), where the + * pte_same() check will fail, eventually causing page fault + * to be retried by the CPU. + */ + goto unlock; + } + + error = radix_tree_insert(page_tree, index, + RADIX_DAX_ENTRY(sector, pmd_entry)); + if (error) + goto unlock; + + mapping->nrexceptional++; + dirty: + if (dirty) + radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); + unlock: + spin_unlock_irq(&mapping->tree_lock); + return error; +} + +static int dax_writeback_one(struct block_device *bdev, + struct address_space *mapping, pgoff_t index, void *entry) +{ + struct radix_tree_root *page_tree = &mapping->page_tree; + int type = RADIX_DAX_TYPE(entry); + struct radix_tree_node *node; + struct blk_dax_ctl dax; + void **slot; + int ret = 0; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + + /* another fsync thread may have already written back this entry */ + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto unlock; + + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { + ret = -EIO; + goto unlock; + } + + dax.sector = RADIX_DAX_SECTOR(entry); + dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); + spin_unlock_irq(&mapping->tree_lock); + + /* + * We cannot hold tree_lock while calling dax_map_atomic() because it + * eventually calls cond_resched(). + */ + ret = dax_map_atomic(bdev, &dax); + if (ret < 0) + return ret; + + if (WARN_ON_ONCE(ret < dax.size)) { + ret = -EIO; + goto unmap; + } + + wb_cache_pmem(dax.addr, dax.size); + + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + unmap: + dax_unmap_atomic(bdev, &dax); + return ret; + + unlock: + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +/* + * Flush the mapping to the persistent domain within the byte range of [start, + * end]. This is required by data integrity operations to ensure file data is + * on persistent storage prior to completion of the operation. + */ +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, + loff_t end) +{ + struct inode *inode = mapping->host; + struct block_device *bdev = inode->i_sb->s_bdev; + pgoff_t start_index, end_index, pmd_index; + pgoff_t indices[PAGEVEC_SIZE]; + struct pagevec pvec; + bool done = false; + int i, ret = 0; + void *entry; + + if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) + return -EIO; + + start_index = start >> PAGE_CACHE_SHIFT; + end_index = end >> PAGE_CACHE_SHIFT; + pmd_index = DAX_PMD_INDEX(start_index); + + rcu_read_lock(); + entry = radix_tree_lookup(&mapping->page_tree, pmd_index); + rcu_read_unlock(); + + /* see if the start of our range is covered by a PMD entry */ + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) + start_index = pmd_index; + + tag_pages_for_writeback(mapping, start_index, end_index); + + pagevec_init(&pvec, 0); + while (!done) { + pvec.nr = find_get_entries_tag(mapping, start_index, + PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, + pvec.pages, indices); + + if (pvec.nr == 0) + break; + + for (i = 0; i < pvec.nr; i++) { + if (indices[i] > end_index) { + done = true; + break; + } + + ret = dax_writeback_one(bdev, mapping, indices[i], + pvec.pages[i]); + if (ret < 0) + return ret; + } + } + wmb_pmem(); + return 0; +} +EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); + static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -363,6 +578,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, } dax_unmap_atomic(bdev, &dax); + error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, + vmf->flags & FAULT_FLAG_WRITE); + if (error) + goto out; + error = vm_insert_mixed(vma, vaddr, dax.pfn); out: @@ -408,6 +628,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); + bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: @@ -487,6 +708,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, delete_from_page_cache(page); unlock_page(page); page_cache_release(page); + page = NULL; } /* @@ -590,7 +812,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, struct block_device *bdev; pgoff_t size, pgoff; sector_t block; - int result = 0; + int error, result = 0; + bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) @@ -624,13 +847,21 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; - if (get_block(inode, block, &bh, write) != 0) + + if (get_block(inode, block, &bh, 0) != 0) return VM_FAULT_SIGBUS; + + if (!buffer_mapped(&bh) && write) { + if (get_block(inode, block, &bh, 1) != 0) + return VM_FAULT_SIGBUS; + alloc = true; + } + bdev = bh.b_bdev; - i_mmap_lock_read(mapping); /* * If the filesystem isn't willing to tell us the length of a hole, @@ -639,19 +870,22 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, */ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { dax_pmd_dbg(&bh, address, "allocated block too small"); - goto fallback; + return VM_FAULT_FALLBACK; } /* * If we allocated new storage, make sure no process has any * zero pages covering this hole */ - if (buffer_new(&bh)) { - i_mmap_unlock_read(mapping); - unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); - i_mmap_lock_read(mapping); + if (alloc) { + loff_t lstart = pgoff << PAGE_SHIFT; + loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ + + truncate_pagecache_range(inode, lstart, lend); } + i_mmap_lock_read(mapping); + /* * If a truncate happened while we were allocating blocks, we may * leave blocks allocated to the file that are beyond EOF. We can't @@ -664,7 +898,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, goto out; } if ((pgoff | PG_PMD_COLOUR) >= size) { - dax_pmd_dbg(&bh, address, "pgoff unaligned"); + dax_pmd_dbg(&bh, address, + "offset + huge page size > file size"); goto fallback; } @@ -732,6 +967,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } dax_unmap_atomic(bdev, &dax); + /* + * For PTE faults we insert a radix tree entry for reads, and + * leave it clean. Then on the first write we dirty the radix + * tree entry via the dax_pfn_mkwrite() path. This sequence + * allows the dax_pfn_mkwrite() call to be simpler and avoid a + * call into get_block() to translate the pgoff to a sector in + * order to be able to create a new radix tree entry. + * + * The PMD path doesn't have an equivalent to + * dax_pfn_mkwrite(), though, so for a read followed by a + * write we traverse all the way through __dax_pmd_fault() + * twice. This means we can just skip inserting a radix tree + * entry completely on the initial read and just wait until + * the write to insert a dirty entry. + */ + if (write) { + error = dax_radix_entry(mapping, pgoff, dax.sector, + true, true); + if (error) { + dax_pmd_dbg(&bh, address, + "PMD radix insertion failed"); + goto fallback; + } + } + dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: %lx sect: %llx\n", __func__, current->comm, address, @@ -790,15 +1050,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault - * */ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct super_block *sb = file_inode(vma->vm_file)->i_sb; + struct file *file = vma->vm_file; - sb_start_pagefault(sb); - file_update_time(vma->vm_file); - sb_end_pagefault(sb); + /* + * We pass NO_SECTOR to dax_radix_entry() because we expect that a + * RADIX_DAX_PTE entry already exists in the radix tree from a + * previous call to __dax_fault(). We just want to look up that PTE + * entry using vmf->pgoff and make sure the dirty tag is set. This + * saves us from having to make a call to get_block() here to look + * up the sector. + */ + dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -835,6 +1100,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, BUG_ON((offset + length) > PAGE_CACHE_SIZE); memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_CACHE_SIZE; err = get_block(inode, index, &bh, 0); if (err < 0) diff --git a/fs/dcache.c b/fs/dcache.c index b4539e84e577..92d5140de851 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2462,7 +2462,7 @@ EXPORT_SYMBOL(d_rehash); */ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) { - BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex)); + BUG_ON(!inode_is_locked(dentry->d_parent->d_inode)); BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ spin_lock(&dentry->d_lock); @@ -2738,7 +2738,7 @@ static int __d_unalias(struct inode *inode, if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; - if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex)) + if (!inode_trylock(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b7fcc0de0b2f..bece948b363d 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -265,7 +265,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { dput(dentry); @@ -273,7 +273,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) } if (IS_ERR(dentry)) { - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -282,7 +282,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); + inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return NULL; @@ -290,7 +290,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); + inode_unlock(d_inode(dentry->d_parent)); return dentry; } @@ -560,9 +560,9 @@ void debugfs_remove(struct dentry *dentry) if (!parent || d_really_is_negative(parent)) return; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); ret = __debugfs_remove(dentry, parent); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); if (!ret) simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -594,7 +594,7 @@ void debugfs_remove_recursive(struct dentry *dentry) parent = dentry; down: - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -609,7 +609,7 @@ void debugfs_remove_recursive(struct dentry *dentry) /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); parent = child; goto down; } @@ -630,10 +630,10 @@ void debugfs_remove_recursive(struct dentry *dentry) } spin_unlock(&parent->d_lock); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); child = parent; parent = parent->d_parent; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); if (child != dentry) /* go up */ @@ -641,7 +641,7 @@ void debugfs_remove_recursive(struct dentry *dentry) if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c35ffdc12bba..1f107fd51328 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -255,7 +255,7 @@ static int mknod_ptmx(struct super_block *sb) if (!uid_valid(root_uid) || !gid_valid(root_gid)) return -EINVAL; - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); /* If we have already created ptmx node, return */ if (fsi->ptmx_dentry) { @@ -292,7 +292,7 @@ static int mknod_ptmx(struct super_block *sb) fsi->ptmx_dentry = dentry; rc = 0; out: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return rc; } @@ -615,7 +615,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, sprintf(s, "%d", index); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_alloc_name(root, s); if (dentry) { @@ -626,7 +626,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, inode = ERR_PTR(-ENOMEM); } - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return inode; } @@ -671,7 +671,7 @@ void devpts_pty_kill(struct inode *inode) BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_find_alias(inode); @@ -680,7 +680,7 @@ void devpts_pty_kill(struct inode *inode) dput(dentry); /* d_alloc_name() in devpts_pty_new() */ dput(dentry); /* d_find_alias above */ - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); } static int __init init_devpts_fs(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index 602e8441bc0f..1b2f7ffc8b84 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1157,12 +1157,12 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, iocb->ki_filp->f_mapping; /* will be released by direct_io_worker */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = filemap_write_and_wait_range(mapping, offset, end - 1); if (retval) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kmem_cache_free(dio_cache, dio); goto out; } @@ -1173,7 +1173,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->i_size = i_size_read(inode); if (iov_iter_rw(iter) == READ && offset >= dio->i_size) { if (dio->flags & DIO_LOCKING) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kmem_cache_free(dio_cache, dio); retval = 0; goto out; @@ -1295,7 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * of protecting us from looking up uninitialized blocks. */ if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); + inode_unlock(dio->inode); /* * The only time we want to leave bios in flight is when a successful diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 1925d6d222b8..58c2f4a21b7f 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -516,7 +516,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, return -EINVAL; kbuf = memdup_user_nul(buf, count); - if (!IS_ERR(kbuf)) + if (IS_ERR(kbuf)) return PTR_ERR(kbuf); if (check_version(kbuf)) { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 040aa879d634..4e685ac1024d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -41,13 +41,13 @@ static struct dentry *lock_parent(struct dentry *dentry) struct dentry *dir; dir = dget_parent(dentry); - mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); return dir; } static void unlock_dir(struct dentry *dir) { - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); } @@ -397,11 +397,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, int rc = 0; lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); + inode_lock(d_inode(lower_dir_dentry)); lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, lower_dir_dentry, ecryptfs_dentry->d_name.len); - mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); + inode_unlock(d_inode(lower_dir_dentry)); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -426,11 +426,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out; } - mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); + inode_lock(d_inode(lower_dir_dentry)); lower_dentry = lookup_one_len(encrypted_and_encoded_name, lower_dir_dentry, encrypted_and_encoded_name_size); - mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); + inode_unlock(d_inode(lower_dir_dentry)); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -869,9 +869,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) if (!rc && lower_ia.ia_valid & ATTR_SIZE) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); } return rc; } @@ -970,9 +970,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) lower_ia.ia_valid &= ~ATTR_MODE; - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: fsstack_copy_attr_all(inode, lower_inode); return rc; @@ -1048,10 +1048,10 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value, size); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } @@ -1075,9 +1075,9 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } @@ -1092,9 +1092,9 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name) rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index caba848ac763..c6ced4cbf0cf 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -436,7 +436,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) rc = -ENOMEM; goto out; } - mutex_lock(&lower_inode->i_mutex); + inode_lock(lower_inode); size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME, xattr_virt, PAGE_CACHE_SIZE); if (size < 0) @@ -444,7 +444,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); - mutex_unlock(&lower_inode->i_mutex); + inode_unlock(lower_inode); if (rc) printk(KERN_ERR "Error whilst attempting to write inode size " "to lower file xattr; rc = [%d]\n", rc); diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index 90001da9abfd..c424e4813ec8 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -50,9 +50,9 @@ static ssize_t efivarfs_file_write(struct file *file, d_delete(file->f_path.dentry); dput(file->f_path.dentry); } else { - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_size_write(inode, datasize + sizeof(attributes)); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } bytes = count; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 86a2121828c3..b8a564f29107 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -160,10 +160,10 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, efivar_entry_size(entry, &size); efivar_entry_add(entry, &efivarfs_list); - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_private = entry; i_size_write(inode, size + sizeof(entry->var.Attributes)); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); d_add(dentry, inode); return 0; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1e009cad8d5c..ae1dbcf47e97 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -92,7 +92,7 @@ */ /* Epoll private bits inside the event mask */ -#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET) +#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 @@ -1002,6 +1002,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + int ewake = 0; if ((unsigned long)key & POLLFREE) { ep_pwq_from_wait(wait)->whead = NULL; @@ -1066,8 +1067,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ - if (waitqueue_active(&ep->wq)) + if (waitqueue_active(&ep->wq)) { + ewake = 1; wake_up_locked(&ep->wq); + } if (waitqueue_active(&ep->poll_wait)) pwake++; @@ -1078,6 +1081,9 @@ out_unlock: if (pwake) ep_poll_safewake(&ep->poll_wait); + if (epi->event.events & EPOLLEXCLUSIVE) + return ewake; + return 1; } @@ -1095,7 +1101,10 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; - add_wait_queue(whead, &pwq->wait); + if (epi->event.events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(whead, &pwq->wait); + else + add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; } else { @@ -1862,6 +1871,15 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, goto error_tgt_fput; /* + * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, + * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. + * Also, we do not currently supported nested exclusive wakeups. + */ + if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD || + (op == EPOLL_CTL_ADD && is_file_epoll(tf.file)))) + goto error_tgt_fput; + + /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ diff --git a/fs/exec.c b/fs/exec.c index 828ec5f07de0..dcd4ac7d3f1e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1307,13 +1307,13 @@ static void bprm_fill_uid(struct linux_binprm *bprm) return; /* Be careful if suid/sgid is set */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; uid = inode->i_uid; gid = inode->i_gid; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!kuid_has_mapping(bprm->cred->user_ns, uid) || diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 906de66e8e7e..28645f0640f7 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -52,9 +52,9 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = sync_inode_metadata(filp->f_mapping->host, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 714cd37a6ba3..c46f1a190b8d 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -124,10 +124,10 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, int err; parent = ERR_PTR(-EACCES); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); if (mnt->mnt_sb->s_export_op->get_parent) parent = mnt->mnt_sb->s_export_op->get_parent(dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { dprintk("%s: get_parent of %ld failed, err %d\n", @@ -143,9 +143,9 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); tmp = lookup_one_len(nbuf, parent, strlen(nbuf)); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (IS_ERR(tmp)) { dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); goto out_err; @@ -503,10 +503,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, */ err = exportfs_get_name(mnt, target_dir, nbuf, result); if (!err) { - mutex_lock(&target_dir->d_inode->i_mutex); + inode_lock(target_dir->d_inode); nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); - mutex_unlock(&target_dir->d_inode->i_mutex); + inode_unlock(target_dir->d_inode); if (!IS_ERR(nresult)) { if (nresult->d_inode) { dput(result); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 11a42c5a09ae..2c88d683cd91 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -102,8 +102,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, { struct inode *inode = file_inode(vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); - int ret = VM_FAULT_NOPAGE; loff_t size; + int ret; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); @@ -113,6 +113,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; + else + ret = dax_pfn_mkwrite(vma, vmf); up_read(&ei->dax_sem); sb_end_pagefault(inode->i_sb); diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 5d46c09863f0..b386af2e45f4 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -51,10 +51,10 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ext2_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto setflags_out; } @@ -68,7 +68,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) */ if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto setflags_out; } @@ -80,7 +80,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ext2_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME_SEC; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mark_inode_dirty(inode); setflags_out: @@ -102,10 +102,10 @@ setflags_out: goto setversion_out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_ctime = CURRENT_TIME_SEC; inode->i_generation = generation; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mark_inode_dirty(inode); setversion_out: diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 1a0835073663..c8021208a7eb 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -384,14 +384,12 @@ int ext4_decrypt(struct page *page) EXT4_DECRYPT, page->index, page, page); } -int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) +int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len) { struct ext4_crypto_ctx *ctx; struct page *ciphertext_page = NULL; struct bio *bio; - ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); - ext4_fsblk_t pblk = ext4_ext_pblock(ex); - unsigned int len = ext4_ext_get_actual_len(ex); int ret, err = 0; #if 0 diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index c5882b36e558..9a16d1e75a49 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -213,9 +213,11 @@ retry: res = -ENOKEY; goto out; } + down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct ext4_encryption_key)) { res = -EINVAL; + up_read(&keyring_key->sem); goto out; } master_key = (struct ext4_encryption_key *)ukp->data; @@ -226,10 +228,12 @@ retry: "ext4: key size incorrect: %d\n", master_key->size); res = -ENOKEY; + up_read(&keyring_key->sem); goto out; } res = ext4_derive_key_aes(ctx.nonce, master_key->raw, raw_key); + up_read(&keyring_key->sem); if (res) goto out; got_key: diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cc7ca4e87144..0662b285dc8a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -378,14 +378,22 @@ struct flex_groups { #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */ + +#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_PROJINHERIT_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ - EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) @@ -555,10 +563,12 @@ enum { #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Request will not result in inode size update (user for fallocate) */ #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 - /* Do not take i_data_sem locking in ext4_map_blocks */ -#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 /* Convert written extents to unwritten */ -#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200 +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 + /* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO 0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ + EXT4_GET_BLOCKS_ZERO) /* * The bit position of these flags must not overlap with any of the @@ -616,6 +626,46 @@ enum { #define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) #define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy) +#ifndef FS_IOC_FSGETXATTR +/* Until the uapi changes get merged for project quota... */ + +#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) +#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) + +/* + * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR. + */ +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; +}; + +/* + * Flags for the fsx_xflags field + */ +#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ +#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ +#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ +#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ +#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ +#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ +#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ +#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ +#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ +#endif /* !defined(FS_IOC_FSGETXATTR) */ + +#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation @@ -910,6 +960,15 @@ struct ext4_inode_info { * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; struct inode vfs_inode; struct jbd2_inode *jinode; @@ -993,6 +1052,7 @@ struct ext4_inode_info { /* Encryption params */ struct ext4_crypt_info *i_crypt_info; #endif + kprojid_t i_projid; }; /* @@ -1248,7 +1308,7 @@ struct ext4_super_block { #endif /* Number of quota types we support */ -#define EXT4_MAXQUOTAS 2 +#define EXT4_MAXQUOTAS 3 /* * fourth extended-fs super-block data in memory @@ -1754,7 +1814,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ - EXT4_FEATURE_RO_COMPAT_QUOTA) + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ @@ -1796,6 +1857,11 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) #define EXT4_DEF_RESUID 0 #define EXT4_DEF_RESGID 0 +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + #define EXT4_DEF_INODE_READAHEAD_BLKS 32 /* @@ -2234,7 +2300,8 @@ void ext4_restore_control_page(struct page *data_page); struct page *ext4_encrypt(struct inode *inode, struct page *plaintext_page); int ext4_decrypt(struct page *page); -int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); +int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_init_crypto(void); @@ -2440,8 +2507,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_get_block_dax(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, @@ -2484,9 +2551,13 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, @@ -2825,7 +2896,7 @@ do { \ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { WARN_ON_ONCE(S_ISREG(inode->i_mode) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = newsize; @@ -2848,6 +2919,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) return changed; } +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; @@ -2986,8 +3060,7 @@ extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, struct page *page); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, - struct inode *inode); + struct inode *dir, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 551353b1b17a..0ffabaf90aa5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3119,19 +3119,11 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { ext4_fsblk_t ee_pblock; unsigned int ee_len; - int ret; ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); - - if (ext4_encrypted_inode(inode)) - return ext4_encrypted_zeroout(inode, ex); - - ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); - if (ret > 0) - ret = 0; - - return ret; + return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, + ee_len); } /* @@ -4052,6 +4044,14 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { + if (flags & EXT4_GET_BLOCKS_ZERO) { + if (allocated > map->m_len) + allocated = map->m_len; + err = ext4_issue_zeroout(inode, map->m_lblk, newblock, + allocated); + if (err < 0) + goto out2; + } ret = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); if (ret >= 0) { @@ -4685,10 +4685,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - /* * credits to insert 1 extent into extent tree */ @@ -4752,8 +4748,6 @@ retry: goto retry; } - ext4_inode_resume_unlocked_dio(inode); - return ret > 0 ? ret2 : ret; } @@ -4770,7 +4764,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; - struct address_space *mapping = inode->i_mapping; unsigned int blkbits = inode->i_blkbits; trace_ext4_zero_range(inode, offset, len, mode); @@ -4786,17 +4779,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + len - 1); - if (ret) - return ret; - } - - /* * Round up offset. This is not fallocate, we neet to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the @@ -4817,7 +4799,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, else max_blocks -= lblk; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Indirect files do not support unwritten extnets @@ -4839,6 +4821,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4847,7 +4833,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags, mode); if (ret) - goto out_mutex; + goto out_dio; } @@ -4856,16 +4842,23 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); - /* Now release the pages and zero block aligned part of pages*/ + /* + * Prevent page faults from reinstantiating pages we have + * released from page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) { + up_write(&EXT4_I(inode)->i_mmap_sem); + goto out_dio; + } + /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) goto out_dio; } @@ -4909,7 +4902,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, out_dio: ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -4980,7 +4973,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * We only support preallocation for extent-based files only @@ -4998,8 +4991,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; @@ -5008,7 +5006,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) EXT4_I(inode)->i_sync_tid); } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } @@ -5494,21 +5492,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - /* - * Need to round down offset to be aligned with page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - return ret; - - /* Take mutex lock */ - mutex_lock(&inode->i_mutex); - + inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation @@ -5524,17 +5508,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* + * Write tail of the last page before removed range since it will get + * removed from the page cache below. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); + if (ret) + goto out_mmap; + /* + * Write data that will be shifted to preserve them when discarding + * page cache below. We are also protected from pages becoming dirty + * by i_mmap_sem. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } down_write(&EXT4_I(inode)->i_data_sem); @@ -5573,10 +5583,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -5627,21 +5638,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - /* - * Need to round down to align start offset to page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - return ret; - - /* Take mutex lock */ - mutex_lock(&inode->i_mutex); - + inode_lock(inode); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; @@ -5660,17 +5657,32 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down to align start offset to page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } /* Expand file to avoid data loss if there is error while shifting */ @@ -5741,10 +5753,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -5779,8 +5792,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); - BUG_ON(!mutex_is_locked(&inode1->i_mutex)); - BUG_ON(!mutex_is_locked(&inode2->i_mutex)); + BUG_ON(!inode_is_locked(inode1)); + BUG_ON(!inode_is_locked(inode2)); *erp = ext4_es_remove_extent(inode1, lblk1, count); if (unlikely(*erp)) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 113837e7ba98..1126436dada1 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -113,7 +113,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_unwritten_wait(inode); } - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto out; @@ -169,7 +169,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = __generic_file_write_iter(iocb, from); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret > 0) { ssize_t err; @@ -186,50 +186,42 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return ret; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (aio_mutex) mutex_unlock(aio_mutex); return ret; } #ifdef CONFIG_FS_DAX -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) -{ - struct inode *inode = bh->b_assoc_map->host; - /* XXX: breaks on 32-bit > 16TB. Is that even supported? */ - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; - int err; - if (!uptodate) - return; - WARN_ON(!buffer_unwritten(bh)); - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); -} - static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int result; handle_t *handle = NULL; - struct super_block *sb = file_inode(vma->vm_file)->i_sb; + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); - } + } else + down_read(&EXT4_I(inode)->i_mmap_sem); if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_fault(vma, vmf, ext4_get_block_dax, - ext4_end_io_unwritten); + result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } + } else + up_read(&EXT4_I(inode)->i_mmap_sem); return result; } @@ -246,44 +238,88 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, ext4_chunk_trans_blocks(inode, PMD_SIZE / PAGE_SIZE)); - } + } else + down_read(&EXT4_I(inode)->i_mmap_sem); if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else result = __dax_pmd_fault(vma, addr, pmd, flags, - ext4_get_block_dax, ext4_end_io_unwritten); + ext4_dax_mmap_get_block, NULL); if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } + } else + up_read(&EXT4_I(inode)->i_mmap_sem); return result; } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block_dax, - ext4_end_io_unwritten); + int err; + struct inode *inode = file_inode(vma->vm_file); + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL); + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(inode->i_sb); + + return err; +} + +/* + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() + * handler we check for races agaist truncate. Note that since we cycle through + * i_mmap_sem, we are sure that also any hole punching that began before we + * were called is finished by now and so if it included part of the file we + * are working on, our pte will get unmapped and the check for pte_same() in + * wp_pfn_shared() fails. Thus fault gets retried and things work out as + * desired. + */ +static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; + loff_t size; + int ret; + + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + ret = VM_FAULT_SIGBUS; + else + ret = dax_pfn_mkwrite(vma, vmf); + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + + return ret; } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .pmd_fault = ext4_dax_pmd_fault, .page_mkwrite = ext4_dax_mkwrite, - .pfn_mkwrite = dax_pfn_mkwrite, + .pfn_mkwrite = ext4_dax_pfn_mkwrite, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = filemap_fault, + .fault = ext4_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; @@ -527,11 +563,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) int blkbits; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -579,7 +615,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) dataoff = (loff_t)last << blkbits; } while (last <= end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (dataoff > isize) return -ENXIO; @@ -600,11 +636,11 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) int blkbits; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -655,7 +691,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) break; } while (last <= end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (holeoff > isize) holeoff = isize; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1b8024d26f65..3fcfd50a2e8a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -799,6 +799,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, inode->i_gid = dir->i_gid; } else inode_init_owner(inode, dir, mode); + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) + ei->i_projid = EXT4_I(dir)->i_projid; + else + ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); + err = dquot_initialize(inode); if (err) goto out; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d884989cc83d..dfe3b9bafc0d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -995,12 +995,11 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, */ static int ext4_add_dirent_to_inline(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, + struct inode *dir, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { - struct inode *dir = d_inode(dentry->d_parent); int err; struct ext4_dir_entry_2 *de; @@ -1245,12 +1244,11 @@ out: * the new created block. */ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, struct inode *inode) + struct inode *dir, struct inode *inode) { int ret, inline_size; void *inline_start; struct ext4_iloc iloc; - struct inode *dir = d_inode(dentry->d_parent); ret = ext4_get_inode_loc(dir, &iloc); if (ret) @@ -1264,7 +1262,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc, + ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; @@ -1285,7 +1283,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); - ret = ext4_add_dirent_to_inline(handle, fname, dentry, + ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3bd912df6bf..83bc8bfb3bea 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -383,6 +383,21 @@ static int __check_block_validity(struct inode *inode, const char *func, return 0; } +int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, + ext4_lblk_t len) +{ + int ret; + + if (ext4_encrypted_inode(inode)) + return ext4_encrypted_zeroout(inode, lblk, pblk, len); + + ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); + if (ret > 0) + ret = 0; + + return ret; +} + #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) @@ -403,8 +418,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * out taking i_data_sem. So at the time the unwritten extent * could be converted. */ - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read(&EXT4_I(inode)->i_data_sem); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -412,8 +426,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, retval = ext4_ind_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); } - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - up_read((&EXT4_I(inode)->i_data_sem)); + up_read((&EXT4_I(inode)->i_data_sem)); /* * We don't check m_len because extent will be collpased in status @@ -509,8 +522,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * Try to see if we can get the block without requesting a new * file system block. */ - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read(&EXT4_I(inode)->i_data_sem); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -541,8 +553,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (ret < 0) retval = ret; } - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - up_read((&EXT4_I(inode)->i_data_sem)); + up_read((&EXT4_I(inode)->i_data_sem)); found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { @@ -626,13 +637,29 @@ found: } /* + * We have to zeroout blocks before inserting them into extent + * status tree. Otherwise someone could look them up there and + * use them before they are really zeroed. + */ + if (flags & EXT4_GET_BLOCKS_ZERO && + map->m_flags & EXT4_MAP_MAPPED && + map->m_flags & EXT4_MAP_NEW) { + ret = ext4_issue_zeroout(inode, map->m_lblk, + map->m_pblk, map->m_len); + if (ret) { + retval = ret; + goto out_sem; + } + } + + /* * If the extent has been zeroed out, we don't need to update * extent status tree. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO) && ext4_es_lookup_extent(inode, map->m_lblk, &es)) { if (ext4_es_is_written(&es)) - goto has_zeroout; + goto out_sem; } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -643,11 +670,13 @@ found: status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); - if (ret < 0) + if (ret < 0) { retval = ret; + goto out_sem; + } } -has_zeroout: +out_sem: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); @@ -674,7 +703,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { + if (flags && !handle) { /* Direct IO write... */ if (map.m_len > DIO_MAX_BLOCKS) map.m_len = DIO_MAX_BLOCKS; @@ -694,16 +723,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; - if (IS_DAX(inode) && buffer_unwritten(bh)) { - /* - * dgc: I suspect unwritten conversion on ext4+DAX is - * fundamentally broken here when there are concurrent - * read/write in progress on this inode. - */ - WARN_ON_ONCE(io_end); - bh->b_assoc_map = inode->i_mapping; - bh->b_private = (void *)(unsigned long)iblock; - } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; @@ -879,9 +898,6 @@ int do_journal_get_write_access(handle_t *handle, return ret; } -static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); - #ifdef CONFIG_EXT4_FS_ENCRYPTION static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) @@ -3054,25 +3070,96 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock, EXT4_GET_BLOCKS_IO_CREATE_EXT); } -static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, +static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", + int ret; + + ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n", inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_NO_LOCK); + ret = _ext4_get_block(inode, iblock, bh_result, 0); + /* + * Blocks should have been preallocated! ext4_file_write_iter() checks + * that. + */ + WARN_ON_ONCE(!buffer_mapped(bh_result)); + + return ret; } -int ext4_get_block_dax(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +#ifdef CONFIG_FS_DAX +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { - int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT; - if (create) - flags |= EXT4_GET_BLOCKS_CREATE; - ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n", + int ret, err; + int credits; + struct ext4_map_blocks map; + handle_t *handle = NULL; + int flags = 0; + + ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, flags); + map.m_lblk = iblock; + map.m_len = bh_result->b_size >> inode->i_blkbits; + credits = ext4_chunk_trans_blocks(inode, map.m_len); + if (create) { + flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO; + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + } + + ret = ext4_map_blocks(handle, inode, &map, flags); + if (create) { + err = ext4_journal_stop(handle); + if (ret >= 0 && err < 0) + ret = err; + } + if (ret <= 0) + goto out; + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int err2; + + /* + * We are protected by i_mmap_sem so we know block cannot go + * away from under us even though we dropped i_data_sem. + * Convert extent to written and write zeros there. + * + * Note: We may get here even when create == 0. + */ + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + err = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO); + if (err < 0) + ret = err; + err2 = ext4_journal_stop(handle); + if (err2 < 0 && ret > 0) + ret = err2; + } +out: + WARN_ON_ONCE(ret == 0 && create); + if (ret > 0) { + map_bh(bh_result, inode->i_sb, map.m_pblk); + bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | + map.m_flags; + /* + * At least for now we have to clear BH_New so that DAX code + * doesn't attempt to zero blocks again in a racy way. + */ + bh_result->b_state &= ~(1 << BH_New); + bh_result->b_size = map.m_len << inode->i_blkbits; + ret = 0; + } + return ret; } +#endif static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) @@ -3143,10 +3230,8 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); - if (overwrite) { - down_read(&EXT4_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); - } + if (overwrite) + inode_unlock(inode); /* * We could direct write to holes and fallocate. @@ -3189,7 +3274,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } if (overwrite) { - get_block_func = ext4_get_block_write_nolock; + get_block_func = ext4_get_block_overwrite; } else { get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; @@ -3245,10 +3330,8 @@ retake_lock: if (iov_iter_rw(iter) == WRITE) inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) { - up_read(&EXT4_I(inode)->i_data_sem); - mutex_lock(&inode->i_mutex); - } + if (overwrite) + inode_lock(inode); return ret; } @@ -3559,6 +3642,35 @@ int ext4_can_truncate(struct inode *inode) } /* + * We have to make sure i_disksize gets properly updated before we truncate + * page cache due to hole punching or zero range. Otherwise i_disksize update + * can get lost as it may have been postponed to submission of writeback but + * that will never happen after we truncate page cache. + */ +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len) +{ + handle_t *handle; + loff_t size = i_size_read(inode); + + WARN_ON(!inode_is_locked(inode)); + if (offset > size || offset + len < size) + return 0; + + if (EXT4_I(inode)->i_disksize >= size) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_update_i_disksize(inode, size); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + + return 0; +} + +/* * ext4_punch_hole: punches a hole in a file by releaseing the blocks * associated with the given offset and length * @@ -3595,7 +3707,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -3623,17 +3735,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) + if (last_block_offset > first_block_offset) { + ret = ext4_update_disksize_before_punch(inode, offset, length); + if (ret) + goto out_dio; truncate_pagecache_range(inode, first_block_offset, last_block_offset); - - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); + } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -3680,19 +3801,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - /* Now release the pages again to reduce race window */ - if (last_block_offset > first_block_offset) - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: ext4_journal_stop(handle); out_dio: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -3762,7 +3879,7 @@ void ext4_truncate(struct inode *inode) * have i_mutex locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) - WARN_ON(!mutex_is_locked(&inode->i_mutex)); + WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) @@ -4076,6 +4193,14 @@ static inline void ext4_iget_extra_inode(struct inode *inode, EXT4_I(inode)->i_inline_off = 0; } +int ext4_get_projid(struct inode *inode, kprojid_t *projid) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) + return -EOPNOTSUPP; + *projid = EXT4_I(inode)->i_projid; + return 0; +} + struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -4087,6 +4212,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) int block; uid_t i_uid; gid_t i_gid; + projid_t i_projid; inode = iget_locked(sb, ino); if (!inode) @@ -4136,12 +4262,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); + else + i_projid = EXT4_DEF_PROJID; + if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); + ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ @@ -4440,6 +4574,7 @@ static int ext4_do_update_inode(handle_t *handle, int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; + projid_t i_projid; spin_lock(&ei->i_raw_lock); @@ -4452,6 +4587,7 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); + i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); @@ -4529,6 +4665,15 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le16(ei->i_extra_isize); } } + + BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_PROJECT) && + i_projid != EXT4_DEF_PROJID); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + raw_inode->i_projid = cpu_to_le32(i_projid); + ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); if (inode->i_sb->s_flags & MS_LAZYTIME) @@ -4824,6 +4969,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } else ext4_wait_for_tail_page_commit(inode); } + down_write(&EXT4_I(inode)->i_mmap_sem); /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. @@ -4831,6 +4977,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) truncate_pagecache(inode, inode->i_size); if (shrink) ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } if (!rc) { @@ -5279,6 +5426,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); + + down_read(&EXT4_I(inode)->i_mmap_sem); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && @@ -5348,6 +5497,19 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(ret); out: + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } + +int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&EXT4_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&EXT4_I(inode)->i_mmap_sem); + + return err; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5e872fd40e5e..0f6c36922c24 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -14,6 +14,7 @@ #include <linux/mount.h> #include <linux/file.h> #include <linux/random.h> +#include <linux/quotaops.h> #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -202,6 +203,238 @@ static int uuid_is_zero(__u8 u[16]) return 1; } +static int ext4_ioctl_setflags(struct inode *inode, + unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle = NULL; + int err = EPERM, migrate = 0; + struct ext4_iloc iloc; + unsigned int oldflags, mask, i; + unsigned int jflag; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto flags_out; + + oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT4_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; + } + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) + migrate = 1; + + if (flags & EXT4_EOFBLOCKS_FL) { + /* we don't support adding EOFBLOCKS flag */ + if (!(oldflags & EXT4_EOFBLOCKS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (oldflags & EXT4_EOFBLOCKS_FL) + ext4_truncate(inode); + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto flags_out; + } + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } + + ext4_set_inode_flags(inode); + inode->i_ctime = ext4_current_time(inode); + + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext4_journal_stop(handle); + if (err) + goto flags_out; + + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) + err = ext4_change_inode_journal_flag(inode, jflag); + if (err) + goto flags_out; + if (migrate) { + if (flags & EXT4_EXTENTS_FL) + err = ext4_ext_migrate(inode); + else + err = ext4_ind_migrate(inode); + } + +flags_out: + return err; +} + +#ifdef CONFIG_QUOTA +static int ext4_ioctl_setproject(struct file *filp, __u32 projid) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); + int err, rc; + handle_t *handle; + kprojid_t kprojid; + struct ext4_iloc iloc; + struct ext4_inode *raw_inode; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_PROJECT)) { + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; + else + return 0; + } + + if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE) + return -EOPNOTSUPP; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + + if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) + return 0; + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = -EPERM; + inode_lock(inode); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto out_unlock; + + err = ext4_get_inode_loc(inode, &iloc); + if (err) + goto out_unlock; + + raw_inode = ext4_raw_inode(&iloc); + if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { + err = -EOVERFLOW; + brelse(iloc.bh); + goto out_unlock; + } + brelse(iloc.bh); + + dquot_initialize(inode); + + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + EXT4_QUOTA_INIT_BLOCKS(sb) + + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out_unlock; + } + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_stop; + + if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + struct dquot *transfer_to[MAXQUOTAS] = { }; + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (transfer_to[PRJQUOTA]) { + err = __dquot_transfer(inode, transfer_to); + dqput(transfer_to[PRJQUOTA]); + if (err) + goto out_dirty; + } + } + EXT4_I(inode)->i_projid = kprojid; + inode->i_ctime = ext4_current_time(inode); +out_dirty: + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; +out_stop: + ext4_journal_stop(handle); +out_unlock: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; +} +#else +static int ext4_ioctl_setproject(struct file *filp, __u32 projid) +{ + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; + return 0; +} +#endif + +/* Transfer internal flags to xflags */ +static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) +{ + __u32 xflags = 0; + + if (iflags & EXT4_SYNC_FL) + xflags |= FS_XFLAG_SYNC; + if (iflags & EXT4_IMMUTABLE_FL) + xflags |= FS_XFLAG_IMMUTABLE; + if (iflags & EXT4_APPEND_FL) + xflags |= FS_XFLAG_APPEND; + if (iflags & EXT4_NODUMP_FL) + xflags |= FS_XFLAG_NODUMP; + if (iflags & EXT4_NOATIME_FL) + xflags |= FS_XFLAG_NOATIME; + if (iflags & EXT4_PROJINHERIT_FL) + xflags |= FS_XFLAG_PROJINHERIT; + return xflags; +} + +/* Transfer xflags flags to internal */ +static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) +{ + unsigned long iflags = 0; + + if (xflags & FS_XFLAG_SYNC) + iflags |= EXT4_SYNC_FL; + if (xflags & FS_XFLAG_IMMUTABLE) + iflags |= EXT4_IMMUTABLE_FL; + if (xflags & FS_XFLAG_APPEND) + iflags |= EXT4_APPEND_FL; + if (xflags & FS_XFLAG_NODUMP) + iflags |= EXT4_NODUMP_FL; + if (xflags & FS_XFLAG_NOATIME) + iflags |= EXT4_NOATIME_FL; + if (xflags & FS_XFLAG_PROJINHERIT) + iflags |= EXT4_PROJINHERIT_FL; + + return iflags; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -217,11 +450,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ei->i_flags & EXT4_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); case EXT4_IOC_SETFLAGS: { - handle_t *handle = NULL; - int err, migrate = 0; - struct ext4_iloc iloc; - unsigned int oldflags, mask, i; - unsigned int jflag; + int err; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -235,90 +464,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ext4_mask_flags(inode->i_mode, flags); - err = -EPERM; - mutex_lock(&inode->i_mutex); - /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) - goto flags_out; - - oldflags = ei->i_flags; - - /* The JOURNAL_DATA flag is modifiable only by root */ - jflag = flags & EXT4_JOURNAL_DATA_FL; - - /* - * The IMMUTABLE and APPEND_ONLY flags can only be changed by - * the relevant capability. - * - * This test looks nicer. Thanks to Pauline Middelink - */ - if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) - goto flags_out; - } - - /* - * The JOURNAL_DATA flag can only be changed by - * the relevant capability. - */ - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) - goto flags_out; - } - if ((flags ^ oldflags) & EXT4_EXTENTS_FL) - migrate = 1; - - if (flags & EXT4_EOFBLOCKS_FL) { - /* we don't support adding EOFBLOCKS flag */ - if (!(oldflags & EXT4_EOFBLOCKS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (oldflags & EXT4_EOFBLOCKS_FL) - ext4_truncate(inode); - - handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto flags_out; - } - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - goto flags_err; - - for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { - if (!(mask & EXT4_FL_USER_MODIFIABLE)) - continue; - if (mask & flags) - ext4_set_inode_flag(inode, i); - else - ext4_clear_inode_flag(inode, i); - } - - ext4_set_inode_flags(inode); - inode->i_ctime = ext4_current_time(inode); - - err = ext4_mark_iloc_dirty(handle, inode, &iloc); -flags_err: - ext4_journal_stop(handle); - if (err) - goto flags_out; - - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) - err = ext4_change_inode_journal_flag(inode, jflag); - if (err) - goto flags_out; - if (migrate) { - if (flags & EXT4_EXTENTS_FL) - err = ext4_ext_migrate(inode); - else - err = ext4_ind_migrate(inode); - } - -flags_out: - mutex_unlock(&inode->i_mutex); + inode_lock(inode); + err = ext4_ioctl_setflags(inode, flags); + inode_unlock(inode); mnt_drop_write_file(filp); return err; } @@ -349,7 +497,7 @@ flags_out: goto setversion_out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); @@ -364,7 +512,7 @@ flags_out: ext4_journal_stop(handle); unlock_out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); setversion_out: mnt_drop_write_file(filp); return err; @@ -510,9 +658,9 @@ group_add_out: * ext4_ext_swap_inode_data before we switch the * inode format to prevent read. */ - mutex_lock(&(inode->i_mutex)); + inode_lock((inode)); err = ext4_ext_migrate(inode); - mutex_unlock(&(inode->i_mutex)); + inode_unlock((inode)); mnt_drop_write_file(filp); return err; } @@ -689,6 +837,60 @@ encryption_policy_out: return -EOPNOTSUPP; #endif } + case EXT4_IOC_FSGETXATTR: + { + struct fsxattr fa; + + memset(&fa, 0, sizeof(struct fsxattr)); + ext4_get_inode_flags(ei); + fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_PROJECT)) { + fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, + EXT4_I(inode)->i_projid); + } + + if (copy_to_user((struct fsxattr __user *)arg, + &fa, sizeof(fa))) + return -EFAULT; + return 0; + } + case EXT4_IOC_FSSETXATTR: + { + struct fsxattr fa; + int err; + + if (copy_from_user(&fa, (struct fsxattr __user *)arg, + sizeof(fa))) + return -EFAULT; + + /* Make sure caller has proper permission */ + if (!inode_owner_or_capable(inode)) + return -EACCES; + + err = mnt_want_write_file(filp); + if (err) + return err; + + flags = ext4_xflags_to_iflags(fa.fsx_xflags); + flags = ext4_mask_flags(inode->i_mode, flags); + + inode_lock(inode); + flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | + (flags & EXT4_FL_XFLAG_VISIBLE); + err = ext4_ioctl_setflags(inode, flags); + inode_unlock(inode); + mnt_drop_write_file(filp); + if (err) + return err; + + err = ext4_ioctl_setproject(filp, fa.fsx_projid); + if (err) + return err; + + return 0; + } default: return -ENOTTY; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f27e0c2598c5..06574dd77614 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -273,7 +273,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, struct inode *inode); + struct inode *dir, struct inode *inode); /* checksumming functions */ void initialize_dirent_tail(struct ext4_dir_entry_tail *t, @@ -1928,10 +1928,9 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * directory, and adds the dentry to the indexed directory. */ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, + struct inode *dir, struct inode *inode, struct buffer_head *bh) { - struct inode *dir = d_inode(dentry->d_parent); struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[2], *frame; @@ -2086,8 +2085,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; if (ext4_has_inline_data(dir)) { - retval = ext4_try_add_inline_entry(handle, &fname, - dentry, inode); + retval = ext4_try_add_inline_entry(handle, &fname, dir, inode); if (retval < 0) goto out; if (retval == 1) { @@ -2097,7 +2095,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } if (is_dx(dir)) { - retval = ext4_dx_add_entry(handle, &fname, dentry, inode); + retval = ext4_dx_add_entry(handle, &fname, dir, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); @@ -2119,7 +2117,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (blocks == 1 && !dx_fallback && ext4_has_feature_dir_index(sb)) { - retval = make_indexed_dir(handle, &fname, dentry, + retval = make_indexed_dir(handle, &fname, dir, inode, bh); bh = NULL; /* make_indexed_dir releases bh */ goto out; @@ -2154,12 +2152,11 @@ out: * Returns 0 for success, or a negative error value */ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, struct inode *inode) + struct inode *dir, struct inode *inode) { struct dx_frame frames[2], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; - struct inode *dir = d_inode(dentry->d_parent); struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; @@ -2756,7 +2753,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); /* * Exit early if inode already is on orphan list. This is a big speedup * since we don't have to contend on the global s_orphan_lock. @@ -2838,7 +2835,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) return 0; @@ -3212,6 +3209,12 @@ static int ext4_link(struct dentry *old_dentry, if (ext4_encrypted_inode(dir) && !ext4_is_child_context_consistent_with_parent(dir, inode)) return -EPERM; + + if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(dir); if (err) return err; @@ -3492,6 +3495,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, int credits; u8 old_file_type; + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(new_dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + retval = dquot_initialize(old.dir); if (retval) return retval; @@ -3701,6 +3709,14 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new.inode))) return -EPERM; + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && + !projid_eq(EXT4_I(new_dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid)) || + (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) && + !projid_eq(EXT4_I(old_dir)->i_projid, + EXT4_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + retval = dquot_initialize(old.dir); if (retval) return retval; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f1b56ff01208..3ed01ec011d7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -80,6 +80,36 @@ static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +/* + * Lock ordering + * + * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and + * i_mmap_rwsem (inode->i_mmap_rwsem)! + * + * page fault path: + * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> + * page lock -> i_data_sem (rw) + * + * buffered write path: + * sb_start_write -> i_mutex -> mmap_sem + * sb_start_write -> i_mutex -> transaction start -> page lock -> + * i_data_sem (rw) + * + * truncate: + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> + * i_mmap_rwsem (w) -> page lock + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> + * transaction start -> i_data_sem (rw) + * + * direct IO: + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> + * transaction start -> i_data_sem (rw) + * + * writepages: + * transaction start -> page lock(s) -> i_data_sem (rw) + */ + #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, @@ -958,6 +988,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); + init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); } @@ -1066,8 +1097,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, } #ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") -#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) +static char *quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); @@ -1100,6 +1131,7 @@ static const struct dquot_operations ext4_quota_operations = { .write_info = ext4_write_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, + .get_projid = ext4_get_projid, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -2254,10 +2286,10 @@ static void ext4_orphan_cleanup(struct super_block *sb, __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); - mutex_lock(&inode->i_mutex); + inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); nr_truncates++; } else { if (test_opt(sb, DEBUG)) @@ -2526,6 +2558,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) "without CONFIG_QUOTA"); return 0; } + if (ext4_has_feature_project(sb) && !readonly) { + ext4_msg(sb, KERN_ERR, + "Filesystem with project quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return 0; + } #endif /* CONFIG_QUOTA */ return 1; } @@ -3654,7 +3692,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); @@ -4790,6 +4828,48 @@ restore_opts: return err; } +#ifdef CONFIG_QUOTA +static int ext4_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -4822,6 +4902,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; +#ifdef CONFIG_QUOTA + if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) + ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); +#endif return 0; } @@ -4986,7 +5071,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, struct inode *qf_inode; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), - le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; BUG_ON(!ext4_has_feature_quota(sb)); @@ -5014,7 +5100,8 @@ static int ext4_enable_quotas(struct super_block *sb) int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), - le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index 011ba6670d99..c70d06a383e2 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -10,8 +10,10 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { + down_write(&EXT4_I(inode)->i_mmap_sem); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ac9e7c6aac74..5c06db17e41f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -794,7 +794,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (start >= isize) @@ -860,7 +860,7 @@ out: if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 18ddb1e5182a..ea272be62677 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -333,7 +333,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) loff_t isize; int err = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) @@ -388,10 +388,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return vfs_setpos(file, data_ofs, maxbytes); fail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -1219,7 +1219,7 @@ static long f2fs_fallocate(struct file *file, int mode, FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) @@ -1243,7 +1243,7 @@ static long f2fs_fallocate(struct file *file, int mode, } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; @@ -1307,13 +1307,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = f2fs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto out; } @@ -1322,7 +1322,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); f2fs_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME; @@ -1667,7 +1667,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, f2fs_balance_fs(sbi, true); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* writeback all dirty pages in the range */ err = filemap_write_and_wait_range(inode->i_mapping, range->start, @@ -1778,7 +1778,7 @@ do_map: clear_out: clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!err) range->len = (u64)total << PAGE_CACHE_SHIFT; return err; diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 93fc62232ec2..5d384921524d 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster) return dclus; } -int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, - unsigned long *mapped_blocks, int create) +int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); + int cluster, offset; + + cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); + offset = sector & (sbi->sec_per_clus - 1); + cluster = fat_bmap_cluster(inode, cluster); + if (cluster < 0) + return cluster; + else if (cluster) { + *bmap = fat_clus_to_blknr(sbi, cluster) + offset; + *mapped_blocks = sbi->sec_per_clus - offset; + if (*mapped_blocks > last_block - sector) + *mapped_blocks = last_block - sector; + } + + return 0; +} + +static int is_exceed_eof(struct inode *inode, sector_t sector, + sector_t *last_block, int create) +{ + struct super_block *sb = inode->i_sb; const unsigned long blocksize = sb->s_blocksize; const unsigned char blocksize_bits = sb->s_blocksize_bits; + + *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; + if (sector >= *last_block) { + if (!create) + return 1; + + /* + * ->mmu_private can access on only allocation path. + * (caller must hold ->i_mutex) + */ + *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + >> blocksize_bits; + if (sector >= *last_block) + return 1; + } + + return 0; +} + +int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, + unsigned long *mapped_blocks, int create, bool from_bmap) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); sector_t last_block; - int cluster, offset; *phys = 0; *mapped_blocks = 0; @@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, return 0; } - last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; - if (sector >= last_block) { - if (!create) + if (!from_bmap) { + if (is_exceed_eof(inode, sector, &last_block, create)) return 0; - - /* - * ->mmu_private can access on only allocation path. - * (caller must hold ->i_mutex) - */ - last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) - >> blocksize_bits; + } else { + last_block = inode->i_blocks >> + (inode->i_sb->s_blocksize_bits - 9); if (sector >= last_block) return 0; } - cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); - offset = sector & (sbi->sec_per_clus - 1); - cluster = fat_bmap_cluster(inode, cluster); - if (cluster < 0) - return cluster; - else if (cluster) { - *phys = fat_clus_to_blknr(sbi, cluster) + offset; - *mapped_blocks = sbi->sec_per_clus - offset; - if (*mapped_blocks > last_block - sector) - *mapped_blocks = last_block - sector; - } - return 0; + return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks, + phys); } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 8b2127ffb226..d0b95c95079b 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -91,7 +91,7 @@ next: *bh = NULL; iblock = *pos >> sb->s_blocksize_bits; - err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0); + err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false); if (err || !phys) return -1; /* beyond EOF or error */ @@ -769,7 +769,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file, buf.dirent = dirent; buf.result = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); buf.ctx.pos = file->f_pos; ret = -ENOENT; if (!IS_DEADDIR(inode)) { @@ -777,7 +777,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file, short_only, both ? &buf : NULL); file->f_pos = buf.ctx.pos; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret >= 0) ret = buf.result; return ret; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index be5e15323bab..e6b764a17a9c 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -87,7 +87,7 @@ struct msdos_sb_info { unsigned int vol_id; /*volume ID*/ int fatent_shift; - struct fatent_operations *fatent_ops; + const struct fatent_operations *fatent_ops; struct inode *fat_inode; struct inode *fsinfo_inode; @@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len) extern void fat_cache_inval_inode(struct inode *inode); extern int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus); +extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap); extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, - unsigned long *mapped_blocks, int create); + unsigned long *mapped_blocks, int create, bool from_bmap); /* fat/dir.c */ extern const struct file_operations fat_dir_operations; @@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart) { return hash_32(logstart, FAT_HASH_BITS); } +extern int fat_add_cluster(struct inode *inode); /* fat/misc.c */ extern __printf(3, 4) __cold diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 8226557130a2..1d9a8c4e9de0 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -99,7 +99,7 @@ err: static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, int offset, sector_t blocknr) { - struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); fatent->fat_inode = MSDOS_SB(sb)->fat_inode; @@ -246,7 +246,7 @@ static int fat32_ent_next(struct fat_entry *fatent) return 0; } -static struct fatent_operations fat12_ops = { +static const struct fatent_operations fat12_ops = { .ent_blocknr = fat12_ent_blocknr, .ent_set_ptr = fat12_ent_set_ptr, .ent_bread = fat12_ent_bread, @@ -255,7 +255,7 @@ static struct fatent_operations fat12_ops = { .ent_next = fat12_ent_next, }; -static struct fatent_operations fat16_ops = { +static const struct fatent_operations fat16_ops = { .ent_blocknr = fat_ent_blocknr, .ent_set_ptr = fat16_ent_set_ptr, .ent_bread = fat_ent_bread, @@ -264,7 +264,7 @@ static struct fatent_operations fat16_ops = { .ent_next = fat16_ent_next, }; -static struct fatent_operations fat32_ops = { +static const struct fatent_operations fat32_ops = { .ent_blocknr = fat_ent_blocknr, .ent_set_ptr = fat32_ent_set_ptr, .ent_bread = fat_ent_bread, @@ -320,7 +320,7 @@ static inline int fat_ent_update_ptr(struct super_block *sb, int offset, sector_t blocknr) { struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; struct buffer_head **bhs = fatent->bhs; /* Is this fatent's blocks including this entry? */ @@ -349,7 +349,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; int err, offset; sector_t blocknr; @@ -407,7 +407,7 @@ int fat_ent_write(struct inode *inode, struct fat_entry *fatent, int new, int wait) { struct super_block *sb = inode->i_sb; - struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; int err; ops->ent_put(fatent, new); @@ -432,7 +432,7 @@ static inline int fat_ent_next(struct msdos_sb_info *sbi, static inline int fat_ent_read_block(struct super_block *sb, struct fat_entry *fatent) { - struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; sector_t blocknr; int offset; @@ -463,7 +463,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent, prev_ent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, count, err, nr_bhs, idx_clus; @@ -551,7 +551,7 @@ int fat_free_clusters(struct inode *inode, int cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, err, nr_bhs; @@ -636,7 +636,7 @@ EXPORT_SYMBOL_GPL(fat_free_clusters); static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent, unsigned long reada_blocks) { - struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; sector_t blocknr; int i, offset; @@ -649,7 +649,7 @@ static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent, int fat_count_free_clusters(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; unsigned long reada_blocks, reada_mask, cur_block; int err = 0, free; diff --git a/fs/fat/file.c b/fs/fat/file.c index a08f1039909a..f70185668832 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -14,15 +14,19 @@ #include <linux/backing-dev.h> #include <linux/fsnotify.h> #include <linux/security.h> +#include <linux/falloc.h> #include "fat.h" +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len); + static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; - mutex_lock(&inode->i_mutex); + inode_lock(inode); attr = fat_make_attrs(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return put_user(attr, user_attr); } @@ -43,7 +47,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) err = mnt_want_write_file(file); if (err) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * ATTR_VOLUME and ATTR_DIR cannot be changed; this also @@ -105,7 +109,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) fat_save_attrs(inode, attr); mark_inode_dirty(inode); out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(file); out: return err; @@ -177,6 +181,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .fallocate = fat_fallocate, }; static int fat_cont_expand(struct inode *inode, loff_t size) @@ -215,6 +220,62 @@ out: return err; } +/* + * Preallocate space for a file. This implements fat's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. + */ +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + int nr_cluster; /* Number of clusters to be allocated */ + loff_t mm_bytes; /* Number of bytes to be allocated for file */ + loff_t ondisksize; /* block aligned on-disk size in bytes*/ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int err = 0; + + /* No support for hole punch or other fallocate flags. */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* No support for dir */ + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + inode_lock(inode); + if (mode & FALLOC_FL_KEEP_SIZE) { + ondisksize = inode->i_blocks << 9; + if ((offset + len) <= ondisksize) + goto error; + + /* First compute the number of clusters to be allocated */ + mm_bytes = offset + len - ondisksize; + nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >> + sbi->cluster_bits; + + /* Start the allocation.We are not zeroing out the clusters */ + while (nr_cluster-- > 0) { + err = fat_add_cluster(inode); + if (err) + goto error; + } + } else { + if ((offset + len) <= i_size_read(inode)) + goto error; + + /* This is just an expanding truncate */ + err = fat_cont_expand(inode, (offset + len)); + } + +error: + inode_unlock(inode); + return err; +} + /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 6aece96df19f..a5599052116c 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -93,7 +93,7 @@ static struct fat_floppy_defaults { }, }; -static int fat_add_cluster(struct inode *inode) +int fat_add_cluster(struct inode *inode) { int err, cluster; @@ -115,10 +115,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); unsigned long mapped_blocks; - sector_t phys; + sector_t phys, last_block; int err, offset; - err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; if (phys) { @@ -135,8 +135,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, return -EIO; } + last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9); offset = (unsigned long)iblock & (sbi->sec_per_clus - 1); - if (!offset) { + /* + * allocate a cluster according to the following. + * 1) no more available blocks + * 2) not part of fallocate region + */ + if (!offset && !(iblock < last_block)) { /* TODO: multiple cluster allocation would be desirable. */ err = fat_add_cluster(inode); if (err) @@ -148,7 +154,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, *max_blocks = min(mapped_blocks, *max_blocks); MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; - err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; @@ -273,13 +279,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return ret; } +static int fat_get_block_bmap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err; + sector_t bmap; + unsigned long mapped_blocks; + + BUG_ON(create != 0); + + err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true); + if (err) + return err; + + if (bmap) { + map_bh(bh_result, sb, bmap); + max_blocks = min(mapped_blocks, max_blocks); + } + + bh_result->b_size = max_blocks << sb->s_blocksize_bits; + + return 0; +} + static sector_t _fat_bmap(struct address_space *mapping, sector_t block) { sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(&MSDOS_I(mapping->host)->truncate_lock); - blocknr = generic_block_bmap(mapping, block, fat_get_block); + blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap); up_read(&MSDOS_I(mapping->host)->truncate_lock); return blocknr; @@ -449,6 +480,24 @@ static int fat_calc_dir_size(struct inode *inode) return 0; } +static int fat_validate_dir(struct inode *dir) +{ + struct super_block *sb = dir->i_sb; + + if (dir->i_nlink < 2) { + /* Directory should have "."/".." entries at least. */ + fat_fs_error(sb, "corrupted directory (invalid entries)"); + return -EIO; + } + if (MSDOS_I(dir)->i_start == 0 || + MSDOS_I(dir)->i_start == MSDOS_SB(sb)->root_cluster) { + /* Directory should point valid cluster. */ + fat_fs_error(sb, "corrupted directory (invalid i_start)"); + return -EIO; + } + return 0; +} + /* doesn't deal with root inode */ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { @@ -475,6 +524,10 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) MSDOS_I(inode)->mmu_private = inode->i_size; set_nlink(inode, fat_subdirs(inode)); + + error = fat_validate_dir(inode); + if (error < 0) + return error; } else { /* not a directory */ inode->i_generation |= 1; inode->i_mode = fat_make_mode(sbi, de->attr, @@ -553,13 +606,43 @@ out: EXPORT_SYMBOL_GPL(fat_build_inode); +static int __fat_write_inode(struct inode *inode, int wait); + +static void fat_free_eofblocks(struct inode *inode) +{ + /* Release unwritten fallocated blocks on inode eviction. */ + if ((inode->i_blocks << 9) > + round_up(MSDOS_I(inode)->mmu_private, + MSDOS_SB(inode->i_sb)->cluster_size)) { + int err; + + fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private); + /* Fallocate results in updating the i_start/iogstart + * for the zero byte file. So, make it return to + * original state during evict and commit it to avoid + * any corruption on the next access to the cluster + * chain for the file. + */ + err = __fat_write_inode(inode, inode_needs_sync(inode)); + if (err) { + fat_msg(inode->i_sb, KERN_WARNING, "Failed to " + "update on disk inode for unused " + "fallocated blocks, inode could be " + "corrupted. Please run fsck"); + } + + } +} + static void fat_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); - } + } else + fat_free_eofblocks(inode); + invalidate_inode_buffers(inode); clear_inode(inode); fat_cache_inval_inode(inode); @@ -1146,7 +1229,12 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_time_offset: if (match_int(&args[0], &option)) return -EINVAL; - if (option < -12 * 60 || option > 12 * 60) + /* + * GMT+-12 zones may have DST corrections so at least + * 13 hours difference is needed. Make the limit 24 + * just in case someone invents something unusual. + */ + if (option < -24 * 60 || option > 24 * 60) return -EINVAL; opts->tz_set = 1; opts->time_offset = option; diff --git a/fs/filesystems.c b/fs/filesystems.c index 5797d45a78cb..c5618db110be 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -46,9 +46,9 @@ void put_filesystem(struct file_system_type *fs) static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; - for (p=&file_systems; *p; p=&(*p)->next) - if (strlen((*p)->name) == len && - strncmp((*p)->name, name, len) == 0) + for (p = &file_systems; *p; p = &(*p)->next) + if (strncmp((*p)->name, name, len) == 0 && + !(*p)->name[len]) break; return p; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 712601f299b8..4b855b65d457 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -944,7 +944,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!parent) return -ENOENT; - mutex_lock(&parent->i_mutex); + inode_lock(parent); if (!S_ISDIR(parent->i_mode)) goto unlock; @@ -962,7 +962,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, fuse_invalidate_entry(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { - mutex_lock(&d_inode(entry)->i_mutex); + inode_lock(d_inode(entry)); if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; goto badentry; @@ -983,7 +983,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, clear_nlink(d_inode(entry)); err = 0; badentry: - mutex_unlock(&d_inode(entry)->i_mutex); + inode_unlock(d_inode(entry)); if (!err) d_delete(entry); } else { @@ -992,7 +992,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, dput(entry); unlock: - mutex_unlock(&parent->i_mutex); + inode_unlock(parent); iput(parent); return err; } @@ -1504,7 +1504,7 @@ void fuse_set_nowrite(struct inode *inode) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - BUG_ON(!mutex_is_locked(&inode->i_mutex)); + BUG_ON(!inode_is_locked(inode)); spin_lock(&fc->lock); BUG_ON(fi->writectr < 0); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 570ca4053c80..b03d253ece15 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -207,7 +207,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) return err; if (lock_inode) - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = fuse_do_open(fc, get_node_id(inode), file, isdir); @@ -215,7 +215,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) fuse_finish_open(inode, file); if (lock_inode) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -413,9 +413,9 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); fuse_sync_writes(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); @@ -450,7 +450,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (is_bad_inode(inode)) return -EIO; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Start writeback against all dirty pages of the inode, then @@ -486,7 +486,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, err = 0; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -1160,7 +1160,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return generic_file_write_iter(iocb, from); } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1210,7 +1210,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } out: current->backing_dev_info = NULL; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return written ? written : err; } @@ -1322,10 +1322,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) - mutex_lock(&inode->i_mutex); + inode_lock(inode); fuse_sync_writes(inode); if (!write) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } while (count) { @@ -1413,14 +1413,14 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) return -EIO; /* Don't allow parallel writes to the same file */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); res = generic_write_checks(iocb, from); if (res > 0) res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); fuse_invalidate_attr(inode); if (res > 0) fuse_write_update_size(inode, iocb->ki_pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -2231,20 +2231,77 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) return err ? 0 : outarg.block; } +static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + FUSE_ARGS(args); + struct fuse_lseek_in inarg = { + .fh = ff->fh, + .offset = offset, + .whence = whence + }; + struct fuse_lseek_out outarg; + int err; + + if (fc->no_lseek) + goto fallback; + + args.in.h.opcode = FUSE_LSEEK; + args.in.h.nodeid = ff->nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); + if (err) { + if (err == -ENOSYS) { + fc->no_lseek = 1; + goto fallback; + } + return err; + } + + return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); + +fallback: + err = fuse_update_attributes(inode, NULL, file, NULL); + if (!err) + return generic_file_llseek(file, offset, whence); + else + return err; +} + static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) { loff_t retval; struct inode *inode = file_inode(file); - /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ - if (whence == SEEK_CUR || whence == SEEK_SET) - return generic_file_llseek(file, offset, whence); - - mutex_lock(&inode->i_mutex); - retval = fuse_update_attributes(inode, NULL, file, NULL); - if (!retval) + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ retval = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + break; + case SEEK_END: + inode_lock(inode); + retval = fuse_update_attributes(inode, NULL, file, NULL); + if (!retval) + retval = generic_file_llseek(file, offset, whence); + inode_unlock(inode); + break; + case SEEK_HOLE: + case SEEK_DATA: + inode_lock(inode); + retval = fuse_lseek(file, offset, whence); + inode_unlock(inode); + break; + default: + retval = -EINVAL; + } return retval; } @@ -2887,7 +2944,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, return -EOPNOTSUPP; if (lock_inode) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { loff_t endbyte = offset + length - 1; err = filemap_write_and_wait_range(inode->i_mapping, @@ -2933,7 +2990,7 @@ out: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (lock_inode) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 405113101db8..ce394b5fe6b4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -605,6 +605,9 @@ struct fuse_conn { /** Does the filesystem support asynchronous direct-IO submission? */ unsigned async_dio:1; + /** Is lseek not implemented by fs? */ + unsigned no_lseek:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 7412863cda1e..c9384f932975 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -914,7 +914,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); @@ -946,7 +946,7 @@ out_unlock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 3e94400d587c..352f958769e1 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -2067,7 +2067,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); if (ret) @@ -2094,7 +2094,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, gfs2_glock_dq_uninit(&gh); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index be6d9c450b22..a39891344259 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -888,7 +888,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) return -ENOMEM; sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); - mutex_lock(&ip->i_inode.i_mutex); + inode_lock(&ip->i_inode); for (qx = 0; qx < num_qd; qx++) { error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &ghs[qx]); @@ -953,7 +953,7 @@ out_alloc: out: while (qx--) gfs2_glock_dq_uninit(&ghs[qx]); - mutex_unlock(&ip->i_inode.i_mutex); + inode_unlock(&ip->i_inode); kfree(ghs); gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH); return error; @@ -1674,7 +1674,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, if (error) goto out_put; - mutex_lock(&ip->i_inode.i_mutex); + inode_lock(&ip->i_inode); error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); if (error) goto out_unlockput; @@ -1739,7 +1739,7 @@ out_i: out_q: gfs2_glock_dq_uninit(&q_gh); out_unlockput: - mutex_unlock(&ip->i_inode.i_mutex); + inode_unlock(&ip->i_inode); out_put: qd_put(qd); return error; diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index db458ee3a546..1eb5d415d434 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -214,7 +214,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) { struct super_block *sb; struct hfs_find_data fd; - struct list_head *pos; + struct hfs_readdir_data *rd; int res, type; hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); @@ -240,9 +240,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) } } - list_for_each(pos, &HFS_I(dir)->open_dir_list) { - struct hfs_readdir_data *rd = - list_entry(pos, struct hfs_readdir_data, list); + list_for_each_entry(rd, &HFS_I(dir)->open_dir_list, list) { if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) rd->file->f_pos--; } diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 70788e03820a..e9f2b855f831 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -173,9 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file) { struct hfs_readdir_data *rd = file->private_data; if (rd) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); list_del(&rd->list); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kfree(rd); } return 0; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index b99ebddb10cb..6686bf39a5b5 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -570,13 +570,13 @@ static int hfs_file_release(struct inode *inode, struct file *file) if (HFS_IS_RSRC(inode)) inode = HFS_I(inode)->rsrc_inode; if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); hfs_file_truncate(inode); //if (inode->i_flags & S_DEAD) { // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); // hfs_delete_inode(inode); //} - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } @@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* sync the inode to buffers */ ret = write_inode_now(inode, 0); @@ -668,7 +668,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, err = sync_blockdev(sb->s_bdev); if (!ret) ret = err; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index d0f39dcbb58e..a4e867e08947 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -284,9 +284,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file) { struct hfsplus_readdir_data *rd = file->private_data; if (rd) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); list_del(&rd->list); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kfree(rd); } return 0; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 19b33f8151f1..1a6394cdb54e 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -229,14 +229,14 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode)->rsrc_inode; if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); hfsplus_file_truncate(inode); if (inode->i_flags & S_DEAD) { hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb)->hidden_dir, NULL); hfsplus_delete_inode(inode); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } @@ -286,7 +286,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Sync inode metadata into the catalog and extent trees. @@ -327,7 +327,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 0624ce4e0702..32a49e292b6a 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -93,7 +93,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) goto out_drop_write; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) || inode->i_flags & (S_IMMUTABLE|S_APPEND)) { @@ -126,7 +126,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) mark_inode_dirty(inode); out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out_drop_write: mnt_drop_write_file(file); out: diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index cfaa18c7a337..d1abbee281d1 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -378,9 +378,9 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = fsync_file(HOSTFS_I(inode)->fd, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index dc540bfcee1d..e57a53c13d86 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -33,7 +33,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) if (whence == SEEK_DATA || whence == SEEK_HOLE) return -EINVAL; - mutex_lock(&i->i_mutex); + inode_lock(i); hpfs_lock(s); /*pr_info("dir lseek\n");*/ @@ -48,12 +48,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) ok: filp->f_pos = new_off; hpfs_unlock(s); - mutex_unlock(&i->i_mutex); + inode_unlock(i); return new_off; fail: /*pr_warn("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); - mutex_unlock(&i->i_mutex); + inode_unlock(i); return -ESPIPE; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8bbf7f3e2a27..e1f465a389d5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -141,7 +141,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma_len = (loff_t)(vma->vm_end - vma->vm_start); - mutex_lock(&inode->i_mutex); + inode_lock(inode); file_accessed(file); ret = -ENOMEM; @@ -157,7 +157,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (hole_end > hole_start) { struct address_space *mapping = inode->i_mapping; - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmdelete_list(&mapping->i_mmap, @@ -538,7 +538,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) hole_end >> PAGE_SHIFT); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, hole_start, hole_end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; @@ -572,7 +572,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, start = offset >> hpage_shift; end = (offset + len + hpage_size - 1) >> hpage_shift; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); @@ -659,7 +659,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, i_size_write(inode, offset + len); inode->i_ctime = CURRENT_TIME; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } diff --git a/fs/inode.c b/fs/inode.c index e491e54d2430..9f62db3bcc3e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -495,7 +495,7 @@ void clear_inode(struct inode *inode) */ spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); - BUG_ON(inode->i_data.nrshadows); + BUG_ON(inode->i_data.nrexceptional); spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); @@ -966,9 +966,9 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) swap(inode1, inode2); if (inode1 && !S_ISDIR(inode1->i_mode)) - mutex_lock(&inode1->i_mutex); + inode_lock(inode1); if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); + inode_lock_nested(inode2, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); @@ -980,9 +980,9 @@ EXPORT_SYMBOL(lock_two_nondirectories); void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1 && !S_ISDIR(inode1->i_mode)) - mutex_unlock(&inode1->i_mutex); + inode_unlock(inode1); if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - mutex_unlock(&inode2->i_mutex); + inode_unlock(inode2); } EXPORT_SYMBOL(unlock_two_nondirectories); diff --git a/fs/ioctl.c b/fs/ioctl.c index 29466c380958..116a333e9c77 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -434,9 +434,9 @@ int generic_block_fiemap(struct inode *inode, u64 len, get_block_t *get_block) { int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } EXPORT_SYMBOL(generic_block_fiemap); diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index a3750f902adc..0ae91ad6df2d 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mtd/mtd.h> +#include <linux/mm.h> /* kvfree() */ #include "nodelist.h" static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, @@ -383,12 +384,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) return 0; out_free: -#ifndef __ECOS - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else -#endif - kfree(c->blocks); + kvfree(c->blocks); return ret; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index f509f62e12f6..c5ac5944bc1b 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -39,10 +39,10 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Trigger GC to flush any pending writes for this inode */ jffs2_flush_wbuf_gc(c, inode->i_ino); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2caf1682036d..bead25ae8fe4 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) out_root: jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else - kfree(c->blocks); + kvfree(c->blocks); out_inohash: jffs2_clear_xattr_subsystem(c); kfree(c->inocache_list); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index bb080c272149..0a9a114bb9d1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb) jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else - kfree(c->blocks); + kvfree(c->blocks); jffs2_flash_cleanup(c); kfree(c->inocache_list); jffs2_clear_xattr_subsystem(c); diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 0e026a7bdcd4..4ce7735dd042 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -38,17 +38,17 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (!(inode->i_state & I_DIRTY_ALL) || (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } rc |= jfs_commit_inode(inode, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc ? -EIO : 0; } diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 8db8b7d61e40..8653cac7e12e 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -96,7 +96,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } /* Lock against other parallel changes of flags */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); jfs_get_inode_flags(jfs_inode); oldflags = jfs_inode->mode2; @@ -109,7 +109,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ((flags ^ oldflags) & (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); err = -EPERM; goto setflags_out; } @@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) jfs_inode->mode2 = flags; jfs_set_inode_flags(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); setflags_out: diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 900925b5eb8c..4f5d85ba8e23 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -792,7 +792,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, struct buffer_head tmp_bh; struct buffer_head *bh; - mutex_lock(&inode->i_mutex); + inode_lock(inode); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -824,7 +824,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, } out: if (len == towrite) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } if (inode->i_size < off+len-towrite) @@ -832,7 +832,7 @@ out: inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return len - towrite; } diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 821973853340..996b7742c90b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1511,9 +1511,9 @@ static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset, struct inode *inode = file_inode(file); loff_t ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/libfs.c b/fs/libfs.c index 01491299f348..0ca80b2af420 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -89,7 +89,7 @@ EXPORT_SYMBOL(dcache_dir_close); loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); switch (whence) { case 1: offset += file->f_pos; @@ -97,7 +97,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return -EINVAL; } if (offset != file->f_pos) { @@ -124,7 +124,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&dentry->d_lock); } } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return offset; } EXPORT_SYMBOL(dcache_dir_lseek); @@ -941,7 +941,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY_ALL)) goto out; @@ -953,7 +953,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, ret = err; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } EXPORT_SYMBOL(__generic_file_fsync); diff --git a/fs/locks.c b/fs/locks.c index af1ed74a657f..7c5f91be9b65 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1650,12 +1650,12 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr * bother, maybe that's a sign this just isn't a good file to * hand out a delegation on. */ - if (is_deleg && !mutex_trylock(&inode->i_mutex)) + if (is_deleg && !inode_trylock(inode)) return -EAGAIN; if (is_deleg && arg == F_WRLCK) { /* Write delegations are not currently supported: */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); WARN_ON_ONCE(1); return -EINVAL; } @@ -1732,7 +1732,7 @@ out: spin_unlock(&ctx->flc_lock); locks_dispose_list(&dispose); if (is_deleg) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error && !my_fl) *flp = NULL; return error; diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 1a6f0167b16a..61eaeb1b6cac 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -204,12 +204,12 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = li->li_flags; flags &= LOGFS_FL_USER_MODIFIABLE; flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE; li->li_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode->i_ctime = CURRENT_TIME; mark_inode_dirty_sync(inode); @@ -230,11 +230,11 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); logfs_get_wblocks(sb, NULL, WF_LOCK); logfs_write_anchor(sb); logfs_put_wblocks(sb, NULL, WF_LOCK); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 39d91f86cd35..27d040e35faa 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -485,7 +485,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s, #endif /* dev_mtd.c */ -#ifdef CONFIG_MTD +#if IS_ENABLED(CONFIG_MTD) int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr); #else static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) diff --git a/fs/namei.c b/fs/namei.c index bceefd5588a2..f624d132e01e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1629,9 +1629,9 @@ static int lookup_slow(struct nameidata *nd, struct path *path) parent = nd->path.dentry; BUG_ON(nd->inode != parent->d_inode); - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); dentry = __lookup_hash(&nd->last, parent, nd->flags); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (IS_ERR(dentry)) return PTR_ERR(dentry); path->mnt = nd->path.mnt; @@ -2229,10 +2229,10 @@ struct dentry *kern_path_locked(const char *name, struct path *path) putname(filename); return ERR_PTR(-EINVAL); } - mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); d = __lookup_hash(&last, path->dentry, 0); if (IS_ERR(d)) { - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); path_put(path); } putname(filename); @@ -2282,7 +2282,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) unsigned int c; int err; - WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(base->d_inode)); this.name = name; this.len = len; @@ -2380,9 +2380,9 @@ struct dentry *lookup_one_len_unlocked(const char *name, if (ret) return ret; - mutex_lock(&base->d_inode->i_mutex); + inode_lock(base->d_inode); ret = __lookup_hash(&this, base, 0); - mutex_unlock(&base->d_inode->i_mutex); + inode_unlock(base->d_inode); return ret; } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -2463,7 +2463,7 @@ mountpoint_last(struct nameidata *nd, struct path *path) goto done; } - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); dentry = d_lookup(dir, &nd->last); if (!dentry) { /* @@ -2473,16 +2473,16 @@ mountpoint_last(struct nameidata *nd, struct path *path) */ dentry = d_alloc(dir, &nd->last); if (!dentry) { - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); return -ENOMEM; } dentry = lookup_real(dir->d_inode, dentry, nd->flags); if (IS_ERR(dentry)) { - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); return PTR_ERR(dentry); } } - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); done: if (d_is_negative(dentry)) { @@ -2672,7 +2672,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) struct dentry *p; if (p1 == p2) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } @@ -2680,29 +2680,29 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) p = d_ancestor(p2, p1); if (p) { - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_CHILD); return p; } p = d_ancestor(p1, p2); if (p) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_CHILD); return p; } - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { - mutex_unlock(&p1->d_inode->i_mutex); + inode_unlock(p1->d_inode); if (p1 != p2) { - mutex_unlock(&p2->d_inode->i_mutex); + inode_unlock(p2->d_inode); mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); } } @@ -3141,9 +3141,9 @@ retry_lookup: * dropping this one anyway. */ } - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); error = lookup_open(nd, &path, file, op, got_write, opened); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); if (error <= 0) { if (error) @@ -3489,7 +3489,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, * Do the final lookup. */ lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; - mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path->dentry, lookup_flags); if (IS_ERR(dentry)) goto unlock; @@ -3518,7 +3518,7 @@ fail: dput(dentry); dentry = ERR_PTR(error); unlock: - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); if (!err2) mnt_drop_write(path->mnt); out: @@ -3538,7 +3538,7 @@ EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { dput(dentry); - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); } @@ -3735,7 +3735,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) return -EPERM; dget(dentry); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); error = -EBUSY; if (is_local_mountpoint(dentry)) @@ -3755,7 +3755,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) detach_mounts(dentry); out: - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); dput(dentry); if (!error) d_delete(dentry); @@ -3794,7 +3794,7 @@ retry: if (error) goto exit1; - mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -3810,7 +3810,7 @@ retry: exit3: dput(dentry); exit2: - mutex_unlock(&path.dentry->d_inode->i_mutex); + inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit1: path_put(&path); @@ -3856,7 +3856,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate if (!dir->i_op->unlink) return -EPERM; - mutex_lock(&target->i_mutex); + inode_lock(target); if (is_local_mountpoint(dentry)) error = -EBUSY; else { @@ -3873,7 +3873,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate } } out: - mutex_unlock(&target->i_mutex); + inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { @@ -3916,7 +3916,7 @@ retry: if (error) goto exit1; retry_deleg: - mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { @@ -3934,7 +3934,7 @@ retry_deleg: exit2: dput(dentry); } - mutex_unlock(&path.dentry->d_inode->i_mutex); + inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; @@ -4086,7 +4086,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; @@ -4103,7 +4103,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de inode->i_state &= ~I_LINKABLE; spin_unlock(&inode->i_lock); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; @@ -4303,7 +4303,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!is_dir || (flags & RENAME_EXCHANGE)) lock_two_nondirectories(source, target); else if (target) - mutex_lock(&target->i_mutex); + inode_lock(target); error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) @@ -4356,7 +4356,7 @@ out: if (!is_dir || (flags & RENAME_EXCHANGE)) unlock_two_nondirectories(source, target); else if (target) - mutex_unlock(&target->i_mutex); + inode_unlock(target); dput(new_dentry); if (!error) { fsnotify_move(old_dir, new_dir, old_name, is_dir, diff --git a/fs/namespace.c b/fs/namespace.c index a830e1463704..4fb1691b4355 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1961,9 +1961,9 @@ static struct mountpoint *lock_mount(struct path *path) struct vfsmount *mnt; struct dentry *dentry = path->dentry; retry: - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); if (unlikely(cant_mount(dentry))) { - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return ERR_PTR(-ENOENT); } namespace_lock(); @@ -1974,13 +1974,13 @@ retry: mp = new_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return mp; } return mp; } namespace_unlock(); - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); path_put(path); path->mnt = mnt; dentry = path->dentry = dget(mnt->mnt_root); @@ -1992,7 +1992,7 @@ static void unlock_mount(struct mountpoint *where) struct dentry *dentry = where->m_dentry; put_mountpoint(where); namespace_unlock(); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index f0e3e9e747dd..26c2de2de13f 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -369,7 +369,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) if (!res) { struct inode *inode = d_inode(dentry); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) { ncp_new_dentry(dentry); val=1; @@ -377,7 +377,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) ncp_dbg(2, "found, but dirEntNum changed\n"); ncp_update_inode2(inode, &finfo); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } finished: @@ -639,9 +639,9 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, } else { struct inode *inode = d_inode(newdent); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(inode, I_MUTEX_CHILD); ncp_update_inode2(inode, entry); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (ctl.idx >= NCP_DIRCACHE_SIZE) { diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 011324ce9df2..dd38ca1f2ecb 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -224,10 +224,10 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->ki_pos = pos; if (pos > i_size_read(inode)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (pos > i_size_read(inode)) i_size_write(inode, pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } ncp_dbg(1, "exit %pD2\n", file); outrel: diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c82a21228a34..9cce67043f92 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -940,7 +940,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case 1: offset += filp->f_pos; @@ -957,7 +957,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dir_ctx->duped = 0; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -972,9 +972,9 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); - mutex_lock(&inode->i_mutex); + inode_lock(inode); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 7ab7ec9f4eed..7a0cfd3266e5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -580,7 +580,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, if (!count) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); result = nfs_sync_mapping(mapping); if (result) goto out_unlock; @@ -608,7 +608,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, NFS_I(inode)->read_io += count; result = nfs_direct_read_schedule_iovec(dreq, iter, pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -622,7 +622,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, out_release: nfs_direct_req_release(dreq); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return result; } @@ -1005,7 +1005,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT; - mutex_lock(&inode->i_mutex); + inode_lock(inode); result = nfs_sync_mapping(mapping); if (result) @@ -1045,7 +1045,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos >> PAGE_CACHE_SHIFT, end); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -1066,7 +1066,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) out_release: nfs_direct_req_release(dreq); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 4ef8f5addcad..748bb813b8ec 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -278,9 +278,9 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index bb1f4e7a3270..3384dc8e6683 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -971,7 +971,7 @@ filelayout_mark_request_commit(struct nfs_page *req, u32 i, j; if (fl->commit_through_mds) { - nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); + nfs_request_add_commit_list(req, cinfo); } else { /* Note that we are calling nfs4_fl_calc_j_index on each page * that ends up being committed to a data server. An attractive diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 6594e9f903a0..5bcd92d50e82 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1948,11 +1948,9 @@ ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo, start = xdr_reserve_space(xdr, 4); BUG_ON(!start); - if (ff_layout_encode_ioerr(flo, xdr, args)) - goto out; - + ff_layout_encode_ioerr(flo, xdr, args); ff_layout_encode_iostats(flo, xdr, args); -out: + *start = cpu_to_be32((xdr->p - start - 1) * 4); dprintk("%s: Return\n", __func__); } diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index bd0327541366..29898a9550fa 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -218,63 +218,55 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, err->length = end - err->offset; } -static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - nfs4_stateid *stateid, - struct nfs4_deviceid *deviceid) +static int +ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1, + const struct nfs4_ff_layout_ds_err *e2) { - return err->status == status && err->opnum == opnum && - nfs4_stateid_match(&err->stateid, stateid) && - !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) && - end_offset(err->offset, err->length) >= offset && - err->offset <= end_offset(offset, length); -} - -static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old, - struct nfs4_ff_layout_ds_err *new) -{ - if (!ds_error_can_merge(old, new->offset, new->length, new->status, - new->opnum, &new->stateid, &new->deviceid)) - return false; - - extend_ds_error(old, new->offset, new->length); - return true; + int ret; + + if (e1->opnum != e2->opnum) + return e1->opnum < e2->opnum ? -1 : 1; + if (e1->status != e2->status) + return e1->status < e2->status ? -1 : 1; + ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid)); + if (ret != 0) + return ret; + ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid)); + if (ret != 0) + return ret; + if (end_offset(e1->offset, e1->length) < e2->offset) + return -1; + if (e1->offset > end_offset(e2->offset, e2->length)) + return 1; + /* If ranges overlap or are contiguous, they are the same */ + return 0; } -static bool +static void ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, struct nfs4_ff_layout_ds_err *dserr) { - struct nfs4_ff_layout_ds_err *err; - - list_for_each_entry(err, &flo->error_list, list) { - if (merge_ds_error(err, dserr)) { - return true; - } - } - - list_add(&dserr->list, &flo->error_list); - return false; -} - -static bool -ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - nfs4_stateid *stateid, struct nfs4_deviceid *deviceid) -{ - bool found = false; - struct nfs4_ff_layout_ds_err *err; - - list_for_each_entry(err, &flo->error_list, list) { - if (ds_error_can_merge(err, offset, length, status, opnum, - stateid, deviceid)) { - found = true; - extend_ds_error(err, offset, length); + struct nfs4_ff_layout_ds_err *err, *tmp; + struct list_head *head = &flo->error_list; + int match; + + /* Do insertion sort w/ merges */ + list_for_each_entry_safe(err, tmp, &flo->error_list, list) { + match = ff_ds_error_match(err, dserr); + if (match < 0) + continue; + if (match > 0) { + /* Add entry "dserr" _before_ entry "err" */ + head = &err->list; break; } + /* Entries match, so merge "err" into "dserr" */ + extend_ds_error(dserr, err->offset, err->length); + list_del(&err->list); + kfree(err); } - return found; + list_add_tail(&dserr->list, head); } int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, @@ -283,7 +275,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, gfp_t gfp_flags) { struct nfs4_ff_layout_ds_err *dserr; - bool needfree; if (status == 0) return 0; @@ -291,14 +282,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, if (mirror->mirror_ds == NULL) return -EINVAL; - spin_lock(&flo->generic_hdr.plh_inode->i_lock); - if (ff_layout_update_ds_error(flo, offset, length, status, opnum, - &mirror->stateid, - &mirror->mirror_ds->id_node.deviceid)) { - spin_unlock(&flo->generic_hdr.plh_inode->i_lock); - return 0; - } - spin_unlock(&flo->generic_hdr.plh_inode->i_lock); dserr = kmalloc(sizeof(*dserr), gfp_flags); if (!dserr) return -ENOMEM; @@ -313,10 +296,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, NFS4_DEVICEID4_SIZE); spin_lock(&flo->generic_hdr.plh_inode->i_lock); - needfree = ff_layout_add_ds_error_locked(flo, dserr); + ff_layout_add_ds_error_locked(flo, dserr); spin_unlock(&flo->generic_hdr.plh_inode->i_lock); - if (needfree) - kfree(dserr); return 0; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8e24d886d2c5..86faecf8f328 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -661,9 +661,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs_sync_inode(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (err) goto out; } @@ -1178,9 +1178,9 @@ static int __nfs_revalidate_mapping(struct inode *inode, spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); if (may_lock) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_invalidate_mapping(inode, mapping); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } else ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 4e8cc942336c..9a547aa3ec8e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -484,7 +484,7 @@ void nfs_retry_commit(struct list_head *page_list, struct nfs_commit_info *cinfo, u32 ds_commit_idx); void nfs_commitdata_release(struct nfs_commit_data *data); -void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, +void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6e8174930a48..bd25dc7077f7 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -101,13 +101,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -123,7 +123,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return -EOPNOTSUPP; nfs_wb_all(inode); - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) @@ -131,7 +131,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 26f9a23e2b25..57ca1c8039c1 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -141,11 +141,11 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) ret = pnfs_sync_inode(inode, !!datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by @@ -219,13 +219,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ if (same_inode) { - mutex_lock(&src_inode->i_mutex); + inode_lock(src_inode); } else if (dst_inode < src_inode) { - mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(dst_inode, I_MUTEX_PARENT); + inode_lock_nested(src_inode, I_MUTEX_CHILD); } else { - mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(src_inode, I_MUTEX_PARENT); + inode_lock_nested(dst_inode, I_MUTEX_CHILD); } /* flush all pending writes on both src and dst so that server @@ -246,13 +246,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, out_unlock: if (same_inode) { - mutex_unlock(&src_inode->i_mutex); + inode_unlock(src_inode); } else if (dst_inode < src_inode) { - mutex_unlock(&src_inode->i_mutex); - mutex_unlock(&dst_inode->i_mutex); + inode_unlock(src_inode); + inode_unlock(dst_inode); } else { - mutex_unlock(&dst_inode->i_mutex); - mutex_unlock(&src_inode->i_mutex); + inode_unlock(dst_inode); + inode_unlock(src_inode); } out: return ret; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ce43cd6d88c6..5754835a2886 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -830,11 +830,10 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); * holding the nfs_page lock. */ void -nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, - struct nfs_commit_info *cinfo) +nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) { spin_lock(cinfo->lock); - nfs_request_add_commit_list_locked(req, dst, cinfo); + nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); spin_unlock(cinfo->lock); nfs_mark_page_unstable(req->wb_page, cinfo); } @@ -892,7 +891,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, { if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx)) return; - nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); + nfs_request_add_commit_list(req, cinfo); } static void diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 819ad812c71b..4cba7865f496 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -55,10 +55,10 @@ nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u struct inode *inode = d_inode(resfh->fh_dentry); int status; - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = security_inode_setsecctx(resfh->fh_dentry, label->data, label->len); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (status) /* diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 79f0307a5ec8..dc8ebecf5618 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -192,7 +192,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dir = nn->rec_file->f_path.dentry; /* lock the parent */ - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { @@ -213,7 +213,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) out_put: dput(dentry); out_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (status == 0) { if (nn->in_grace) { crp = nfs4_client_to_reclaim(dname, nn); @@ -286,7 +286,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } status = iterate_dir(nn->rec_file, &ctx.ctx); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { if (!status) { @@ -302,7 +302,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) list_del(&entry->list); kfree(entry); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); nfs4_reset_creds(original_cred); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { @@ -322,7 +322,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); dir = nn->rec_file->f_path.dentry; - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); @@ -335,7 +335,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) out: dput(dentry); out_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); return status; } diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 0770bcb543c8..f84fe6bf9aee 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -288,7 +288,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) } inode = d_inode(dentry); - mutex_lock_nested(&inode->i_mutex, subclass); + inode_lock_nested(inode, subclass); fill_pre_wcc(fhp); fhp->fh_locked = true; } @@ -307,7 +307,7 @@ fh_unlock(struct svc_fh *fhp) { if (fhp->fh_locked) { fill_post_wcc(fhp); - mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex); + inode_unlock(d_inode(fhp->fh_dentry)); fhp->fh_locked = false; } } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6739077f17fe..5d2a57e4c03a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -493,9 +493,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); host_error = security_inode_setsecctx(dentry, label->data, label->len); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return nfserrno(host_error); } #else diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 10b22527a617..21a1e2e0d92f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1003,7 +1003,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); @@ -1113,6 +1113,6 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index aba43811d6ef..e8fe24882b5b 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -158,7 +158,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, flags = nilfs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = NILFS_I(inode)->i_flags; @@ -186,7 +186,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, nilfs_mark_inode_dirty(inode); ret = nilfs_transaction_commit(inode->i_sb); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 9e38dafa3bc7..b2eff5816adc 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1509,7 +1509,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, err = filemap_write_and_wait_range(vi->i_mapping, start, end); if (err) return err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); BUG_ON(!S_ISDIR(vi->i_mode)); /* If the bitmap attribute inode is in memory sync it, too. */ @@ -1532,7 +1532,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, else ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " "%u.", datasync ? "data" : "", vi->i_ino, -ret); - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); return ret; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 9d383e5eff0e..bed4d427dfae 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1944,14 +1944,14 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t written = 0; ssize_t err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); /* We can write back this queue in page reclaim. */ current->backing_dev_info = inode_to_bdi(vi); err = ntfs_prepare_file_for_write(iocb, from); if (iov_iter_count(from) && !err) written = ntfs_perform_write(file, from, iocb->ki_pos); current->backing_dev_info = NULL; - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); if (likely(written > 0)) { err = generic_write_sync(file, iocb->ki_pos, written); if (err < 0) @@ -1996,7 +1996,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, err = filemap_write_and_wait_range(vi->i_mapping, start, end); if (err) return err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); BUG_ON(S_ISDIR(vi->i_mode)); if (!datasync || !NInoNonResident(NTFS_I(vi))) @@ -2015,7 +2015,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, else ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " "%u.", datasync ? "data" : "", vi->i_ino, -ret); - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); return ret; } diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c index d80e3315cab0..9793e68ba1dd 100644 --- a/fs/ntfs/quota.c +++ b/fs/ntfs/quota.c @@ -48,7 +48,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) ntfs_error(vol->sb, "Quota inodes are not open."); return false; } - mutex_lock(&vol->quota_q_ino->i_mutex); + inode_lock(vol->quota_q_ino); ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); if (!ictx) { ntfs_error(vol->sb, "Failed to get index context."); @@ -98,7 +98,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) ntfs_index_entry_mark_dirty(ictx); set_done: ntfs_index_ctx_put(ictx); - mutex_unlock(&vol->quota_q_ino->i_mutex); + inode_unlock(vol->quota_q_ino); /* * We set the flag so we do not try to mark the quotas out of date * again on remount. @@ -110,7 +110,7 @@ done: err_out: if (ictx) ntfs_index_ctx_put(ictx); - mutex_unlock(&vol->quota_q_ino->i_mutex); + inode_unlock(vol->quota_q_ino); return false; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 2f77f8dfb861..1b38abdaa3ed 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1284,10 +1284,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol) * Find the inode number for the hibernation file by looking up the * filename hiberfil.sys in the root directory. */ - mutex_lock(&vol->root_ino->i_mutex); + inode_lock(vol->root_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, &name); - mutex_unlock(&vol->root_ino->i_mutex); + inode_unlock(vol->root_ino); if (IS_ERR_MREF(mref)) { ret = MREF_ERR(mref); /* If the file does not exist, Windows is not hibernated. */ @@ -1377,10 +1377,10 @@ static bool load_and_init_quota(ntfs_volume *vol) * Find the inode number for the quota file by looking up the filename * $Quota in the extended system files directory $Extend. */ - mutex_lock(&vol->extend_ino->i_mutex); + inode_lock(vol->extend_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, &name); - mutex_unlock(&vol->extend_ino->i_mutex); + inode_unlock(vol->extend_ino); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, quotas are disabled and have @@ -1460,10 +1460,10 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol) * Find the inode number for the transaction log file by looking up the * filename $UsnJrnl in the extended system files directory $Extend. */ - mutex_lock(&vol->extend_ino->i_mutex); + inode_lock(vol->extend_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, &name); - mutex_unlock(&vol->extend_ino->i_mutex); + inode_unlock(vol->extend_ino); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, transaction logging is disabled, diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a3ded88718c9..d002579c6f2b 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5719,7 +5719,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto bail; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5776,7 +5776,7 @@ int ocfs2_remove_btree_range(struct inode *inode, out_commit: ocfs2_commit_trans(osb, handle); out: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); @@ -5832,7 +5832,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); @@ -5980,7 +5980,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); di = (struct ocfs2_dinode *) tl_bh->b_data; @@ -6008,7 +6008,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } - mutex_lock(&data_alloc_inode->i_mutex); + inode_lock(data_alloc_inode); status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); if (status < 0) { @@ -6035,7 +6035,7 @@ out_unlock: ocfs2_inode_unlock(data_alloc_inode, 1); out_mutex: - mutex_unlock(&data_alloc_inode->i_mutex); + inode_unlock(data_alloc_inode); iput(data_alloc_inode); out: @@ -6047,9 +6047,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); status = __ocfs2_flush_truncate_log(osb); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6208,7 +6208,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, (unsigned long long)le64_to_cpu(tl_copy->i_blkno), num_recs); - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); for(i = 0; i < num_recs; i++) { if (ocfs2_truncate_log_needs_flush(osb)) { status = __ocfs2_flush_truncate_log(osb); @@ -6239,7 +6239,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, } bail_up: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6346,7 +6346,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { @@ -6395,7 +6395,7 @@ out_unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); out: while(head) { @@ -6439,7 +6439,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, handle_t *handle; int ret = 0; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); while (head) { if (ocfs2_truncate_log_needs_flush(osb)) { @@ -6471,7 +6471,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); while (head) { /* Premature exit may have left some dangling items. */ @@ -7355,7 +7355,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); if (ret < 0) { @@ -7422,7 +7422,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: return ret; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 7f604727f487..794fd1587f34 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2046,9 +2046,9 @@ static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, int ret = 0; unsigned int truncated_clusters; - mutex_lock(&osb->osb_tl_inode->i_mutex); + inode_lock(osb->osb_tl_inode); truncated_clusters = osb->truncated_clusters; - mutex_unlock(&osb->osb_tl_inode->i_mutex); + inode_unlock(osb->osb_tl_inode); /* * Check whether we can succeed in allocating if we free diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 72afdca3cea7..ebe543894db0 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -757,7 +757,7 @@ int o2nm_depend_item(struct config_item *item) void o2nm_undepend_item(struct config_item *item) { - configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item); + configfs_undepend_item(item); } int o2nm_depend_this_node(void) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index ffecf89c8c1c..e1adf285fc31 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -4361,7 +4361,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, mlog_errno(ret); goto out; } - mutex_lock(&dx_alloc_inode->i_mutex); + inode_lock(dx_alloc_inode); ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); if (ret) { @@ -4410,7 +4410,7 @@ out_unlock: ocfs2_inode_unlock(dx_alloc_inode, 1); out_mutex: - mutex_unlock(&dx_alloc_inode->i_mutex); + inode_unlock(dx_alloc_inode); brelse(dx_alloc_bh); out: iput(dx_alloc_inode); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index f92612e4b9d6..474e57f834e6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1390,6 +1390,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, unsigned int gen; int noqueue_attempted = 0; int dlm_locked = 0; + int kick_dc = 0; if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) { mlog_errno(-EINVAL); @@ -1524,7 +1525,12 @@ update_holders: unlock: lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ + kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED); + spin_unlock_irqrestore(&lockres->l_lock, flags); + if (kick_dc) + ocfs2_wake_downconvert_thread(osb); out: /* * This is helping work around a lock inversion between the page lock diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d63127932509..7cb38fdca229 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1872,7 +1872,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes on other nodes @@ -1991,7 +1991,7 @@ out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -2299,7 +2299,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); relock: /* @@ -2435,7 +2435,7 @@ out: ocfs2_rw_unlock(inode, rw_level); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (written) ret = written; @@ -2547,7 +2547,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_SET: @@ -2585,7 +2585,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret) return ret; return offset; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 97a563bab9a8..36294446d960 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -630,10 +630,10 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); mlog_errno(status); goto bail; @@ -680,7 +680,7 @@ bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode_alloc_inode, 1); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); brelse(inode_alloc_bh); bail: iput(inode_alloc_inode); @@ -751,10 +751,10 @@ static int ocfs2_wipe_inode(struct inode *inode, /* Lock the orphan dir. The lock will be held for the entire * delete_inode operation. We do this now to avoid races with * recovery completion on other nodes. */ - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); mlog_errno(status); goto bail; @@ -803,7 +803,7 @@ bail_unlock_dir: return status; ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); bail: iput(orphan_dir_inode); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 16b0bb482ea7..4506ec5ec2ea 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -86,7 +86,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, unsigned oldflags; int status; - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { @@ -135,7 +135,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, bail_unlock: ocfs2_inode_unlock(inode, 1); bail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); brelse(bh); @@ -287,7 +287,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, struct ocfs2_dinode *dinode_alloc = NULL; if (inode_alloc) - mutex_lock(&inode_alloc->i_mutex); + inode_lock(inode_alloc); if (o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); @@ -317,7 +317,7 @@ bail: ocfs2_inode_unlock(inode_alloc, 0); if (inode_alloc) - mutex_unlock(&inode_alloc->i_mutex); + inode_unlock(inode_alloc); brelse(bh); @@ -547,7 +547,7 @@ static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, struct ocfs2_dinode *gb_dinode = NULL; if (gb_inode) - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); if (o2info_coherent(&ffg->iff_req)) { status = ocfs2_inode_lock(gb_inode, &bh, 0); @@ -604,7 +604,7 @@ bail: ocfs2_inode_unlock(gb_inode, 0); if (gb_inode) - mutex_unlock(&gb_inode->i_mutex); + inode_unlock(gb_inode); iput(gb_inode); brelse(bh); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 3772a2dbb980..61b833b721d8 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2088,7 +2088,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, return status; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); if (status < 0) { mlog_errno(status); @@ -2106,7 +2106,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, out_cluster: ocfs2_inode_unlock(orphan_dir_inode, 0); out: - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); return status; } @@ -2196,7 +2196,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, oi->ip_next_orphan = NULL; if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_rw_lock(inode, 1); if (ret < 0) { mlog_errno(ret); @@ -2235,7 +2235,7 @@ unlock_inode: unlock_rw: ocfs2_rw_unlock(inode, 1); unlock_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* clear dio flag in ocfs2_inode_info */ oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index e9c99e35f5ea..7d62c43a2c3e 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -414,7 +414,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -468,7 +468,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: @@ -506,7 +506,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, goto bail; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_read_inode_block_full(inode, &alloc_bh, OCFS2_BH_IGNORE_CACHE); @@ -539,7 +539,7 @@ bail: brelse(alloc_bh); if (inode) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); } @@ -571,7 +571,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -601,7 +601,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); brelse(main_bm_bh); @@ -643,7 +643,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, goto bail; } - mutex_lock(&local_alloc_inode->i_mutex); + inode_lock(local_alloc_inode); /* * We must double check state and allocator bits because @@ -709,7 +709,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, status = 0; bail: if (status < 0 && local_alloc_inode) { - mutex_unlock(&local_alloc_inode->i_mutex); + inode_unlock(local_alloc_inode); iput(local_alloc_inode); } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 124471d26a73..e3d05d9901a3 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -276,7 +276,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; */ - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -338,7 +338,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out_unlock_mutex: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (context->data_ac) { ocfs2_free_alloc_context(context->data_ac); @@ -632,7 +632,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out; } - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); if (ret) { @@ -640,7 +640,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out_unlock_gb_mutex; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -708,11 +708,11 @@ out_commit: brelse(gd_bh); out_unlock_tl_inode: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ocfs2_inode_unlock(gb_inode, 1); out_unlock_gb_mutex: - mutex_unlock(&gb_inode->i_mutex); + inode_unlock(gb_inode); brelse(gb_bh); iput(gb_inode); @@ -905,7 +905,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes from other nodes @@ -969,7 +969,7 @@ out_inode_unlock: out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return status; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index ab42c38031b1..6b3e87189a64 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1045,7 +1045,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -1664,7 +1664,7 @@ bail: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -2121,11 +2121,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, return ret; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (ret < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(ret); @@ -2226,7 +2226,7 @@ out: if (ret) { ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); } @@ -2495,7 +2495,7 @@ out: ocfs2_free_alloc_context(inode_ac); /* Unroll orphan dir locking */ - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); ocfs2_inode_unlock(orphan_dir, 1); iput(orphan_dir); } @@ -2602,7 +2602,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -2689,7 +2689,7 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, bail_unlock_orphan: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); ocfs2_free_dir_lookup_result(&orphan_insert); @@ -2721,10 +2721,10 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, goto bail; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(status); goto bail; @@ -2770,7 +2770,7 @@ bail_commit: bail_unlock_orphan: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); iput(orphan_dir_inode); @@ -2834,12 +2834,12 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { mlog_errno(status); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); goto leave; } @@ -2901,7 +2901,7 @@ out_commit: ocfs2_commit_trans(osb, handle); orphan_unlock: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); leave: diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index fde9ef18cff3..9c9dd30bc945 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -308,7 +308,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) WARN_ON(bh != oinfo->dqi_gqi_bh); spin_unlock(&dq_data_lock); if (ex) { - mutex_lock(&oinfo->dqi_gqinode->i_mutex); + inode_lock(oinfo->dqi_gqinode); down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } else { down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); @@ -320,7 +320,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) { if (ex) { up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); - mutex_unlock(&oinfo->dqi_gqinode->i_mutex); + inode_unlock(oinfo->dqi_gqinode); } else { up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 252119860e6c..3eff031aaf26 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -807,7 +807,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) mlog_errno(ret); goto out; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); if (ret) { @@ -867,7 +867,7 @@ out_unlock: } out_mutex: if (alloc_inode) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); } out: @@ -4197,7 +4197,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto out; } - mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(new_inode, I_MUTEX_CHILD); ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, OI_LS_REFLINK_TARGET); if (ret) { @@ -4231,7 +4231,7 @@ inode_unlock: ocfs2_inode_unlock(new_inode, 1); brelse(new_bh); out_unlock: - mutex_unlock(&new_inode->i_mutex); + inode_unlock(new_inode); out: if (!ret) { ret = filemap_fdatawait(inode->i_mapping); @@ -4402,11 +4402,11 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, return error; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = dquot_initialize(dir); if (!error) error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error) fsnotify_create(dir, new_dentry); return error; diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 79b8021302b3..576b9a04873f 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -301,7 +301,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -375,7 +375,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: @@ -486,7 +486,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -590,7 +590,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index fc6d25f6d444..2f19aeec5482 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -141,7 +141,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) if (ac->ac_which != OCFS2_AC_USE_LOCAL) ocfs2_inode_unlock(inode, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); ac->ac_inode = NULL; @@ -797,11 +797,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, return -EINVAL; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); status = ocfs2_inode_lock(alloc_inode, &bh, 1); if (status < 0) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); mlog_errno(status); @@ -2875,10 +2875,10 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", (u32)suballoc_slot, status); @@ -2891,7 +2891,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); ocfs2_inode_unlock(inode_alloc_inode, 0); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); brelse(alloc_bh); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f0e241ffd94f..7d3d979f57d9 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2524,7 +2524,7 @@ static int ocfs2_xattr_free_block(struct inode *inode, mlog_errno(ret); goto out; } - mutex_lock(&xb_alloc_inode->i_mutex); + inode_lock(xb_alloc_inode); ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); if (ret < 0) { @@ -2549,7 +2549,7 @@ out_unlock: ocfs2_inode_unlock(xb_alloc_inode, 1); brelse(xb_alloc_bh); out_mutex: - mutex_unlock(&xb_alloc_inode->i_mutex); + inode_unlock(xb_alloc_inode); iput(xb_alloc_inode); out: brelse(blk_bh); @@ -3619,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode, } } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); if (ret < 0) { - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); mlog_errno(ret); goto cleanup; } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, &xbs, &ctxt, ref_meta, &credits); @@ -5460,7 +5460,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, return ret; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5504,7 +5504,7 @@ out_commit: out: ocfs2_schedule_truncate_log_flush(osb, 1); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (meta_ac) ocfs2_free_alloc_context(meta_ac); diff --git a/fs/open.c b/fs/open.c index b25b1542c530..55bdc75e2172 100644 --- a/fs/open.c +++ b/fs/open.c @@ -58,10 +58,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ ret = notify_change(dentry, &newattrs, NULL); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return ret; } @@ -510,7 +510,7 @@ static int chmod_common(struct path *path, umode_t mode) if (error) return error; retry_deleg: - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_path_chmod(path, mode); if (error) goto out_unlock; @@ -518,7 +518,7 @@ retry_deleg: newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(path->dentry, &newattrs, &delegated_inode); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) @@ -593,11 +593,11 @@ retry_deleg: if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_path_chown(path, uid, gid); if (!error) error = notify_change(path->dentry, &newattrs, &delegated_inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 0a8983492d91..d894e7cd9a86 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -22,9 +22,9 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) { - ssize_t list_size, size; - char *buf, *name, *value; - int error; + ssize_t list_size, size, value_size = 0; + char *buf, *name, *value = NULL; + int uninitialized_var(error); if (!old->d_inode->i_op->getxattr || !new->d_inode->i_op->getxattr) @@ -41,29 +41,40 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) if (!buf) return -ENOMEM; - error = -ENOMEM; - value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL); - if (!value) - goto out; - list_size = vfs_listxattr(old, buf, list_size); if (list_size <= 0) { error = list_size; - goto out_free_value; + goto out; } for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { - size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX); - if (size <= 0) { +retry: + size = vfs_getxattr(old, name, value, value_size); + if (size == -ERANGE) + size = vfs_getxattr(old, name, NULL, 0); + + if (size < 0) { error = size; - goto out_free_value; + break; + } + + if (size > value_size) { + void *new; + + new = krealloc(value, size, GFP_KERNEL); + if (!new) { + error = -ENOMEM; + break; + } + value = new; + value_size = size; + goto retry; } + error = vfs_setxattr(new, name, value, size, 0); if (error) - goto out_free_value; + break; } - -out_free_value: kfree(value); out: kfree(buf); @@ -237,9 +248,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (err) goto out_cleanup; - mutex_lock(&newdentry->d_inode->i_mutex); + inode_lock(newdentry->d_inode); err = ovl_set_attr(newdentry, stat); - mutex_unlock(&newdentry->d_inode->i_mutex); + inode_unlock(newdentry->d_inode); if (err) goto out_cleanup; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 692ceda3bc21..ed95272d57a6 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -167,7 +167,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; - mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(udir, I_MUTEX_PARENT); newdentry = lookup_one_len(dentry->d_name.name, upperdir, dentry->d_name.len); err = PTR_ERR(newdentry); @@ -185,7 +185,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, out_dput: dput(newdentry); out_unlock: - mutex_unlock(&udir->i_mutex); + inode_unlock(udir); return err; } @@ -258,9 +258,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (err) goto out_cleanup; - mutex_lock(&opaquedir->d_inode->i_mutex); + inode_lock(opaquedir->d_inode); err = ovl_set_attr(opaquedir, &stat); - mutex_unlock(&opaquedir->d_inode->i_mutex); + inode_unlock(opaquedir->d_inode); if (err) goto out_cleanup; @@ -599,7 +599,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) struct dentry *upper = ovl_dentry_upper(dentry); int err; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); err = -ESTALE; if (upper->d_parent == upperdir) { /* Don't let d_delete() think it can reset d_inode */ @@ -619,7 +619,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) * now. */ d_drop(dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return err; } diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 964a60fa7afc..49e204560655 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -42,6 +42,19 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) int err; struct dentry *upperdentry; + /* + * Check for permissions before trying to copy-up. This is redundant + * since it will be rechecked later by ->setattr() on upper dentry. But + * without this, copy-up can be triggered by just about anybody. + * + * We don't initialize inode->size, which just means that + * inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not + * check for a swapfile (which this won't be anyway). + */ + err = inode_change_ok(dentry->d_inode, attr); + if (err) + return err; + err = ovl_want_write(dentry); if (err) goto out; @@ -50,9 +63,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (!err) { upperdentry = ovl_dentry_upper(dentry); - mutex_lock(&upperdentry->d_inode->i_mutex); + inode_lock(upperdentry->d_inode); err = notify_change(upperdentry, attr, NULL); - mutex_unlock(&upperdentry->d_inode->i_mutex); + inode_unlock(upperdentry->d_inode); } ovl_drop_write(dentry); out: @@ -95,6 +108,29 @@ int ovl_permission(struct inode *inode, int mask) realdentry = ovl_entry_real(oe, &is_upper); + if (ovl_is_default_permissions(inode)) { + struct kstat stat; + struct path realpath = { .dentry = realdentry }; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper); + + err = vfs_getattr(&realpath, &stat); + if (err) + return err; + + if ((stat.mode ^ inode->i_mode) & S_IFMT) + return -ESTALE; + + inode->i_mode = stat.mode; + inode->i_uid = stat.uid; + inode->i_gid = stat.gid; + + return generic_permission(inode, mask); + } + /* Careful in RCU walk mode */ realinode = ACCESS_ONCE(realdentry->d_inode); if (!realinode) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e17154aeaae4..99b4168c36ff 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -142,7 +142,10 @@ struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper); +struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode, + bool is_upper); struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); +bool ovl_is_default_permissions(struct inode *inode); void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); struct dentry *ovl_workdir(struct dentry *dentry); int ovl_want_write(struct dentry *dentry); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 70e9af551600..fdaf28f75e12 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -228,7 +228,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) dput(dentry); } } - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); } revert_creds(old_cred); put_cred(override_cred); @@ -399,7 +399,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) loff_t res; struct ovl_dir_file *od = file->private_data; - mutex_lock(&file_inode(file)->i_mutex); + inode_lock(file_inode(file)); if (!file->f_pos) ovl_dir_reset(file); @@ -429,7 +429,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) res = offset; } out_unlock: - mutex_unlock(&file_inode(file)->i_mutex); + inode_unlock(file_inode(file)); return res; } @@ -454,10 +454,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); smp_mb__before_spinlock(); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (!od->upperfile) { if (IS_ERR(realfile)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return PTR_ERR(realfile); } od->upperfile = realfile; @@ -467,7 +467,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, fput(realfile); realfile = od->upperfile; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } } @@ -479,9 +479,9 @@ static int ovl_dir_release(struct inode *inode, struct file *file) struct ovl_dir_file *od = file->private_data; if (od->cache) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ovl_cache_put(od, file->f_path.dentry); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } fput(od->realfile); if (od->upperfile) @@ -557,7 +557,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; - mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(upper->d_inode, I_MUTEX_CHILD); list_for_each_entry(p, list, l_node) { struct dentry *dentry; @@ -571,8 +571,9 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) (int) PTR_ERR(dentry)); continue; } - ovl_cleanup(upper->d_inode, dentry); + if (dentry->d_inode) + ovl_cleanup(upper->d_inode, dentry); dput(dentry); } - mutex_unlock(&upper->d_inode->i_mutex); + inode_unlock(upper->d_inode); } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index e38ee0fed24a..8d826bd56b26 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -9,12 +9,14 @@ #include <linux/fs.h> #include <linux/namei.h> +#include <linux/pagemap.h> #include <linux/xattr.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/slab.h> #include <linux/parser.h> #include <linux/module.h> +#include <linux/pagemap.h> #include <linux/sched.h> #include <linux/statfs.h> #include <linux/seq_file.h> @@ -24,12 +26,11 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); -#define OVERLAYFS_SUPER_MAGIC 0x794c7630 - struct ovl_config { char *lowerdir; char *upperdir; char *workdir; + bool default_permissions; }; /* private information held for overlayfs's superblock */ @@ -154,6 +155,18 @@ struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) return realdentry; } +struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode, + bool is_upper) +{ + if (is_upper) { + struct ovl_fs *ofs = inode->i_sb->s_fs_info; + + return ofs->upper_mnt; + } else { + return oe->numlower ? oe->lowerstack[0].mnt : NULL; + } +} + struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; @@ -161,6 +174,13 @@ struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) return oe->cache; } +bool ovl_is_default_permissions(struct inode *inode) +{ + struct ovl_fs *ofs = inode->i_sb->s_fs_info; + + return ofs->config.default_permissions; +} + void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) { struct ovl_entry *oe = dentry->d_fsdata; @@ -209,7 +229,7 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode)); WARN_ON(oe->__upperdentry); BUG_ON(!upperdentry->d_inode); /* @@ -224,7 +244,7 @@ void ovl_dentry_version_inc(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(dentry->d_inode)); oe->version++; } @@ -232,7 +252,7 @@ u64 ovl_dentry_version_get(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(dentry->d_inode)); return oe->version; } @@ -355,9 +375,9 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, { struct dentry *dentry; - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); dentry = lookup_one_len(name->name, dir, name->len); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) @@ -594,6 +614,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) seq_show_option(m, "upperdir", ufs->config.upperdir); seq_show_option(m, "workdir", ufs->config.workdir); } + if (ufs->config.default_permissions) + seq_puts(m, ",default_permissions"); return 0; } @@ -618,6 +640,7 @@ enum { OPT_LOWERDIR, OPT_UPPERDIR, OPT_WORKDIR, + OPT_DEFAULT_PERMISSIONS, OPT_ERR, }; @@ -625,6 +648,7 @@ static const match_table_t ovl_tokens = { {OPT_LOWERDIR, "lowerdir=%s"}, {OPT_UPPERDIR, "upperdir=%s"}, {OPT_WORKDIR, "workdir=%s"}, + {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, {OPT_ERR, NULL} }; @@ -685,6 +709,10 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) return -ENOMEM; break; + case OPT_DEFAULT_PERMISSIONS: + config->default_permissions = true; + break; + default: pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; @@ -716,7 +744,7 @@ static struct dentry *ovl_workdir_create(struct vfsmount *mnt, if (err) return ERR_PTR(err); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); retry: work = lookup_one_len(OVL_WORKDIR_NAME, dentry, strlen(OVL_WORKDIR_NAME)); @@ -742,7 +770,7 @@ retry: goto out_dput; } out_unlock: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); mnt_drop_write(mnt); return work; @@ -910,6 +938,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } sb->s_stack_depth = 0; + sb->s_maxbytes = MAX_LFS_FILESIZE; if (ufs->config.upperdir) { if (!ufs->config.workdir) { pr_err("overlayfs: missing 'workdir'\n"); @@ -1053,6 +1082,9 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) root_dentry->d_fsdata = oe; + ovl_copyattr(ovl_dentry_real(root_dentry)->d_inode, + root_dentry->d_inode); + sb->s_magic = OVERLAYFS_SUPER_MAGIC; sb->s_op = &ovl_super_operations; sb->s_root = root_dentry; diff --git a/fs/pipe.c b/fs/pipe.c index 42cf8ddf0e55..ab8dad3ccb6a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576; */ unsigned int pipe_min_size = PAGE_SIZE; +/* Maximum allocatable pages per user. Hard limit is unset by default, soft + * matches default values. + */ +unsigned long pipe_user_pages_hard; +unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } +static void account_pipe_buffers(struct pipe_inode_info *pipe, + unsigned long old, unsigned long new) +{ + atomic_long_add(new - old, &pipe->user->pipe_bufs); +} + +static bool too_many_pipe_buffers_soft(struct user_struct *user) +{ + return pipe_user_pages_soft && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; +} + +static bool too_many_pipe_buffers_hard(struct user_struct *user) +{ + return pipe_user_pages_hard && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; +} + struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); if (pipe) { - pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + unsigned long pipe_bufs = PIPE_DEF_BUFFERS; + struct user_struct *user = get_current_user(); + + if (!too_many_pipe_buffers_hard(user)) { + if (too_many_pipe_buffers_soft(user)) + pipe_bufs = 1; + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); + } + if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; - pipe->buffers = PIPE_DEF_BUFFERS; + pipe->buffers = pipe_bufs; + pipe->user = user; + account_pipe_buffers(pipe, 0, pipe_bufs); mutex_init(&pipe->mutex); return pipe; } + free_uid(user); kfree(pipe); } @@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; + account_pipe_buffers(pipe, pipe->buffers, 0); + free_uid(pipe->user); for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) @@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } + account_pipe_buffers(pipe, pipe->buffers, nr_pages); pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; @@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { ret = -EPERM; goto out; + } else if ((too_many_pipe_buffers_hard(pipe->user) || + too_many_pipe_buffers_soft(pipe->user)) && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; } ret = pipe_set_size(pipe, nr_pages); break; diff --git a/fs/proc/array.c b/fs/proc/array.c index d73291f5f0fc..b6c00ce0e29e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -395,7 +395,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, state = *get_task_state(task); vsize = eip = esp = 0; - permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); + permitted = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS | PTRACE_MODE_NOAUDIT); mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); diff --git a/fs/proc/base.c b/fs/proc/base.c index 2cf5d7e37375..4f764c2ac1a5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -403,7 +403,7 @@ static const struct file_operations proc_pid_cmdline_ops = { static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); + struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (mm && !IS_ERR(mm)) { unsigned int nwords = 0; do { @@ -430,7 +430,8 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); - if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname)) + if (wchan && ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS) + && !lookup_symbol_name(wchan, symname)) seq_printf(m, "%s", symname); else seq_putc(m, '0'); @@ -444,7 +445,7 @@ static int lock_trace(struct task_struct *task) int err = mutex_lock_killable(&task->signal->cred_guard_mutex); if (err) return err; - if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { mutex_unlock(&task->signal->cred_guard_mutex); return -EPERM; } @@ -697,7 +698,7 @@ static int proc_fd_access_allowed(struct inode *inode) */ task = get_proc_task(inode); if (task) { - allowed = ptrace_may_access(task, PTRACE_MODE_READ); + allowed = ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); put_task_struct(task); } return allowed; @@ -732,7 +733,7 @@ static bool has_pid_permissions(struct pid_namespace *pid, return true; if (in_group_p(pid->pid_gid)) return true; - return ptrace_may_access(task, PTRACE_MODE_READ); + return ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); } @@ -809,7 +810,7 @@ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) struct mm_struct *mm = ERR_PTR(-ESRCH); if (task) { - mm = mm_access(task, mode); + mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); put_task_struct(task); if (!IS_ERR_OR_NULL(mm)) { @@ -952,6 +953,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, unsigned long src = *ppos; int ret = 0; struct mm_struct *mm = file->private_data; + unsigned long env_start, env_end; if (!mm) return 0; @@ -963,19 +965,25 @@ static ssize_t environ_read(struct file *file, char __user *buf, ret = 0; if (!atomic_inc_not_zero(&mm->mm_users)) goto free; + + down_read(&mm->mmap_sem); + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + while (count > 0) { size_t this_len, max_len; int retval; - if (src >= (mm->env_end - mm->env_start)) + if (src >= (env_end - env_start)) break; - this_len = mm->env_end - (mm->env_start + src); + this_len = env_end - (env_start + src); max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); - retval = access_remote_vm(mm, (mm->env_start + src), + retval = access_remote_vm(mm, (env_start + src), page, this_len, 0); if (retval <= 0) { @@ -1860,7 +1868,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) if (!task) goto out_notask; - mm = mm_access(task, PTRACE_MODE_READ); + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR_OR_NULL(mm)) goto out; @@ -2013,7 +2021,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, goto out; result = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; result = -ENOENT; @@ -2066,7 +2074,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) goto out; ret = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out_put_task; ret = 0; @@ -2533,7 +2541,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh if (result) return result; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { result = -EACCES; goto out_unlock; } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 92e6726f6e37..a939f5ed7f89 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -552,9 +552,9 @@ static int open_kcore(struct inode *inode, struct file *filp) if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_size_write(inode, proc_root_kcore->size); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 1dece8781f91..276f12431dbf 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -46,7 +46,7 @@ static const char *proc_ns_get_link(struct dentry *dentry, if (!task) return error; - if (ptrace_may_access(task, PTRACE_MODE_READ)) { + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { error = ns_get_path(&ns_path, task, ns_ops); if (!error) nd_jump_link(&ns_path); @@ -67,7 +67,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!task) return res; - if (ptrace_may_access(task, PTRACE_MODE_READ)) { + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) res = readlink_copy(buffer, buflen, name); diff --git a/fs/proc/self.c b/fs/proc/self.c index 67e8db442cf0..b6a8d3529fea 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -50,7 +50,7 @@ int proc_setup_self(struct super_block *s) struct pid_namespace *ns = s->s_fs_info; struct dentry *self; - mutex_lock(&root_inode->i_mutex); + inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { struct inode *inode = new_inode_pseudo(s); @@ -69,7 +69,7 @@ int proc_setup_self(struct super_block *s) } else { self = ERR_PTR(-ENOMEM); } - mutex_unlock(&root_inode->i_mutex); + inode_unlock(root_inode); if (IS_ERR(self)) { pr_err("proc_fill_super: can't allocate /proc/self\n"); return PTR_ERR(self); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 65a1b6c69c11..85d16c67c33e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -468,7 +468,7 @@ struct mem_size_stats { static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty) { - int i, nr = compound ? HPAGE_PMD_NR : 1; + int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; if (PageAnon(page)) @@ -602,7 +602,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); return 0; @@ -913,7 +914,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty_pmd(vma, addr, pmd); goto out; @@ -1187,7 +1189,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, int err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge_lock(pmdp, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmdp, vma); + if (ptl) { u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; @@ -1519,7 +1522,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *orig_pte; pte_t *pte; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 9eacd59e0360..e58a31e8fb2a 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -52,7 +52,7 @@ int proc_setup_thread_self(struct super_block *s) struct pid_namespace *ns = s->s_fs_info; struct dentry *thread_self; - mutex_lock(&root_inode->i_mutex); + inode_lock(root_inode); thread_self = d_alloc_name(s->s_root, "thread-self"); if (thread_self) { struct inode *inode = new_inode_pseudo(s); @@ -71,7 +71,7 @@ int proc_setup_thread_self(struct super_block *s) } else { thread_self = ERR_PTR(-ENOMEM); } - mutex_unlock(&root_inode->i_mutex); + inode_unlock(root_inode); if (IS_ERR(thread_self)) { pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); return PTR_ERR(thread_self); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d8c439d813ce..dc645b66cd79 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -377,7 +377,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, break; } - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_alloc_name(root, name); if (!dentry) @@ -397,12 +397,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, list_add(&private->list, &allpstore); spin_unlock_irqrestore(&allpstore_lock, flags); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return 0; fail_lockedalloc: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); kfree(private); fail_alloc: iput(inode); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index fbd70af98820..3c3b81bb6dfe 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -682,9 +682,9 @@ int dquot_quota_sync(struct super_block *sb, int type) continue; if (!sb_has_quota_active(sb, cnt)) continue; - mutex_lock(&dqopt->files[cnt]->i_mutex); + inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); - mutex_unlock(&dqopt->files[cnt]->i_mutex); + inode_unlock(dqopt->files[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); @@ -2162,12 +2162,12 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) /* If quota was reenabled in the meantime, we have * nothing to do */ if (!sb_has_quota_loaded(sb, cnt)) { - mutex_lock(&toputinode[cnt]->i_mutex); + inode_lock(toputinode[cnt]); toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); truncate_inode_pages(&toputinode[cnt]->i_data, 0); - mutex_unlock(&toputinode[cnt]->i_mutex); + inode_unlock(toputinode[cnt]); mark_inode_dirty_sync(toputinode[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); @@ -2258,11 +2258,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * When S_NOQUOTA is set, remove dquot references as no more * references can be added @@ -2305,12 +2305,12 @@ out_file_init: iput(inode); out_lock: if (oldflags != -1) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); inode->i_flags |= oldflags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: @@ -2430,9 +2430,9 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - mutex_lock(&d_inode(sb->s_root)->i_mutex); + inode_lock(d_inode(sb->s_root)); dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); - mutex_unlock(&d_inode(sb->s_root)->i_mutex); + inode_unlock(d_inode(sb->s_root)); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/read_write.c b/fs/read_write.c index 06b07d5a08fe..324ec271cc4e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -238,7 +238,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file_inode(file); loff_t retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); @@ -283,7 +283,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence) retval = offset; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); @@ -1656,6 +1656,9 @@ next_file: mnt_drop_write_file(dst_file); next_loop: fdput(dst_fd); + + if (fatal_signal_pending(current)) + goto out; } out: diff --git a/fs/readdir.c b/fs/readdir.c index ced679179cac..e69ef3b79787 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -44,7 +44,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx) fsnotify_access(file); file_accessed(file); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return res; } diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 4a024e2ceb9f..3abd4004184b 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -38,11 +38,11 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); reiserfs_write_lock(inode->i_sb); err = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (err < 0) return err; return 0; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 96a1bcf33db4..9424a4ba93a9 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -158,7 +158,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); BUG_ON(!S_ISREG(inode->i_mode)); err = sync_mapping_buffers(inode->i_mapping); reiserfs_write_lock(inode->i_sb); @@ -166,7 +166,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (barrier_done < 0) return barrier_done; return (err < 0) ? -EIO : 0; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 6ec8a30a0911..036a1fc0a8c3 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -224,7 +224,7 @@ out_unlock: page_cache_release(page); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); reiserfs_write_unlock(inode->i_sb); return retval; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 05db7473bcb5..c0306ec8ed7b 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -288,7 +288,7 @@ static int finish_unfinished(struct super_block *s) pathrelse(&path); inode = reiserfs_iget(s, &obj_key); - if (!inode) { + if (IS_ERR_OR_NULL(inode)) { /* * the unlink almost completed, it just did not * manage to remove "save" link and release objectid diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e5ddb4e5ea94..57e0b2310532 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -64,14 +64,14 @@ #ifdef CONFIG_REISERFS_FS_XATTR static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); return dir->i_op->create(dir, dentry, mode, true); } #endif static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); return dir->i_op->mkdir(dir, dentry, mode); } @@ -85,11 +85,11 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); error = dir->i_op->unlink(dir, dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (!error) d_delete(dentry); @@ -100,13 +100,13 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) { int error; - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); error = dir->i_op->rmdir(dir, dentry); if (!error) d_inode(dentry)->i_flags |= S_DEAD; - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (!error) d_delete(dentry); @@ -123,7 +123,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) if (d_really_is_negative(privroot)) return ERR_PTR(-ENODATA); - mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR); xaroot = dget(REISERFS_SB(sb)->xattr_root); if (!xaroot) @@ -139,7 +139,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) } } - mutex_unlock(&d_inode(privroot)->i_mutex); + inode_unlock(d_inode(privroot)); return xaroot; } @@ -156,7 +156,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) le32_to_cpu(INODE_PKEY(inode)->k_objectid), inode->i_generation); - mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR); xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); if (!IS_ERR(xadir) && d_really_is_negative(xadir)) { @@ -170,7 +170,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) } } - mutex_unlock(&d_inode(xaroot)->i_mutex); + inode_unlock(d_inode(xaroot)); dput(xaroot); return xadir; } @@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, container_of(ctx, struct reiserfs_dentry_buf, ctx); struct dentry *dentry; - WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir))); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) return -ENOSPC; @@ -254,7 +254,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, goto out_dir; } - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dir), I_MUTEX_XATTR); buf.xadir = dir; while (1) { @@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, break; buf.count = 0; } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); cleanup_dentry_buf(&buf); @@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (!err) { int jerror; - mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex, + inode_lock_nested(d_inode(dir->d_parent), I_MUTEX_XATTR); err = action(dir, data); reiserfs_write_lock(inode->i_sb); jerror = journal_end(&th); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&d_inode(dir->d_parent)->i_mutex); + inode_unlock(d_inode(dir->d_parent)); err = jerror ?: err; } } @@ -384,7 +384,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (IS_ERR(xadir)) return ERR_CAST(xadir); - mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR); xafile = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(xafile)) { err = PTR_ERR(xafile); @@ -404,7 +404,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (err) dput(xafile); out: - mutex_unlock(&d_inode(xadir)->i_mutex); + inode_unlock(d_inode(xadir)); dput(xadir); if (err) return ERR_PTR(err); @@ -469,7 +469,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) if (IS_ERR(xadir)) return PTR_ERR(xadir); - mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR); dentry = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -483,7 +483,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) dput(dentry); out_dput: - mutex_unlock(&d_inode(xadir)->i_mutex); + inode_unlock(d_inode(xadir)); dput(xadir); return err; } @@ -580,11 +580,11 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR); inode_dio_wait(d_inode(dentry)); err = reiserfs_setattr(dentry, &newattrs); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); } else update_ctime(inode); out_unlock: @@ -888,9 +888,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) goto out; } - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dir), I_MUTEX_XATTR); err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (!err) err = buf.pos; @@ -905,7 +905,7 @@ static int create_privroot(struct dentry *dentry) int err; struct inode *inode = d_inode(dentry->d_parent); - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); err = xattr_mkdir(inode, dentry, 0700); if (err || d_really_is_negative(dentry)) { @@ -995,7 +995,7 @@ int reiserfs_lookup_privroot(struct super_block *s) int err = 0; /* If we don't have the privroot located yet - go find it */ - mutex_lock(&d_inode(s->s_root)->i_mutex); + inode_lock(d_inode(s->s_root)); dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { @@ -1005,7 +1005,7 @@ int reiserfs_lookup_privroot(struct super_block *s) d_inode(dentry)->i_flags |= S_PRIVATE; } else err = PTR_ERR(dentry); - mutex_unlock(&d_inode(s->s_root)->i_mutex); + inode_unlock(d_inode(s->s_root)); return err; } @@ -1025,14 +1025,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) goto error; if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) { - mutex_lock(&d_inode(s->s_root)->i_mutex); + inode_lock(d_inode(s->s_root)); err = create_privroot(REISERFS_SB(s)->priv_root); - mutex_unlock(&d_inode(s->s_root)->i_mutex); + inode_unlock(d_inode(s->s_root)); } if (d_really_is_positive(privroot)) { s->s_xattr = reiserfs_xattr_handlers; - mutex_lock(&d_inode(privroot)->i_mutex); + inode_lock(d_inode(privroot)); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; @@ -1043,7 +1043,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) else err = PTR_ERR(dentry); } - mutex_unlock(&d_inode(privroot)->i_mutex); + inode_unlock(d_inode(privroot)); } error: diff --git a/fs/timerfd.c b/fs/timerfd.c index b94fa6c3c6eb..053818dd6c18 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -153,7 +153,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) if (isalarm(ctx)) remaining = alarm_expires_remaining(&ctx->t.alarm); else - remaining = hrtimer_expires_remaining(&ctx->t.tmr); + remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr); return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index c66f2423e1f5..4a0e48f92104 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -84,9 +84,9 @@ static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umo * the files within the tracefs system. It is up to the individual * mkdir routine to handle races. */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = tracefs_ops.mkdir(name); - mutex_lock(&inode->i_mutex); + inode_lock(inode); kfree(name); @@ -109,13 +109,13 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) * This time we need to unlock not only the parent (inode) but * also the directory that is being deleted. */ - mutex_unlock(&inode->i_mutex); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(inode); + inode_unlock(dentry->d_inode); ret = tracefs_ops.rmdir(name); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock_nested(inode, I_MUTEX_PARENT); + inode_lock(dentry->d_inode); kfree(name); @@ -334,7 +334,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = tracefs_mount->mnt_root; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && dentry->d_inode) { dput(dentry); @@ -342,7 +342,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) } if (IS_ERR(dentry)) { - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -351,7 +351,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + inode_unlock(dentry->d_parent->d_inode); dput(dentry); simple_release_fs(&tracefs_mount, &tracefs_mount_count); return NULL; @@ -359,7 +359,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + inode_unlock(dentry->d_parent->d_inode); return dentry; } @@ -544,9 +544,9 @@ void tracefs_remove(struct dentry *dentry) if (!parent || !parent->d_inode) return; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); ret = __tracefs_remove(dentry, parent); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (!ret) simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -572,7 +572,7 @@ void tracefs_remove_recursive(struct dentry *dentry) parent = dentry; down: - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -587,7 +587,7 @@ void tracefs_remove_recursive(struct dentry *dentry) /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); parent = child; goto down; } @@ -608,10 +608,10 @@ void tracefs_remove_recursive(struct dentry *dentry) } spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); child = parent; parent = parent->d_parent; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); if (child != dentry) /* go up */ @@ -619,7 +619,7 @@ void tracefs_remove_recursive(struct dentry *dentry) if (!__tracefs_remove(child, parent)) simple_release_fs(&tracefs_mount, &tracefs_mount_count); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); } /** diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e49bd2808bf3..795992a8321e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -515,8 +515,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = dbg_check_synced_i_size(c, inode); if (err) @@ -572,8 +572,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = dbg_check_synced_i_size(c, inode); if (err) return err; @@ -661,8 +661,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry, inode->i_ino, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = check_dir_empty(c, d_inode(dentry)); if (err) return err; @@ -996,10 +996,10 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu", old_dentry, old_inode->i_ino, old_dir->i_ino, new_dentry, new_dir->i_ino); - ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); - ubifs_assert(mutex_is_locked(&new_dir->i_mutex)); + ubifs_assert(inode_is_locked(old_dir)); + ubifs_assert(inode_is_locked(new_dir)); if (unlink) - ubifs_assert(mutex_is_locked(&new_inode->i_mutex)); + ubifs_assert(inode_is_locked(new_inode)); if (unlink && is_dir) { diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index eff62801acbf..065c88f8e4b8 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1317,7 +1317,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Synchronize the inode unless this is a 'datasync()' call. */ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { @@ -1332,7 +1332,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ err = ubifs_sync_wbufs_by_inode(c, inode); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e53292d0c21b..c7f4d434d098 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -313,7 +313,7 @@ static int setxattr(struct inode *host, const char *name, const void *value, union ubifs_key key; int err, type; - ubifs_assert(mutex_is_locked(&host->i_mutex)); + ubifs_assert(inode_is_locked(host)); if (size > UBIFS_MAX_INO_DATA) return -ERANGE; @@ -550,7 +550,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name) dbg_gen("xattr '%s', ino %lu ('%pd')", name, host->i_ino, dentry); - ubifs_assert(mutex_is_locked(&host->i_mutex)); + ubifs_assert(inode_is_locked(host)); err = check_namespace(&nm); if (err < 0) diff --git a/fs/udf/file.c b/fs/udf/file.c index bddf3d071dae..1af98963d860 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct udf_inode_info *iinfo = UDF_I(inode); int err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = generic_write_checks(iocb, from); if (retval <= 0) @@ -136,7 +136,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) (udf_file_entry_alloc_offset(inode) + end)) { err = udf_expand_file_adinicb(inode); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); udf_debug("udf_expand_adinicb: err=%d\n", err); return err; } @@ -149,7 +149,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) retval = __generic_file_write_iter(iocb, from); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (retval > 0) { mark_inode_dirty(inode); @@ -223,12 +223,12 @@ static int udf_release_file(struct inode *inode, struct file *filp) * Grab i_mutex to avoid races with writes changing i_size * while we are running. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); down_write(&UDF_I(inode)->i_data_sem); udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); up_write(&UDF_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 87dc16d15572..166d3ed32c39 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -262,7 +262,7 @@ int udf_expand_file_adinicb(struct inode *inode) .nr_to_write = 1, }; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; diff --git a/fs/udf/super.c b/fs/udf/super.c index 0fbb4c7c72e8..a522c15a0bfd 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -279,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) { int i; int nr_groups = bitmap->s_nr_groups; - int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) * - nr_groups); for (i = 0; i < nr_groups; i++) if (bitmap->s_block_bitmap[i]) brelse(bitmap->s_block_bitmap[i]); - if (size <= PAGE_SIZE) - kfree(bitmap); - else - vfree(bitmap); + kvfree(bitmap); } static void udf_free_partition(struct udf_part_map *map) diff --git a/fs/utimes.c b/fs/utimes.c index aa138d64560a..85c40f4f373d 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -103,9 +103,9 @@ static int utimes_common(struct path *path, struct timespec *times) } } retry_deleg: - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = notify_change(path->dentry, &newattrs, &delegated_inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) diff --git a/fs/xattr.c b/fs/xattr.c index d5dd6c8b82a7..07d0e47f6a7f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -129,7 +129,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_inode_setxattr(dentry, name, value, size, flags); if (error) goto out; @@ -137,7 +137,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, error = __vfs_setxattr_noperm(dentry, name, value, size, flags); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); @@ -277,7 +277,7 @@ vfs_removexattr(struct dentry *dentry, const char *name) if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_inode_removexattr(dentry, name); if (error) goto out; @@ -290,7 +290,7 @@ vfs_removexattr(struct dentry *dentry, const char *name) } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index e2536bb1c760..dc97eb21af07 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -984,8 +984,6 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) /* * Values for di_flags - * There should be a one-to-one correspondence between these flags and the - * XFS_XFLAG_s. */ #define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ #define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ @@ -1026,6 +1024,15 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) /* + * Values for di_flags2 These start by being exposed to userspace in the upper + * 16 bits of the XFS_XFLAG_s range. + */ +#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ +#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) + +#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX) + +/* * Inode number format: * low inopblog bits - offset in block * next agblklog bits - block number in ag diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b2b73a998d42..fffe3d01bd9f 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -36,40 +36,6 @@ struct dioattr { #endif /* - * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR. - */ -#ifndef HAVE_FSXATTR -struct fsxattr { - __u32 fsx_xflags; /* xflags field value (get/set) */ - __u32 fsx_extsize; /* extsize field value (get/set)*/ - __u32 fsx_nextents; /* nextents field value (get) */ - __u32 fsx_projid; /* project identifier (get/set) */ - unsigned char fsx_pad[12]; -}; -#endif - -/* - * Flags for the bs_xflags/fsx_xflags field - * There should be a one-to-one correspondence between these flags and the - * XFS_DIFLAG_s. - */ -#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ -#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ -#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ -#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */ -#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ -#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */ -#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ -#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ -#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ -#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ -#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ -#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ -#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ -#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ -#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ - -/* * Structure for XFS_IOC_GETBMAP. * On input, fill in bmv_offset and bmv_length of the first structure * to indicate the area of interest in the file, and bmv_entries with @@ -514,8 +480,8 @@ typedef struct xfs_swapext #define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) #define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) -#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr) -#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr) +#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR #define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) #define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index daed4bfb85b2..435c7de42e5f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1527,6 +1527,16 @@ xfs_wait_buftarg( LIST_HEAD(dispose); int loop = 0; + /* + * We need to flush the buffer workqueue to ensure that all IO + * completion processing is 100% done. Just waiting on buffer locks is + * not sufficient for async IO as the reference count held over IO is + * not released until after the buffer lock is dropped. Hence we need to + * ensure here that all reference counts have been dropped before we + * start walking the LRU list. + */ + drain_workqueue(btp->bt_mount->m_buf_workqueue); + /* loop until there is nothing left on the lru list. */ while (list_lru_count(&btp->bt_lru)) { list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ebe9b8290a70..52883ac3cf84 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -55,7 +55,7 @@ xfs_rw_ilock( int type) { if (type & XFS_IOLOCK_EXCL) - mutex_lock(&VFS_I(ip)->i_mutex); + inode_lock(VFS_I(ip)); xfs_ilock(ip, type); } @@ -66,7 +66,7 @@ xfs_rw_iunlock( { xfs_iunlock(ip, type); if (type & XFS_IOLOCK_EXCL) - mutex_unlock(&VFS_I(ip)->i_mutex); + inode_unlock(VFS_I(ip)); } static inline void @@ -76,7 +76,7 @@ xfs_rw_ilock_demote( { xfs_ilock_demote(ip, type); if (type & XFS_IOLOCK_EXCL) - mutex_unlock(&VFS_I(ip)->i_mutex); + inode_unlock(VFS_I(ip)); } /* @@ -1610,9 +1610,8 @@ xfs_filemap_pmd_fault( /* * pfn_mkwrite was originally inteneded to ensure we capture time stamp * updates on write faults. In reality, it's need to serialise against - * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite() - * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault - * barrier in place. + * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED + * to ensure we serialise the fault barrier in place. */ static int xfs_filemap_pfn_mkwrite( @@ -1635,6 +1634,8 @@ xfs_filemap_pfn_mkwrite( size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; + else if (IS_DAX(inode)) + ret = dax_pfn_mkwrite(vma, vmf); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ae3758a90ed6..ceba1a83cacc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -610,60 +610,69 @@ __xfs_iflock( STATIC uint _xfs_dic2xflags( - __uint16_t di_flags) + __uint16_t di_flags, + uint64_t di_flags2, + bool has_attr) { uint flags = 0; if (di_flags & XFS_DIFLAG_ANY) { if (di_flags & XFS_DIFLAG_REALTIME) - flags |= XFS_XFLAG_REALTIME; + flags |= FS_XFLAG_REALTIME; if (di_flags & XFS_DIFLAG_PREALLOC) - flags |= XFS_XFLAG_PREALLOC; + flags |= FS_XFLAG_PREALLOC; if (di_flags & XFS_DIFLAG_IMMUTABLE) - flags |= XFS_XFLAG_IMMUTABLE; + flags |= FS_XFLAG_IMMUTABLE; if (di_flags & XFS_DIFLAG_APPEND) - flags |= XFS_XFLAG_APPEND; + flags |= FS_XFLAG_APPEND; if (di_flags & XFS_DIFLAG_SYNC) - flags |= XFS_XFLAG_SYNC; + flags |= FS_XFLAG_SYNC; if (di_flags & XFS_DIFLAG_NOATIME) - flags |= XFS_XFLAG_NOATIME; + flags |= FS_XFLAG_NOATIME; if (di_flags & XFS_DIFLAG_NODUMP) - flags |= XFS_XFLAG_NODUMP; + flags |= FS_XFLAG_NODUMP; if (di_flags & XFS_DIFLAG_RTINHERIT) - flags |= XFS_XFLAG_RTINHERIT; + flags |= FS_XFLAG_RTINHERIT; if (di_flags & XFS_DIFLAG_PROJINHERIT) - flags |= XFS_XFLAG_PROJINHERIT; + flags |= FS_XFLAG_PROJINHERIT; if (di_flags & XFS_DIFLAG_NOSYMLINKS) - flags |= XFS_XFLAG_NOSYMLINKS; + flags |= FS_XFLAG_NOSYMLINKS; if (di_flags & XFS_DIFLAG_EXTSIZE) - flags |= XFS_XFLAG_EXTSIZE; + flags |= FS_XFLAG_EXTSIZE; if (di_flags & XFS_DIFLAG_EXTSZINHERIT) - flags |= XFS_XFLAG_EXTSZINHERIT; + flags |= FS_XFLAG_EXTSZINHERIT; if (di_flags & XFS_DIFLAG_NODEFRAG) - flags |= XFS_XFLAG_NODEFRAG; + flags |= FS_XFLAG_NODEFRAG; if (di_flags & XFS_DIFLAG_FILESTREAM) - flags |= XFS_XFLAG_FILESTREAM; + flags |= FS_XFLAG_FILESTREAM; } + if (di_flags2 & XFS_DIFLAG2_ANY) { + if (di_flags2 & XFS_DIFLAG2_DAX) + flags |= FS_XFLAG_DAX; + } + + if (has_attr) + flags |= FS_XFLAG_HASATTR; + return flags; } uint xfs_ip2xflags( - xfs_inode_t *ip) + struct xfs_inode *ip) { - xfs_icdinode_t *dic = &ip->i_d; + struct xfs_icdinode *dic = &ip->i_d; - return _xfs_dic2xflags(dic->di_flags) | - (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); + return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); } uint xfs_dic2xflags( - xfs_dinode_t *dip) + struct xfs_dinode *dip) { - return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | - (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); + return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), + be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip)); } /* @@ -862,7 +871,8 @@ xfs_ialloc( case S_IFREG: case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint di_flags = 0; + uint64_t di_flags2 = 0; + uint di_flags = 0; if (S_ISDIR(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) @@ -898,7 +908,11 @@ xfs_ialloc( di_flags |= XFS_DIFLAG_NODEFRAG; if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + ip->i_d.di_flags |= di_flags; + ip->i_d.di_flags2 |= di_flags2; } /* FALLTHROUGH */ case S_IFLNK: diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d42738deec6d..478d04e07f95 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -859,25 +859,25 @@ xfs_merge_ioc_xflags( unsigned int xflags = start; if (flags & FS_IMMUTABLE_FL) - xflags |= XFS_XFLAG_IMMUTABLE; + xflags |= FS_XFLAG_IMMUTABLE; else - xflags &= ~XFS_XFLAG_IMMUTABLE; + xflags &= ~FS_XFLAG_IMMUTABLE; if (flags & FS_APPEND_FL) - xflags |= XFS_XFLAG_APPEND; + xflags |= FS_XFLAG_APPEND; else - xflags &= ~XFS_XFLAG_APPEND; + xflags &= ~FS_XFLAG_APPEND; if (flags & FS_SYNC_FL) - xflags |= XFS_XFLAG_SYNC; + xflags |= FS_XFLAG_SYNC; else - xflags &= ~XFS_XFLAG_SYNC; + xflags &= ~FS_XFLAG_SYNC; if (flags & FS_NOATIME_FL) - xflags |= XFS_XFLAG_NOATIME; + xflags |= FS_XFLAG_NOATIME; else - xflags &= ~XFS_XFLAG_NOATIME; + xflags &= ~FS_XFLAG_NOATIME; if (flags & FS_NODUMP_FL) - xflags |= XFS_XFLAG_NODUMP; + xflags |= FS_XFLAG_NODUMP; else - xflags &= ~XFS_XFLAG_NODUMP; + xflags &= ~FS_XFLAG_NODUMP; return xflags; } @@ -945,40 +945,51 @@ xfs_set_diflags( unsigned int xflags) { unsigned int di_flags; + uint64_t di_flags2; /* can't set PREALLOC this way, just preserve it */ di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); - if (xflags & XFS_XFLAG_IMMUTABLE) + if (xflags & FS_XFLAG_IMMUTABLE) di_flags |= XFS_DIFLAG_IMMUTABLE; - if (xflags & XFS_XFLAG_APPEND) + if (xflags & FS_XFLAG_APPEND) di_flags |= XFS_DIFLAG_APPEND; - if (xflags & XFS_XFLAG_SYNC) + if (xflags & FS_XFLAG_SYNC) di_flags |= XFS_DIFLAG_SYNC; - if (xflags & XFS_XFLAG_NOATIME) + if (xflags & FS_XFLAG_NOATIME) di_flags |= XFS_DIFLAG_NOATIME; - if (xflags & XFS_XFLAG_NODUMP) + if (xflags & FS_XFLAG_NODUMP) di_flags |= XFS_DIFLAG_NODUMP; - if (xflags & XFS_XFLAG_NODEFRAG) + if (xflags & FS_XFLAG_NODEFRAG) di_flags |= XFS_DIFLAG_NODEFRAG; - if (xflags & XFS_XFLAG_FILESTREAM) + if (xflags & FS_XFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; if (S_ISDIR(ip->i_d.di_mode)) { - if (xflags & XFS_XFLAG_RTINHERIT) + if (xflags & FS_XFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; - if (xflags & XFS_XFLAG_NOSYMLINKS) + if (xflags & FS_XFLAG_NOSYMLINKS) di_flags |= XFS_DIFLAG_NOSYMLINKS; - if (xflags & XFS_XFLAG_EXTSZINHERIT) + if (xflags & FS_XFLAG_EXTSZINHERIT) di_flags |= XFS_DIFLAG_EXTSZINHERIT; - if (xflags & XFS_XFLAG_PROJINHERIT) + if (xflags & FS_XFLAG_PROJINHERIT) di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(ip->i_d.di_mode)) { - if (xflags & XFS_XFLAG_REALTIME) + if (xflags & FS_XFLAG_REALTIME) di_flags |= XFS_DIFLAG_REALTIME; - if (xflags & XFS_XFLAG_EXTSIZE) + if (xflags & FS_XFLAG_EXTSIZE) di_flags |= XFS_DIFLAG_EXTSIZE; } - ip->i_d.di_flags = di_flags; + + /* diflags2 only valid for v3 inodes. */ + if (ip->i_d.di_version < 3) + return; + + di_flags2 = 0; + if (xflags & FS_XFLAG_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + + ip->i_d.di_flags2 = di_flags2; + } STATIC void @@ -988,22 +999,27 @@ xfs_diflags_to_linux( struct inode *inode = VFS_I(ip); unsigned int xflags = xfs_ip2xflags(ip); - if (xflags & XFS_XFLAG_IMMUTABLE) + if (xflags & FS_XFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; else inode->i_flags &= ~S_IMMUTABLE; - if (xflags & XFS_XFLAG_APPEND) + if (xflags & FS_XFLAG_APPEND) inode->i_flags |= S_APPEND; else inode->i_flags &= ~S_APPEND; - if (xflags & XFS_XFLAG_SYNC) + if (xflags & FS_XFLAG_SYNC) inode->i_flags |= S_SYNC; else inode->i_flags &= ~S_SYNC; - if (xflags & XFS_XFLAG_NOATIME) + if (xflags & FS_XFLAG_NOATIME) inode->i_flags |= S_NOATIME; else inode->i_flags &= ~S_NOATIME; + if (xflags & FS_XFLAG_DAX) + inode->i_flags |= S_DAX; + else + inode->i_flags &= ~S_DAX; + } static int @@ -1016,11 +1032,11 @@ xfs_ioctl_setattr_xflags( /* Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && - XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME)) + XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) return -EINVAL; /* If realtime flag is set then must have realtime device */ - if (fa->fsx_xflags & XFS_XFLAG_REALTIME) { + if (fa->fsx_xflags & FS_XFLAG_REALTIME) { if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) return -EINVAL; @@ -1031,7 +1047,7 @@ xfs_ioctl_setattr_xflags( * we have appropriate permission. */ if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) || - (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; @@ -1095,8 +1111,8 @@ out_cancel: * extent size hint validation is somewhat cumbersome. Rules are: * * 1. extent size hint is only valid for directories and regular files - * 2. XFS_XFLAG_EXTSIZE is only valid for regular files - * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories. + * 2. FS_XFLAG_EXTSIZE is only valid for regular files + * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. * 4. can only be changed on regular files if no extents are allocated * 5. can be changed on directories at any time * 6. extsize hint of 0 turns off hints, clears inode flags. @@ -1112,10 +1128,10 @@ xfs_ioctl_setattr_check_extsize( { struct xfs_mount *mp = ip->i_mount; - if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) + if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) return -EINVAL; - if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) && + if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && !S_ISDIR(ip->i_d.di_mode)) return -EINVAL; @@ -1132,7 +1148,7 @@ xfs_ioctl_setattr_check_extsize( return -EINVAL; if (XFS_IS_REALTIME_INODE(ip) || - (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + (fa->fsx_xflags & FS_XFLAG_REALTIME)) { size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; } else { size = mp->m_sb.sb_blocksize; @@ -1143,7 +1159,7 @@ xfs_ioctl_setattr_check_extsize( if (fa->fsx_extsize % size) return -EINVAL; } else - fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT); + fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); return 0; } @@ -1168,7 +1184,7 @@ xfs_ioctl_setattr_check_projid( if (xfs_get_projid(ip) != fa->fsx_projid) return -EINVAL; - if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) != + if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) != (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)) return -EINVAL; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 06eafafe636e..76b71a1c6c32 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1205,8 +1205,8 @@ xfs_diflags_to_iflags( inode->i_flags |= S_SYNC; if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - /* XXX: Also needs an on-disk per inode flag! */ - if (ip->i_mount->m_flags & XFS_MOUNT_DAX) + if (ip->i_mount->m_flags & XFS_MOUNT_DAX || + ip->i_d.di_flags2 & XFS_DIFLAG2_DAX) inode->i_flags |= S_DAX; } diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index dc6221942b85..ade236e90bb3 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -42,11 +42,11 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; if (with_imutex) - mutex_lock(&inode->i_mutex); + inode_lock(inode); xfs_ilock(ip, *iolock); } diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index aa67339b9537..4f18fd92ca13 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -497,7 +497,6 @@ xfsaild( long tout = 0; /* milliseconds */ current->flags |= PF_MEMALLOC; - set_freezable(); while (!kthread_should_stop()) { if (tout && tout <= 20) |