diff options
Diffstat (limited to 'fs')
36 files changed, 686 insertions, 322 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7fa9bb79ad08..89422aa8e9d1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3164,6 +3164,7 @@ int __cold open_ctree(struct super_block *sb, /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && !btrfs_test_opt(fs_info, NOLOGREPLAY)) { + btrfs_info(fs_info, "start tree-log replay"); ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6f417ff68980..bd6229fb2b6f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -237,6 +237,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) struct extent_map *merge = NULL; struct rb_node *rb; + /* + * We can't modify an extent map that is in the tree and that is being + * used by another task, as it can cause that other task to see it in + * inconsistent state during the merging. We always have 1 reference for + * the tree and 1 for this task (which is unpinning the extent map or + * clearing the logging flag), so anything > 2 means it's being used by + * other tasks too. + */ + if (refcount_read(&em->refs) > 2) + return; + if (em->start != 0) { rb = rb_prev(&em->rb_node); if (rb) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b3ec93ff911..7d26b4bfb2c6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4085,6 +4085,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; + const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); + struct extent_state *cached_state = NULL; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); @@ -4101,6 +4103,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->reada = READA_BACK; + lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); + /* * We want to drop from the next block forward in case this new size is * not block aligned since we will be keeping the last block of the @@ -4367,6 +4372,9 @@ out: btrfs_ordered_update_i_size(inode, last_size, NULL); } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, + &cached_state); + btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index b57f3618e58e..454a1015d026 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -744,6 +744,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, */ be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); if (IS_ERR(be)) { + kfree(ref); kfree(ra); ret = PTR_ERR(be); goto out; @@ -757,6 +758,8 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "re-allocated a block that still has references to it!"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); goto out_unlock; } @@ -819,6 +822,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "dropping a ref for a existing root that doesn't have a ref on the block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); kfree(ra); goto out_unlock; } @@ -834,6 +838,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, "attempting to add another ref for an existing ref on a tree block"); dump_block_entry(fs_info, be); dump_ref_action(fs_info, ra); + kfree(ref); kfree(ra); goto out_unlock; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0616a5434793..67c63858812a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1834,6 +1834,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } if (btrfs_super_log_root(fs_info->super_copy) != 0) { + btrfs_warn(fs_info, + "mount required to replay tree-log, cannot remount read-write"); ret = -EINVAL; goto restore; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 7436422194da..3c10e78924d0 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -901,6 +901,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { + if (fs_devs->devinfo_kobj) { + kobject_del(fs_devs->devinfo_kobj); + kobject_put(fs_devs->devinfo_kobj); + fs_devs->devinfo_kobj = NULL; + } + if (fs_devs->devices_kobj) { kobject_del(fs_devs->devices_kobj); kobject_put(fs_devs->devices_kobj); @@ -1289,7 +1295,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, init_completion(&dev->kobj_unregister); error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype, - fs_devices->devices_kobj, "%llu", + fs_devices->devinfo_kobj, "%llu", dev->devid); if (error) { kobject_put(&dev->devid_kobj); @@ -1369,6 +1375,15 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) return -ENOMEM; } + fs_devs->devinfo_kobj = kobject_create_and_add("devinfo", + &fs_devs->fsid_kobj); + if (!fs_devs->devinfo_kobj) { + btrfs_err(fs_devs->fs_info, + "failed to init sysfs devinfo kobject"); + btrfs_sysfs_remove_fsid(fs_devs); + return -ENOMEM; + } + return 0; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 409f4816fb89..f01552a0785e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -258,6 +258,7 @@ struct btrfs_fs_devices { /* sysfs kobjects */ struct kobject fsid_kobj; struct kobject *devices_kobj; + struct kobject *devinfo_kobj; struct completion kobj_unregister; }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c3b8e8e0bf17..7e0190b1f821 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1418,6 +1418,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; int err, want, got; + bool direct_lock = false; loff_t pos; loff_t limit = max(i_size_read(inode), fsc->max_file_size); @@ -1428,8 +1429,11 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!prealloc_cf) return -ENOMEM; + if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT) + direct_lock = true; + retry_snap: - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_start_io_direct(inode); else ceph_start_io_write(inode); @@ -1519,14 +1523,15 @@ retry_snap: /* we might need to revert back to that point */ data = *from; - if (iocb->ki_flags & IOCB_DIRECT) { + if (iocb->ki_flags & IOCB_DIRECT) written = ceph_direct_read_write(iocb, &data, snapc, &prealloc_cf); - ceph_end_io_direct(inode); - } else { + else written = ceph_sync_write(iocb, &data, pos, snapc); + if (direct_lock) + ceph_end_io_direct(inode); + else ceph_end_io_write(inode); - } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); @@ -1577,7 +1582,7 @@ retry_snap: goto out_unlocked; out: - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_end_io_direct(inode); else ceph_end_io_write(inode); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 1d9f083b8a11..c7f150686a53 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -203,6 +203,26 @@ struct ceph_parse_opts_ctx { }; /* + * Remove adjacent slashes and then the trailing slash, unless it is + * the only remaining character. + * + * E.g. "//dir1////dir2///" --> "/dir1/dir2", "///" --> "/". + */ +static void canonicalize_path(char *path) +{ + int i, j = 0; + + for (i = 0; path[i] != '\0'; i++) { + if (path[i] != '/' || j < 1 || path[j - 1] != '/') + path[j++] = path[i]; + } + + if (j > 1 && path[j - 1] == '/') + j--; + path[j] = '\0'; +} + +/* * Parse the source parameter. Distinguish the server list from the path. * * The source will look like: @@ -224,15 +244,16 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dev_name_end = strchr(dev_name, '/'); if (dev_name_end) { - kfree(fsopt->server_path); - /* * The server_path will include the whole chars from userland * including the leading '/'. */ + kfree(fsopt->server_path); fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); if (!fsopt->server_path) return -ENOMEM; + + canonicalize_path(fsopt->server_path); } else { dev_name_end = dev_name + strlen(dev_name); } @@ -456,73 +477,6 @@ static int strcmp_null(const char *s1, const char *s2) return strcmp(s1, s2); } -/** - * path_remove_extra_slash - Remove the extra slashes in the server path - * @server_path: the server path and could be NULL - * - * Return NULL if the path is NULL or only consists of "/", or a string - * without any extra slashes including the leading slash(es) and the - * slash(es) at the end of the server path, such as: - * "//dir1////dir2///" --> "dir1/dir2" - */ -static char *path_remove_extra_slash(const char *server_path) -{ - const char *path = server_path; - const char *cur, *end; - char *buf, *p; - int len; - - /* if the server path is omitted */ - if (!path) - return NULL; - - /* remove all the leading slashes */ - while (*path == '/') - path++; - - /* if the server path only consists of slashes */ - if (*path == '\0') - return NULL; - - len = strlen(path); - - buf = kmalloc(len + 1, GFP_KERNEL); - if (!buf) - return ERR_PTR(-ENOMEM); - - end = path + len; - p = buf; - do { - cur = strchr(path, '/'); - if (!cur) - cur = end; - - len = cur - path; - - /* including one '/' */ - if (cur != end) - len += 1; - - memcpy(p, path, len); - p += len; - - while (cur <= end && *cur == '/') - cur++; - path = cur; - } while (path < end); - - *p = '\0'; - - /* - * remove the last slash if there has and just to make sure that - * we will get something like "dir1/dir2" - */ - if (*(--p) == '/') - *p = '\0'; - - return buf; -} - static int compare_mount_options(struct ceph_mount_options *new_fsopt, struct ceph_options *new_opt, struct ceph_fs_client *fsc) @@ -530,7 +484,6 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, struct ceph_mount_options *fsopt1 = new_fsopt; struct ceph_mount_options *fsopt2 = fsc->mount_options; int ofs = offsetof(struct ceph_mount_options, snapdir_name); - char *p1, *p2; int ret; ret = memcmp(fsopt1, fsopt2, ofs); @@ -540,21 +493,12 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); if (ret) return ret; + ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); if (ret) return ret; - p1 = path_remove_extra_slash(fsopt1->server_path); - if (IS_ERR(p1)) - return PTR_ERR(p1); - p2 = path_remove_extra_slash(fsopt2->server_path); - if (IS_ERR(p2)) { - kfree(p1); - return PTR_ERR(p2); - } - ret = strcmp_null(p1, p2); - kfree(p1); - kfree(p2); + ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); if (ret) return ret; @@ -957,7 +901,9 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, mutex_lock(&fsc->client->mount_mutex); if (!fsc->sb->s_root) { - const char *path, *p; + const char *path = fsc->mount_options->server_path ? + fsc->mount_options->server_path + 1 : ""; + err = __ceph_open_session(fsc->client, started); if (err < 0) goto out; @@ -969,22 +915,11 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto out; } - p = path_remove_extra_slash(fsc->mount_options->server_path); - if (IS_ERR(p)) { - err = PTR_ERR(p); - goto out; - } - /* if the server path is omitted or just consists of '/' */ - if (!p) - path = ""; - else - path = p; dout("mount opening path '%s'\n", path); ceph_fs_debugfs_init(fsc); root = open_root_dentry(fsc, path, started); - kfree(p); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; @@ -1097,10 +1032,6 @@ static int ceph_get_tree(struct fs_context *fc) if (!fc->source) return invalfc(fc, "No source"); -#ifdef CONFIG_CEPH_FS_POSIX_ACL - fc->sb_flags |= SB_POSIXACL; -#endif - /* create client (which we may/may not use) */ fsc = create_fs_client(pctx->opts, pctx->copts); pctx->opts = NULL; @@ -1223,6 +1154,10 @@ static int ceph_init_fs_context(struct fs_context *fc) fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); +#ifdef CONFIG_CEPH_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#endif + fc->fs_private = pctx; fc->ops = &ceph_context_ops; return 0; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1e456a9011bb..037cdfb2ad4f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -91,7 +91,7 @@ struct ceph_mount_options { char *snapdir_name; /* default ".snap" */ char *mds_namespace; /* default NULL */ - char *server_path; /* default "/" */ + char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ }; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 440828afcdde..716574aab3b6 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -601,7 +601,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode, ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) *pmode |= (S_IXUGO & (*pbits_to_set)); - cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode); + cifs_dbg(NOISY, "access flags 0x%x mode now %04o\n", flags, *pmode); return; } @@ -630,7 +630,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use, if (mode & S_IXUGO) *pace_flags |= SET_FILE_EXEC_RIGHTS; - cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n", + cifs_dbg(NOISY, "mode: %04o, access flags now 0x%x\n", mode, *pace_flags); return; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index febab27cd838..46ebaf3f0824 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -414,7 +414,7 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses) seq_puts(s, "ntlm"); break; case Kerberos: - seq_printf(s, "krb5,cruid=%u", from_kuid_munged(&init_user_ns,ses->cred_uid)); + seq_puts(s, "krb5"); break; case RawNTLMSSP: seq_puts(s, "ntlmssp"); @@ -427,6 +427,10 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses) if (ses->sign) seq_puts(s, "i"); + + if (ses->sectype == Kerberos) + seq_printf(s, ",cruid=%u", + from_kuid_munged(&init_user_ns, ses->cred_uid)); } static void diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a941ac7a659d..4804d1df8c1c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4151,7 +4151,7 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_gid = pvolume_info->linux_gid; cifs_sb->mnt_file_mode = pvolume_info->file_mode; cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; - cifs_dbg(FYI, "file mode: 0x%hx dir mode: 0x%hx\n", + cifs_dbg(FYI, "file mode: %04ho dir mode: %04ho\n", cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); cifs_sb->actimeo = pvolume_info->actimeo; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 9ba623b601ec..b5e6635c578e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1648,7 +1648,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) struct TCP_Server_Info *server; char *full_path; - cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n", + cifs_dbg(FYI, "In cifs_mkdir, mode = %04ho inode = 0x%p\n", mode, inode); cifs_sb = CIFS_SB(inode->i_sb); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index baa825f4cec0..e47190cae163 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1116,7 +1116,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, void *data[1]; struct smb2_file_full_ea_info *ea = NULL; struct kvec close_iov[1]; - int rc; + struct smb2_query_info_rsp *rsp; + int rc, used_len = 0; if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; @@ -1139,6 +1140,38 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, cifs_sb); if (rc == -ENODATA) goto sea_exit; + } else { + /* If we are adding a attribute we should first check + * if there will be enough space available to store + * the new EA. If not we should not add it since we + * would not be able to even read the EAs back. + */ + rc = smb2_query_info_compound(xid, tcon, utf16_path, + FILE_READ_EA, + FILE_FULL_EA_INFORMATION, + SMB2_O_INFO_FILE, + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE, + &rsp_iov[1], &resp_buftype[1], cifs_sb); + if (rc == 0) { + rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + used_len = le32_to_cpu(rsp->OutputBufferLength); + } + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + resp_buftype[1] = CIFS_NO_BUFFER; + memset(&rsp_iov[1], 0, sizeof(rsp_iov[1])); + rc = 0; + + /* Use a fudge factor of 256 bytes in case we collide + * with a different set_EAs command. + */ + if(CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE - 256 < + used_len + ea_name_len + ea_value_len + 1) { + rc = -ENOSPC; + goto sea_exit; + } } } @@ -4795,6 +4828,7 @@ struct smb_version_operations smb21_operations = { .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, .enum_snapshots = smb3_enum_snapshots, + .notify = smb3_notify, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, #ifdef CONFIG_CIFS_XATTR @@ -937,12 +937,11 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, * on persistent storage prior to completion of the operation. */ int dax_writeback_mapping_range(struct address_space *mapping, - struct block_device *bdev, struct writeback_control *wbc) + struct dax_device *dax_dev, struct writeback_control *wbc) { XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); struct inode *inode = mapping->host; pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; - struct dax_device *dax_dev; void *entry; int ret = 0; unsigned int scanned = 0; @@ -953,10 +952,6 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; - dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); - if (!dax_dev) - return -EIO; - trace_dax_writeback_range(inode, xas.xa_index, end_index); tag_pages_for_writeback(mapping, xas.xa_index, end_index); @@ -977,7 +972,6 @@ int dax_writeback_mapping_range(struct address_space *mapping, xas_lock_irq(&xas); } xas_unlock_irq(&xas); - put_dax(dax_dev); trace_dax_writeback_range_done(inode, xas.xa_index, end_index); return ret; } @@ -1207,6 +1201,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, lockdep_assert_held(&inode->i_rwsem); } + if (iocb->ki_flags & IOCB_NOWAIT) + flags |= IOMAP_NOWAIT; + while (iov_iter_count(iter)) { ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, iter, dax_iomap_actor); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 119667e65890..c885cf7d724b 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -960,8 +960,9 @@ ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) static int ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return dax_writeback_mapping_range(mapping, - mapping->host->i_sb->s_bdev, wbc); + struct ext2_sb_info *sbi = EXT2_SB(mapping->host->i_sb); + + return dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); } const struct address_space_operations ext2_aops = { diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 1ee04e76bbe0..0a734ffb4310 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -207,6 +207,7 @@ static int ext4_protect_reserved_inode(struct super_block *sb, return PTR_ERR(inode); num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; while (i < num) { + cond_resched(); map.m_lblk = i; map.m_len = num - i; n = ext4_map_blocks(NULL, inode, &map, 0); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 1f340743c9a8..9aa1f75409b0 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -129,12 +129,14 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) if (err != ERR_BAD_DX_DIR) { return err; } - /* - * We don't set the inode dirty flag since it's not - * critical that it get flushed back to the disk. - */ - ext4_clear_inode_flag(file_inode(file), - EXT4_INODE_INDEX); + /* Can we just clear INDEX flag to ignore htree information? */ + if (!ext4_has_metadata_csum(sb)) { + /* + * We don't set the inode dirty flag since it's not + * critical that it gets flushed back to the disk. + */ + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } } if (ext4_has_inline_data(inode)) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9a2ee2428ecc..4441331d06cc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2544,8 +2544,11 @@ void ext4_insert_dentry(struct inode *inode, struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { - if (!ext4_has_feature_dir_index(inode->i_sb)) + if (!ext4_has_feature_dir_index(inode->i_sb)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } } static const unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3313168b680f..e60aca791d3f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2867,7 +2867,7 @@ static int ext4_dax_writepages(struct address_space *mapping, percpu_down_read(&sbi->s_journal_flag_rwsem); trace_ext4_writepages(inode, wbc); - ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc); + ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); percpu_up_read(&sbi->s_journal_flag_rwsem); @@ -4644,6 +4644,18 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } + /* + * If dir_index is not enabled but there's dir with INDEX flag set, + * we'd normally treat htree data as empty space. But with metadata + * checksumming that corrupts checksums so forbid that. + */ + if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + ext4_error_inode(inode, function, line, 0, + "iget: Dir with htree data on filesystem without dir_index feature."); + ret = -EFSCORRUPTED; + goto bad_inode; + } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 1c44b1a32001..87f7551c5132 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -120,10 +120,10 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, { __ext4_warning(sb, function, line, "%s", msg); __ext4_warning(sb, function, line, - "MMP failure info: last update time: %llu, last update " - "node: %s, last update device: %s", - (long long unsigned int) le64_to_cpu(mmp->mmp_time), - mmp->mmp_nodename, mmp->mmp_bdevname); + "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s", + (unsigned long long)le64_to_cpu(mmp->mmp_time), + (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename, + (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname); } /* @@ -154,6 +154,7 @@ static int kmmpd(void *data) mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval, EXT4_MMP_MIN_CHECK_INTERVAL); mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); + BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE); bdevname(bh->b_bdev, mmp->mmp_bdevname); memcpy(mmp->mmp_nodename, init_utsname()->nodename, @@ -379,7 +380,8 @@ skip: /* * Start a kernel thread to update the MMP block periodically. */ - EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s", + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s", + (int)sizeof(mmp->mmp_bdevname), bdevname(bh->b_bdev, mmp->mmp_bdevname)); if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 129d2ebae00d..ceff4b4b1877 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2213,6 +2213,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, retval = ext4_dx_add_entry(handle, &fname, dir, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; + /* Can we just ignore htree data? */ + if (ext4_has_metadata_csum(sb)) { + EXT4_ERROR_INODE(dir, + "Directory has corrupted htree index."); + retval = -EFSCORRUPTED; + goto out; + } ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; ext4_mark_inode_dirty(handle, dir); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8434217549b3..f464dff09774 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3009,17 +3009,11 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } -#ifndef CONFIG_QUOTA - if (ext4_has_feature_quota(sb) && !readonly) { +#if !defined(CONFIG_QUOTA) || !defined(CONFIG_QFMT_V2) + if (!readonly && (ext4_has_feature_quota(sb) || + ext4_has_feature_project(sb))) { ext4_msg(sb, KERN_ERR, - "Filesystem with quota feature cannot be mounted RDWR " - "without CONFIG_QUOTA"); - return 0; - } - if (ext4_has_feature_project(sb) && !readonly) { - ext4_msg(sb, KERN_ERR, - "Filesystem with project quota feature cannot be mounted RDWR " - "without CONFIG_QUOTA"); + "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2"); return 0; } #endif /* CONFIG_QUOTA */ @@ -3814,6 +3808,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + if (blocksize < EXT4_MIN_BLOCK_SIZE || + blocksize > EXT4_MAX_BLOCK_SIZE) { + ext4_msg(sb, KERN_ERR, + "Unsupported filesystem blocksize %d (%d log_block_size)", + blocksize, le32_to_cpu(es->s_log_block_size)); + goto failed_mount; + } + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; @@ -3831,6 +3834,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_ERR, "unsupported inode size: %d", sbi->s_inode_size); + ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize); goto failed_mount; } /* @@ -4033,14 +4037,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) goto failed_mount; - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - if (blocksize < EXT4_MIN_BLOCK_SIZE || - blocksize > EXT4_MAX_BLOCK_SIZE) { - ext4_msg(sb, KERN_ERR, - "Unsupported filesystem blocksize %d (%d log_block_size)", - blocksize, le32_to_cpu(es->s_log_block_size)); - goto failed_mount; - } if (le32_to_cpu(es->s_log_block_size) > (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, @@ -5585,10 +5581,7 @@ static int ext4_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = 0; - if (dquot->dq_dqb.dqb_bsoftlimit && - (!limit || dquot->dq_dqb.dqb_bsoftlimit < limit)) - limit = dquot->dq_dqb.dqb_bsoftlimit; + limit = dquot->dq_dqb.dqb_bsoftlimit; if (dquot->dq_dqb.dqb_bhardlimit && (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) limit = dquot->dq_dqb.dqb_bhardlimit; @@ -5603,10 +5596,7 @@ static int ext4_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = 0; - if (dquot->dq_dqb.dqb_isoftlimit && - (!limit || dquot->dq_dqb.dqb_isoftlimit < limit)) - limit = dquot->dq_dqb.dqb_isoftlimit; + limit = dquot->dq_dqb.dqb_isoftlimit; if (dquot->dq_dqb.dqb_ihardlimit && (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) limit = dquot->dq_dqb.dqb_ihardlimit; diff --git a/fs/io-wq.c b/fs/io-wq.c index cb60a42b9fdf..0a5ab1a8f69a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/kthread.h> #include <linux/rculist_nulls.h> +#include <linux/fs_struct.h> #include "io-wq.h" @@ -59,6 +60,7 @@ struct io_worker { const struct cred *cur_creds; const struct cred *saved_creds; struct files_struct *restore_files; + struct fs_struct *restore_fs; }; #if BITS_PER_LONG == 64 @@ -151,6 +153,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) task_unlock(current); } + if (current->fs != worker->restore_fs) + current->fs = worker->restore_fs; + /* * If we have an active mm, we need to drop the wq lock before unusing * it. If we do, return true and let the caller retry the idle loop. @@ -311,6 +316,7 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); worker->restore_files = current->files; + worker->restore_fs = current->fs; io_wqe_inc_running(wqe, worker); } @@ -481,6 +487,8 @@ next: current->files = work->files; task_unlock(current); } + if (work->fs && current->fs != work->fs) + current->fs = work->fs; if (work->mm != worker->mm) io_wq_switch_mm(worker, work); if (worker->cur_creds != work->creds) @@ -691,11 +699,16 @@ static int io_wq_manager(void *data) /* create fixed workers */ refcount_set(&wq->refs, workers_to_create); for_each_node(node) { + if (!node_online(node)) + continue; if (!create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) goto err; workers_to_create--; } + while (workers_to_create--) + refcount_dec(&wq->refs); + complete(&wq->done); while (!kthread_should_stop()) { @@ -703,6 +716,9 @@ static int io_wq_manager(void *data) struct io_wqe *wqe = wq->wqes[node]; bool fork_worker[2] = { false, false }; + if (!node_online(node)) + continue; + spin_lock_irq(&wqe->lock); if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) fork_worker[IO_WQ_ACCT_BOUND] = true; @@ -821,7 +837,9 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe, list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { if (io_worker_get(worker)) { - ret = func(worker, data); + /* no task if node is/was offline */ + if (worker->task) + ret = func(worker, data); io_worker_release(worker); if (ret) break; @@ -929,17 +947,19 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, return ret; } +struct work_match { + bool (*fn)(struct io_wq_work *, void *data); + void *data; +}; + static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { - struct io_wq_work *work = data; + struct work_match *match = data; unsigned long flags; bool ret = false; - if (worker->cur_work != work) - return false; - spin_lock_irqsave(&worker->lock, flags); - if (worker->cur_work == work && + if (match->fn(worker->cur_work, match->data) && !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) { send_sig(SIGINT, worker->task, 1); ret = true; @@ -950,15 +970,13 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) } static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, - struct io_wq_work *cwork) + struct work_match *match) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; unsigned long flags; bool found = false; - cwork->flags |= IO_WQ_WORK_CANCEL; - /* * First check pending list, if we're lucky we can just remove it * from there. CANCEL_OK means that the work is returned as-new, @@ -968,7 +986,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); - if (work == cwork) { + if (match->fn(work, match->data)) { wq_node_del(&wqe->work_list, node, prev); found = true; break; @@ -989,20 +1007,60 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, * completion will run normally in this case. */ rcu_read_lock(); - found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, cwork); + found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); rcu_read_unlock(); return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; } +static bool io_wq_work_match(struct io_wq_work *work, void *data) +{ + return work == data; +} + enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) { + struct work_match match = { + .fn = io_wq_work_match, + .data = cwork + }; enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; int node; + cwork->flags |= IO_WQ_WORK_CANCEL; + for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - ret = io_wqe_cancel_work(wqe, cwork); + ret = io_wqe_cancel_work(wqe, &match); + if (ret != IO_WQ_CANCEL_NOTFOUND) + break; + } + + return ret; +} + +static bool io_wq_pid_match(struct io_wq_work *work, void *data) +{ + pid_t pid = (pid_t) (unsigned long) data; + + if (work) + return work->task_pid == pid; + return false; +} + +enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) +{ + struct work_match match = { + .fn = io_wq_pid_match, + .data = (void *) (unsigned long) pid + }; + enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; + int node; + + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + + ret = io_wqe_cancel_work(wqe, &match); if (ret != IO_WQ_CANCEL_NOTFOUND) break; } @@ -1036,6 +1094,8 @@ void io_wq_flush(struct io_wq *wq) for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; + if (!node_online(node)) + continue; init_completion(&data.done); INIT_IO_WORK(&data.work, io_wq_flush_func); data.work.flags |= IO_WQ_WORK_INTERNAL; @@ -1067,12 +1127,15 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) for_each_node(node) { struct io_wqe *wqe; + int alloc_node = node; - wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, node); + if (!node_online(alloc_node)) + alloc_node = NUMA_NO_NODE; + wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node); if (!wqe) goto err; wq->wqes[node] = wqe; - wqe->node = node; + wqe->node = alloc_node; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); if (wq->user) { @@ -1080,7 +1143,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) task_rlimit(current, RLIMIT_NPROC); } atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); - wqe->node = node; wqe->wq = wq; spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); diff --git a/fs/io-wq.h b/fs/io-wq.h index 50b3378febf2..ccc7d84af57d 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -74,17 +74,20 @@ struct io_wq_work { struct files_struct *files; struct mm_struct *mm; const struct cred *creds; + struct fs_struct *fs; unsigned flags; + pid_t task_pid; }; #define INIT_IO_WORK(work, _func) \ do { \ (work)->list.next = NULL; \ (work)->func = _func; \ - (work)->flags = 0; \ (work)->files = NULL; \ (work)->mm = NULL; \ (work)->creds = NULL; \ + (work)->fs = NULL; \ + (work)->flags = 0; \ } while (0) \ typedef void (get_work_fn)(struct io_wq_work *); @@ -107,6 +110,7 @@ void io_wq_flush(struct io_wq *wq); void io_wq_cancel_all(struct io_wq *wq); enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); +enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid); typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/fs/io_uring.c b/fs/io_uring.c index 77f22c3da30f..5a826017ebb8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -75,6 +75,7 @@ #include <linux/fsnotify.h> #include <linux/fadvise.h> #include <linux/eventpoll.h> +#include <linux/fs_struct.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -204,11 +205,11 @@ struct io_ring_ctx { struct { unsigned int flags; - int compat: 1; - int account_mem: 1; - int cq_overflow_flushed: 1; - int drain_next: 1; - int eventfd_async: 1; + unsigned int compat: 1; + unsigned int account_mem: 1; + unsigned int cq_overflow_flushed: 1; + unsigned int drain_next: 1; + unsigned int eventfd_async: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -441,6 +442,7 @@ struct io_async_msghdr { struct iovec *iov; struct sockaddr __user *uaddr; struct msghdr msg; + struct sockaddr_storage addr; }; struct io_async_rw { @@ -450,17 +452,12 @@ struct io_async_rw { ssize_t size; }; -struct io_async_open { - struct filename *filename; -}; - struct io_async_ctx { union { struct io_async_rw rw; struct io_async_msghdr msg; struct io_async_connect connect; struct io_timeout_data timeout; - struct io_async_open open; }; }; @@ -483,6 +480,8 @@ enum { REQ_F_MUST_PUNT_BIT, REQ_F_TIMEOUT_NOSEQ_BIT, REQ_F_COMP_LOCKED_BIT, + REQ_F_NEED_CLEANUP_BIT, + REQ_F_OVERFLOW_BIT, }; enum { @@ -521,6 +520,10 @@ enum { REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), /* completion under lock */ REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), + /* needs cleanup */ + REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), + /* in overflow list */ + REQ_F_OVERFLOW = BIT(REQ_F_OVERFLOW_BIT), }; /* @@ -553,7 +556,6 @@ struct io_kiocb { * llist_node is only used for poll deferred completions */ struct llist_node llist_node; - bool has_user; bool in_async; bool needs_fixed_file; u8 opcode; @@ -614,6 +616,8 @@ struct io_op_def { unsigned not_supported : 1; /* needs file table */ unsigned file_table : 1; + /* needs ->fs */ + unsigned needs_fs : 1; }; static const struct io_op_def io_op_defs[] = { @@ -656,12 +660,14 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .needs_fs = 1, }, [IORING_OP_RECVMSG] = { .async_ctx = 1, .needs_mm = 1, .needs_file = 1, .unbound_nonreg_file = 1, + .needs_fs = 1, }, [IORING_OP_TIMEOUT] = { .async_ctx = 1, @@ -692,6 +698,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .fd_non_neg = 1, .file_table = 1, + .needs_fs = 1, }, [IORING_OP_CLOSE] = { .needs_file = 1, @@ -705,6 +712,7 @@ static const struct io_op_def io_op_defs[] = { .needs_mm = 1, .needs_file = 1, .fd_non_neg = 1, + .needs_fs = 1, }, [IORING_OP_READ] = { .needs_mm = 1, @@ -736,6 +744,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .fd_non_neg = 1, .file_table = 1, + .needs_fs = 1, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, @@ -754,6 +763,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, unsigned nr_args); static int io_grab_files(struct io_kiocb *req); static void io_ring_file_ref_flush(struct fixed_file_data *data); +static void io_cleanup_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -909,6 +919,18 @@ static inline void io_req_work_grab_env(struct io_kiocb *req, } if (!req->work.creds) req->work.creds = get_current_cred(); + if (!req->work.fs && def->needs_fs) { + spin_lock(¤t->fs->lock); + if (!current->fs->in_exec) { + req->work.fs = current->fs; + req->work.fs->users++; + } else { + req->work.flags |= IO_WQ_WORK_CANCEL; + } + spin_unlock(¤t->fs->lock); + } + if (!req->work.task_pid) + req->work.task_pid = task_pid_vnr(current); } static inline void io_req_work_drop_env(struct io_kiocb *req) @@ -921,6 +943,16 @@ static inline void io_req_work_drop_env(struct io_kiocb *req) put_cred(req->work.creds); req->work.creds = NULL; } + if (req->work.fs) { + struct fs_struct *fs = req->work.fs; + + spin_lock(&req->work.fs->lock); + if (--fs->users) + fs = NULL; + spin_unlock(&req->work.fs->lock); + if (fs) + free_fs_struct(fs); + } } static inline bool io_prep_async_work(struct io_kiocb *req, @@ -1074,6 +1106,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb, list); list_move(&req->list, &list); + req->flags &= ~REQ_F_OVERFLOW; if (cqe) { WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, req->result); @@ -1126,6 +1159,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) set_bit(0, &ctx->sq_check_overflow); set_bit(0, &ctx->cq_check_overflow); } + req->flags |= REQ_F_OVERFLOW; refcount_inc(&req->refs); req->result = res; list_add_tail(&req->list, &ctx->cq_overflow_list); @@ -1241,6 +1275,9 @@ static void __io_free_req(struct io_kiocb *req) { __io_req_aux_free(req); + if (req->flags & REQ_F_NEED_CLEANUP) + io_cleanup_req(req); + if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; @@ -2056,9 +2093,6 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, return iorw->size; } - if (!req->has_user) - return -EFAULT; - #ifdef CONFIG_COMPAT if (req->ctx->compat) return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, @@ -2137,6 +2171,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, req->io->rw.iov = req->io->rw.fast_iov; memcpy(req->io->rw.iov, fast_iov, sizeof(struct iovec) * iter->nr_segs); + } else { + req->flags |= REQ_F_NEED_CLEANUP; } } @@ -2148,17 +2184,6 @@ static int io_alloc_async_ctx(struct io_kiocb *req) return req->io == NULL; } -static void io_rw_async(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct iovec *iov = NULL; - - if (req->io->rw.iov != req->io->rw.fast_iov) - iov = req->io->rw.iov; - io_wq_submit_work(workptr); - kfree(iov); -} - static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) @@ -2171,7 +2196,6 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, io_req_map_rw(req, io_size, iovec, fast_iov, iter); } - req->work.func = io_rw_async; return 0; } @@ -2189,7 +2213,8 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(!(req->file->f_mode & FMODE_READ))) return -EBADF; - if (!req->io) + /* either don't need iovec imported or already have it */ + if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; io = req->io; @@ -2258,8 +2283,8 @@ copy_iov: } } out_free: - if (!io_wq_current_is_worker()) - kfree(iovec); + kfree(iovec); + req->flags &= ~REQ_F_NEED_CLEANUP; return ret; } @@ -2277,7 +2302,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - if (!req->io) + /* either don't need iovec imported or already have it */ + if (!req->io || req->flags & REQ_F_NEED_CLEANUP) return 0; io = req->io; @@ -2352,6 +2378,12 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, ret2 = call_write_iter(req->file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + /* + * Raw bdev writes will -EOPNOTSUPP for IOCB_NOWAIT. Just + * retry them without IOCB_NOWAIT. + */ + if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) + ret2 = -EAGAIN; if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); } else { @@ -2364,8 +2396,8 @@ copy_iov: } } out_free: - if (!io_wq_current_is_worker()) - kfree(iovec); + req->flags &= ~REQ_F_NEED_CLEANUP; + kfree(iovec); return ret; } @@ -2534,6 +2566,10 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->buf_index) return -EINVAL; + if (sqe->flags & IOSQE_FIXED_FILE) + return -EBADF; + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; req->open.dfd = READ_ONCE(sqe->fd); req->open.how.mode = READ_ONCE(sqe->len); @@ -2547,6 +2583,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } + req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -2559,6 +2596,10 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->buf_index) return -EINVAL; + if (sqe->flags & IOSQE_FIXED_FILE) + return -EBADF; + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; req->open.dfd = READ_ONCE(sqe->fd); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -2583,6 +2624,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } + req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -2614,6 +2656,7 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, } err: putname(req->open.filename); + req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); @@ -2754,6 +2797,10 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->buf_index) return -EINVAL; + if (sqe->flags & IOSQE_FIXED_FILE) + return -EBADF; + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; req->open.dfd = READ_ONCE(sqe->fd); req->open.mask = READ_ONCE(sqe->len); @@ -2771,6 +2818,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } + req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -2808,6 +2856,7 @@ retry: ret = cp_statx(&stat, ctx->buffer); err: putname(ctx->filename); + req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); @@ -2827,7 +2876,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sqe->rw_flags || sqe->buf_index) return -EINVAL; if (sqe->flags & IOSQE_FIXED_FILE) - return -EINVAL; + return -EBADF; req->close.fd = READ_ONCE(sqe->fd); if (req->file->f_op == &io_uring_fops || @@ -2837,24 +2886,25 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +/* only called when __close_fd_get_file() is done */ +static void __io_close_finish(struct io_kiocb *req, struct io_kiocb **nxt) +{ + int ret; + + ret = filp_close(req->close.put_file, req->work.files); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + fput(req->close.put_file); + io_put_req_find_next(req, nxt); +} + static void io_close_finish(struct io_wq_work **workptr) { struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); struct io_kiocb *nxt = NULL; - /* Invoked with files, we need to do the close */ - if (req->work.files) { - int ret; - - ret = filp_close(req->close.put_file, req->work.files); - if (ret < 0) - req_set_fail_links(req); - io_cqring_add_event(req, ret); - } - - fput(req->close.put_file); - - io_put_req_find_next(req, &nxt); + __io_close_finish(req, &nxt); if (nxt) io_wq_assign_next(workptr, nxt); } @@ -2877,22 +2927,8 @@ static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, * No ->flush(), safely close from here and just punt the * fput() to async context. */ - ret = filp_close(req->close.put_file, current->files); - - if (ret < 0) - req_set_fail_links(req); - io_cqring_add_event(req, ret); - - if (io_wq_current_is_worker()) { - struct io_wq_work *old_work, *work; - - old_work = work = &req->work; - io_close_finish(&work); - if (work && work != old_work) - *nxt = container_of(work, struct io_kiocb, work); - return 0; - } - + __io_close_finish(req, nxt); + return 0; eagain: req->work.func = io_close_finish; /* @@ -2960,24 +2996,12 @@ static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } -#if defined(CONFIG_NET) -static void io_sendrecv_async(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct iovec *iov = NULL; - - if (req->io->rw.iov != req->io->rw.fast_iov) - iov = req->io->msg.iov; - io_wq_submit_work(workptr); - kfree(iov); -} -#endif - static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; struct io_async_ctx *io = req->io; + int ret; sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -2985,10 +3009,16 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!io || req->opcode == IORING_OP_SEND) return 0; + /* iovec is already imported */ + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; io->msg.iov = io->msg.fast_iov; - return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.iov); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; #else return -EOPNOTSUPP; #endif @@ -3008,12 +3038,11 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, sock = sock_from_file(req->file, &ret); if (sock) { struct io_async_ctx io; - struct sockaddr_storage addr; unsigned flags; if (req->io) { kmsg = &req->io->msg; - kmsg->msg.msg_name = &addr; + kmsg->msg.msg_name = &req->io->msg.addr; /* if iov is set, it's allocated already */ if (!kmsg->iov) kmsg->iov = kmsg->fast_iov; @@ -3022,7 +3051,7 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, struct io_sr_msg *sr = &req->sr_msg; kmsg = &io.msg; - kmsg->msg.msg_name = &addr; + kmsg->msg.msg_name = &io.msg.addr; io.msg.iov = io.msg.fast_iov; ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg, @@ -3041,18 +3070,22 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, if (force_nonblock && ret == -EAGAIN) { if (req->io) return -EAGAIN; - if (io_alloc_async_ctx(req)) + if (io_alloc_async_ctx(req)) { + if (kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); return -ENOMEM; + } + req->flags |= REQ_F_NEED_CLEANUP; memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); - req->work.func = io_sendrecv_async; return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; } - if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); + req->flags &= ~REQ_F_NEED_CLEANUP; io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); @@ -3120,6 +3153,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, #if defined(CONFIG_NET) struct io_sr_msg *sr = &req->sr_msg; struct io_async_ctx *io = req->io; + int ret; sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -3127,10 +3161,16 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (!io || req->opcode == IORING_OP_RECV) return 0; + /* iovec is already imported */ + if (req->flags & REQ_F_NEED_CLEANUP) + return 0; io->msg.iov = io->msg.fast_iov; - return recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, + ret = recvmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.uaddr, &io->msg.iov); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; #else return -EOPNOTSUPP; #endif @@ -3150,12 +3190,11 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, sock = sock_from_file(req->file, &ret); if (sock) { struct io_async_ctx io; - struct sockaddr_storage addr; unsigned flags; if (req->io) { kmsg = &req->io->msg; - kmsg->msg.msg_name = &addr; + kmsg->msg.msg_name = &req->io->msg.addr; /* if iov is set, it's allocated already */ if (!kmsg->iov) kmsg->iov = kmsg->fast_iov; @@ -3164,7 +3203,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, struct io_sr_msg *sr = &req->sr_msg; kmsg = &io.msg; - kmsg->msg.msg_name = &addr; + kmsg->msg.msg_name = &io.msg.addr; io.msg.iov = io.msg.fast_iov; ret = recvmsg_copy_msghdr(&io.msg.msg, sr->msg, @@ -3185,18 +3224,22 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, if (force_nonblock && ret == -EAGAIN) { if (req->io) return -EAGAIN; - if (io_alloc_async_ctx(req)) + if (io_alloc_async_ctx(req)) { + if (kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); return -ENOMEM; + } memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); - req->work.func = io_sendrecv_async; + req->flags |= REQ_F_NEED_CLEANUP; return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; } - if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + if (kmsg && kmsg->iov != kmsg->fast_iov) kfree(kmsg->iov); + req->flags &= ~REQ_F_NEED_CLEANUP; io_cqring_add_event(req, ret); if (ret < 0) req_set_fail_links(req); @@ -4207,6 +4250,35 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EIOCBQUEUED; } +static void io_cleanup_req(struct io_kiocb *req) +{ + struct io_async_ctx *io = req->io; + + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + if (io->rw.iov != io->rw.fast_iov) + kfree(io->rw.iov); + break; + case IORING_OP_SENDMSG: + case IORING_OP_RECVMSG: + if (io->msg.iov != io->msg.fast_iov) + kfree(io->msg.iov); + break; + case IORING_OP_OPENAT: + case IORING_OP_OPENAT2: + case IORING_OP_STATX: + putname(req->open.filename); + break; + } + + req->flags &= ~REQ_F_NEED_CLEANUP; +} + static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_kiocb **nxt, bool force_nonblock) { @@ -4446,7 +4518,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } if (!ret) { - req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; req->in_async = true; do { ret = io_issue_sqe(req, NULL, &nxt, false); @@ -4479,7 +4550,7 @@ static int io_req_needs_file(struct io_kiocb *req, int fd) { if (!io_op_defs[req->opcode].needs_file) return 0; - if (fd == -1 && io_op_defs[req->opcode].fd_non_neg) + if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg) return 0; return 1; } @@ -4950,6 +5021,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; struct io_kiocb *req; + int err; req = io_get_req(ctx, statep); if (unlikely(!req)) { @@ -4966,20 +5038,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, submitted++; if (unlikely(req->opcode >= IORING_OP_LAST)) { - io_cqring_add_event(req, -EINVAL); + err = -EINVAL; +fail_req: + io_cqring_add_event(req, err); io_double_put_req(req); break; } if (io_op_defs[req->opcode].needs_mm && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); - if (!mm_fault) { - use_mm(ctx->sqo_mm); - *mm = ctx->sqo_mm; + if (unlikely(mm_fault)) { + err = -EFAULT; + goto fail_req; } + use_mm(ctx->sqo_mm); + *mm = ctx->sqo_mm; } - req->has_user = *mm != NULL; req->in_async = async; req->needs_fixed_file = async; trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, @@ -6301,7 +6376,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != ctx->rings->sq_ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; - if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail) + if (io_cqring_events(ctx, false)) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -6393,6 +6468,29 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, if (!cancel_req) break; + if (cancel_req->flags & REQ_F_OVERFLOW) { + spin_lock_irq(&ctx->completion_lock); + list_del(&cancel_req->list); + cancel_req->flags &= ~REQ_F_OVERFLOW; + if (list_empty(&ctx->cq_overflow_list)) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + } + spin_unlock_irq(&ctx->completion_lock); + + WRITE_ONCE(ctx->rings->cq_overflow, + atomic_inc_return(&ctx->cached_cq_overflow)); + + /* + * Put inflight ref and overflow ref. If that's + * all we had, then we're done with this request. + */ + if (refcount_sub_and_test(2, &cancel_req->refs)) { + io_put_req(cancel_req); + continue; + } + } + io_wq_cancel_work(ctx->io_wq, &cancel_req->work); io_put_req(cancel_req); schedule(); @@ -6405,6 +6503,13 @@ static int io_uring_flush(struct file *file, void *data) struct io_ring_ctx *ctx = file->private_data; io_uring_cancel_files(ctx, data); + + /* + * If the task is going away, cancel work it may have pending + */ + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) + io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current)); + return 0; } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2494095e0340..27373f5792a4 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -976,29 +976,33 @@ restart_loop: * it. */ /* - * A buffer which has been freed while still being journaled by - * a previous transaction. - */ - if (buffer_freed(bh)) { + * A buffer which has been freed while still being journaled + * by a previous transaction, refile the buffer to BJ_Forget of + * the running transaction. If the just committed transaction + * contains "add to orphan" operation, we can completely + * invalidate the buffer now. We are rather through in that + * since the buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial page. + */ + if (buffer_freed(bh) && !jh->b_next_transaction) { + struct address_space *mapping; + + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + /* - * If the running transaction is the one containing - * "add to orphan" operation (b_next_transaction != - * NULL), we have to wait for that transaction to - * commit before we can really get rid of the buffer. - * So just clear b_modified to not confuse transaction - * credit accounting and refile the buffer to - * BJ_Forget of the running transaction. If the just - * committed transaction contains "add to orphan" - * operation, we can completely invalidate the buffer - * now. We are rather through in that since the - * buffer may be still accessible when blocksize < - * pagesize and it is attached to the last partial - * page. + * Block device buffers need to stay mapped all the + * time, so it is enough to clear buffer_jbddirty and + * buffer_freed bits. For the file mapping buffers (i.e. + * journalled data) we need to unmap buffer and clear + * more bits. We also need to be careful about the check + * because the data page mapping can get cleared under + * out hands, which alse need not to clear more bits + * because the page and buffers will be freed and can + * never be reused once we are done with them. */ - jh->b_modified = 0; - if (!jh->b_next_transaction) { - clear_buffer_freed(bh); - clear_buffer_jbddirty(bh); + mapping = READ_ONCE(bh->b_page->mapping); + if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { clear_buffer_mapped(bh); clear_buffer_new(bh); clear_buffer_req(bh); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e77a5a0b4e46..2dd848a743ed 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2329,14 +2329,16 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, return -EBUSY; } /* - * OK, buffer won't be reachable after truncate. We just set - * j_next_transaction to the running transaction (if there is - * one) and mark buffer as freed so that commit code knows it - * should clear dirty bits when it is done with the buffer. + * OK, buffer won't be reachable after truncate. We just clear + * b_modified to not confuse transaction credit accounting, and + * set j_next_transaction to the running transaction (if there + * is one) and mark buffer as freed so that commit code knows + * it should clear dirty bits when it is done with the buffer. */ set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) jh->b_next_transaction = journal->j_running_transaction; + jh->b_modified = 0; spin_unlock(&journal->j_list_lock); spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 4a841071d8a7..1865322de142 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -42,13 +42,27 @@ static void nfs_mark_delegation_revoked(struct nfs_delegation *delegation) if (!test_and_set_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { delegation->stateid.type = NFS4_INVALID_STATEID_TYPE; atomic_long_dec(&nfs_active_delegations); + if (!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + nfs_clear_verifier_delegated(delegation->inode); } } +static struct nfs_delegation *nfs_get_delegation(struct nfs_delegation *delegation) +{ + refcount_inc(&delegation->refcount); + return delegation; +} + +static void nfs_put_delegation(struct nfs_delegation *delegation) +{ + if (refcount_dec_and_test(&delegation->refcount)) + __nfs_free_delegation(delegation); +} + static void nfs_free_delegation(struct nfs_delegation *delegation) { nfs_mark_delegation_revoked(delegation); - __nfs_free_delegation(delegation); + nfs_put_delegation(delegation); } /** @@ -241,13 +255,18 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) { + const struct cred *cred; int res = 0; - if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) - res = nfs4_proc_delegreturn(inode, - delegation->cred, + if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + spin_lock(&delegation->lock); + cred = get_cred(delegation->cred); + spin_unlock(&delegation->lock); + res = nfs4_proc_delegreturn(inode, cred, &delegation->stateid, issync); + put_cred(cred); + } return res; } @@ -273,9 +292,13 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) if (delegation == NULL) goto out; spin_lock(&delegation->lock); - if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) - ret = delegation; + if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + /* Refcount matched in nfs_end_delegation_return() */ + ret = nfs_get_delegation(delegation); + } spin_unlock(&delegation->lock); + if (ret) + nfs_clear_verifier_delegated(&nfsi->vfs_inode); out: return ret; } @@ -393,6 +416,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, if (delegation == NULL) return -ENOMEM; nfs4_stateid_copy(&delegation->stateid, stateid); + refcount_set(&delegation->refcount, 1); delegation->type = type; delegation->pagemod_limit = pagemod_limit; delegation->change_attr = inode_peek_iversion_raw(inode); @@ -492,6 +516,8 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation err = nfs_do_return_delegation(inode, delegation, issync); out: + /* Refcount matched in nfs_start_delegation_return_locked() */ + nfs_put_delegation(delegation); return err; } @@ -686,9 +712,12 @@ void nfs4_inode_return_delegation_on_close(struct inode *inode) list_empty(&NFS_I(inode)->open_files) && !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); - ret = delegation; + /* Refcount matched in nfs_end_delegation_return() */ + ret = nfs_get_delegation(delegation); } spin_unlock(&delegation->lock); + if (ret) + nfs_clear_verifier_delegated(inode); } out: rcu_read_unlock(); @@ -1088,10 +1117,11 @@ restart: delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); if (delegation != NULL) { - delegation = nfs_detach_delegation(NFS_I(inode), - delegation, server); - if (delegation != NULL) + if (nfs_detach_delegation(NFS_I(inode), delegation, + server) != NULL) nfs_free_delegation(delegation); + /* Match nfs_start_delegation_return_locked */ + nfs_put_delegation(delegation); } iput(inode); nfs_sb_deactive(server->super); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 31b84604d383..9b00a0b7f832 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -22,6 +22,7 @@ struct nfs_delegation { unsigned long pagemod_limit; __u64 change_attr; unsigned long flags; + refcount_t refcount; spinlock_t lock; struct rcu_head rcu; }; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 1320288ff9ec..193d6fb363b7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -155,6 +155,7 @@ typedef struct { loff_t current_index; decode_dirent_t decode; + unsigned long dir_verifier; unsigned long timestamp; unsigned long gencount; unsigned int cache_entry_index; @@ -353,6 +354,7 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, again: timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); + desc->dir_verifier = nfs_save_change_attribute(inode); error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { @@ -455,13 +457,13 @@ void nfs_force_use_readdirplus(struct inode *dir) } static -void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) +void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, + unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; - struct inode *dir = d_inode(parent); struct inode *inode; int status; @@ -500,7 +502,7 @@ again: if (nfs_same_file(dentry, entry)) { if (!entry->fh->size) goto out; - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_set_verifier(dentry, dir_verifier); status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label); @@ -526,7 +528,7 @@ again: dput(dentry); dentry = alias; } - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_set_verifier(dentry, dir_verifier); out: dput(dentry); } @@ -564,7 +566,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en count++; if (desc->plus) - nfs_prime_dcache(file_dentry(desc->file), entry); + nfs_prime_dcache(file_dentry(desc->file), entry, + desc->dir_verifier); status = nfs_readdir_add_to_array(entry, page); if (status != 0) @@ -983,14 +986,113 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, * full lookup on all child dentries of 'dir' whenever a change occurs * on the server that might have invalidated our dcache. * + * Note that we reserve bit '0' as a tag to let us know when a dentry + * was revalidated while holding a delegation on its inode. + * * The caller should be holding dir->i_lock */ void nfs_force_lookup_revalidate(struct inode *dir) { - NFS_I(dir)->cache_change_attribute++; + NFS_I(dir)->cache_change_attribute += 2; } EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate); +/** + * nfs_verify_change_attribute - Detects NFS remote directory changes + * @dir: pointer to parent directory inode + * @verf: previously saved change attribute + * + * Return "false" if the verifiers doesn't match the change attribute. + * This would usually indicate that the directory contents have changed on + * the server, and that any dentries need revalidating. + */ +static bool nfs_verify_change_attribute(struct inode *dir, unsigned long verf) +{ + return (verf & ~1UL) == nfs_save_change_attribute(dir); +} + +static void nfs_set_verifier_delegated(unsigned long *verf) +{ + *verf |= 1UL; +} + +#if IS_ENABLED(CONFIG_NFS_V4) +static void nfs_unset_verifier_delegated(unsigned long *verf) +{ + *verf &= ~1UL; +} +#endif /* IS_ENABLED(CONFIG_NFS_V4) */ + +static bool nfs_test_verifier_delegated(unsigned long verf) +{ + return verf & 1; +} + +static bool nfs_verifier_is_delegated(struct dentry *dentry) +{ + return nfs_test_verifier_delegated(dentry->d_time); +} + +static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf) +{ + struct inode *inode = d_inode(dentry); + + if (!nfs_verifier_is_delegated(dentry) && + !nfs_verify_change_attribute(d_inode(dentry->d_parent), verf)) + goto out; + if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + nfs_set_verifier_delegated(&verf); +out: + dentry->d_time = verf; +} + +/** + * nfs_set_verifier - save a parent directory verifier in the dentry + * @dentry: pointer to dentry + * @verf: verifier to save + * + * Saves the parent directory verifier in @dentry. If the inode has + * a delegation, we also tag the dentry as having been revalidated + * while holding a delegation so that we know we don't have to + * look it up again after a directory change. + */ +void nfs_set_verifier(struct dentry *dentry, unsigned long verf) +{ + + spin_lock(&dentry->d_lock); + nfs_set_verifier_locked(dentry, verf); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL_GPL(nfs_set_verifier); + +#if IS_ENABLED(CONFIG_NFS_V4) +/** + * nfs_clear_verifier_delegated - clear the dir verifier delegation tag + * @inode: pointer to inode + * + * Iterates through the dentries in the inode alias list and clears + * the tag used to indicate that the dentry has been revalidated + * while holding a delegation. + * This function is intended for use when the delegation is being + * returned or revoked. + */ +void nfs_clear_verifier_delegated(struct inode *inode) +{ + struct dentry *alias; + + if (!inode) + return; + spin_lock(&inode->i_lock); + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + spin_lock(&alias->d_lock); + nfs_unset_verifier_delegated(&alias->d_time); + spin_unlock(&alias->d_lock); + } + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated); +#endif /* IS_ENABLED(CONFIG_NFS_V4) */ + /* * A check for whether or not the parent directory has changed. * In the case it has, we assume that the dentries are untrustworthy @@ -1159,6 +1261,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, struct nfs_fh *fhandle; struct nfs_fattr *fattr; struct nfs4_label *label; + unsigned long dir_verifier; int ret; ret = -ENOMEM; @@ -1168,6 +1271,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, if (fhandle == NULL || fattr == NULL || IS_ERR(label)) goto out; + dir_verifier = nfs_save_change_attribute(dir); ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label); if (ret < 0) { switch (ret) { @@ -1188,7 +1292,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, goto out; nfs_setsecurity(inode, fattr, label); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_set_verifier(dentry, dir_verifier); /* set a readdirplus hint that we had a cache miss */ nfs_force_use_readdirplus(dir); @@ -1230,7 +1334,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, goto out_bad; } - if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) + if (nfs_verifier_is_delegated(dentry)) return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* Force a full look up iff the parent directory has changed */ @@ -1415,6 +1519,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in struct nfs_fh *fhandle = NULL; struct nfs_fattr *fattr = NULL; struct nfs4_label *label = NULL; + unsigned long dir_verifier; int error; dfprintk(VFS, "NFS: lookup(%pd2)\n", dentry); @@ -1440,6 +1545,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in if (IS_ERR(label)) goto out; + dir_verifier = nfs_save_change_attribute(dir); trace_nfs_lookup_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr, label); if (error == -ENOENT) @@ -1463,7 +1569,7 @@ no_entry: goto out_label; dentry = res; } - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + nfs_set_verifier(dentry, dir_verifier); out_label: trace_nfs_lookup_exit(dir, dentry, flags, error); nfs4_label_free(label); @@ -1668,7 +1774,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, if (inode == NULL) goto full_reval; - if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) + if (nfs_verifier_is_delegated(dentry)) return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* NFS only supports OPEN on regular files */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1309e6f47f3d..11bf15800ac9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2114,6 +2114,7 @@ static void init_once(void *foo) init_rwsem(&nfsi->rmdir_sem); mutex_init(&nfsi->commit_mutex); nfs4_init_once(nfsi); + nfsi->cache_change_attribute = 0; } static int __init nfs_init_inodecache(void) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index be4eb720d5b6..1297919e0fce 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -87,7 +87,6 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (inode != d_inode(dentry)) goto out_drop; - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); nfs_file_set_open_context(filp, ctx); nfs_fscache_open_file(inode, filp); err = 0; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 95d07a3dc5d1..69b7ab7a5815 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2974,10 +2974,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, struct dentry *dentry; struct nfs4_state *state; fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx); + struct inode *dir = d_inode(opendata->dir); + unsigned long dir_verifier; unsigned int seq; int ret; seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); + dir_verifier = nfs_save_change_attribute(dir); ret = _nfs4_proc_open(opendata, ctx); if (ret != 0) @@ -3005,8 +3008,19 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, dput(ctx->dentry); ctx->dentry = dentry = alias; } - nfs_set_verifier(dentry, - nfs_save_change_attribute(d_inode(opendata->dir))); + } + + switch(opendata->o_arg.claim) { + default: + break; + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + if (!opendata->rpc_done) + break; + if (opendata->o_res.delegation_type != 0) + dir_verifier = nfs_save_change_attribute(dir); + nfs_set_verifier(dentry, dir_verifier); } /* Parse layoutget results before we check for access */ @@ -5322,7 +5336,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, hdr->timestamp = jiffies; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; - nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1, 0); + nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0); nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr); } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3a688eb5c5ae..58e937be24ce 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -587,7 +587,7 @@ xfs_dax_writepages( xfs_iflags_clear(ip, XFS_ITRUNCATED); return dax_writeback_mapping_range(mapping, - xfs_inode_buftarg(ip)->bt_bdev, wbc); + xfs_inode_buftarg(ip)->bt_daxdev, wbc); } STATIC sector_t |