diff options
Diffstat (limited to 'fs')
45 files changed, 319 insertions, 241 deletions
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index df415c05939e..de1ae0bead3b 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -19,7 +19,7 @@ void afs_put_addrlist(struct afs_addr_list *alist) { if (alist && refcount_dec_and_test(&alist->usage)) - call_rcu(&alist->rcu, (rcu_callback_t)kfree); + kfree_rcu(alist, rcu); } /* diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 1d81fc4c3058..35f951ac296f 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -81,7 +81,7 @@ enum afs_call_state { * List of server addresses. */ struct afs_addr_list { - struct rcu_head rcu; /* Must be first */ + struct rcu_head rcu; refcount_t usage; u32 version; /* Version */ unsigned char max_addrs; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 404e050ce8ee..7f09147872dc 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -856,9 +856,9 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) found_raid1c34 = true; up_read(&sinfo->groups_sem); } - if (found_raid56) + if (!found_raid56) btrfs_clear_fs_incompat(fs_info, RAID56); - if (found_raid1c34) + if (!found_raid1c34) btrfs_clear_fs_incompat(fs_info, RAID1C34); } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1ccb3f8d528d..d267eb5caa7b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7783,6 +7783,7 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, { struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); + u16 csum_size; blk_status_t ret; /* @@ -7802,7 +7803,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, file_offset -= dip->logical_offset; file_offset >>= inode->i_sb->s_blocksize_bits; - io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset); + csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy); + io_bio->csum = orig_io_bio->csum + csum_size * file_offset; return 0; } @@ -9494,6 +9496,10 @@ out_fail: ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); if (ret) commit_transaction = true; + } else if (sync_log) { + mutex_lock(&root->log_mutex); + list_del(&ctx.list); + mutex_unlock(&root->log_mutex); } if (commit_transaction) { ret = btrfs_commit_transaction(trans); diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 606f26d862dc..cc3ada12848d 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -324,6 +324,8 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) if (full_path == NULL) goto cdda_exit; + convert_delimiter(full_path, '\\'); + cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path); if (!cifs_sb_master_tlink(cifs_sb)) { diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 46ebaf3f0824..fa77fe5258b0 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -530,6 +530,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) if (tcon->seal) seq_puts(s, ",seal"); + else if (tcon->ses->server->ignore_signature) + seq_puts(s, ",signloosely"); if (tcon->nocase) seq_puts(s, ",nocase"); if (tcon->local_lease) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index de82cfa44b1a..0d956360e984 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1281,6 +1281,7 @@ struct cifs_fid { __u64 volatile_fid; /* volatile file id for smb2 */ __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */ __u8 create_guid[16]; + __u32 access; struct cifs_pending_open *pending_open; unsigned int epoch; #ifdef CONFIG_CIFS_DEBUG2 @@ -1741,6 +1742,12 @@ static inline bool is_retryable_error(int error) return false; } + +/* cifs_get_writable_file() flags */ +#define FIND_WR_ANY 0 +#define FIND_WR_FSUID_ONLY 1 +#define FIND_WR_WITH_DELETE 2 + #define MID_FREE 0 #define MID_REQUEST_ALLOCATED 1 #define MID_REQUEST_SUBMITTED 2 diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 89eaaf46d1ca..e5cb681ec138 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -134,11 +134,12 @@ extern bool backup_cred(struct cifs_sb_info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); -extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); +extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int); extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, - bool fsuid_only, + int flags, struct cifsFileInfo **ret_file); extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, + int flags, struct cifsFileInfo **ret_file); extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 3c89569e7210..6f6fb3606a5d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1492,6 +1492,7 @@ openRetry: *oplock = rsp->OplockLevel; /* cifs fid stays in le */ oparms->fid->netfid = rsp->Fid; + oparms->fid->access = desired_access; /* Let caller know file was created so we can set the mode. */ /* Do we care about the CreateAction in any other cases? */ @@ -2115,7 +2116,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata) wdata2->tailsz = tailsz; wdata2->bytes = cur_len; - rc = cifs_get_writable_file(CIFS_I(inode), false, + rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &wdata2->cfile); if (!wdata2->cfile) { cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n", diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 0ef099442f20..36e7b2fd2190 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -555,7 +555,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, if (server->ops->close) server->ops->close(xid, tcon, &fid); cifs_del_pending_open(&open); - fput(file); rc = -ENOMEM; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index bc9516ab4b34..8f9d849a0012 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1169,7 +1169,8 @@ try_again: rc = posix_lock_file(file, flock, NULL); up_write(&cinode->lock_sem); if (rc == FILE_LOCK_DEFERRED) { - rc = wait_event_interruptible(flock->fl_wait, !flock->fl_blocker); + rc = wait_event_interruptible(flock->fl_wait, + list_empty(&flock->fl_blocked_member)); if (!rc) goto try_again; locks_delete_block(flock); @@ -1958,7 +1959,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, /* Return -EBADF if no handle is found and general rc otherwise */ int -cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, +cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags, struct cifsFileInfo **ret_file) { struct cifsFileInfo *open_file, *inv_file = NULL; @@ -1966,7 +1967,8 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, bool any_available = false; int rc = -EBADF; unsigned int refind = 0; - + bool fsuid_only = flags & FIND_WR_FSUID_ONLY; + bool with_delete = flags & FIND_WR_WITH_DELETE; *ret_file = NULL; /* @@ -1998,6 +2000,8 @@ refind_writable: continue; if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) continue; + if (with_delete && !(open_file->fid.access & DELETE)) + continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { /* found a good writable file */ @@ -2045,12 +2049,12 @@ refind_writable: } struct cifsFileInfo * -find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) +find_writable_file(struct cifsInodeInfo *cifs_inode, int flags) { struct cifsFileInfo *cfile; int rc; - rc = cifs_get_writable_file(cifs_inode, fsuid_only, &cfile); + rc = cifs_get_writable_file(cifs_inode, flags, &cfile); if (rc) cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc); @@ -2059,6 +2063,7 @@ find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, + int flags, struct cifsFileInfo **ret_file) { struct list_head *tmp; @@ -2085,7 +2090,7 @@ cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, kfree(full_path); cinode = CIFS_I(d_inode(cfile->dentry)); spin_unlock(&tcon->open_file_lock); - return cifs_get_writable_file(cinode, 0, ret_file); + return cifs_get_writable_file(cinode, flags, ret_file); } spin_unlock(&tcon->open_file_lock); @@ -2162,7 +2167,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) if (mapping->host->i_size - offset < (loff_t)to) to = (unsigned)(mapping->host->i_size - offset); - rc = cifs_get_writable_file(CIFS_I(mapping->host), false, &open_file); + rc = cifs_get_writable_file(CIFS_I(mapping->host), FIND_WR_ANY, + &open_file); if (!rc) { bytes_written = cifs_write(open_file, open_file->pid, write_data, to - from, &offset); @@ -2355,7 +2361,7 @@ retry: if (cfile) cifsFileInfo_put(cfile); - rc = cifs_get_writable_file(CIFS_I(inode), false, &cfile); + rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile); /* in case of an error store it to return later */ if (rc) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b5e6635c578e..b16f8d23e97b 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -653,8 +653,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, */ if ((fattr->cf_nlink < 1) && !tcon->unix_ext && !info->DeletePending) { - cifs_dbg(1, "bogus file nlink value %u\n", - fattr->cf_nlink); + cifs_dbg(VFS, "bogus file nlink value %u\n", + fattr->cf_nlink); fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK; } } @@ -2073,6 +2073,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) struct inode *inode = d_inode(dentry); struct super_block *sb = dentry->d_sb; char *full_path = NULL; + int count = 0; if (inode == NULL) return -ENOENT; @@ -2094,15 +2095,18 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) full_path, inode, inode->i_count.counter, dentry, cifs_get_time(dentry), jiffies); +again: if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid); else rc = cifs_get_inode_info(&inode, full_path, NULL, sb, xid, NULL); - + if (rc == -EAGAIN && count++ < 10) + goto again; out: kfree(full_path); free_xid(xid); + return rc; } @@ -2187,7 +2191,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat, if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) stat->gid = current_fsgid(); } - return rc; + return 0; } int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, @@ -2278,7 +2282,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, * writebehind data than the SMB timeout for the SetPathInfo * request would allow */ - open_file = find_writable_file(cifsInode, true); + open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY); if (open_file) { tcon = tlink_tcon(open_file->tlink); server = tcon->ses->server; @@ -2428,7 +2432,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) args->ctime = NO_CHANGE_64; args->device = 0; - open_file = find_writable_file(cifsInode, true); + open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY); if (open_file) { u16 nfid = open_file->fid.netfid; u32 npid = open_file->pid; @@ -2531,7 +2535,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) rc = 0; if (attrs->ia_valid & ATTR_MTIME) { - rc = cifs_get_writable_file(cifsInode, false, &wfile); + rc = cifs_get_writable_file(cifsInode, FIND_WR_ANY, &wfile); if (!rc) { tcon = tlink_tcon(wfile->tlink); rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index eb994e313c6a..b130efaf8feb 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -766,7 +766,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, struct cifs_tcon *tcon; /* if the file is already open for write, just use that fileid */ - open_file = find_writable_file(cinode, true); + open_file = find_writable_file(cinode, FIND_WR_FSUID_ONLY); if (open_file) { fid.netfid = open_file->fid.netfid; netpid = open_file->pid; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 1cf207564ff9..a8c301ae00ed 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -521,7 +521,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, cifs_i = CIFS_I(inode); dosattrs = cifs_i->cifsAttrs | ATTR_READONLY; data.Attributes = cpu_to_le32(dosattrs); - cifs_get_writable_path(tcon, name, &cfile); + cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile); tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE, @@ -577,7 +577,7 @@ smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, { struct cifsFileInfo *cfile; - cifs_get_writable_path(tcon, from_name, &cfile); + cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile); return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb, DELETE, SMB2_OP_RENAME, cfile); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index e47190cae163..cfe9b800ea8c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1364,6 +1364,7 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) cfile->fid.persistent_fid = fid->persistent_fid; cfile->fid.volatile_fid = fid->volatile_fid; + cfile->fid.access = fid->access; #ifdef CONFIG_CIFS_DEBUG2 cfile->fid.mid = fid->mid; #endif /* CIFS_DEBUG2 */ @@ -2221,6 +2222,8 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, goto qdf_free; } + atomic_inc(&tcon->num_remote_opens); + qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base; if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { trace_smb3_query_dir_done(xid, fid->persistent_fid, @@ -3327,7 +3330,7 @@ static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offs * some servers (Windows2016) will not reflect recent writes in * QUERY_ALLOCATED_RANGES until SMB2_flush is called. */ - wrcfile = find_writable_file(cifsi, false); + wrcfile = find_writable_file(cifsi, FIND_WR_ANY); if (wrcfile) { filemap_write_and_wait(inode->i_mapping); smb2_flush_file(xid, tcon, &wrcfile->fid); @@ -3416,7 +3419,7 @@ static int smb3_fiemap(struct cifs_tcon *tcon, if (rc) goto out; - if (out_data_len < sizeof(struct file_allocated_range_buffer)) { + if (out_data_len && out_data_len < sizeof(struct file_allocated_range_buffer)) { rc = -EINVAL; goto out; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 1234f9ccab03..28c0be5e69b7 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2771,6 +2771,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, atomic_inc(&tcon->num_remote_opens); oparms->fid->persistent_fid = rsp->PersistentFileId; oparms->fid->volatile_fid = rsp->VolatileFileId; + oparms->fid->access = oparms->desired_access; #ifdef CONFIG_CIFS_DEBUG2 oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId); #endif /* CIFS_DEBUG2 */ diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 65cb09fa6ead..08c9f216a54d 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -539,6 +539,15 @@ int fscrypt_drop_inode(struct inode *inode) mk = ci->ci_master_key->payload.data[0]; /* + * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes + * protected by the key were cleaned by sync_filesystem(). But if + * userspace is still using the files, inodes can be dirtied between + * then and now. We mustn't lose any writes, so skip dirty inodes here. + */ + if (inode->i_state & I_DIRTY_ALL) + return 0; + + /* * Note: since we aren't holding ->mk_secret_sem, the result here can * immediately become outdated. But there's no correctness problem with * unnecessarily evicting. Nor is there a correctness problem with not diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 634b09d18b77..db987b5110a9 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1090,21 +1090,12 @@ static const struct file_operations fops_regset32 = { * This function creates a file in debugfs with the given name that reports * the names and values of a set of 32-bit registers. If the @mode variable * is so set it can be read from. Writing is not supported. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_regset32(const char *name, umode_t mode, - struct dentry *parent, - struct debugfs_regset32 *regset) +void debugfs_create_regset32(const char *name, umode_t mode, + struct dentry *parent, + struct debugfs_regset32 *regset) { - return debugfs_create_file(name, mode, parent, regset, &fops_regset32); + debugfs_create_file(name, mode, parent, regset, &fops_regset32); } EXPORT_SYMBOL_GPL(debugfs_create_regset32); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index b041b66002db..eee3c92a9ebf 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1854,9 +1854,9 @@ fetch_events: waiter = true; init_waitqueue_entry(&wait, current); - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); __add_wait_queue_exclusive(&ep->wq, &wait); - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); } for (;;) { @@ -1904,9 +1904,9 @@ send_events: goto fetch_events; if (waiter) { - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); __remove_wait_queue(&ep->wq, &wait); - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); } return res; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ff1b764b0c0e..0c7c4adb664e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2391,7 +2391,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct flex_groups **old_groups, **new_groups; - int size, i; + int size, i, j; if (!sbi->s_log_groups_per_flex) return 0; @@ -2412,8 +2412,8 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) sizeof(struct flex_groups)), GFP_KERNEL); if (!new_groups[i]) { - for (i--; i >= sbi->s_flex_groups_allocated; i--) - kvfree(new_groups[i]); + for (j = sbi->s_flex_groups_allocated; j < i; j++) + kvfree(new_groups[j]); kvfree(new_groups); ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", size); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 594b05ae16c9..71946da84388 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -750,6 +750,13 @@ static struct inode *fat_alloc_inode(struct super_block *sb) return NULL; init_rwsem(&ei->truncate_lock); + /* Zeroing to allow iput() even if partial initialized inode. */ + ei->mmu_private = 0; + ei->i_start = 0; + ei->i_logstart = 0; + ei->i_attrs = 0; + ei->i_pos = 0; + return &ei->vfs_inode; } @@ -1374,16 +1381,6 @@ out: return 0; } -static void fat_dummy_inode_init(struct inode *inode) -{ - /* Initialize this dummy inode to work as no-op. */ - MSDOS_I(inode)->mmu_private = 0; - MSDOS_I(inode)->i_start = 0; - MSDOS_I(inode)->i_logstart = 0; - MSDOS_I(inode)->i_attrs = 0; - MSDOS_I(inode)->i_pos = 0; -} - static int fat_read_root(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); @@ -1844,13 +1841,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, fat_inode = new_inode(sb); if (!fat_inode) goto out_fail; - fat_dummy_inode_init(fat_inode); sbi->fat_inode = fat_inode; fsinfo_inode = new_inode(sb); if (!fsinfo_inode) goto out_fail; - fat_dummy_inode_init(fsinfo_inode); fsinfo_inode->i_ino = MSDOS_FSINFO_INO; sbi->fsinfo_inode = fsinfo_inode; insert_inode_hash(fsinfo_inode); diff --git a/fs/fcntl.c b/fs/fcntl.c index 9bc167562ee8..2e4c0fa2074b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -735,8 +735,9 @@ static void send_sigio_to_task(struct task_struct *p, return; switch (signum) { - kernel_siginfo_t si; - default: + default: { + kernel_siginfo_t si; + /* Queue a rt signal with the appropriate fd as its value. We use SI_SIGIO as the source, not SI_KERNEL, since kernel signals always get @@ -769,6 +770,7 @@ static void send_sigio_to_task(struct task_struct *p, si.si_fd = fd; if (!do_send_sig_info(signum, &si, p, type)) break; + } /* fall-through - fall back on the old plain SIGIO signal */ case 0: do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type); diff --git a/fs/file.c b/fs/file.c index a364e1a9b7e8..c8a4e4c86e55 100644 --- a/fs/file.c +++ b/fs/file.c @@ -540,9 +540,14 @@ static int alloc_fd(unsigned start, unsigned flags) return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags); } +int __get_unused_fd_flags(unsigned flags, unsigned long nofile) +{ + return __alloc_fd(current->files, 0, nofile, flags); +} + int get_unused_fd_flags(unsigned flags) { - return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags); + return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8e02d76fe104..97eec7522bf2 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -276,12 +276,10 @@ static void flush_bg_queue(struct fuse_conn *fc) void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) { struct fuse_iqueue *fiq = &fc->iq; - bool async; if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; - async = req->args->end; /* * test_and_set_bit() implies smp_mb() between bit * changing and below intr_entry check. Pairs with @@ -324,7 +322,7 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&req->waitq); } - if (async) + if (test_bit(FR_ASYNC, &req->flags)) req->args->end(fc, req->args, req->out.h.error); put_request: fuse_put_request(fc, req); @@ -471,6 +469,8 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) req->in.h.opcode = args->opcode; req->in.h.nodeid = args->nodeid; req->args = args; + if (args->end) + __set_bit(FR_ASYNC, &req->flags); } ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index aa75e2305b75..ca344bf71404 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -301,6 +301,7 @@ struct fuse_io_priv { * FR_SENT: request is in userspace, waiting for an answer * FR_FINISHED: request is finished * FR_PRIVATE: request is on private list + * FR_ASYNC: request is asynchronous */ enum fuse_req_flag { FR_ISREPLY, @@ -314,6 +315,7 @@ enum fuse_req_flag { FR_SENT, FR_FINISHED, FR_PRIVATE, + FR_ASYNC, }; /** diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2716d56ed0a0..8294851a9dd9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1248,7 +1248,7 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, if (!(file->f_mode & FMODE_OPENED)) return finish_no_open(file, d); dput(d); - return 0; + return excl && (flags & O_CREAT) ? -EEXIST : 0; } BUG_ON(d != NULL); diff --git a/fs/inode.c b/fs/inode.c index 7d57068b6b7a..93d9252a00ab 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -138,6 +138,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_sb = sb; inode->i_blkbits = sb->s_blocksize_bits; inode->i_flags = 0; + atomic64_set(&inode->i_sequence, 0); atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; diff --git a/fs/io-wq.c b/fs/io-wq.c index 0a5ab1a8f69a..5cef075c0b37 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -502,7 +502,7 @@ next: if (worker->mm) work->flags |= IO_WQ_WORK_HAS_MM; - if (wq->get_work && !(work->flags & IO_WQ_WORK_INTERNAL)) { + if (wq->get_work) { put_work = work; wq->get_work(work); } @@ -535,42 +535,23 @@ next: } while (1); } -static inline void io_worker_spin_for_work(struct io_wqe *wqe) -{ - int i = 0; - - while (++i < 1000) { - if (io_wqe_run_queue(wqe)) - break; - if (need_resched()) - break; - cpu_relax(); - } -} - static int io_wqe_worker(void *data) { struct io_worker *worker = data; struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; - bool did_work; io_worker_start(wqe, worker); - did_work = false; while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { set_current_state(TASK_INTERRUPTIBLE); loop: - if (did_work) - io_worker_spin_for_work(wqe); spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); - did_work = true; goto loop; } - did_work = false; /* drops the lock on success, retry */ if (__io_worker_idle(wqe, worker)) { __release(&wqe->lock); @@ -766,6 +747,17 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, return true; } +static void io_run_cancel(struct io_wq_work *work) +{ + do { + struct io_wq_work *old_work = work; + + work->flags |= IO_WQ_WORK_CANCEL; + work->func(&work); + work = (work == old_work) ? NULL : work; + } while (work); +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); @@ -779,8 +771,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) * It's close enough to not be an issue, fork() has the same delay. */ if (unlikely(!io_wq_can_queue(wqe, acct, work))) { - work->flags |= IO_WQ_WORK_CANCEL; - work->func(&work); + io_run_cancel(work); return; } @@ -919,8 +910,7 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe, spin_unlock_irqrestore(&wqe->lock, flags); if (found) { - work->flags |= IO_WQ_WORK_CANCEL; - work->func(&work); + io_run_cancel(work); return IO_WQ_CANCEL_OK; } @@ -995,8 +985,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, spin_unlock_irqrestore(&wqe->lock, flags); if (found) { - work->flags |= IO_WQ_WORK_CANCEL; - work->func(&work); + io_run_cancel(work); return IO_WQ_CANCEL_OK; } @@ -1068,42 +1057,6 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) return ret; } -struct io_wq_flush_data { - struct io_wq_work work; - struct completion done; -}; - -static void io_wq_flush_func(struct io_wq_work **workptr) -{ - struct io_wq_work *work = *workptr; - struct io_wq_flush_data *data; - - data = container_of(work, struct io_wq_flush_data, work); - complete(&data->done); -} - -/* - * Doesn't wait for previously queued work to finish. When this completes, - * it just means that previously queued work was started. - */ -void io_wq_flush(struct io_wq *wq) -{ - struct io_wq_flush_data data; - int node; - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - if (!node_online(node)) - continue; - init_completion(&data.done); - INIT_IO_WORK(&data.work, io_wq_flush_func); - data.work.flags |= IO_WQ_WORK_INTERNAL; - io_wqe_enqueue(wqe, &data.work); - wait_for_completion(&data.done); - } -} - struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { int ret = -ENOMEM, node; diff --git a/fs/io-wq.h b/fs/io-wq.h index ccc7d84af57d..e5e15f2c93ec 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -8,7 +8,6 @@ enum { IO_WQ_WORK_HAS_MM = 2, IO_WQ_WORK_HASHED = 4, IO_WQ_WORK_UNBOUND = 32, - IO_WQ_WORK_INTERNAL = 64, IO_WQ_WORK_CB = 128, IO_WQ_WORK_NO_CANCEL = 256, IO_WQ_WORK_CONCURRENT = 512, @@ -79,16 +78,10 @@ struct io_wq_work { pid_t task_pid; }; -#define INIT_IO_WORK(work, _func) \ - do { \ - (work)->list.next = NULL; \ - (work)->func = _func; \ - (work)->files = NULL; \ - (work)->mm = NULL; \ - (work)->creds = NULL; \ - (work)->fs = NULL; \ - (work)->flags = 0; \ - } while (0) \ +#define INIT_IO_WORK(work, _func) \ + do { \ + *(work) = (struct io_wq_work){ .func = _func }; \ + } while (0) \ typedef void (get_work_fn)(struct io_wq_work *); typedef void (put_work_fn)(struct io_wq_work *); @@ -106,7 +99,6 @@ void io_wq_destroy(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val); -void io_wq_flush(struct io_wq *wq); void io_wq_cancel_all(struct io_wq *wq); enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); diff --git a/fs/io_uring.c b/fs/io_uring.c index de650df9ac53..3affd96a98ba 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -183,17 +183,12 @@ struct fixed_file_table { struct file **files; }; -enum { - FFD_F_ATOMIC, -}; - struct fixed_file_data { struct fixed_file_table *table; struct io_ring_ctx *ctx; struct percpu_ref refs; struct llist_head put_llist; - unsigned long state; struct work_struct ref_work; struct completion done; }; @@ -348,6 +343,7 @@ struct io_accept { struct sockaddr __user *addr; int __user *addr_len; int flags; + unsigned long nofile; }; struct io_sync { @@ -402,6 +398,7 @@ struct io_open { struct filename *filename; struct statx __user *buffer; struct open_how how; + unsigned long nofile; }; struct io_files_update { @@ -1004,6 +1001,7 @@ static void io_kill_timeout(struct io_kiocb *req) if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); list_del_init(&req->list); + req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, 0); io_put_req(req); } @@ -1483,10 +1481,10 @@ static void io_free_req(struct io_kiocb *req) __attribute__((nonnull)) static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { - io_req_find_next(req, nxtptr); - - if (refcount_dec_and_test(&req->refs)) + if (refcount_dec_and_test(&req->refs)) { + io_req_find_next(req, nxtptr); __io_free_req(req); + } } static void io_put_req(struct io_kiocb *req) @@ -1821,6 +1819,10 @@ static void io_iopoll_req_issued(struct io_kiocb *req) list_add(&req->list, &ctx->poll_list); else list_add_tail(&req->list, &ctx->poll_list); + + if ((ctx->flags & IORING_SETUP_SQPOLL) && + wq_has_sleeper(&ctx->sqo_wait)) + wake_up(&ctx->sqo_wait); } static void io_file_put(struct io_submit_state *state) @@ -2071,7 +2073,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, ssize_t ret; ret = import_single_range(rw, buf, sqe_len, *iovec, iter); *iovec = NULL; - return ret; + return ret < 0 ? ret : sqe_len; } if (req->io) { @@ -2577,6 +2579,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } + req->open.nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -2618,6 +2621,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ret; } + req->open.nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; return 0; } @@ -2636,7 +2640,7 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, if (ret) goto err; - ret = get_unused_fd_flags(req->open.how.flags); + ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); if (ret < 0) goto err; @@ -3002,6 +3006,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + sr->msg_flags |= MSG_CMSG_COMPAT; +#endif + if (!io || req->opcode == IORING_OP_SEND) return 0; /* iovec is already imported */ @@ -3154,6 +3163,11 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + sr->msg_flags |= MSG_CMSG_COMPAT; +#endif + if (!io || req->opcode == IORING_OP_RECV) return 0; /* iovec is already imported */ @@ -3311,6 +3325,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); + accept->nofile = rlimit(RLIMIT_NOFILE); return 0; #else return -EOPNOTSUPP; @@ -3327,7 +3342,8 @@ static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, file_flags = force_nonblock ? O_NONBLOCK : 0; ret = __sys_accept4_file(req->file, file_flags, accept->addr, - accept->addr_len, accept->flags); + accept->addr_len, accept->flags, + accept->nofile); if (ret == -EAGAIN && force_nonblock) return -EAGAIN; if (ret == -ERESTARTSYS) @@ -4121,6 +4137,9 @@ static int io_req_defer_prep(struct io_kiocb *req, { ssize_t ret = 0; + if (!sqe) + return 0; + if (io_op_defs[req->opcode].file_table) { ret = io_grab_files(req); if (unlikely(ret)) @@ -4705,11 +4724,21 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_kiocb *linked_timeout; struct io_kiocb *nxt = NULL; + const struct cred *old_creds = NULL; int ret; again: linked_timeout = io_prep_linked_timeout(req); + if (req->work.creds && req->work.creds != current_cred()) { + if (old_creds) + revert_creds(old_creds); + if (old_creds == req->work.creds) + old_creds = NULL; /* restored original creds */ + else + old_creds = override_creds(req->work.creds); + } + ret = io_issue_sqe(req, sqe, &nxt, true); /* @@ -4735,7 +4764,7 @@ punt: err: /* drop submission reference */ - io_put_req(req); + io_put_req_find_next(req, &nxt); if (linked_timeout) { if (!ret) @@ -4759,6 +4788,8 @@ done_req: goto punt; goto again; } + if (old_creds) + revert_creds(old_creds); } static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4803,7 +4834,6 @@ static inline void io_queue_link_head(struct io_kiocb *req) static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) { - const struct cred *old_creds = NULL; struct io_ring_ctx *ctx = req->ctx; unsigned int sqe_flags; int ret, id; @@ -4818,14 +4848,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, id = READ_ONCE(sqe->personality); if (id) { - const struct cred *personality_creds; - - personality_creds = idr_find(&ctx->personality_idr, id); - if (unlikely(!personality_creds)) { + req->work.creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!req->work.creds)) { ret = -EINVAL; goto err_req; } - old_creds = override_creds(personality_creds); + get_cred(req->work.creds); } /* same numerical values with corresponding REQ_F_*, safe to copy */ @@ -4837,8 +4865,6 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, err_req: io_cqring_add_event(req, ret); io_double_put_req(req); - if (old_creds) - revert_creds(old_creds); return false; } @@ -4890,6 +4916,11 @@ err_req: if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { req->flags |= REQ_F_LINK; INIT_LIST_HEAD(&req->link_list); + + if (io_alloc_async_ctx(req)) { + ret = -EAGAIN; + goto err_req; + } ret = io_req_defer_prep(req, sqe); if (ret) req->flags |= REQ_F_FAIL_LINK; @@ -4899,8 +4930,6 @@ err_req: } } - if (old_creds) - revert_creds(old_creds); return true; } @@ -5081,9 +5110,8 @@ static int io_sq_thread(void *data) const struct cred *old_cred; mm_segment_t old_fs; DEFINE_WAIT(wait); - unsigned inflight; unsigned long timeout; - int ret; + int ret = 0; complete(&ctx->completions[1]); @@ -5091,39 +5119,19 @@ static int io_sq_thread(void *data) set_fs(USER_DS); old_cred = override_creds(ctx->creds); - ret = timeout = inflight = 0; + timeout = jiffies + ctx->sq_thread_idle; while (!kthread_should_park()) { unsigned int to_submit; - if (inflight) { + if (!list_empty(&ctx->poll_list)) { unsigned nr_events = 0; - if (ctx->flags & IORING_SETUP_IOPOLL) { - /* - * inflight is the count of the maximum possible - * entries we submitted, but it can be smaller - * if we dropped some of them. If we don't have - * poll entries available, then we know that we - * have nothing left to poll for. Reset the - * inflight count to zero in that case. - */ - mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->poll_list)) - io_iopoll_getevents(ctx, &nr_events, 0); - else - inflight = 0; - mutex_unlock(&ctx->uring_lock); - } else { - /* - * Normal IO, just pretend everything completed. - * We don't have to poll completions for that. - */ - nr_events = inflight; - } - - inflight -= nr_events; - if (!inflight) + mutex_lock(&ctx->uring_lock); + if (!list_empty(&ctx->poll_list)) + io_iopoll_getevents(ctx, &nr_events, 0); + else timeout = jiffies + ctx->sq_thread_idle; + mutex_unlock(&ctx->uring_lock); } to_submit = io_sqring_entries(ctx); @@ -5152,7 +5160,7 @@ static int io_sq_thread(void *data) * more IO, we should wait for the application to * reap events and wake us up. */ - if (inflight || + if (!list_empty(&ctx->poll_list) || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { cond_resched(); @@ -5162,6 +5170,19 @@ static int io_sq_thread(void *data) prepare_to_wait(&ctx->sqo_wait, &wait, TASK_INTERRUPTIBLE); + /* + * While doing polled IO, before going to sleep, we need + * to check if there are new reqs added to poll_list, it + * is because reqs may have been punted to io worker and + * will be added to poll_list later, hence check the + * poll_list again. + */ + if ((ctx->flags & IORING_SETUP_IOPOLL) && + !list_empty_careful(&ctx->poll_list)) { + finish_wait(&ctx->sqo_wait, &wait); + continue; + } + /* Tell userspace we may need a wakeup call */ ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; /* make sure to read SQ tail after writing flags */ @@ -5189,8 +5210,7 @@ static int io_sq_thread(void *data) mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); mutex_unlock(&ctx->uring_lock); - if (ret > 0) - inflight += ret; + timeout = jiffies + ctx->sq_thread_idle; } set_fs(old_fs); @@ -5324,6 +5344,23 @@ static void io_file_ref_kill(struct percpu_ref *ref) complete(&data->done); } +static void io_file_ref_exit_and_free(struct work_struct *work) +{ + struct fixed_file_data *data; + + data = container_of(work, struct fixed_file_data, ref_work); + + /* + * Ensure any percpu-ref atomic switch callback has run, it could have + * been in progress when the files were being unregistered. Once + * that's done, we can safely exit and free the ref and containing + * data structure. + */ + rcu_barrier(); + percpu_ref_exit(&data->refs); + kfree(data); +} + static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { struct fixed_file_data *data = ctx->file_data; @@ -5336,14 +5373,14 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) flush_work(&data->ref_work); wait_for_completion(&data->done); io_ring_file_ref_flush(data); - percpu_ref_exit(&data->refs); __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) kfree(data->table[i].files); kfree(data->table); - kfree(data); + INIT_WORK(&data->ref_work, io_file_ref_exit_and_free); + queue_work(system_wq, &data->ref_work); ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; @@ -5595,7 +5632,6 @@ static void io_ring_file_ref_switch(struct work_struct *work) data = container_of(work, struct fixed_file_data, ref_work); io_ring_file_ref_flush(data); - percpu_ref_get(&data->refs); percpu_ref_switch_to_percpu(&data->refs); } @@ -5771,8 +5807,13 @@ static void io_atomic_switch(struct percpu_ref *ref) { struct fixed_file_data *data; + /* + * Juggle reference to ensure we hit zero, if needed, so we can + * switch back to percpu mode + */ data = container_of(ref, struct fixed_file_data, refs); - clear_bit(FFD_F_ATOMIC, &data->state); + percpu_ref_put(&data->refs); + percpu_ref_get(&data->refs); } static bool io_queue_file_removal(struct fixed_file_data *data, @@ -5795,11 +5836,7 @@ static bool io_queue_file_removal(struct fixed_file_data *data, llist_add(&pfile->llist, &data->put_llist); if (pfile == &pfile_stack) { - if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) { - percpu_ref_put(&data->refs); - percpu_ref_switch_to_atomic(&data->refs, - io_atomic_switch); - } + percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); wait_for_completion(&done); flush_work(&data->ref_work); return false; @@ -5873,10 +5910,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, up->offset++; } - if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) { - percpu_ref_put(&data->refs); + if (ref_switch) percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); - } return done ? done : err; } @@ -6334,6 +6369,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_sqe_buffer_unregister(ctx); io_sqe_files_unregister(ctx); io_eventfd_unregister(ctx); + idr_destroy(&ctx->personality_idr); #if defined(CONFIG_UNIX) if (ctx->ring_sock) { @@ -6647,6 +6683,7 @@ out_fput: return submitted ? submitted : ret; } +#ifdef CONFIG_PROC_FS static int io_uring_show_cred(int id, void *p, void *data) { const struct cred *cred = p; @@ -6720,6 +6757,7 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) percpu_ref_put(&ctx->refs); } } +#endif static const struct file_operations io_uring_fops = { .release = io_uring_release, @@ -6731,7 +6769,9 @@ static const struct file_operations io_uring_fops = { #endif .poll = io_uring_poll, .fasync = io_uring_fasync, +#ifdef CONFIG_PROC_FS .show_fdinfo = io_uring_show_fdinfo, +#endif }; static int io_allocate_scq_urings(struct io_ring_ctx *ctx, diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d181948c0390..3dccc23cf010 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1150,8 +1150,8 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh, /* For undo access buffer must have data copied */ if (undo && !jh->b_committed_data) goto out; - if (jh->b_transaction != handle->h_transaction && - jh->b_next_transaction != handle->h_transaction) + if (READ_ONCE(jh->b_transaction) != handle->h_transaction && + READ_ONCE(jh->b_next_transaction) != handle->h_transaction) goto out; /* * There are two reasons for the barrier here: @@ -2569,8 +2569,8 @@ bool __jbd2_journal_refile_buffer(struct journal_head *jh) * our jh reference and thus __jbd2_journal_file_buffer() must not * take a new one. */ - jh->b_transaction = jh->b_next_transaction; - jh->b_next_transaction = NULL; + WRITE_ONCE(jh->b_transaction, jh->b_next_transaction); + WRITE_ONCE(jh->b_next_transaction, NULL); if (buffer_freed(bh)) jlist = BJ_Forget; else if (jh->b_modified) diff --git a/fs/locks.c b/fs/locks.c index 44b6da032842..b8a31c1c4fff 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -725,7 +725,6 @@ static void __locks_delete_block(struct file_lock *waiter) { locks_delete_global_blocked(waiter); list_del_init(&waiter->fl_blocked_member); - waiter->fl_blocker = NULL; } static void __locks_wake_up_blocks(struct file_lock *blocker) @@ -740,6 +739,13 @@ static void __locks_wake_up_blocks(struct file_lock *blocker) waiter->fl_lmops->lm_notify(waiter); else wake_up(&waiter->fl_wait); + + /* + * The setting of fl_blocker to NULL marks the "done" + * point in deleting a block. Paired with acquire at the top + * of locks_delete_block(). + */ + smp_store_release(&waiter->fl_blocker, NULL); } } @@ -754,24 +760,41 @@ int locks_delete_block(struct file_lock *waiter) int status = -ENOENT; /* - * If fl_blocker is NULL, it won't be set again as this thread - * "owns" the lock and is the only one that might try to claim - * the lock. So it is safe to test fl_blocker locklessly. - * Also if fl_blocker is NULL, this waiter is not listed on - * fl_blocked_requests for some lock, so no other request can - * be added to the list of fl_blocked_requests for this - * request. So if fl_blocker is NULL, it is safe to - * locklessly check if fl_blocked_requests is empty. If both - * of these checks succeed, there is no need to take the lock. + * If fl_blocker is NULL, it won't be set again as this thread "owns" + * the lock and is the only one that might try to claim the lock. + * + * We use acquire/release to manage fl_blocker so that we can + * optimize away taking the blocked_lock_lock in many cases. + * + * The smp_load_acquire guarantees two things: + * + * 1/ that fl_blocked_requests can be tested locklessly. If something + * was recently added to that list it must have been in a locked region + * *before* the locked region when fl_blocker was set to NULL. + * + * 2/ that no other thread is accessing 'waiter', so it is safe to free + * it. __locks_wake_up_blocks is careful not to touch waiter after + * fl_blocker is released. + * + * If a lockless check of fl_blocker shows it to be NULL, we know that + * no new locks can be inserted into its fl_blocked_requests list, and + * can avoid doing anything further if the list is empty. */ - if (waiter->fl_blocker == NULL && + if (!smp_load_acquire(&waiter->fl_blocker) && list_empty(&waiter->fl_blocked_requests)) return status; + spin_lock(&blocked_lock_lock); if (waiter->fl_blocker) status = 0; __locks_wake_up_blocks(waiter); __locks_delete_block(waiter); + + /* + * The setting of fl_blocker to NULL marks the "done" point in deleting + * a block. Paired with acquire at the top of this function. + */ + smp_store_release(&waiter->fl_blocker, NULL); spin_unlock(&blocked_lock_lock); return status; } @@ -1364,7 +1387,8 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) error = posix_lock_inode(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker); + error = wait_event_interruptible(fl->fl_wait, + list_empty(&fl->fl_blocked_member)); if (error) break; } @@ -1449,7 +1473,8 @@ int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start, error = posix_lock_inode(inode, &fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl.fl_wait, !fl.fl_blocker); + error = wait_event_interruptible(fl.fl_wait, + list_empty(&fl.fl_blocked_member)); if (!error) { /* * If we've been sleeping someone might have @@ -1652,7 +1677,8 @@ restart: locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, - !new_fl->fl_blocker, break_time); + list_empty(&new_fl->fl_blocked_member), + break_time); percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); @@ -2136,7 +2162,8 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) error = flock_lock_inode(inode, fl); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker); + error = wait_event_interruptible(fl->fl_wait, + list_empty(&fl->fl_blocked_member)); if (error) break; } @@ -2413,7 +2440,8 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, error = vfs_lock_file(filp, cmd, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker); + error = wait_event_interruptible(fl->fl_wait, + list_empty(&fl->fl_blocked_member)); if (error) break; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 989c30c98511..f1ff3076e4a4 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -153,6 +153,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) goto error_0; + clp->cl_minorversion = cl_init->minorversion; clp->cl_nfs_mod = cl_init->nfs_mod; if (!try_module_get(clp->cl_nfs_mod->owner)) goto error_dealloc; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index e1b938457ab9..e113fcb4bb4c 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -832,6 +832,8 @@ static int nfs_parse_source(struct fs_context *fc, if (len > maxnamlen) goto out_hostname; + kfree(ctx->nfs_server.hostname); + /* N.B. caller will free nfs_server.hostname in all cases */ ctx->nfs_server.hostname = kmemdup_nul(dev_name, len, GFP_KERNEL); if (!ctx->nfs_server.hostname) @@ -1240,6 +1242,13 @@ static int nfs_fs_context_validate(struct fs_context *fc) } ctx->nfs_mod = nfs_mod; } + + /* Ensure the filesystem context has the correct fs_type */ + if (fc->fs_type != ctx->nfs_mod->nfs_fs) { + module_put(fc->fs_type->owner); + __module_get(ctx->nfs_mod->nfs_fs->owner); + fc->fs_type = ctx->nfs_mod->nfs_fs; + } return 0; out_no_device_name: diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 52270bfac120..1abf126c2df4 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -31,6 +31,7 @@ static DEFINE_SPINLOCK(nfs_fscache_keys_lock); struct nfs_server_key { struct { uint16_t nfsversion; /* NFS protocol version */ + uint32_t minorversion; /* NFSv4 minor version */ uint16_t family; /* address family */ __be16 port; /* IP port */ } hdr; @@ -55,6 +56,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp) memset(&key, 0, sizeof(key)); key.hdr.nfsversion = clp->rpc_ops->version; + key.hdr.minorversion = clp->cl_minorversion; key.hdr.family = clp->cl_addr.ss_family; switch (clp->cl_addr.ss_family) { diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index ad6077404947..f3ece8ed3203 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -153,7 +153,7 @@ struct vfsmount *nfs_d_automount(struct path *path) /* Open a new filesystem context, transferring parameters from the * parent superblock, including the network namespace. */ - fc = fs_context_for_submount(&nfs_fs_type, path->dentry); + fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); if (IS_ERR(fc)) return ERR_CAST(fc); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 0cd767e5c977..0bd77cc1f639 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -216,7 +216,6 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) INIT_LIST_HEAD(&clp->cl_ds_clients); rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; - clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; #if IS_ENABLED(CONFIG_NFS_V4_1) diff --git a/fs/open.c b/fs/open.c index 0788b3715731..b69d6eed67e6 100644 --- a/fs/open.c +++ b/fs/open.c @@ -860,9 +860,6 @@ cleanup_file: * the return value of d_splice_alias(), then the caller needs to perform dput() * on it after finish_open(). * - * On successful return @file is a fully instantiated open file. After this, if - * an error occurs in ->atomic_open(), it needs to clean up with fput(). - * * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 444e2da4f60e..714c14c47ca5 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -93,6 +93,7 @@ config OVERLAY_FS_XINO_AUTO bool "Overlayfs: auto enable inode number mapping" default n depends on OVERLAY_FS + depends on 64BIT help If this config option is enabled then overlay filesystems will use unused high bits in undelying filesystem inode numbers to map all diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index a5317216de73..87c362f65448 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -244,6 +244,9 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) if (iocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(orig_iocb->ki_filp); + /* Actually acquired in ovl_write_iter() */ + __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, + SB_FREEZE_WRITE); file_end_write(iocb->ki_filp); ovl_copyattr(ovl_inode_real(inode), inode); } @@ -346,6 +349,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) goto out; file_start_write(real.file); + /* Pacify lockdep, same trick as done in aio_write() */ + __sb_writers_release(file_inode(real.file)->i_sb, + SB_FREEZE_WRITE); aio_req->fd = real; real.flags = 0; aio_req->orig_iocb = iocb; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 3623d28aa4fa..3d3f2b8bdae5 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -318,7 +318,12 @@ static inline unsigned int ovl_xino_bits(struct super_block *sb) return ovl_same_dev(sb) ? OVL_FS(sb)->xino_mode : 0; } -static inline int ovl_inode_lock(struct inode *inode) +static inline void ovl_inode_lock(struct inode *inode) +{ + mutex_lock(&OVL_I(inode)->lock); +} + +static inline int ovl_inode_lock_interruptible(struct inode *inode) { return mutex_lock_interruptible(&OVL_I(inode)->lock); } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 319fe0d355b0..ac967f1cb6e5 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1411,6 +1411,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, if (ofs->config.xino == OVL_XINO_ON) pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n"); ofs->xino_mode = 0; + } else if (ofs->config.xino == OVL_XINO_OFF) { + ofs->xino_mode = -1; } else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) { /* * This is a roundup of number of bits needed for encoding @@ -1623,8 +1625,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = 0; sb->s_maxbytes = MAX_LFS_FILESIZE; /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ - if (ofs->config.xino != OVL_XINO_OFF) + if (ofs->config.xino != OVL_XINO_OFF) { ofs->xino_mode = BITS_PER_LONG - 32; + if (!ofs->xino_mode) { + pr_warn("xino not supported on 32bit kernel, falling back to xino=off.\n"); + ofs->config.xino = OVL_XINO_OFF; + } + } /* alloc/destroy_inode needed for setting up traps in inode cache */ sb->s_op = &ovl_super_operations; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index ea005085803f..042f7eb4f7f4 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -509,7 +509,7 @@ int ovl_copy_up_start(struct dentry *dentry, int flags) struct inode *inode = d_inode(dentry); int err; - err = ovl_inode_lock(inode); + err = ovl_inode_lock_interruptible(inode); if (!err && ovl_already_copied_up_locked(dentry, flags)) { err = 1; /* Already copied up */ ovl_inode_unlock(inode); @@ -764,7 +764,7 @@ int ovl_nlink_start(struct dentry *dentry) return err; } - err = ovl_inode_lock(inode); + err = ovl_inode_lock_interruptible(inode); if (err) return err; diff --git a/fs/zonefs/Kconfig b/fs/zonefs/Kconfig index fb87ad372e29..ef2697b78820 100644 --- a/fs/zonefs/Kconfig +++ b/fs/zonefs/Kconfig @@ -2,6 +2,7 @@ config ZONEFS_FS tristate "zonefs filesystem support" depends on BLOCK depends on BLK_DEV_ZONED + select FS_IOMAP help zonefs is a simple file system which exposes zones of a zoned block device (e.g. host-managed or host-aware SMR disk drives) as files. diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 8bc6ef82d693..69aee3dfb660 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -601,13 +601,13 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; /* - * For async direct IOs to sequential zone files, ignore IOCB_NOWAIT + * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT * as this can cause write reordering (e.g. the first aio gets EAGAIN * on the inode lock but the second goes through but is now unaligned). */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb) - && (iocb->ki_flags & IOCB_NOWAIT)) - iocb->ki_flags &= ~IOCB_NOWAIT; + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb) && + (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) |