summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/addr_list.c2
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/btrfs/block-group.c4
-rw-r--r--fs/btrfs/inode.c8
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/cifs/cifsglob.h7
-rw-r--r--fs/cifs/cifsproto.h5
-rw-r--r--fs/cifs/cifssmb.c3
-rw-r--r--fs/cifs/dir.c1
-rw-r--r--fs/cifs/file.c22
-rw-r--r--fs/cifs/inode.c18
-rw-r--r--fs/cifs/smb1ops.c2
-rw-r--r--fs/cifs/smb2inode.c4
-rw-r--r--fs/cifs/smb2ops.c7
-rw-r--r--fs/cifs/smb2pdu.c1
-rw-r--r--fs/crypto/keysetup.c9
-rw-r--r--fs/debugfs/file.c17
-rw-r--r--fs/eventpoll.c8
-rw-r--r--fs/ext4/super.c6
-rw-r--r--fs/fat/inode.c19
-rw-r--r--fs/fcntl.c6
-rw-r--r--fs/file.c7
-rw-r--r--fs/fuse/dev.c6
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/gfs2/inode.c2
-rw-r--r--fs/inode.c1
-rw-r--r--fs/io-wq.c77
-rw-r--r--fs/io-wq.h16
-rw-r--r--fs/io_uring.c172
-rw-r--r--fs/jbd2/transaction.c8
-rw-r--r--fs/locks.c60
-rw-r--r--fs/nfs/client.c1
-rw-r--r--fs/nfs/fs_context.c9
-rw-r--r--fs/nfs/fscache.c2
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs4client.c1
-rw-r--r--fs/open.c3
-rw-r--r--fs/overlayfs/Kconfig1
-rw-r--r--fs/overlayfs/file.c6
-rw-r--r--fs/overlayfs/overlayfs.h7
-rw-r--r--fs/overlayfs/super.c9
-rw-r--r--fs/overlayfs/util.c4
-rw-r--r--fs/zonefs/Kconfig1
-rw-r--r--fs/zonefs/super.c8
45 files changed, 319 insertions, 241 deletions
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index df415c05939e..de1ae0bead3b 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -19,7 +19,7 @@
void afs_put_addrlist(struct afs_addr_list *alist)
{
if (alist && refcount_dec_and_test(&alist->usage))
- call_rcu(&alist->rcu, (rcu_callback_t)kfree);
+ kfree_rcu(alist, rcu);
}
/*
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 1d81fc4c3058..35f951ac296f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -81,7 +81,7 @@ enum afs_call_state {
* List of server addresses.
*/
struct afs_addr_list {
- struct rcu_head rcu; /* Must be first */
+ struct rcu_head rcu;
refcount_t usage;
u32 version; /* Version */
unsigned char max_addrs;
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 404e050ce8ee..7f09147872dc 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -856,9 +856,9 @@ static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
found_raid1c34 = true;
up_read(&sinfo->groups_sem);
}
- if (found_raid56)
+ if (!found_raid56)
btrfs_clear_fs_incompat(fs_info, RAID56);
- if (found_raid1c34)
+ if (!found_raid1c34)
btrfs_clear_fs_incompat(fs_info, RAID1C34);
}
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1ccb3f8d528d..d267eb5caa7b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7783,6 +7783,7 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
{
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
+ u16 csum_size;
blk_status_t ret;
/*
@@ -7802,7 +7803,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
file_offset -= dip->logical_offset;
file_offset >>= inode->i_sb->s_blocksize_bits;
- io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
+ csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy);
+ io_bio->csum = orig_io_bio->csum + csum_size * file_offset;
return 0;
}
@@ -9494,6 +9496,10 @@ out_fail:
ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
if (ret)
commit_transaction = true;
+ } else if (sync_log) {
+ mutex_lock(&root->log_mutex);
+ list_del(&ctx.list);
+ mutex_unlock(&root->log_mutex);
}
if (commit_transaction) {
ret = btrfs_commit_transaction(trans);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 606f26d862dc..cc3ada12848d 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -324,6 +324,8 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
if (full_path == NULL)
goto cdda_exit;
+ convert_delimiter(full_path, '\\');
+
cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
if (!cifs_sb_master_tlink(cifs_sb)) {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 46ebaf3f0824..fa77fe5258b0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -530,6 +530,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
if (tcon->seal)
seq_puts(s, ",seal");
+ else if (tcon->ses->server->ignore_signature)
+ seq_puts(s, ",signloosely");
if (tcon->nocase)
seq_puts(s, ",nocase");
if (tcon->local_lease)
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index de82cfa44b1a..0d956360e984 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1281,6 +1281,7 @@ struct cifs_fid {
__u64 volatile_fid; /* volatile file id for smb2 */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */
__u8 create_guid[16];
+ __u32 access;
struct cifs_pending_open *pending_open;
unsigned int epoch;
#ifdef CONFIG_CIFS_DEBUG2
@@ -1741,6 +1742,12 @@ static inline bool is_retryable_error(int error)
return false;
}
+
+/* cifs_get_writable_file() flags */
+#define FIND_WR_ANY 0
+#define FIND_WR_FSUID_ONLY 1
+#define FIND_WR_WITH_DELETE 2
+
#define MID_FREE 0
#define MID_REQUEST_ALLOCATED 1
#define MID_REQUEST_SUBMITTED 2
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 89eaaf46d1ca..e5cb681ec138 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -134,11 +134,12 @@ extern bool backup_cred(struct cifs_sb_info *);
extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
-extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
+extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int);
extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode,
- bool fsuid_only,
+ int flags,
struct cifsFileInfo **ret_file);
extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
+ int flags,
struct cifsFileInfo **ret_file);
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3c89569e7210..6f6fb3606a5d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1492,6 +1492,7 @@ openRetry:
*oplock = rsp->OplockLevel;
/* cifs fid stays in le */
oparms->fid->netfid = rsp->Fid;
+ oparms->fid->access = desired_access;
/* Let caller know file was created so we can set the mode. */
/* Do we care about the CreateAction in any other cases? */
@@ -2115,7 +2116,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
wdata2->tailsz = tailsz;
wdata2->bytes = cur_len;
- rc = cifs_get_writable_file(CIFS_I(inode), false,
+ rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY,
&wdata2->cfile);
if (!wdata2->cfile) {
cifs_dbg(VFS, "No writable handle to retry writepages rc=%d\n",
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 0ef099442f20..36e7b2fd2190 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -555,7 +555,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
if (server->ops->close)
server->ops->close(xid, tcon, &fid);
cifs_del_pending_open(&open);
- fput(file);
rc = -ENOMEM;
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index bc9516ab4b34..8f9d849a0012 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1169,7 +1169,8 @@ try_again:
rc = posix_lock_file(file, flock, NULL);
up_write(&cinode->lock_sem);
if (rc == FILE_LOCK_DEFERRED) {
- rc = wait_event_interruptible(flock->fl_wait, !flock->fl_blocker);
+ rc = wait_event_interruptible(flock->fl_wait,
+ list_empty(&flock->fl_blocked_member));
if (!rc)
goto try_again;
locks_delete_block(flock);
@@ -1958,7 +1959,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
/* Return -EBADF if no handle is found and general rc otherwise */
int
-cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
+cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags,
struct cifsFileInfo **ret_file)
{
struct cifsFileInfo *open_file, *inv_file = NULL;
@@ -1966,7 +1967,8 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only,
bool any_available = false;
int rc = -EBADF;
unsigned int refind = 0;
-
+ bool fsuid_only = flags & FIND_WR_FSUID_ONLY;
+ bool with_delete = flags & FIND_WR_WITH_DELETE;
*ret_file = NULL;
/*
@@ -1998,6 +2000,8 @@ refind_writable:
continue;
if (fsuid_only && !uid_eq(open_file->uid, current_fsuid()))
continue;
+ if (with_delete && !(open_file->fid.access & DELETE))
+ continue;
if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) {
if (!open_file->invalidHandle) {
/* found a good writable file */
@@ -2045,12 +2049,12 @@ refind_writable:
}
struct cifsFileInfo *
-find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only)
+find_writable_file(struct cifsInodeInfo *cifs_inode, int flags)
{
struct cifsFileInfo *cfile;
int rc;
- rc = cifs_get_writable_file(cifs_inode, fsuid_only, &cfile);
+ rc = cifs_get_writable_file(cifs_inode, flags, &cfile);
if (rc)
cifs_dbg(FYI, "couldn't find writable handle rc=%d", rc);
@@ -2059,6 +2063,7 @@ find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only)
int
cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
+ int flags,
struct cifsFileInfo **ret_file)
{
struct list_head *tmp;
@@ -2085,7 +2090,7 @@ cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
kfree(full_path);
cinode = CIFS_I(d_inode(cfile->dentry));
spin_unlock(&tcon->open_file_lock);
- return cifs_get_writable_file(cinode, 0, ret_file);
+ return cifs_get_writable_file(cinode, flags, ret_file);
}
spin_unlock(&tcon->open_file_lock);
@@ -2162,7 +2167,8 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
if (mapping->host->i_size - offset < (loff_t)to)
to = (unsigned)(mapping->host->i_size - offset);
- rc = cifs_get_writable_file(CIFS_I(mapping->host), false, &open_file);
+ rc = cifs_get_writable_file(CIFS_I(mapping->host), FIND_WR_ANY,
+ &open_file);
if (!rc) {
bytes_written = cifs_write(open_file, open_file->pid,
write_data, to - from, &offset);
@@ -2355,7 +2361,7 @@ retry:
if (cfile)
cifsFileInfo_put(cfile);
- rc = cifs_get_writable_file(CIFS_I(inode), false, &cfile);
+ rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile);
/* in case of an error store it to return later */
if (rc)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b5e6635c578e..b16f8d23e97b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -653,8 +653,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
*/
if ((fattr->cf_nlink < 1) && !tcon->unix_ext &&
!info->DeletePending) {
- cifs_dbg(1, "bogus file nlink value %u\n",
- fattr->cf_nlink);
+ cifs_dbg(VFS, "bogus file nlink value %u\n",
+ fattr->cf_nlink);
fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
}
}
@@ -2073,6 +2073,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
struct inode *inode = d_inode(dentry);
struct super_block *sb = dentry->d_sb;
char *full_path = NULL;
+ int count = 0;
if (inode == NULL)
return -ENOENT;
@@ -2094,15 +2095,18 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
full_path, inode, inode->i_count.counter,
dentry, cifs_get_time(dentry), jiffies);
+again:
if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
else
rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
xid, NULL);
-
+ if (rc == -EAGAIN && count++ < 10)
+ goto again;
out:
kfree(full_path);
free_xid(xid);
+
return rc;
}
@@ -2187,7 +2191,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat,
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
stat->gid = current_fsgid();
}
- return rc;
+ return 0;
}
int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
@@ -2278,7 +2282,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
* writebehind data than the SMB timeout for the SetPathInfo
* request would allow
*/
- open_file = find_writable_file(cifsInode, true);
+ open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY);
if (open_file) {
tcon = tlink_tcon(open_file->tlink);
server = tcon->ses->server;
@@ -2428,7 +2432,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
args->ctime = NO_CHANGE_64;
args->device = 0;
- open_file = find_writable_file(cifsInode, true);
+ open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY);
if (open_file) {
u16 nfid = open_file->fid.netfid;
u32 npid = open_file->pid;
@@ -2531,7 +2535,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
rc = 0;
if (attrs->ia_valid & ATTR_MTIME) {
- rc = cifs_get_writable_file(cifsInode, false, &wfile);
+ rc = cifs_get_writable_file(cifsInode, FIND_WR_ANY, &wfile);
if (!rc) {
tcon = tlink_tcon(wfile->tlink);
rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index eb994e313c6a..b130efaf8feb 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -766,7 +766,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
struct cifs_tcon *tcon;
/* if the file is already open for write, just use that fileid */
- open_file = find_writable_file(cinode, true);
+ open_file = find_writable_file(cinode, FIND_WR_FSUID_ONLY);
if (open_file) {
fid.netfid = open_file->fid.netfid;
netpid = open_file->pid;
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1cf207564ff9..a8c301ae00ed 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -521,7 +521,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
cifs_i = CIFS_I(inode);
dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
data.Attributes = cpu_to_le32(dosattrs);
- cifs_get_writable_path(tcon, name, &cfile);
+ cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile);
tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, ACL_NO_MODE,
@@ -577,7 +577,7 @@ smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
{
struct cifsFileInfo *cfile;
- cifs_get_writable_path(tcon, from_name, &cfile);
+ cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile);
return smb2_set_path_attr(xid, tcon, from_name, to_name,
cifs_sb, DELETE, SMB2_OP_RENAME, cfile);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index e47190cae163..cfe9b800ea8c 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1364,6 +1364,7 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
cfile->fid.persistent_fid = fid->persistent_fid;
cfile->fid.volatile_fid = fid->volatile_fid;
+ cfile->fid.access = fid->access;
#ifdef CONFIG_CIFS_DEBUG2
cfile->fid.mid = fid->mid;
#endif /* CIFS_DEBUG2 */
@@ -2221,6 +2222,8 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
goto qdf_free;
}
+ atomic_inc(&tcon->num_remote_opens);
+
qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base;
if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
trace_smb3_query_dir_done(xid, fid->persistent_fid,
@@ -3327,7 +3330,7 @@ static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offs
* some servers (Windows2016) will not reflect recent writes in
* QUERY_ALLOCATED_RANGES until SMB2_flush is called.
*/
- wrcfile = find_writable_file(cifsi, false);
+ wrcfile = find_writable_file(cifsi, FIND_WR_ANY);
if (wrcfile) {
filemap_write_and_wait(inode->i_mapping);
smb2_flush_file(xid, tcon, &wrcfile->fid);
@@ -3416,7 +3419,7 @@ static int smb3_fiemap(struct cifs_tcon *tcon,
if (rc)
goto out;
- if (out_data_len < sizeof(struct file_allocated_range_buffer)) {
+ if (out_data_len && out_data_len < sizeof(struct file_allocated_range_buffer)) {
rc = -EINVAL;
goto out;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 1234f9ccab03..28c0be5e69b7 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2771,6 +2771,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
atomic_inc(&tcon->num_remote_opens);
oparms->fid->persistent_fid = rsp->PersistentFileId;
oparms->fid->volatile_fid = rsp->VolatileFileId;
+ oparms->fid->access = oparms->desired_access;
#ifdef CONFIG_CIFS_DEBUG2
oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId);
#endif /* CIFS_DEBUG2 */
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 65cb09fa6ead..08c9f216a54d 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -539,6 +539,15 @@ int fscrypt_drop_inode(struct inode *inode)
mk = ci->ci_master_key->payload.data[0];
/*
+ * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes
+ * protected by the key were cleaned by sync_filesystem(). But if
+ * userspace is still using the files, inodes can be dirtied between
+ * then and now. We mustn't lose any writes, so skip dirty inodes here.
+ */
+ if (inode->i_state & I_DIRTY_ALL)
+ return 0;
+
+ /*
* Note: since we aren't holding ->mk_secret_sem, the result here can
* immediately become outdated. But there's no correctness problem with
* unnecessarily evicting. Nor is there a correctness problem with not
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 634b09d18b77..db987b5110a9 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -1090,21 +1090,12 @@ static const struct file_operations fops_regset32 = {
* This function creates a file in debugfs with the given name that reports
* the names and values of a set of 32-bit registers. If the @mode variable
* is so set it can be read from. Writing is not supported.
- *
- * This function will return a pointer to a dentry if it succeeds. This
- * pointer must be passed to the debugfs_remove() function when the file is
- * to be removed (no automatic cleanup happens if your module is unloaded,
- * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
- * returned.
- *
- * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
- * be returned.
*/
-struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
- struct dentry *parent,
- struct debugfs_regset32 *regset)
+void debugfs_create_regset32(const char *name, umode_t mode,
+ struct dentry *parent,
+ struct debugfs_regset32 *regset)
{
- return debugfs_create_file(name, mode, parent, regset, &fops_regset32);
+ debugfs_create_file(name, mode, parent, regset, &fops_regset32);
}
EXPORT_SYMBOL_GPL(debugfs_create_regset32);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b041b66002db..eee3c92a9ebf 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1854,9 +1854,9 @@ fetch_events:
waiter = true;
init_waitqueue_entry(&wait, current);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
__add_wait_queue_exclusive(&ep->wq, &wait);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
for (;;) {
@@ -1904,9 +1904,9 @@ send_events:
goto fetch_events;
if (waiter) {
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
__remove_wait_queue(&ep->wq, &wait);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
return res;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ff1b764b0c0e..0c7c4adb664e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2391,7 +2391,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct flex_groups **old_groups, **new_groups;
- int size, i;
+ int size, i, j;
if (!sbi->s_log_groups_per_flex)
return 0;
@@ -2412,8 +2412,8 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
sizeof(struct flex_groups)),
GFP_KERNEL);
if (!new_groups[i]) {
- for (i--; i >= sbi->s_flex_groups_allocated; i--)
- kvfree(new_groups[i]);
+ for (j = sbi->s_flex_groups_allocated; j < i; j++)
+ kvfree(new_groups[j]);
kvfree(new_groups);
ext4_msg(sb, KERN_ERR,
"not enough memory for %d flex groups", size);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 594b05ae16c9..71946da84388 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -750,6 +750,13 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
return NULL;
init_rwsem(&ei->truncate_lock);
+ /* Zeroing to allow iput() even if partial initialized inode. */
+ ei->mmu_private = 0;
+ ei->i_start = 0;
+ ei->i_logstart = 0;
+ ei->i_attrs = 0;
+ ei->i_pos = 0;
+
return &ei->vfs_inode;
}
@@ -1374,16 +1381,6 @@ out:
return 0;
}
-static void fat_dummy_inode_init(struct inode *inode)
-{
- /* Initialize this dummy inode to work as no-op. */
- MSDOS_I(inode)->mmu_private = 0;
- MSDOS_I(inode)->i_start = 0;
- MSDOS_I(inode)->i_logstart = 0;
- MSDOS_I(inode)->i_attrs = 0;
- MSDOS_I(inode)->i_pos = 0;
-}
-
static int fat_read_root(struct inode *inode)
{
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
@@ -1844,13 +1841,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
fat_inode = new_inode(sb);
if (!fat_inode)
goto out_fail;
- fat_dummy_inode_init(fat_inode);
sbi->fat_inode = fat_inode;
fsinfo_inode = new_inode(sb);
if (!fsinfo_inode)
goto out_fail;
- fat_dummy_inode_init(fsinfo_inode);
fsinfo_inode->i_ino = MSDOS_FSINFO_INO;
sbi->fsinfo_inode = fsinfo_inode;
insert_inode_hash(fsinfo_inode);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9bc167562ee8..2e4c0fa2074b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -735,8 +735,9 @@ static void send_sigio_to_task(struct task_struct *p,
return;
switch (signum) {
- kernel_siginfo_t si;
- default:
+ default: {
+ kernel_siginfo_t si;
+
/* Queue a rt signal with the appropriate fd as its
value. We use SI_SIGIO as the source, not
SI_KERNEL, since kernel signals always get
@@ -769,6 +770,7 @@ static void send_sigio_to_task(struct task_struct *p,
si.si_fd = fd;
if (!do_send_sig_info(signum, &si, p, type))
break;
+ }
/* fall-through - fall back on the old plain SIGIO signal */
case 0:
do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type);
diff --git a/fs/file.c b/fs/file.c
index a364e1a9b7e8..c8a4e4c86e55 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -540,9 +540,14 @@ static int alloc_fd(unsigned start, unsigned flags)
return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
}
+int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
+{
+ return __alloc_fd(current->files, 0, nofile, flags);
+}
+
int get_unused_fd_flags(unsigned flags)
{
- return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
+ return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
}
EXPORT_SYMBOL(get_unused_fd_flags);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8e02d76fe104..97eec7522bf2 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -276,12 +276,10 @@ static void flush_bg_queue(struct fuse_conn *fc)
void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
{
struct fuse_iqueue *fiq = &fc->iq;
- bool async;
if (test_and_set_bit(FR_FINISHED, &req->flags))
goto put_request;
- async = req->args->end;
/*
* test_and_set_bit() implies smp_mb() between bit
* changing and below intr_entry check. Pairs with
@@ -324,7 +322,7 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&req->waitq);
}
- if (async)
+ if (test_bit(FR_ASYNC, &req->flags))
req->args->end(fc, req->args, req->out.h.error);
put_request:
fuse_put_request(fc, req);
@@ -471,6 +469,8 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
req->in.h.opcode = args->opcode;
req->in.h.nodeid = args->nodeid;
req->args = args;
+ if (args->end)
+ __set_bit(FR_ASYNC, &req->flags);
}
ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index aa75e2305b75..ca344bf71404 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -301,6 +301,7 @@ struct fuse_io_priv {
* FR_SENT: request is in userspace, waiting for an answer
* FR_FINISHED: request is finished
* FR_PRIVATE: request is on private list
+ * FR_ASYNC: request is asynchronous
*/
enum fuse_req_flag {
FR_ISREPLY,
@@ -314,6 +315,7 @@ enum fuse_req_flag {
FR_SENT,
FR_FINISHED,
FR_PRIVATE,
+ FR_ASYNC,
};
/**
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2716d56ed0a0..8294851a9dd9 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1248,7 +1248,7 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
if (!(file->f_mode & FMODE_OPENED))
return finish_no_open(file, d);
dput(d);
- return 0;
+ return excl && (flags & O_CREAT) ? -EEXIST : 0;
}
BUG_ON(d != NULL);
diff --git a/fs/inode.c b/fs/inode.c
index 7d57068b6b7a..93d9252a00ab 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -138,6 +138,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
+ atomic64_set(&inode->i_sequence, 0);
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
inode->i_fop = &no_open_fops;
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 0a5ab1a8f69a..5cef075c0b37 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -502,7 +502,7 @@ next:
if (worker->mm)
work->flags |= IO_WQ_WORK_HAS_MM;
- if (wq->get_work && !(work->flags & IO_WQ_WORK_INTERNAL)) {
+ if (wq->get_work) {
put_work = work;
wq->get_work(work);
}
@@ -535,42 +535,23 @@ next:
} while (1);
}
-static inline void io_worker_spin_for_work(struct io_wqe *wqe)
-{
- int i = 0;
-
- while (++i < 1000) {
- if (io_wqe_run_queue(wqe))
- break;
- if (need_resched())
- break;
- cpu_relax();
- }
-}
-
static int io_wqe_worker(void *data)
{
struct io_worker *worker = data;
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
- bool did_work;
io_worker_start(wqe, worker);
- did_work = false;
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
set_current_state(TASK_INTERRUPTIBLE);
loop:
- if (did_work)
- io_worker_spin_for_work(wqe);
spin_lock_irq(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
__set_current_state(TASK_RUNNING);
io_worker_handle_work(worker);
- did_work = true;
goto loop;
}
- did_work = false;
/* drops the lock on success, retry */
if (__io_worker_idle(wqe, worker)) {
__release(&wqe->lock);
@@ -766,6 +747,17 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
return true;
}
+static void io_run_cancel(struct io_wq_work *work)
+{
+ do {
+ struct io_wq_work *old_work = work;
+
+ work->flags |= IO_WQ_WORK_CANCEL;
+ work->func(&work);
+ work = (work == old_work) ? NULL : work;
+ } while (work);
+}
+
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
@@ -779,8 +771,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
* It's close enough to not be an issue, fork() has the same delay.
*/
if (unlikely(!io_wq_can_queue(wqe, acct, work))) {
- work->flags |= IO_WQ_WORK_CANCEL;
- work->func(&work);
+ io_run_cancel(work);
return;
}
@@ -919,8 +910,7 @@ static enum io_wq_cancel io_wqe_cancel_cb_work(struct io_wqe *wqe,
spin_unlock_irqrestore(&wqe->lock, flags);
if (found) {
- work->flags |= IO_WQ_WORK_CANCEL;
- work->func(&work);
+ io_run_cancel(work);
return IO_WQ_CANCEL_OK;
}
@@ -995,8 +985,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe,
spin_unlock_irqrestore(&wqe->lock, flags);
if (found) {
- work->flags |= IO_WQ_WORK_CANCEL;
- work->func(&work);
+ io_run_cancel(work);
return IO_WQ_CANCEL_OK;
}
@@ -1068,42 +1057,6 @@ enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid)
return ret;
}
-struct io_wq_flush_data {
- struct io_wq_work work;
- struct completion done;
-};
-
-static void io_wq_flush_func(struct io_wq_work **workptr)
-{
- struct io_wq_work *work = *workptr;
- struct io_wq_flush_data *data;
-
- data = container_of(work, struct io_wq_flush_data, work);
- complete(&data->done);
-}
-
-/*
- * Doesn't wait for previously queued work to finish. When this completes,
- * it just means that previously queued work was started.
- */
-void io_wq_flush(struct io_wq *wq)
-{
- struct io_wq_flush_data data;
- int node;
-
- for_each_node(node) {
- struct io_wqe *wqe = wq->wqes[node];
-
- if (!node_online(node))
- continue;
- init_completion(&data.done);
- INIT_IO_WORK(&data.work, io_wq_flush_func);
- data.work.flags |= IO_WQ_WORK_INTERNAL;
- io_wqe_enqueue(wqe, &data.work);
- wait_for_completion(&data.done);
- }
-}
-
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{
int ret = -ENOMEM, node;
diff --git a/fs/io-wq.h b/fs/io-wq.h
index ccc7d84af57d..e5e15f2c93ec 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -8,7 +8,6 @@ enum {
IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_UNBOUND = 32,
- IO_WQ_WORK_INTERNAL = 64,
IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512,
@@ -79,16 +78,10 @@ struct io_wq_work {
pid_t task_pid;
};
-#define INIT_IO_WORK(work, _func) \
- do { \
- (work)->list.next = NULL; \
- (work)->func = _func; \
- (work)->files = NULL; \
- (work)->mm = NULL; \
- (work)->creds = NULL; \
- (work)->fs = NULL; \
- (work)->flags = 0; \
- } while (0) \
+#define INIT_IO_WORK(work, _func) \
+ do { \
+ *(work) = (struct io_wq_work){ .func = _func }; \
+ } while (0) \
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
@@ -106,7 +99,6 @@ void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
-void io_wq_flush(struct io_wq *wq);
void io_wq_cancel_all(struct io_wq *wq);
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index de650df9ac53..3affd96a98ba 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -183,17 +183,12 @@ struct fixed_file_table {
struct file **files;
};
-enum {
- FFD_F_ATOMIC,
-};
-
struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
struct percpu_ref refs;
struct llist_head put_llist;
- unsigned long state;
struct work_struct ref_work;
struct completion done;
};
@@ -348,6 +343,7 @@ struct io_accept {
struct sockaddr __user *addr;
int __user *addr_len;
int flags;
+ unsigned long nofile;
};
struct io_sync {
@@ -402,6 +398,7 @@ struct io_open {
struct filename *filename;
struct statx __user *buffer;
struct open_how how;
+ unsigned long nofile;
};
struct io_files_update {
@@ -1004,6 +1001,7 @@ static void io_kill_timeout(struct io_kiocb *req)
if (ret != -1) {
atomic_inc(&req->ctx->cq_timeouts);
list_del_init(&req->list);
+ req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, 0);
io_put_req(req);
}
@@ -1483,10 +1481,10 @@ static void io_free_req(struct io_kiocb *req)
__attribute__((nonnull))
static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
{
- io_req_find_next(req, nxtptr);
-
- if (refcount_dec_and_test(&req->refs))
+ if (refcount_dec_and_test(&req->refs)) {
+ io_req_find_next(req, nxtptr);
__io_free_req(req);
+ }
}
static void io_put_req(struct io_kiocb *req)
@@ -1821,6 +1819,10 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
list_add(&req->list, &ctx->poll_list);
else
list_add_tail(&req->list, &ctx->poll_list);
+
+ if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+ wq_has_sleeper(&ctx->sqo_wait))
+ wake_up(&ctx->sqo_wait);
}
static void io_file_put(struct io_submit_state *state)
@@ -2071,7 +2073,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
ssize_t ret;
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
- return ret;
+ return ret < 0 ? ret : sqe_len;
}
if (req->io) {
@@ -2577,6 +2579,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return ret;
}
+ req->open.nofile = rlimit(RLIMIT_NOFILE);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -2618,6 +2621,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return ret;
}
+ req->open.nofile = rlimit(RLIMIT_NOFILE);
req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -2636,7 +2640,7 @@ static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
if (ret)
goto err;
- ret = get_unused_fd_flags(req->open.how.flags);
+ ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
if (ret < 0)
goto err;
@@ -3002,6 +3006,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ sr->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+
if (!io || req->opcode == IORING_OP_SEND)
return 0;
/* iovec is already imported */
@@ -3154,6 +3163,11 @@ static int io_recvmsg_prep(struct io_kiocb *req,
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ sr->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+
if (!io || req->opcode == IORING_OP_RECV)
return 0;
/* iovec is already imported */
@@ -3311,6 +3325,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
accept->flags = READ_ONCE(sqe->accept_flags);
+ accept->nofile = rlimit(RLIMIT_NOFILE);
return 0;
#else
return -EOPNOTSUPP;
@@ -3327,7 +3342,8 @@ static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
file_flags = force_nonblock ? O_NONBLOCK : 0;
ret = __sys_accept4_file(req->file, file_flags, accept->addr,
- accept->addr_len, accept->flags);
+ accept->addr_len, accept->flags,
+ accept->nofile);
if (ret == -EAGAIN && force_nonblock)
return -EAGAIN;
if (ret == -ERESTARTSYS)
@@ -4121,6 +4137,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
{
ssize_t ret = 0;
+ if (!sqe)
+ return 0;
+
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (unlikely(ret))
@@ -4705,11 +4724,21 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_kiocb *linked_timeout;
struct io_kiocb *nxt = NULL;
+ const struct cred *old_creds = NULL;
int ret;
again:
linked_timeout = io_prep_linked_timeout(req);
+ if (req->work.creds && req->work.creds != current_cred()) {
+ if (old_creds)
+ revert_creds(old_creds);
+ if (old_creds == req->work.creds)
+ old_creds = NULL; /* restored original creds */
+ else
+ old_creds = override_creds(req->work.creds);
+ }
+
ret = io_issue_sqe(req, sqe, &nxt, true);
/*
@@ -4735,7 +4764,7 @@ punt:
err:
/* drop submission reference */
- io_put_req(req);
+ io_put_req_find_next(req, &nxt);
if (linked_timeout) {
if (!ret)
@@ -4759,6 +4788,8 @@ done_req:
goto punt;
goto again;
}
+ if (old_creds)
+ revert_creds(old_creds);
}
static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4803,7 +4834,6 @@ static inline void io_queue_link_head(struct io_kiocb *req)
static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
{
- const struct cred *old_creds = NULL;
struct io_ring_ctx *ctx = req->ctx;
unsigned int sqe_flags;
int ret, id;
@@ -4818,14 +4848,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
id = READ_ONCE(sqe->personality);
if (id) {
- const struct cred *personality_creds;
-
- personality_creds = idr_find(&ctx->personality_idr, id);
- if (unlikely(!personality_creds)) {
+ req->work.creds = idr_find(&ctx->personality_idr, id);
+ if (unlikely(!req->work.creds)) {
ret = -EINVAL;
goto err_req;
}
- old_creds = override_creds(personality_creds);
+ get_cred(req->work.creds);
}
/* same numerical values with corresponding REQ_F_*, safe to copy */
@@ -4837,8 +4865,6 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
err_req:
io_cqring_add_event(req, ret);
io_double_put_req(req);
- if (old_creds)
- revert_creds(old_creds);
return false;
}
@@ -4890,6 +4916,11 @@ err_req:
if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
req->flags |= REQ_F_LINK;
INIT_LIST_HEAD(&req->link_list);
+
+ if (io_alloc_async_ctx(req)) {
+ ret = -EAGAIN;
+ goto err_req;
+ }
ret = io_req_defer_prep(req, sqe);
if (ret)
req->flags |= REQ_F_FAIL_LINK;
@@ -4899,8 +4930,6 @@ err_req:
}
}
- if (old_creds)
- revert_creds(old_creds);
return true;
}
@@ -5081,9 +5110,8 @@ static int io_sq_thread(void *data)
const struct cred *old_cred;
mm_segment_t old_fs;
DEFINE_WAIT(wait);
- unsigned inflight;
unsigned long timeout;
- int ret;
+ int ret = 0;
complete(&ctx->completions[1]);
@@ -5091,39 +5119,19 @@ static int io_sq_thread(void *data)
set_fs(USER_DS);
old_cred = override_creds(ctx->creds);
- ret = timeout = inflight = 0;
+ timeout = jiffies + ctx->sq_thread_idle;
while (!kthread_should_park()) {
unsigned int to_submit;
- if (inflight) {
+ if (!list_empty(&ctx->poll_list)) {
unsigned nr_events = 0;
- if (ctx->flags & IORING_SETUP_IOPOLL) {
- /*
- * inflight is the count of the maximum possible
- * entries we submitted, but it can be smaller
- * if we dropped some of them. If we don't have
- * poll entries available, then we know that we
- * have nothing left to poll for. Reset the
- * inflight count to zero in that case.
- */
- mutex_lock(&ctx->uring_lock);
- if (!list_empty(&ctx->poll_list))
- io_iopoll_getevents(ctx, &nr_events, 0);
- else
- inflight = 0;
- mutex_unlock(&ctx->uring_lock);
- } else {
- /*
- * Normal IO, just pretend everything completed.
- * We don't have to poll completions for that.
- */
- nr_events = inflight;
- }
-
- inflight -= nr_events;
- if (!inflight)
+ mutex_lock(&ctx->uring_lock);
+ if (!list_empty(&ctx->poll_list))
+ io_iopoll_getevents(ctx, &nr_events, 0);
+ else
timeout = jiffies + ctx->sq_thread_idle;
+ mutex_unlock(&ctx->uring_lock);
}
to_submit = io_sqring_entries(ctx);
@@ -5152,7 +5160,7 @@ static int io_sq_thread(void *data)
* more IO, we should wait for the application to
* reap events and wake us up.
*/
- if (inflight ||
+ if (!list_empty(&ctx->poll_list) ||
(!time_after(jiffies, timeout) && ret != -EBUSY &&
!percpu_ref_is_dying(&ctx->refs))) {
cond_resched();
@@ -5162,6 +5170,19 @@ static int io_sq_thread(void *data)
prepare_to_wait(&ctx->sqo_wait, &wait,
TASK_INTERRUPTIBLE);
+ /*
+ * While doing polled IO, before going to sleep, we need
+ * to check if there are new reqs added to poll_list, it
+ * is because reqs may have been punted to io worker and
+ * will be added to poll_list later, hence check the
+ * poll_list again.
+ */
+ if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+ !list_empty_careful(&ctx->poll_list)) {
+ finish_wait(&ctx->sqo_wait, &wait);
+ continue;
+ }
+
/* Tell userspace we may need a wakeup call */
ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
/* make sure to read SQ tail after writing flags */
@@ -5189,8 +5210,7 @@ static int io_sq_thread(void *data)
mutex_lock(&ctx->uring_lock);
ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
mutex_unlock(&ctx->uring_lock);
- if (ret > 0)
- inflight += ret;
+ timeout = jiffies + ctx->sq_thread_idle;
}
set_fs(old_fs);
@@ -5324,6 +5344,23 @@ static void io_file_ref_kill(struct percpu_ref *ref)
complete(&data->done);
}
+static void io_file_ref_exit_and_free(struct work_struct *work)
+{
+ struct fixed_file_data *data;
+
+ data = container_of(work, struct fixed_file_data, ref_work);
+
+ /*
+ * Ensure any percpu-ref atomic switch callback has run, it could have
+ * been in progress when the files were being unregistered. Once
+ * that's done, we can safely exit and free the ref and containing
+ * data structure.
+ */
+ rcu_barrier();
+ percpu_ref_exit(&data->refs);
+ kfree(data);
+}
+
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
struct fixed_file_data *data = ctx->file_data;
@@ -5336,14 +5373,14 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
flush_work(&data->ref_work);
wait_for_completion(&data->done);
io_ring_file_ref_flush(data);
- percpu_ref_exit(&data->refs);
__io_sqe_files_unregister(ctx);
nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
for (i = 0; i < nr_tables; i++)
kfree(data->table[i].files);
kfree(data->table);
- kfree(data);
+ INIT_WORK(&data->ref_work, io_file_ref_exit_and_free);
+ queue_work(system_wq, &data->ref_work);
ctx->file_data = NULL;
ctx->nr_user_files = 0;
return 0;
@@ -5595,7 +5632,6 @@ static void io_ring_file_ref_switch(struct work_struct *work)
data = container_of(work, struct fixed_file_data, ref_work);
io_ring_file_ref_flush(data);
- percpu_ref_get(&data->refs);
percpu_ref_switch_to_percpu(&data->refs);
}
@@ -5771,8 +5807,13 @@ static void io_atomic_switch(struct percpu_ref *ref)
{
struct fixed_file_data *data;
+ /*
+ * Juggle reference to ensure we hit zero, if needed, so we can
+ * switch back to percpu mode
+ */
data = container_of(ref, struct fixed_file_data, refs);
- clear_bit(FFD_F_ATOMIC, &data->state);
+ percpu_ref_put(&data->refs);
+ percpu_ref_get(&data->refs);
}
static bool io_queue_file_removal(struct fixed_file_data *data,
@@ -5795,11 +5836,7 @@ static bool io_queue_file_removal(struct fixed_file_data *data,
llist_add(&pfile->llist, &data->put_llist);
if (pfile == &pfile_stack) {
- if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
- percpu_ref_put(&data->refs);
- percpu_ref_switch_to_atomic(&data->refs,
- io_atomic_switch);
- }
+ percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
wait_for_completion(&done);
flush_work(&data->ref_work);
return false;
@@ -5873,10 +5910,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
up->offset++;
}
- if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
- percpu_ref_put(&data->refs);
+ if (ref_switch)
percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
- }
return done ? done : err;
}
@@ -6334,6 +6369,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_sqe_buffer_unregister(ctx);
io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
+ idr_destroy(&ctx->personality_idr);
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
@@ -6647,6 +6683,7 @@ out_fput:
return submitted ? submitted : ret;
}
+#ifdef CONFIG_PROC_FS
static int io_uring_show_cred(int id, void *p, void *data)
{
const struct cred *cred = p;
@@ -6720,6 +6757,7 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
percpu_ref_put(&ctx->refs);
}
}
+#endif
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
@@ -6731,7 +6769,9 @@ static const struct file_operations io_uring_fops = {
#endif
.poll = io_uring_poll,
.fasync = io_uring_fasync,
+#ifdef CONFIG_PROC_FS
.show_fdinfo = io_uring_show_fdinfo,
+#endif
};
static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d181948c0390..3dccc23cf010 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1150,8 +1150,8 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
/* For undo access buffer must have data copied */
if (undo && !jh->b_committed_data)
goto out;
- if (jh->b_transaction != handle->h_transaction &&
- jh->b_next_transaction != handle->h_transaction)
+ if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
+ READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
goto out;
/*
* There are two reasons for the barrier here:
@@ -2569,8 +2569,8 @@ bool __jbd2_journal_refile_buffer(struct journal_head *jh)
* our jh reference and thus __jbd2_journal_file_buffer() must not
* take a new one.
*/
- jh->b_transaction = jh->b_next_transaction;
- jh->b_next_transaction = NULL;
+ WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
+ WRITE_ONCE(jh->b_next_transaction, NULL);
if (buffer_freed(bh))
jlist = BJ_Forget;
else if (jh->b_modified)
diff --git a/fs/locks.c b/fs/locks.c
index 44b6da032842..b8a31c1c4fff 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -725,7 +725,6 @@ static void __locks_delete_block(struct file_lock *waiter)
{
locks_delete_global_blocked(waiter);
list_del_init(&waiter->fl_blocked_member);
- waiter->fl_blocker = NULL;
}
static void __locks_wake_up_blocks(struct file_lock *blocker)
@@ -740,6 +739,13 @@ static void __locks_wake_up_blocks(struct file_lock *blocker)
waiter->fl_lmops->lm_notify(waiter);
else
wake_up(&waiter->fl_wait);
+
+ /*
+ * The setting of fl_blocker to NULL marks the "done"
+ * point in deleting a block. Paired with acquire at the top
+ * of locks_delete_block().
+ */
+ smp_store_release(&waiter->fl_blocker, NULL);
}
}
@@ -754,24 +760,41 @@ int locks_delete_block(struct file_lock *waiter)
int status = -ENOENT;
/*
- * If fl_blocker is NULL, it won't be set again as this thread
- * "owns" the lock and is the only one that might try to claim
- * the lock. So it is safe to test fl_blocker locklessly.
- * Also if fl_blocker is NULL, this waiter is not listed on
- * fl_blocked_requests for some lock, so no other request can
- * be added to the list of fl_blocked_requests for this
- * request. So if fl_blocker is NULL, it is safe to
- * locklessly check if fl_blocked_requests is empty. If both
- * of these checks succeed, there is no need to take the lock.
+ * If fl_blocker is NULL, it won't be set again as this thread "owns"
+ * the lock and is the only one that might try to claim the lock.
+ *
+ * We use acquire/release to manage fl_blocker so that we can
+ * optimize away taking the blocked_lock_lock in many cases.
+ *
+ * The smp_load_acquire guarantees two things:
+ *
+ * 1/ that fl_blocked_requests can be tested locklessly. If something
+ * was recently added to that list it must have been in a locked region
+ * *before* the locked region when fl_blocker was set to NULL.
+ *
+ * 2/ that no other thread is accessing 'waiter', so it is safe to free
+ * it. __locks_wake_up_blocks is careful not to touch waiter after
+ * fl_blocker is released.
+ *
+ * If a lockless check of fl_blocker shows it to be NULL, we know that
+ * no new locks can be inserted into its fl_blocked_requests list, and
+ * can avoid doing anything further if the list is empty.
*/
- if (waiter->fl_blocker == NULL &&
+ if (!smp_load_acquire(&waiter->fl_blocker) &&
list_empty(&waiter->fl_blocked_requests))
return status;
+
spin_lock(&blocked_lock_lock);
if (waiter->fl_blocker)
status = 0;
__locks_wake_up_blocks(waiter);
__locks_delete_block(waiter);
+
+ /*
+ * The setting of fl_blocker to NULL marks the "done" point in deleting
+ * a block. Paired with acquire at the top of this function.
+ */
+ smp_store_release(&waiter->fl_blocker, NULL);
spin_unlock(&blocked_lock_lock);
return status;
}
@@ -1364,7 +1387,8 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
error = posix_lock_inode(inode, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+ error = wait_event_interruptible(fl->fl_wait,
+ list_empty(&fl->fl_blocked_member));
if (error)
break;
}
@@ -1449,7 +1473,8 @@ int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start,
error = posix_lock_inode(inode, &fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl.fl_wait, !fl.fl_blocker);
+ error = wait_event_interruptible(fl.fl_wait,
+ list_empty(&fl.fl_blocked_member));
if (!error) {
/*
* If we've been sleeping someone might have
@@ -1652,7 +1677,8 @@ restart:
locks_dispose_list(&dispose);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
- !new_fl->fl_blocker, break_time);
+ list_empty(&new_fl->fl_blocked_member),
+ break_time);
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
@@ -2136,7 +2162,8 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
error = flock_lock_inode(inode, fl);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+ error = wait_event_interruptible(fl->fl_wait,
+ list_empty(&fl->fl_blocked_member));
if (error)
break;
}
@@ -2413,7 +2440,8 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
error = vfs_lock_file(filp, cmd, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+ error = wait_event_interruptible(fl->fl_wait,
+ list_empty(&fl->fl_blocked_member));
if (error)
break;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 989c30c98511..f1ff3076e4a4 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -153,6 +153,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
goto error_0;
+ clp->cl_minorversion = cl_init->minorversion;
clp->cl_nfs_mod = cl_init->nfs_mod;
if (!try_module_get(clp->cl_nfs_mod->owner))
goto error_dealloc;
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index e1b938457ab9..e113fcb4bb4c 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -832,6 +832,8 @@ static int nfs_parse_source(struct fs_context *fc,
if (len > maxnamlen)
goto out_hostname;
+ kfree(ctx->nfs_server.hostname);
+
/* N.B. caller will free nfs_server.hostname in all cases */
ctx->nfs_server.hostname = kmemdup_nul(dev_name, len, GFP_KERNEL);
if (!ctx->nfs_server.hostname)
@@ -1240,6 +1242,13 @@ static int nfs_fs_context_validate(struct fs_context *fc)
}
ctx->nfs_mod = nfs_mod;
}
+
+ /* Ensure the filesystem context has the correct fs_type */
+ if (fc->fs_type != ctx->nfs_mod->nfs_fs) {
+ module_put(fc->fs_type->owner);
+ __module_get(ctx->nfs_mod->nfs_fs->owner);
+ fc->fs_type = ctx->nfs_mod->nfs_fs;
+ }
return 0;
out_no_device_name:
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 52270bfac120..1abf126c2df4 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -31,6 +31,7 @@ static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
struct nfs_server_key {
struct {
uint16_t nfsversion; /* NFS protocol version */
+ uint32_t minorversion; /* NFSv4 minor version */
uint16_t family; /* address family */
__be16 port; /* IP port */
} hdr;
@@ -55,6 +56,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp)
memset(&key, 0, sizeof(key));
key.hdr.nfsversion = clp->rpc_ops->version;
+ key.hdr.minorversion = clp->cl_minorversion;
key.hdr.family = clp->cl_addr.ss_family;
switch (clp->cl_addr.ss_family) {
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index ad6077404947..f3ece8ed3203 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -153,7 +153,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
/* Open a new filesystem context, transferring parameters from the
* parent superblock, including the network namespace.
*/
- fc = fs_context_for_submount(&nfs_fs_type, path->dentry);
+ fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry);
if (IS_ERR(fc))
return ERR_CAST(fc);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 0cd767e5c977..0bd77cc1f639 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -216,7 +216,6 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
INIT_LIST_HEAD(&clp->cl_ds_clients);
rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
- clp->cl_minorversion = cl_init->minorversion;
clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
clp->cl_mig_gen = 1;
#if IS_ENABLED(CONFIG_NFS_V4_1)
diff --git a/fs/open.c b/fs/open.c
index 0788b3715731..b69d6eed67e6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -860,9 +860,6 @@ cleanup_file:
* the return value of d_splice_alias(), then the caller needs to perform dput()
* on it after finish_open().
*
- * On successful return @file is a fully instantiated open file. After this, if
- * an error occurs in ->atomic_open(), it needs to clean up with fput().
- *
* Returns zero on success or -errno if the open failed.
*/
int finish_open(struct file *file, struct dentry *dentry,
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index 444e2da4f60e..714c14c47ca5 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -93,6 +93,7 @@ config OVERLAY_FS_XINO_AUTO
bool "Overlayfs: auto enable inode number mapping"
default n
depends on OVERLAY_FS
+ depends on 64BIT
help
If this config option is enabled then overlay filesystems will use
unused high bits in undelying filesystem inode numbers to map all
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index a5317216de73..87c362f65448 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -244,6 +244,9 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
if (iocb->ki_flags & IOCB_WRITE) {
struct inode *inode = file_inode(orig_iocb->ki_filp);
+ /* Actually acquired in ovl_write_iter() */
+ __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
+ SB_FREEZE_WRITE);
file_end_write(iocb->ki_filp);
ovl_copyattr(ovl_inode_real(inode), inode);
}
@@ -346,6 +349,9 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out;
file_start_write(real.file);
+ /* Pacify lockdep, same trick as done in aio_write() */
+ __sb_writers_release(file_inode(real.file)->i_sb,
+ SB_FREEZE_WRITE);
aio_req->fd = real;
real.flags = 0;
aio_req->orig_iocb = iocb;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 3623d28aa4fa..3d3f2b8bdae5 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -318,7 +318,12 @@ static inline unsigned int ovl_xino_bits(struct super_block *sb)
return ovl_same_dev(sb) ? OVL_FS(sb)->xino_mode : 0;
}
-static inline int ovl_inode_lock(struct inode *inode)
+static inline void ovl_inode_lock(struct inode *inode)
+{
+ mutex_lock(&OVL_I(inode)->lock);
+}
+
+static inline int ovl_inode_lock_interruptible(struct inode *inode)
{
return mutex_lock_interruptible(&OVL_I(inode)->lock);
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 319fe0d355b0..ac967f1cb6e5 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1411,6 +1411,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
if (ofs->config.xino == OVL_XINO_ON)
pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n");
ofs->xino_mode = 0;
+ } else if (ofs->config.xino == OVL_XINO_OFF) {
+ ofs->xino_mode = -1;
} else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) {
/*
* This is a roundup of number of bits needed for encoding
@@ -1623,8 +1625,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_stack_depth = 0;
sb->s_maxbytes = MAX_LFS_FILESIZE;
/* Assume underlaying fs uses 32bit inodes unless proven otherwise */
- if (ofs->config.xino != OVL_XINO_OFF)
+ if (ofs->config.xino != OVL_XINO_OFF) {
ofs->xino_mode = BITS_PER_LONG - 32;
+ if (!ofs->xino_mode) {
+ pr_warn("xino not supported on 32bit kernel, falling back to xino=off.\n");
+ ofs->config.xino = OVL_XINO_OFF;
+ }
+ }
/* alloc/destroy_inode needed for setting up traps in inode cache */
sb->s_op = &ovl_super_operations;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index ea005085803f..042f7eb4f7f4 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -509,7 +509,7 @@ int ovl_copy_up_start(struct dentry *dentry, int flags)
struct inode *inode = d_inode(dentry);
int err;
- err = ovl_inode_lock(inode);
+ err = ovl_inode_lock_interruptible(inode);
if (!err && ovl_already_copied_up_locked(dentry, flags)) {
err = 1; /* Already copied up */
ovl_inode_unlock(inode);
@@ -764,7 +764,7 @@ int ovl_nlink_start(struct dentry *dentry)
return err;
}
- err = ovl_inode_lock(inode);
+ err = ovl_inode_lock_interruptible(inode);
if (err)
return err;
diff --git a/fs/zonefs/Kconfig b/fs/zonefs/Kconfig
index fb87ad372e29..ef2697b78820 100644
--- a/fs/zonefs/Kconfig
+++ b/fs/zonefs/Kconfig
@@ -2,6 +2,7 @@ config ZONEFS_FS
tristate "zonefs filesystem support"
depends on BLOCK
depends on BLK_DEV_ZONED
+ select FS_IOMAP
help
zonefs is a simple file system which exposes zones of a zoned block
device (e.g. host-managed or host-aware SMR disk drives) as files.
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 8bc6ef82d693..69aee3dfb660 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -601,13 +601,13 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret;
/*
- * For async direct IOs to sequential zone files, ignore IOCB_NOWAIT
+ * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
* as this can cause write reordering (e.g. the first aio gets EAGAIN
* on the inode lock but the second goes through but is now unaligned).
*/
- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb)
- && (iocb->ki_flags & IOCB_NOWAIT))
- iocb->ki_flags &= ~IOCB_NOWAIT;
+ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb) &&
+ (iocb->ki_flags & IOCB_NOWAIT))
+ return -EOPNOTSUPP;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock(inode))