summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/ceph/caps.c41
-rw-r--r--fs/ceph/debugfs.c13
-rw-r--r--fs/ceph/mds_client.c8
-rw-r--r--fs/ceph/mds_client.h9
-rw-r--r--fs/ceph/mdsmap.c12
-rw-r--r--fs/ceph/super.c28
-rw-r--r--fs/ceph/super.h16
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/cifssmb.c3
-rw-r--r--fs/cifs/smb2inode.c1
-rw-r--r--fs/cifs/smb2ops.c19
-rw-r--r--fs/cifs/smb2pdu.c2
-rw-r--r--fs/cifs/smb2proto.h2
-rw-r--r--fs/crypto/keyring.c2
-rw-r--r--fs/io-wq.c34
-rw-r--r--fs/io-wq.h7
-rw-r--r--fs/io_uring.c168
-rw-r--r--fs/overlayfs/copy_up.c53
-rw-r--r--fs/overlayfs/dir.c2
-rw-r--r--fs/overlayfs/export.c80
-rw-r--r--fs/overlayfs/inode.c8
-rw-r--r--fs/overlayfs/namei.c52
-rw-r--r--fs/overlayfs/overlayfs.h34
-rw-r--r--fs/overlayfs/ovl_entry.h2
-rw-r--r--fs/overlayfs/super.c24
-rw-r--r--fs/verity/enable.c2
26 files changed, 393 insertions, 231 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f5a38910a82b..9d09bb53c1ab 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1011,18 +1011,13 @@ static int __ceph_is_single_caps(struct ceph_inode_info *ci)
return rb_first(&ci->i_caps) == rb_last(&ci->i_caps);
}
-static int __ceph_is_any_caps(struct ceph_inode_info *ci)
-{
- return !RB_EMPTY_ROOT(&ci->i_caps);
-}
-
int ceph_is_any_caps(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int ret;
spin_lock(&ci->i_ceph_lock);
- ret = __ceph_is_any_caps(ci);
+ ret = __ceph_is_any_real_caps(ci);
spin_unlock(&ci->i_ceph_lock);
return ret;
@@ -1099,15 +1094,16 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
if (removed)
ceph_put_cap(mdsc, cap);
- /* when reconnect denied, we remove session caps forcibly,
- * i_wr_ref can be non-zero. If there are ongoing write,
- * keep i_snap_realm.
- */
- if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
- drop_inode_snap_realm(ci);
+ if (!__ceph_is_any_real_caps(ci)) {
+ /* when reconnect denied, we remove session caps forcibly,
+ * i_wr_ref can be non-zero. If there are ongoing write,
+ * keep i_snap_realm.
+ */
+ if (ci->i_wr_ref == 0 && ci->i_snap_realm)
+ drop_inode_snap_realm(ci);
- if (!__ceph_is_any_real_caps(ci))
__cap_delay_cancel(mdsc, ci);
+ }
}
struct cap_msg_args {
@@ -2764,7 +2760,19 @@ int ceph_get_caps(struct file *filp, int need, int want,
if (ret == -EAGAIN)
continue;
if (!ret) {
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct cap_wait cw;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ cw.ino = inode->i_ino;
+ cw.tgid = current->tgid;
+ cw.need = need;
+ cw.want = want;
+
+ spin_lock(&mdsc->caps_list_lock);
+ list_add(&cw.list, &mdsc->cap_wait_list);
+ spin_unlock(&mdsc->caps_list_lock);
+
add_wait_queue(&ci->i_cap_wq, &wait);
flags |= NON_BLOCKING;
@@ -2778,6 +2786,11 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
remove_wait_queue(&ci->i_cap_wq, &wait);
+
+ spin_lock(&mdsc->caps_list_lock);
+ list_del(&cw.list);
+ spin_unlock(&mdsc->caps_list_lock);
+
if (ret == -EAGAIN)
continue;
}
@@ -2928,7 +2941,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
ci->i_head_snapc = NULL;
}
/* see comment in __ceph_remove_cap() */
- if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
+ if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
drop_inode_snap_realm(ci);
}
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index facb387c2735..c281f32b54f7 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -139,6 +139,7 @@ static int caps_show(struct seq_file *s, void *p)
struct ceph_fs_client *fsc = s->private;
struct ceph_mds_client *mdsc = fsc->mdsc;
int total, avail, used, reserved, min, i;
+ struct cap_wait *cw;
ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
seq_printf(s, "total\t\t%d\n"
@@ -166,6 +167,18 @@ static int caps_show(struct seq_file *s, void *p)
}
mutex_unlock(&mdsc->mutex);
+ seq_printf(s, "\n\nWaiters:\n--------\n");
+ seq_printf(s, "tgid ino need want\n");
+ seq_printf(s, "-----------------------------------------------------\n");
+
+ spin_lock(&mdsc->caps_list_lock);
+ list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
+ seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
+ ceph_cap_string(cw->need),
+ ceph_cap_string(cw->want));
+ }
+ spin_unlock(&mdsc->caps_list_lock);
+
return 0;
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 068b029cf073..374db1bd57d1 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2015,7 +2015,7 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
if (!nr)
return;
val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
- if (!(val % CEPH_CAPS_PER_RELEASE)) {
+ if ((val % CEPH_CAPS_PER_RELEASE) < nr) {
atomic_set(&mdsc->cap_reclaim_pending, 0);
ceph_queue_cap_reclaim_work(mdsc);
}
@@ -2032,12 +2032,13 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
size_t size = sizeof(struct ceph_mds_reply_dir_entry);
- int order, num_entries;
+ unsigned int num_entries;
+ int order;
spin_lock(&ci->i_ceph_lock);
num_entries = ci->i_files + ci->i_subdirs;
spin_unlock(&ci->i_ceph_lock);
- num_entries = max(num_entries, 1);
+ num_entries = max(num_entries, 1U);
num_entries = min(num_entries, opt->max_readdir);
order = get_order(size * num_entries);
@@ -4168,6 +4169,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
mdsc->last_renew_caps = jiffies;
INIT_LIST_HEAD(&mdsc->cap_delay_list);
+ INIT_LIST_HEAD(&mdsc->cap_wait_list);
spin_lock_init(&mdsc->cap_delay_lock);
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 5cd131b41d84..14c7e8c49970 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -340,6 +340,14 @@ struct ceph_quotarealm_inode {
struct inode *inode;
};
+struct cap_wait {
+ struct list_head list;
+ unsigned long ino;
+ pid_t tgid;
+ int need;
+ int want;
+};
+
/*
* mds client state
*/
@@ -416,6 +424,7 @@ struct ceph_mds_client {
spinlock_t caps_list_lock;
struct list_head caps_list; /* unused (reserved or
unreserved) */
+ struct list_head cap_wait_list;
int caps_total_count; /* total caps allocated */
int caps_use_count; /* in use */
int caps_use_max; /* max used caps */
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index aeec1d6e3769..471bac335fae 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -158,6 +158,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
void *pexport_targets = NULL;
struct ceph_timespec laggy_since;
struct ceph_mds_info *info;
+ bool laggy;
ceph_decode_need(p, end, sizeof(u64) + 1, bad);
global_id = ceph_decode_64(p);
@@ -190,6 +191,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
if (err)
goto corrupt;
ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
+ laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0;
*p += sizeof(u32);
ceph_decode_32_safe(p, end, namelen, bad);
*p += namelen;
@@ -207,10 +209,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
*p = info_end;
}
- dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
+ dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n",
i+1, n, global_id, mds, inc,
ceph_pr_addr(&addr),
- ceph_mds_state_name(state));
+ ceph_mds_state_name(state),
+ laggy ? "(laggy)" : "");
if (mds < 0 || state <= 0)
continue;
@@ -230,8 +233,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info->global_id = global_id;
info->state = state;
info->addr = addr;
- info->laggy = (laggy_since.tv_sec != 0 ||
- laggy_since.tv_nsec != 0);
+ info->laggy = laggy;
info->num_export_targets = num_export_targets;
if (num_export_targets) {
info->export_targets = kcalloc(num_export_targets,
@@ -355,6 +357,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_damaged = false;
}
bad_ext:
+ dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
+ !!m->m_enabled, !!m->m_damaged, m->m_num_laggy);
*p = end;
dout("mdsmap_decode success epoch %u\n", m->m_epoch);
return m;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9c9a7c68eea3..29a795f975df 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -172,10 +172,10 @@ static const struct fs_parameter_enum ceph_mount_param_enums[] = {
static const struct fs_parameter_spec ceph_mount_param_specs[] = {
fsparam_flag_no ("acl", Opt_acl),
fsparam_flag_no ("asyncreaddir", Opt_asyncreaddir),
- fsparam_u32 ("caps_max", Opt_caps_max),
+ fsparam_s32 ("caps_max", Opt_caps_max),
fsparam_u32 ("caps_wanted_delay_max", Opt_caps_wanted_delay_max),
fsparam_u32 ("caps_wanted_delay_min", Opt_caps_wanted_delay_min),
- fsparam_s32 ("write_congestion_kb", Opt_congestion_kb),
+ fsparam_u32 ("write_congestion_kb", Opt_congestion_kb),
fsparam_flag_no ("copyfrom", Opt_copyfrom),
fsparam_flag_no ("dcache", Opt_dcache),
fsparam_flag_no ("dirstat", Opt_dirstat),
@@ -187,8 +187,8 @@ static const struct fs_parameter_spec ceph_mount_param_specs[] = {
fsparam_flag_no ("quotadf", Opt_quotadf),
fsparam_u32 ("rasize", Opt_rasize),
fsparam_flag_no ("rbytes", Opt_rbytes),
- fsparam_s32 ("readdir_max_bytes", Opt_readdir_max_bytes),
- fsparam_s32 ("readdir_max_entries", Opt_readdir_max_entries),
+ fsparam_u32 ("readdir_max_bytes", Opt_readdir_max_bytes),
+ fsparam_u32 ("readdir_max_entries", Opt_readdir_max_entries),
fsparam_enum ("recover_session", Opt_recover_session),
fsparam_flag_no ("require_active_mds", Opt_require_active_mds),
fsparam_u32 ("rsize", Opt_rsize),
@@ -328,7 +328,9 @@ static int ceph_parse_mount_param(struct fs_context *fc,
fsopt->caps_wanted_delay_max = result.uint_32;
break;
case Opt_caps_max:
- fsopt->caps_max = result.uint_32;
+ if (result.int_32 < 0)
+ goto out_of_range;
+ fsopt->caps_max = result.int_32;
break;
case Opt_readdir_max_entries:
if (result.uint_32 < 1)
@@ -547,25 +549,25 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_show_option(m, "recover_session", "clean");
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
- seq_printf(m, ",wsize=%d", fsopt->wsize);
+ seq_printf(m, ",wsize=%u", fsopt->wsize);
if (fsopt->rsize != CEPH_MAX_READ_SIZE)
- seq_printf(m, ",rsize=%d", fsopt->rsize);
+ seq_printf(m, ",rsize=%u", fsopt->rsize);
if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
- seq_printf(m, ",rasize=%d", fsopt->rasize);
+ seq_printf(m, ",rasize=%u", fsopt->rasize);
if (fsopt->congestion_kb != default_congestion_kb())
- seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
+ seq_printf(m, ",write_congestion_kb=%u", fsopt->congestion_kb);
if (fsopt->caps_max)
seq_printf(m, ",caps_max=%d", fsopt->caps_max);
if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
- seq_printf(m, ",caps_wanted_delay_min=%d",
+ seq_printf(m, ",caps_wanted_delay_min=%u",
fsopt->caps_wanted_delay_min);
if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
- seq_printf(m, ",caps_wanted_delay_max=%d",
+ seq_printf(m, ",caps_wanted_delay_max=%u",
fsopt->caps_wanted_delay_max);
if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
- seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
+ seq_printf(m, ",readdir_max_entries=%u", fsopt->max_readdir);
if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
- seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
+ seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes);
if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
seq_show_option(m, "snapdirname", fsopt->snapdir_name);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f0f9cb7447ac..3bf1a01cd536 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -73,16 +73,16 @@
#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
struct ceph_mount_options {
- int flags;
+ unsigned int flags;
- int wsize; /* max write size */
- int rsize; /* max read size */
- int rasize; /* max readahead */
- int congestion_kb; /* max writeback in flight */
- int caps_wanted_delay_min, caps_wanted_delay_max;
+ unsigned int wsize; /* max write size */
+ unsigned int rsize; /* max read size */
+ unsigned int rasize; /* max readahead */
+ unsigned int congestion_kb; /* max writeback in flight */
+ unsigned int caps_wanted_delay_min, caps_wanted_delay_max;
int caps_max;
- int max_readdir; /* max readdir result (entires) */
- int max_readdir_bytes; /* max readdir result (bytes) */
+ unsigned int max_readdir; /* max readdir result (entries) */
+ unsigned int max_readdir_bytes; /* max readdir result (bytes) */
/*
* everything above this point can be memcmp'd; everything below
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index fd0262ce5ad5..ce9bac756c2a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1061,7 +1061,7 @@ cap_unix(struct cifs_ses *ses)
struct cached_fid {
bool is_valid:1; /* Do we have a useable root fid */
bool file_all_info_is_valid:1;
-
+ bool has_lease:1;
struct kref refcount;
struct cifs_fid *fid;
struct mutex fid_mutex;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4f554f019a98..cc86a67225d1 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -42,6 +42,7 @@
#include "cifsproto.h"
#include "cifs_unicode.h"
#include "cifs_debug.h"
+#include "smb2proto.h"
#include "fscache.h"
#include "smbdirect.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -112,6 +113,8 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
mutex_lock(&tcon->crfid.fid_mutex);
tcon->crfid.is_valid = false;
+ /* cached handle is not valid, so SMB2_CLOSE won't be sent below */
+ close_shroot_lease_locked(&tcon->crfid);
memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid));
mutex_unlock(&tcon->crfid.fid_mutex);
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 18c7a33adceb..5ef5e97a6d13 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -95,6 +95,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
goto finished;
}
+ memset(&oparms, 0, sizeof(struct cifs_open_parms));
oparms.tcon = tcon;
oparms.desired_access = desired_access;
oparms.disposition = create_disposition;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index a5c96bc522cb..6250370c1170 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -616,6 +616,7 @@ smb2_close_cached_fid(struct kref *ref)
cfid->fid->volatile_fid);
cfid->is_valid = false;
cfid->file_all_info_is_valid = false;
+ cfid->has_lease = false;
}
}
@@ -626,13 +627,28 @@ void close_shroot(struct cached_fid *cfid)
mutex_unlock(&cfid->fid_mutex);
}
+void close_shroot_lease_locked(struct cached_fid *cfid)
+{
+ if (cfid->has_lease) {
+ cfid->has_lease = false;
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+ }
+}
+
+void close_shroot_lease(struct cached_fid *cfid)
+{
+ mutex_lock(&cfid->fid_mutex);
+ close_shroot_lease_locked(cfid);
+ mutex_unlock(&cfid->fid_mutex);
+}
+
void
smb2_cached_lease_break(struct work_struct *work)
{
struct cached_fid *cfid = container_of(work,
struct cached_fid, lease_break);
- close_shroot(cfid);
+ close_shroot_lease(cfid);
}
/*
@@ -773,6 +789,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
/* BB TBD check to see if oplock level check can be removed below */
if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) {
kref_get(&tcon->crfid.refcount);
+ tcon->crfid.has_lease = true;
smb2_parse_contexts(server, o_rsp,
&oparms.fid->epoch,
oparms.fid->lease_key, &oplock, NULL);
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 0ab6b1200288..9434f6dd8df3 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1847,7 +1847,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
if ((tcon->need_reconnect) || (tcon->ses->need_reconnect))
return 0;
- close_shroot(&tcon->crfid);
+ close_shroot_lease(&tcon->crfid);
rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req,
&total_len);
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index a18272c987fe..27d29f2eb6c8 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -70,6 +70,8 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server,
extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid *pfid);
extern void close_shroot(struct cached_fid *cfid);
+extern void close_shroot_lease(struct cached_fid *cfid);
+extern void close_shroot_lease_locked(struct cached_fid *cfid);
extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
struct smb2_file_all_info *src);
extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 040df1f5e1c8..40cca351273f 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -151,7 +151,7 @@ static struct key *search_fscrypt_keyring(struct key *keyring,
}
#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \
- (CONST_STRLEN("fscrypt-") + FIELD_SIZEOF(struct super_block, s_id))
+ (CONST_STRLEN("fscrypt-") + sizeof_field(struct super_block, s_id))
#define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1)
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 74b40506c5d9..90c4978781fb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -49,7 +49,6 @@ struct io_worker {
struct hlist_nulls_node nulls_node;
struct list_head all_list;
struct task_struct *task;
- wait_queue_head_t wait;
struct io_wqe *wqe;
struct io_wq_work *cur_work;
@@ -258,7 +257,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
worker = hlist_nulls_entry(n, struct io_worker, nulls_node);
if (io_worker_get(worker)) {
- wake_up(&worker->wait);
+ wake_up_process(worker->task);
io_worker_release(worker);
return true;
}
@@ -492,28 +491,46 @@ next:
} while (1);
}
+static inline void io_worker_spin_for_work(struct io_wqe *wqe)
+{
+ int i = 0;
+
+ while (++i < 1000) {
+ if (io_wqe_run_queue(wqe))
+ break;
+ if (need_resched())
+ break;
+ cpu_relax();
+ }
+}
+
static int io_wqe_worker(void *data)
{
struct io_worker *worker = data;
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
- DEFINE_WAIT(wait);
+ bool did_work;
io_worker_start(wqe, worker);
+ did_work = false;
while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- prepare_to_wait(&worker->wait, &wait, TASK_INTERRUPTIBLE);
-
+ set_current_state(TASK_INTERRUPTIBLE);
+loop:
+ if (did_work)
+ io_worker_spin_for_work(wqe);
spin_lock_irq(&wqe->lock);
if (io_wqe_run_queue(wqe)) {
__set_current_state(TASK_RUNNING);
io_worker_handle_work(worker);
- continue;
+ did_work = true;
+ goto loop;
}
+ did_work = false;
/* drops the lock on success, retry */
if (__io_worker_idle(wqe, worker)) {
__release(&wqe->lock);
- continue;
+ goto loop;
}
spin_unlock_irq(&wqe->lock);
if (signal_pending(current))
@@ -526,8 +543,6 @@ static int io_wqe_worker(void *data)
break;
}
- finish_wait(&worker->wait, &wait);
-
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
spin_lock_irq(&wqe->lock);
if (!wq_list_empty(&wqe->work_list))
@@ -589,7 +604,6 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
refcount_set(&worker->ref, 1);
worker->nulls_node.pprev = NULL;
- init_waitqueue_head(&worker->wait);
worker->wqe = wqe;
spin_lock_init(&worker->lock);
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 7c333a28e2a7..fb993b2bd0ef 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -35,7 +35,8 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
if (!list->first) {
- list->first = list->last = node;
+ list->last = node;
+ WRITE_ONCE(list->first, node);
} else {
list->last->next = node;
list->last = node;
@@ -47,7 +48,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
struct io_wq_work_node *prev)
{
if (node == list->first)
- list->first = node->next;
+ WRITE_ONCE(list->first, node->next);
if (node == list->last)
list->last = prev;
if (prev)
@@ -58,7 +59,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
#define wq_list_for_each(pos, prv, head) \
for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
-#define wq_list_empty(list) ((list)->first == NULL)
+#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL)
#define INIT_WQ_LIST(list) do { \
(list)->first = NULL; \
(list)->last = NULL; \
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 405be10da73d..9b1833fedc5c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -293,7 +293,7 @@ struct io_poll_iocb {
__poll_t events;
bool done;
bool canceled;
- struct wait_queue_entry *wait;
+ struct wait_queue_entry wait;
};
struct io_timeout_data {
@@ -377,6 +377,7 @@ struct io_kiocb {
#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
#define REQ_F_INFLIGHT 16384 /* on inflight list */
#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
+#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */
u64 user_data;
u32 result;
u32 sequence;
@@ -580,7 +581,9 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
switch (req->sqe->opcode) {
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
- do_hashed = true;
+ /* only regular files should be hashed for writes */
+ if (req->flags & REQ_F_ISREG)
+ do_hashed = true;
/* fall-through */
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
@@ -1292,6 +1295,12 @@ static void kiocb_end_write(struct io_kiocb *req)
file_end_write(req->file);
}
+static inline void req_set_fail_links(struct io_kiocb *req)
+{
+ if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+ req->flags |= REQ_F_FAIL_LINK;
+}
+
static void io_complete_rw_common(struct kiocb *kiocb, long res)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
@@ -1299,8 +1308,8 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if ((req->flags & REQ_F_LINK) && res != req->result)
- req->flags |= REQ_F_FAIL_LINK;
+ if (res != req->result)
+ req_set_fail_links(req);
io_cqring_add_event(req, res);
}
@@ -1330,8 +1339,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
- if ((req->flags & REQ_F_LINK) && res != req->result)
- req->flags |= REQ_F_FAIL_LINK;
+ if (res != req->result)
+ req_set_fail_links(req);
req->result = res;
if (res != -EAGAIN)
req->flags |= REQ_F_IOPOLL_COMPLETED;
@@ -1422,7 +1431,7 @@ static bool io_file_supports_async(struct file *file)
{
umode_t mode = file_inode(file)->i_mode;
- if (S_ISBLK(mode) || S_ISCHR(mode))
+ if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
return true;
if (S_ISREG(mode) && file->f_op != &io_uring_fops)
return true;
@@ -1858,7 +1867,9 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
goto copy_iov;
}
- if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
+ /* file path doesn't support NOWAIT for non-direct_IO */
+ if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
+ (req->flags & REQ_F_ISREG))
goto copy_iov;
iov_count = iov_iter_count(&iter);
@@ -1956,8 +1967,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
end > 0 ? end : LLONG_MAX,
fsync_flags & IORING_FSYNC_DATASYNC);
- if (ret < 0 && (req->flags & REQ_F_LINK))
- req->flags |= REQ_F_FAIL_LINK;
+ if (ret < 0)
+ req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
@@ -2003,8 +2014,8 @@ static int io_sync_file_range(struct io_kiocb *req,
ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
- if (ret < 0 && (req->flags & REQ_F_LINK))
- req->flags |= REQ_F_FAIL_LINK;
+ if (ret < 0)
+ req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
@@ -2019,6 +2030,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
flags = READ_ONCE(sqe->msg_flags);
msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr);
+ io->msg.iov = io->msg.fast_iov;
return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov);
#else
return 0;
@@ -2054,7 +2066,6 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
} else {
kmsg = &io.msg.msg;
kmsg->msg_name = &addr;
- io.msg.iov = io.msg.fast_iov;
ret = io_sendmsg_prep(req, &io);
if (ret)
goto out;
@@ -2079,8 +2090,8 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
out:
io_cqring_add_event(req, ret);
- if (ret < 0 && (req->flags & REQ_F_LINK))
- req->flags |= REQ_F_FAIL_LINK;
+ if (ret < 0)
+ req_set_fail_links(req);
io_put_req_find_next(req, nxt);
return 0;
#else
@@ -2097,6 +2108,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
flags = READ_ONCE(sqe->msg_flags);
msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr);
+ io->msg.iov = io->msg.fast_iov;
return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr,
&io->msg.iov);
#else
@@ -2136,7 +2148,6 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
} else {
kmsg = &io.msg.msg;
kmsg->msg_name = &addr;
- io.msg.iov = io.msg.fast_iov;
ret = io_recvmsg_prep(req, &io);
if (ret)
goto out;
@@ -2161,8 +2172,8 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
out:
io_cqring_add_event(req, ret);
- if (ret < 0 && (req->flags & REQ_F_LINK))
- req->flags |= REQ_F_FAIL_LINK;
+ if (ret < 0)
+ req_set_fail_links(req);
io_put_req_find_next(req, nxt);
return 0;
#else
@@ -2196,8 +2207,8 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
- if (ret < 0 && (req->flags & REQ_F_LINK))
- req->flags |= REQ_F_FAIL_LINK;
+ if (ret < 0)
+ req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
@@ -2263,8 +2274,8 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (ret == -ERESTARTSYS)
ret = -EINTR;
out:
- if (ret < 0 && (req->flags & REQ_F_LINK))
- req->flags |= REQ_F_FAIL_LINK;
+ if (ret < 0)
+ req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
@@ -2279,8 +2290,8 @@ static void io_poll_remove_one(struct io_kiocb *req)
spin_lock(&poll->head->lock);
WRITE_ONCE(poll->canceled, true);
- if (!list_empty(&poll->wait->entry)) {
- list_del_init(&poll->wait->entry);
+ if (!list_empty(&poll->wait.entry)) {
+ list_del_init(&poll->wait.entry);
io_queue_async_work(req);
}
spin_unlock(&poll->head->lock);
@@ -2340,8 +2351,8 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe)
spin_unlock_irq(&ctx->completion_lock);
io_cqring_add_event(req, ret);
- if (ret < 0 && (req->flags & REQ_F_LINK))
- req->flags |= REQ_F_FAIL_LINK;
+ if (ret < 0)
+ req_set_fail_links(req);
io_put_req(req);
return 0;
}
@@ -2351,7 +2362,6 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
struct io_ring_ctx *ctx = req->ctx;
req->poll.done = true;
- kfree(req->poll.wait);
if (error)
io_cqring_fill_event(req, error);
else
@@ -2389,7 +2399,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
*/
spin_lock_irq(&ctx->completion_lock);
if (!mask && ret != -ECANCELED) {
- add_wait_queue(poll->head, poll->wait);
+ add_wait_queue(poll->head, &poll->wait);
spin_unlock_irq(&ctx->completion_lock);
return;
}
@@ -2399,8 +2409,8 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
io_cqring_ev_posted(ctx);
- if (ret < 0 && req->flags & REQ_F_LINK)
- req->flags |= REQ_F_FAIL_LINK;
+ if (ret < 0)
+ req_set_fail_links(req);
io_put_req_find_next(req, &nxt);
if (nxt)
*workptr = &nxt->work;
@@ -2419,7 +2429,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && !(mask & poll->events))
return 0;
- list_del_init(&poll->wait->entry);
+ list_del_init(&poll->wait.entry);
/*
* Run completion inline if we can. We're using trylock here because
@@ -2460,7 +2470,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
pt->error = 0;
pt->req->poll.head = head;
- add_wait_queue(head, pt->req->poll.wait);
+ add_wait_queue(head, &pt->req->poll.wait);
}
static void io_poll_req_insert(struct io_kiocb *req)
@@ -2489,10 +2499,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (!poll->file)
return -EBADF;
- poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL);
- if (!poll->wait)
- return -ENOMEM;
-
req->io = NULL;
INIT_IO_WORK(&req->work, io_poll_complete_work);
events = READ_ONCE(sqe->poll_events);
@@ -2509,9 +2515,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
/* initialized the list so that we can do list_empty checks */
- INIT_LIST_HEAD(&poll->wait->entry);
- init_waitqueue_func_entry(poll->wait, io_poll_wake);
- poll->wait->private = poll;
+ INIT_LIST_HEAD(&poll->wait.entry);
+ init_waitqueue_func_entry(&poll->wait, io_poll_wake);
+ poll->wait.private = poll;
INIT_LIST_HEAD(&req->list);
@@ -2520,14 +2526,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
spin_lock_irq(&ctx->completion_lock);
if (likely(poll->head)) {
spin_lock(&poll->head->lock);
- if (unlikely(list_empty(&poll->wait->entry))) {
+ if (unlikely(list_empty(&poll->wait.entry))) {
if (ipt.error)
cancel = true;
ipt.error = 0;
mask = 0;
}
if (mask || ipt.error)
- list_del_init(&poll->wait->entry);
+ list_del_init(&poll->wait.entry);
else if (cancel)
WRITE_ONCE(poll->canceled, true);
else if (!poll->done) /* actually waiting for an event */
@@ -2582,8 +2588,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
- if (req->flags & REQ_F_LINK)
- req->flags |= REQ_F_FAIL_LINK;
+ req_set_fail_links(req);
io_put_req(req);
return HRTIMER_NORESTART;
}
@@ -2608,8 +2613,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (ret == -1)
return -EALREADY;
- if (req->flags & REQ_F_LINK)
- req->flags |= REQ_F_FAIL_LINK;
+ req_set_fail_links(req);
io_cqring_fill_event(req, -ECANCELED);
io_put_req(req);
return 0;
@@ -2640,8 +2644,8 @@ static int io_timeout_remove(struct io_kiocb *req,
io_commit_cqring(ctx);
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
- if (ret < 0 && req->flags & REQ_F_LINK)
- req->flags |= REQ_F_FAIL_LINK;
+ if (ret < 0)
+ req_set_fail_links(req);
io_put_req(req);
return 0;
}
@@ -2822,8 +2826,8 @@ done:
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
- if (ret < 0 && (req->flags & REQ_F_LINK))
- req->flags |= REQ_F_FAIL_LINK;
+ if (ret < 0)
+ req_set_fail_links(req);
io_put_req_find_next(req, nxt);
}
@@ -2991,12 +2995,7 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt,
if (req->result == -EAGAIN)
return -EAGAIN;
- /* workqueue context doesn't hold uring_lock, grab it now */
- if (req->in_async)
- mutex_lock(&ctx->uring_lock);
io_iopoll_req_issued(req);
- if (req->in_async)
- mutex_unlock(&ctx->uring_lock);
}
return 0;
@@ -3044,8 +3043,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
io_put_req(req);
if (ret) {
- if (req->flags & REQ_F_LINK)
- req->flags |= REQ_F_FAIL_LINK;
+ req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
}
@@ -3064,7 +3062,12 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
}
}
-static bool io_op_needs_file(const struct io_uring_sqe *sqe)
+static bool io_req_op_valid(int op)
+{
+ return op >= IORING_OP_NOP && op < IORING_OP_LAST;
+}
+
+static int io_op_needs_file(const struct io_uring_sqe *sqe)
{
int op = READ_ONCE(sqe->opcode);
@@ -3075,9 +3078,11 @@ static bool io_op_needs_file(const struct io_uring_sqe *sqe)
case IORING_OP_TIMEOUT_REMOVE:
case IORING_OP_ASYNC_CANCEL:
case IORING_OP_LINK_TIMEOUT:
- return false;
+ return 0;
default:
- return true;
+ if (io_req_op_valid(op))
+ return 1;
+ return -EINVAL;
}
}
@@ -3094,7 +3099,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned flags;
- int fd;
+ int fd, ret;
flags = READ_ONCE(req->sqe->flags);
fd = READ_ONCE(req->sqe->fd);
@@ -3102,8 +3107,9 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req)
if (flags & IOSQE_IO_DRAIN)
req->flags |= REQ_F_IO_DRAIN;
- if (!io_op_needs_file(req->sqe))
- return 0;
+ ret = io_op_needs_file(req->sqe);
+ if (ret <= 0)
+ return ret;
if (flags & IOSQE_FIXED_FILE) {
if (unlikely(!ctx->file_table ||
@@ -3179,8 +3185,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (prev) {
- if (prev->flags & REQ_F_LINK)
- prev->flags |= REQ_F_FAIL_LINK;
+ req_set_fail_links(prev);
io_async_find_and_cancel(ctx, req, prev->user_data, NULL,
-ETIME);
io_put_req(prev);
@@ -3231,13 +3236,14 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
static void __io_queue_sqe(struct io_kiocb *req)
{
- struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
+ struct io_kiocb *linked_timeout;
struct io_kiocb *nxt = NULL;
int ret;
+again:
+ linked_timeout = io_prep_linked_timeout(req);
+
ret = io_issue_sqe(req, &nxt, true);
- if (nxt)
- io_queue_async_work(nxt);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -3256,7 +3262,7 @@ static void __io_queue_sqe(struct io_kiocb *req)
* submit reference when the iocb is actually submitted.
*/
io_queue_async_work(req);
- return;
+ goto done_req;
}
err:
@@ -3273,10 +3279,15 @@ err:
/* and drop final reference, if we failed */
if (ret) {
io_cqring_add_event(req, ret);
- if (req->flags & REQ_F_LINK)
- req->flags |= REQ_F_FAIL_LINK;
+ req_set_fail_links(req);
io_put_req(req);
}
+done_req:
+ if (nxt) {
+ req = nxt;
+ nxt = NULL;
+ goto again;
+ }
}
static void io_queue_sqe(struct io_kiocb *req)
@@ -3293,8 +3304,7 @@ static void io_queue_sqe(struct io_kiocb *req)
if (ret) {
if (ret != -EIOCBQUEUED) {
io_cqring_add_event(req, ret);
- if (req->flags & REQ_F_LINK)
- req->flags |= REQ_F_FAIL_LINK;
+ req_set_fail_links(req);
io_double_put_req(req);
}
} else
@@ -3310,8 +3320,8 @@ static inline void io_queue_link_head(struct io_kiocb *req)
io_queue_sqe(req);
}
-
-#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
+#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
+ IOSQE_IO_HARDLINK)
static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
struct io_kiocb **link)
@@ -3349,6 +3359,9 @@ err_req:
if (req->sqe->flags & IOSQE_IO_DRAIN)
(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
+ if (req->sqe->flags & IOSQE_IO_HARDLINK)
+ req->flags |= REQ_F_HARDLINK;
+
io = kmalloc(sizeof(*io), GFP_KERNEL);
if (!io) {
ret = -EAGAIN;
@@ -3358,13 +3371,16 @@ err_req:
ret = io_req_defer_prep(req, io);
if (ret) {
kfree(io);
+ /* fail even hard links since we don't submit */
prev->flags |= REQ_F_FAIL_LINK;
goto err_req;
}
trace_io_uring_link(ctx, req, prev);
list_add_tail(&req->link_list, &prev->link_list);
- } else if (req->sqe->flags & IOSQE_IO_LINK) {
+ } else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
req->flags |= REQ_F_LINK;
+ if (req->sqe->flags & IOSQE_IO_HARDLINK)
+ req->flags |= REQ_F_HARDLINK;
INIT_LIST_HEAD(&req->link_list);
*link = req;
@@ -3647,7 +3663,9 @@ static int io_sq_thread(void *data)
}
to_submit = min(to_submit, ctx->sq_entries);
+ mutex_lock(&ctx->uring_lock);
ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
+ mutex_unlock(&ctx->uring_lock);
if (ret > 0)
inflight += ret;
}
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index b801c6353100..6220642fe113 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -227,13 +227,17 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper)
{
struct ovl_fh *fh;
- int fh_type, fh_len, dwords;
- void *buf;
+ int fh_type, dwords;
int buflen = MAX_HANDLE_SZ;
uuid_t *uuid = &real->d_sb->s_uuid;
+ int err;
- buf = kmalloc(buflen, GFP_KERNEL);
- if (!buf)
+ /* Make sure the real fid stays 32bit aligned */
+ BUILD_BUG_ON(OVL_FH_FID_OFFSET % 4);
+ BUILD_BUG_ON(MAX_HANDLE_SZ + OVL_FH_FID_OFFSET > 255);
+
+ fh = kzalloc(buflen + OVL_FH_FID_OFFSET, GFP_KERNEL);
+ if (!fh)
return ERR_PTR(-ENOMEM);
/*
@@ -242,27 +246,19 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper)
* the price or reconnecting the dentry.
*/
dwords = buflen >> 2;
- fh_type = exportfs_encode_fh(real, buf, &dwords, 0);
+ fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0);
buflen = (dwords << 2);
- fh = ERR_PTR(-EIO);
+ err = -EIO;
if (WARN_ON(fh_type < 0) ||
WARN_ON(buflen > MAX_HANDLE_SZ) ||
WARN_ON(fh_type == FILEID_INVALID))
- goto out;
+ goto out_err;
- BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255);
- fh_len = offsetof(struct ovl_fh, fid) + buflen;
- fh = kmalloc(fh_len, GFP_KERNEL);
- if (!fh) {
- fh = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- fh->version = OVL_FH_VERSION;
- fh->magic = OVL_FH_MAGIC;
- fh->type = fh_type;
- fh->flags = OVL_FH_FLAG_CPU_ENDIAN;
+ fh->fb.version = OVL_FH_VERSION;
+ fh->fb.magic = OVL_FH_MAGIC;
+ fh->fb.type = fh_type;
+ fh->fb.flags = OVL_FH_FLAG_CPU_ENDIAN;
/*
* When we will want to decode an overlay dentry from this handle
* and all layers are on the same fs, if we get a disconncted real
@@ -270,14 +266,15 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper)
* it to upperdentry or to lowerstack is by checking this flag.
*/
if (is_upper)
- fh->flags |= OVL_FH_FLAG_PATH_UPPER;
- fh->len = fh_len;
- fh->uuid = *uuid;
- memcpy(fh->fid, buf, buflen);
+ fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER;
+ fh->fb.len = sizeof(fh->fb) + buflen;
+ fh->fb.uuid = *uuid;
-out:
- kfree(buf);
return fh;
+
+out_err:
+ kfree(fh);
+ return ERR_PTR(err);
}
int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
@@ -300,8 +297,8 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
/*
* Do not fail when upper doesn't support xattrs.
*/
- err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh,
- fh ? fh->len : 0, 0);
+ err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh->buf,
+ fh ? fh->fb.len : 0, 0);
kfree(fh);
return err;
@@ -317,7 +314,7 @@ static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index)
if (IS_ERR(fh))
return PTR_ERR(fh);
- err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh, fh->len, 0);
+ err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh->buf, fh->fb.len, 0);
kfree(fh);
return err;
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 702aa63f6774..29abdb1d3b5c 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1170,7 +1170,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
if (newdentry == trap)
goto out_dput;
- if (WARN_ON(olddentry->d_inode == newdentry->d_inode))
+ if (olddentry->d_inode == newdentry->d_inode)
goto out_dput;
err = 0;
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 73c9775215b3..70e55588aedc 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -211,10 +211,11 @@ static int ovl_check_encode_origin(struct dentry *dentry)
return 1;
}
-static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen)
+static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen)
{
struct ovl_fh *fh = NULL;
int err, enc_lower;
+ int len;
/*
* Check if we should encode a lower or upper file handle and maybe
@@ -231,11 +232,12 @@ static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen)
return PTR_ERR(fh);
err = -EOVERFLOW;
- if (fh->len > buflen)
+ len = OVL_FH_LEN(fh);
+ if (len > buflen)
goto fail;
- memcpy(buf, (char *)fh, fh->len);
- err = fh->len;
+ memcpy(fid, fh, len);
+ err = len;
out:
kfree(fh);
@@ -243,31 +245,16 @@ out:
fail:
pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n",
- dentry, err, buflen, fh ? (int)fh->len : 0,
- fh ? fh->type : 0);
+ dentry, err, buflen, fh ? (int)fh->fb.len : 0,
+ fh ? fh->fb.type : 0);
goto out;
}
-static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len)
-{
- int res, len = *max_len << 2;
-
- res = ovl_d_to_fh(dentry, (char *)fid, len);
- if (res <= 0)
- return FILEID_INVALID;
-
- len = res;
-
- /* Round up to dwords */
- *max_len = (len + 3) >> 2;
- return OVL_FILEID;
-}
-
static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
struct inode *parent)
{
struct dentry *dentry;
- int type;
+ int bytes = *max_len << 2;
/* TODO: encode connectable file handles */
if (parent)
@@ -277,10 +264,14 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
if (WARN_ON(!dentry))
return FILEID_INVALID;
- type = ovl_dentry_to_fh(dentry, fid, max_len);
-
+ bytes = ovl_dentry_to_fid(dentry, fid, bytes);
dput(dentry);
- return type;
+ if (bytes <= 0)
+ return FILEID_INVALID;
+
+ *max_len = bytes >> 2;
+
+ return OVL_FILEID_V1;
}
/*
@@ -777,24 +768,45 @@ out_err:
goto out;
}
+static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type)
+{
+ struct ovl_fh *fh;
+
+ /* If on-wire inner fid is aligned - nothing to do */
+ if (fh_type == OVL_FILEID_V1)
+ return (struct ovl_fh *)fid;
+
+ if (fh_type != OVL_FILEID_V0)
+ return ERR_PTR(-EINVAL);
+
+ fh = kzalloc(buflen, GFP_KERNEL);
+ if (!fh)
+ return ERR_PTR(-ENOMEM);
+
+ /* Copy unaligned inner fh into aligned buffer */
+ memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET);
+ return fh;
+}
+
static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type)
{
struct dentry *dentry = NULL;
- struct ovl_fh *fh = (struct ovl_fh *) fid;
+ struct ovl_fh *fh = NULL;
int len = fh_len << 2;
unsigned int flags = 0;
int err;
- err = -EINVAL;
- if (fh_type != OVL_FILEID)
+ fh = ovl_fid_to_fh(fid, len, fh_type);
+ err = PTR_ERR(fh);
+ if (IS_ERR(fh))
goto out_err;
err = ovl_check_fh_len(fh, len);
if (err)
goto out_err;
- flags = fh->flags;
+ flags = fh->fb.flags;
dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ?
ovl_upper_fh_to_d(sb, fh) :
ovl_lower_fh_to_d(sb, fh);
@@ -802,12 +814,18 @@ static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid,
if (IS_ERR(dentry) && err != -ESTALE)
goto out_err;
+out:
+ /* We may have needed to re-align OVL_FILEID_V0 */
+ if (!IS_ERR_OR_NULL(fh) && fh != (void *)fid)
+ kfree(fh);
+
return dentry;
out_err:
pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n",
- len, fh_type, flags, err);
- return ERR_PTR(err);
+ fh_len, fh_type, flags, err);
+ dentry = ERR_PTR(err);
+ goto out;
}
static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index bc14781886bf..b045cf1826fc 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -200,8 +200,14 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) ||
(!ovl_verify_lower(dentry->d_sb) &&
(is_dir || lowerstat.nlink == 1))) {
- stat->ino = lowerstat.ino;
lower_layer = ovl_layer_lower(dentry);
+ /*
+ * Cannot use origin st_dev;st_ino because
+ * origin inode content may differ from overlay
+ * inode content.
+ */
+ if (samefs || lower_layer->fsid)
+ stat->ino = lowerstat.ino;
}
/*
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index c269d6033525..76ff66339173 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -84,21 +84,21 @@ static int ovl_acceptable(void *ctx, struct dentry *dentry)
* Return -ENODATA for "origin unknown".
* Return <0 for an invalid file handle.
*/
-int ovl_check_fh_len(struct ovl_fh *fh, int fh_len)
+int ovl_check_fb_len(struct ovl_fb *fb, int fb_len)
{
- if (fh_len < sizeof(struct ovl_fh) || fh_len < fh->len)
+ if (fb_len < sizeof(struct ovl_fb) || fb_len < fb->len)
return -EINVAL;
- if (fh->magic != OVL_FH_MAGIC)
+ if (fb->magic != OVL_FH_MAGIC)
return -EINVAL;
/* Treat larger version and unknown flags as "origin unknown" */
- if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL)
+ if (fb->version > OVL_FH_VERSION || fb->flags & ~OVL_FH_FLAG_ALL)
return -ENODATA;
/* Treat endianness mismatch as "origin unknown" */
- if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) &&
- (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN)
+ if (!(fb->flags & OVL_FH_FLAG_ANY_ENDIAN) &&
+ (fb->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN)
return -ENODATA;
return 0;
@@ -119,15 +119,15 @@ static struct ovl_fh *ovl_get_fh(struct dentry *dentry, const char *name)
if (res == 0)
return NULL;
- fh = kzalloc(res, GFP_KERNEL);
+ fh = kzalloc(res + OVL_FH_WIRE_OFFSET, GFP_KERNEL);
if (!fh)
return ERR_PTR(-ENOMEM);
- res = vfs_getxattr(dentry, name, fh, res);
+ res = vfs_getxattr(dentry, name, fh->buf, res);
if (res < 0)
goto fail;
- err = ovl_check_fh_len(fh, res);
+ err = ovl_check_fb_len(&fh->fb, res);
if (err < 0) {
if (err == -ENODATA)
goto out;
@@ -158,12 +158,12 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt,
* Make sure that the stored uuid matches the uuid of the lower
* layer where file handle will be decoded.
*/
- if (!uuid_equal(&fh->uuid, &mnt->mnt_sb->s_uuid))
+ if (!uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid))
return NULL;
- bytes = (fh->len - offsetof(struct ovl_fh, fid));
- real = exportfs_decode_fh(mnt, (struct fid *)fh->fid,
- bytes >> 2, (int)fh->type,
+ bytes = (fh->fb.len - offsetof(struct ovl_fb, fid));
+ real = exportfs_decode_fh(mnt, (struct fid *)fh->fb.fid,
+ bytes >> 2, (int)fh->fb.type,
connected ? ovl_acceptable : NULL, mnt);
if (IS_ERR(real)) {
/*
@@ -173,7 +173,7 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt,
* index entries correctly.
*/
if (real == ERR_PTR(-ESTALE) &&
- !(fh->flags & OVL_FH_FLAG_PATH_UPPER))
+ !(fh->fb.flags & OVL_FH_FLAG_PATH_UPPER))
real = NULL;
return real;
}
@@ -323,6 +323,14 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
int i;
for (i = 0; i < ofs->numlower; i++) {
+ /*
+ * If lower fs uuid is not unique among lower fs we cannot match
+ * fh->uuid to layer.
+ */
+ if (ofs->lower_layers[i].fsid &&
+ ofs->lower_layers[i].fs->bad_uuid)
+ continue;
+
origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt,
connected);
if (origin)
@@ -400,7 +408,7 @@ static int ovl_verify_fh(struct dentry *dentry, const char *name,
if (IS_ERR(ofh))
return PTR_ERR(ofh);
- if (fh->len != ofh->len || memcmp(fh, ofh, fh->len))
+ if (fh->fb.len != ofh->fb.len || memcmp(&fh->fb, &ofh->fb, fh->fb.len))
err = -ESTALE;
kfree(ofh);
@@ -431,7 +439,7 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name,
err = ovl_verify_fh(dentry, name, fh);
if (set && err == -ENODATA)
- err = ovl_do_setxattr(dentry, name, fh, fh->len, 0);
+ err = ovl_do_setxattr(dentry, name, fh->buf, fh->fb.len, 0);
if (err)
goto fail;
@@ -505,20 +513,20 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index)
goto fail;
err = -EINVAL;
- if (index->d_name.len < sizeof(struct ovl_fh)*2)
+ if (index->d_name.len < sizeof(struct ovl_fb)*2)
goto fail;
err = -ENOMEM;
len = index->d_name.len / 2;
- fh = kzalloc(len, GFP_KERNEL);
+ fh = kzalloc(len + OVL_FH_WIRE_OFFSET, GFP_KERNEL);
if (!fh)
goto fail;
err = -EINVAL;
- if (hex2bin((u8 *)fh, index->d_name.name, len))
+ if (hex2bin(fh->buf, index->d_name.name, len))
goto fail;
- err = ovl_check_fh_len(fh, len);
+ err = ovl_check_fb_len(&fh->fb, len);
if (err)
goto fail;
@@ -597,11 +605,11 @@ static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name)
{
char *n, *s;
- n = kcalloc(fh->len, 2, GFP_KERNEL);
+ n = kcalloc(fh->fb.len, 2, GFP_KERNEL);
if (!n)
return -ENOMEM;
- s = bin2hex(n, fh, fh->len);
+ s = bin2hex(n, fh->buf, fh->fb.len);
*name = (struct qstr) QSTR_INIT(n, s - n);
return 0;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 6934bcf030f0..f283b1d69a9e 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -71,20 +71,36 @@ enum ovl_entry_flag {
#error Endianness not defined
#endif
-/* The type returned by overlay exportfs ops when encoding an ovl_fh handle */
-#define OVL_FILEID 0xfb
+/* The type used to be returned by overlay exportfs for misaligned fid */
+#define OVL_FILEID_V0 0xfb
+/* The type returned by overlay exportfs for 32bit aligned fid */
+#define OVL_FILEID_V1 0xf8
-/* On-disk and in-memeory format for redirect by file handle */
-struct ovl_fh {
+/* On-disk format for "origin" file handle */
+struct ovl_fb {
u8 version; /* 0 */
u8 magic; /* 0xfb */
u8 len; /* size of this header + size of fid */
u8 flags; /* OVL_FH_FLAG_* */
u8 type; /* fid_type of fid */
uuid_t uuid; /* uuid of filesystem */
- u8 fid[0]; /* file identifier */
+ u32 fid[0]; /* file identifier should be 32bit aligned in-memory */
} __packed;
+/* In-memory and on-wire format for overlay file handle */
+struct ovl_fh {
+ u8 padding[3]; /* make sure fb.fid is 32bit aligned */
+ union {
+ struct ovl_fb fb;
+ u8 buf[0];
+ };
+} __packed;
+
+#define OVL_FH_WIRE_OFFSET offsetof(struct ovl_fh, fb)
+#define OVL_FH_LEN(fh) (OVL_FH_WIRE_OFFSET + (fh)->fb.len)
+#define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \
+ offsetof(struct ovl_fb, fid))
+
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
@@ -302,7 +318,13 @@ static inline void ovl_inode_unlock(struct inode *inode)
/* namei.c */
-int ovl_check_fh_len(struct ovl_fh *fh, int fh_len);
+int ovl_check_fb_len(struct ovl_fb *fb, int fb_len);
+
+static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len)
+{
+ return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET);
+}
+
struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt,
bool connected);
int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index a8279280e88d..28348c44ea5b 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -22,6 +22,8 @@ struct ovl_config {
struct ovl_sb {
struct super_block *sb;
dev_t pseudo_dev;
+ /* Unusable (conflicting) uuid */
+ bool bad_uuid;
};
struct ovl_layer {
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index afbcb116a7f1..7621ff176d15 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1255,7 +1255,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
{
unsigned int i;
- if (!ofs->config.nfs_export && !(ofs->config.index && ofs->upper_mnt))
+ if (!ofs->config.nfs_export && !ofs->upper_mnt)
return true;
for (i = 0; i < ofs->numlowerfs; i++) {
@@ -1263,9 +1263,13 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
* We use uuid to associate an overlay lower file handle with a
* lower layer, so we can accept lower fs with null uuid as long
* as all lower layers with null uuid are on the same fs.
+ * if we detect multiple lower fs with the same uuid, we
+ * disable lower file handle decoding on all of them.
*/
- if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid))
+ if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) {
+ ofs->lower_fs[i].bad_uuid = true;
return false;
+ }
}
return true;
}
@@ -1277,6 +1281,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
unsigned int i;
dev_t dev;
int err;
+ bool bad_uuid = false;
/* fsid 0 is reserved for upper fs even with non upper overlay */
if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb)
@@ -1288,11 +1293,15 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
}
if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) {
- ofs->config.index = false;
- ofs->config.nfs_export = false;
- pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n",
- uuid_is_null(&sb->s_uuid) ? "null" : "conflicting",
- path->dentry);
+ bad_uuid = true;
+ if (ofs->config.index || ofs->config.nfs_export) {
+ ofs->config.index = false;
+ ofs->config.nfs_export = false;
+ pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n",
+ uuid_is_null(&sb->s_uuid) ? "null" :
+ "conflicting",
+ path->dentry);
+ }
}
err = get_anon_bdev(&dev);
@@ -1303,6 +1312,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
ofs->lower_fs[ofs->numlowerfs].sb = sb;
ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev;
+ ofs->lower_fs[ofs->numlowerfs].bad_uuid = bad_uuid;
ofs->numlowerfs++;
return ofs->numlowerfs;
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index eabc6ac19906..b79e3fd19d11 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -315,7 +315,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
if (arg.block_size != PAGE_SIZE)
return -EINVAL;
- if (arg.salt_size > FIELD_SIZEOF(struct fsverity_descriptor, salt))
+ if (arg.salt_size > sizeof_field(struct fsverity_descriptor, salt))
return -EMSGSIZE;
if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE)