diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ceph/caps.c | 41 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 13 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 8 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 9 | ||||
-rw-r--r-- | fs/ceph/mdsmap.c | 12 | ||||
-rw-r--r-- | fs/ceph/super.c | 28 | ||||
-rw-r--r-- | fs/ceph/super.h | 16 | ||||
-rw-r--r-- | fs/cifs/cifsglob.h | 2 | ||||
-rw-r--r-- | fs/cifs/cifssmb.c | 3 | ||||
-rw-r--r-- | fs/cifs/smb2inode.c | 1 | ||||
-rw-r--r-- | fs/cifs/smb2ops.c | 19 | ||||
-rw-r--r-- | fs/cifs/smb2pdu.c | 2 | ||||
-rw-r--r-- | fs/cifs/smb2proto.h | 2 | ||||
-rw-r--r-- | fs/crypto/keyring.c | 2 | ||||
-rw-r--r-- | fs/io-wq.c | 34 | ||||
-rw-r--r-- | fs/io-wq.h | 7 | ||||
-rw-r--r-- | fs/io_uring.c | 168 | ||||
-rw-r--r-- | fs/overlayfs/copy_up.c | 53 | ||||
-rw-r--r-- | fs/overlayfs/dir.c | 2 | ||||
-rw-r--r-- | fs/overlayfs/export.c | 80 | ||||
-rw-r--r-- | fs/overlayfs/inode.c | 8 | ||||
-rw-r--r-- | fs/overlayfs/namei.c | 52 | ||||
-rw-r--r-- | fs/overlayfs/overlayfs.h | 34 | ||||
-rw-r--r-- | fs/overlayfs/ovl_entry.h | 2 | ||||
-rw-r--r-- | fs/overlayfs/super.c | 24 | ||||
-rw-r--r-- | fs/verity/enable.c | 2 |
26 files changed, 393 insertions, 231 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f5a38910a82b..9d09bb53c1ab 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1011,18 +1011,13 @@ static int __ceph_is_single_caps(struct ceph_inode_info *ci) return rb_first(&ci->i_caps) == rb_last(&ci->i_caps); } -static int __ceph_is_any_caps(struct ceph_inode_info *ci) -{ - return !RB_EMPTY_ROOT(&ci->i_caps); -} - int ceph_is_any_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); int ret; spin_lock(&ci->i_ceph_lock); - ret = __ceph_is_any_caps(ci); + ret = __ceph_is_any_real_caps(ci); spin_unlock(&ci->i_ceph_lock); return ret; @@ -1099,15 +1094,16 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) if (removed) ceph_put_cap(mdsc, cap); - /* when reconnect denied, we remove session caps forcibly, - * i_wr_ref can be non-zero. If there are ongoing write, - * keep i_snap_realm. - */ - if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm) - drop_inode_snap_realm(ci); + if (!__ceph_is_any_real_caps(ci)) { + /* when reconnect denied, we remove session caps forcibly, + * i_wr_ref can be non-zero. If there are ongoing write, + * keep i_snap_realm. + */ + if (ci->i_wr_ref == 0 && ci->i_snap_realm) + drop_inode_snap_realm(ci); - if (!__ceph_is_any_real_caps(ci)) __cap_delay_cancel(mdsc, ci); + } } struct cap_msg_args { @@ -2764,7 +2760,19 @@ int ceph_get_caps(struct file *filp, int need, int want, if (ret == -EAGAIN) continue; if (!ret) { + struct ceph_mds_client *mdsc = fsc->mdsc; + struct cap_wait cw; DEFINE_WAIT_FUNC(wait, woken_wake_function); + + cw.ino = inode->i_ino; + cw.tgid = current->tgid; + cw.need = need; + cw.want = want; + + spin_lock(&mdsc->caps_list_lock); + list_add(&cw.list, &mdsc->cap_wait_list); + spin_unlock(&mdsc->caps_list_lock); + add_wait_queue(&ci->i_cap_wq, &wait); flags |= NON_BLOCKING; @@ -2778,6 +2786,11 @@ int ceph_get_caps(struct file *filp, int need, int want, } remove_wait_queue(&ci->i_cap_wq, &wait); + + spin_lock(&mdsc->caps_list_lock); + list_del(&cw.list); + spin_unlock(&mdsc->caps_list_lock); + if (ret == -EAGAIN) continue; } @@ -2928,7 +2941,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) ci->i_head_snapc = NULL; } /* see comment in __ceph_remove_cap() */ - if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) + if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) drop_inode_snap_realm(ci); } spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index facb387c2735..c281f32b54f7 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -139,6 +139,7 @@ static int caps_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; int total, avail, used, reserved, min, i; + struct cap_wait *cw; ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" @@ -166,6 +167,18 @@ static int caps_show(struct seq_file *s, void *p) } mutex_unlock(&mdsc->mutex); + seq_printf(s, "\n\nWaiters:\n--------\n"); + seq_printf(s, "tgid ino need want\n"); + seq_printf(s, "-----------------------------------------------------\n"); + + spin_lock(&mdsc->caps_list_lock); + list_for_each_entry(cw, &mdsc->cap_wait_list, list) { + seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino, + ceph_cap_string(cw->need), + ceph_cap_string(cw->want)); + } + spin_unlock(&mdsc->caps_list_lock); + return 0; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 068b029cf073..374db1bd57d1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2015,7 +2015,7 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) if (!nr) return; val = atomic_add_return(nr, &mdsc->cap_reclaim_pending); - if (!(val % CEPH_CAPS_PER_RELEASE)) { + if ((val % CEPH_CAPS_PER_RELEASE) < nr) { atomic_set(&mdsc->cap_reclaim_pending, 0); ceph_queue_cap_reclaim_work(mdsc); } @@ -2032,12 +2032,13 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; size_t size = sizeof(struct ceph_mds_reply_dir_entry); - int order, num_entries; + unsigned int num_entries; + int order; spin_lock(&ci->i_ceph_lock); num_entries = ci->i_files + ci->i_subdirs; spin_unlock(&ci->i_ceph_lock); - num_entries = max(num_entries, 1); + num_entries = max(num_entries, 1U); num_entries = min(num_entries, opt->max_readdir); order = get_order(size * num_entries); @@ -4168,6 +4169,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; INIT_LIST_HEAD(&mdsc->cap_delay_list); + INIT_LIST_HEAD(&mdsc->cap_wait_list); spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 5cd131b41d84..14c7e8c49970 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -340,6 +340,14 @@ struct ceph_quotarealm_inode { struct inode *inode; }; +struct cap_wait { + struct list_head list; + unsigned long ino; + pid_t tgid; + int need; + int want; +}; + /* * mds client state */ @@ -416,6 +424,7 @@ struct ceph_mds_client { spinlock_t caps_list_lock; struct list_head caps_list; /* unused (reserved or unreserved) */ + struct list_head cap_wait_list; int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ int caps_use_max; /* max used caps */ diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index aeec1d6e3769..471bac335fae 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -158,6 +158,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) void *pexport_targets = NULL; struct ceph_timespec laggy_since; struct ceph_mds_info *info; + bool laggy; ceph_decode_need(p, end, sizeof(u64) + 1, bad); global_id = ceph_decode_64(p); @@ -190,6 +191,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) if (err) goto corrupt; ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); + laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; @@ -207,10 +209,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) *p = info_end; } - dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id, mds, inc, ceph_pr_addr(&addr), - ceph_mds_state_name(state)); + ceph_mds_state_name(state), + laggy ? "(laggy)" : ""); if (mds < 0 || state <= 0) continue; @@ -230,8 +233,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->global_id = global_id; info->state = state; info->addr = addr; - info->laggy = (laggy_since.tv_sec != 0 || - laggy_since.tv_nsec != 0); + info->laggy = laggy; info->num_export_targets = num_export_targets; if (num_export_targets) { info->export_targets = kcalloc(num_export_targets, @@ -355,6 +357,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_damaged = false; } bad_ext: + dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", + !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); *p = end; dout("mdsmap_decode success epoch %u\n", m->m_epoch); return m; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9c9a7c68eea3..29a795f975df 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -172,10 +172,10 @@ static const struct fs_parameter_enum ceph_mount_param_enums[] = { static const struct fs_parameter_spec ceph_mount_param_specs[] = { fsparam_flag_no ("acl", Opt_acl), fsparam_flag_no ("asyncreaddir", Opt_asyncreaddir), - fsparam_u32 ("caps_max", Opt_caps_max), + fsparam_s32 ("caps_max", Opt_caps_max), fsparam_u32 ("caps_wanted_delay_max", Opt_caps_wanted_delay_max), fsparam_u32 ("caps_wanted_delay_min", Opt_caps_wanted_delay_min), - fsparam_s32 ("write_congestion_kb", Opt_congestion_kb), + fsparam_u32 ("write_congestion_kb", Opt_congestion_kb), fsparam_flag_no ("copyfrom", Opt_copyfrom), fsparam_flag_no ("dcache", Opt_dcache), fsparam_flag_no ("dirstat", Opt_dirstat), @@ -187,8 +187,8 @@ static const struct fs_parameter_spec ceph_mount_param_specs[] = { fsparam_flag_no ("quotadf", Opt_quotadf), fsparam_u32 ("rasize", Opt_rasize), fsparam_flag_no ("rbytes", Opt_rbytes), - fsparam_s32 ("readdir_max_bytes", Opt_readdir_max_bytes), - fsparam_s32 ("readdir_max_entries", Opt_readdir_max_entries), + fsparam_u32 ("readdir_max_bytes", Opt_readdir_max_bytes), + fsparam_u32 ("readdir_max_entries", Opt_readdir_max_entries), fsparam_enum ("recover_session", Opt_recover_session), fsparam_flag_no ("require_active_mds", Opt_require_active_mds), fsparam_u32 ("rsize", Opt_rsize), @@ -328,7 +328,9 @@ static int ceph_parse_mount_param(struct fs_context *fc, fsopt->caps_wanted_delay_max = result.uint_32; break; case Opt_caps_max: - fsopt->caps_max = result.uint_32; + if (result.int_32 < 0) + goto out_of_range; + fsopt->caps_max = result.int_32; break; case Opt_readdir_max_entries: if (result.uint_32 < 1) @@ -547,25 +549,25 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_show_option(m, "recover_session", "clean"); if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) - seq_printf(m, ",wsize=%d", fsopt->wsize); + seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) - seq_printf(m, ",rsize=%d", fsopt->rsize); + seq_printf(m, ",rsize=%u", fsopt->rsize); if (fsopt->rasize != CEPH_RASIZE_DEFAULT) - seq_printf(m, ",rasize=%d", fsopt->rasize); + seq_printf(m, ",rasize=%u", fsopt->rasize); if (fsopt->congestion_kb != default_congestion_kb()) - seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + seq_printf(m, ",write_congestion_kb=%u", fsopt->congestion_kb); if (fsopt->caps_max) seq_printf(m, ",caps_max=%d", fsopt->caps_max); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) - seq_printf(m, ",caps_wanted_delay_min=%d", + seq_printf(m, ",caps_wanted_delay_min=%u", fsopt->caps_wanted_delay_min); if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) - seq_printf(m, ",caps_wanted_delay_max=%d", + seq_printf(m, ",caps_wanted_delay_max=%u", fsopt->caps_wanted_delay_max); if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) - seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); + seq_printf(m, ",readdir_max_entries=%u", fsopt->max_readdir); if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) - seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); + seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_show_option(m, "snapdirname", fsopt->snapdir_name); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f0f9cb7447ac..3bf1a01cd536 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -73,16 +73,16 @@ #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ struct ceph_mount_options { - int flags; + unsigned int flags; - int wsize; /* max write size */ - int rsize; /* max read size */ - int rasize; /* max readahead */ - int congestion_kb; /* max writeback in flight */ - int caps_wanted_delay_min, caps_wanted_delay_max; + unsigned int wsize; /* max write size */ + unsigned int rsize; /* max read size */ + unsigned int rasize; /* max readahead */ + unsigned int congestion_kb; /* max writeback in flight */ + unsigned int caps_wanted_delay_min, caps_wanted_delay_max; int caps_max; - int max_readdir; /* max readdir result (entires) */ - int max_readdir_bytes; /* max readdir result (bytes) */ + unsigned int max_readdir; /* max readdir result (entries) */ + unsigned int max_readdir_bytes; /* max readdir result (bytes) */ /* * everything above this point can be memcmp'd; everything below diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index fd0262ce5ad5..ce9bac756c2a 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1061,7 +1061,7 @@ cap_unix(struct cifs_ses *ses) struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ bool file_all_info_is_valid:1; - + bool has_lease:1; struct kref refcount; struct cifs_fid *fid; struct mutex fid_mutex; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4f554f019a98..cc86a67225d1 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -42,6 +42,7 @@ #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" +#include "smb2proto.h" #include "fscache.h" #include "smbdirect.h" #ifdef CONFIG_CIFS_DFS_UPCALL @@ -112,6 +113,8 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) mutex_lock(&tcon->crfid.fid_mutex); tcon->crfid.is_valid = false; + /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ + close_shroot_lease_locked(&tcon->crfid); memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); mutex_unlock(&tcon->crfid.fid_mutex); diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 18c7a33adceb..5ef5e97a6d13 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -95,6 +95,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, goto finished; } + memset(&oparms, 0, sizeof(struct cifs_open_parms)); oparms.tcon = tcon; oparms.desired_access = desired_access; oparms.disposition = create_disposition; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index a5c96bc522cb..6250370c1170 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -616,6 +616,7 @@ smb2_close_cached_fid(struct kref *ref) cfid->fid->volatile_fid); cfid->is_valid = false; cfid->file_all_info_is_valid = false; + cfid->has_lease = false; } } @@ -626,13 +627,28 @@ void close_shroot(struct cached_fid *cfid) mutex_unlock(&cfid->fid_mutex); } +void close_shroot_lease_locked(struct cached_fid *cfid) +{ + if (cfid->has_lease) { + cfid->has_lease = false; + kref_put(&cfid->refcount, smb2_close_cached_fid); + } +} + +void close_shroot_lease(struct cached_fid *cfid) +{ + mutex_lock(&cfid->fid_mutex); + close_shroot_lease_locked(cfid); + mutex_unlock(&cfid->fid_mutex); +} + void smb2_cached_lease_break(struct work_struct *work) { struct cached_fid *cfid = container_of(work, struct cached_fid, lease_break); - close_shroot(cfid); + close_shroot_lease(cfid); } /* @@ -773,6 +789,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) /* BB TBD check to see if oplock level check can be removed below */ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { kref_get(&tcon->crfid.refcount); + tcon->crfid.has_lease = true; smb2_parse_contexts(server, o_rsp, &oparms.fid->epoch, oparms.fid->lease_key, &oplock, NULL); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 0ab6b1200288..9434f6dd8df3 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1847,7 +1847,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) return 0; - close_shroot(&tcon->crfid); + close_shroot_lease(&tcon->crfid); rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req, &total_len); diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index a18272c987fe..27d29f2eb6c8 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -70,6 +70,8 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid); extern void close_shroot(struct cached_fid *cfid); +extern void close_shroot_lease(struct cached_fid *cfid); +extern void close_shroot_lease_locked(struct cached_fid *cfid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 040df1f5e1c8..40cca351273f 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -151,7 +151,7 @@ static struct key *search_fscrypt_keyring(struct key *keyring, } #define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \ - (CONST_STRLEN("fscrypt-") + FIELD_SIZEOF(struct super_block, s_id)) + (CONST_STRLEN("fscrypt-") + sizeof_field(struct super_block, s_id)) #define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1) diff --git a/fs/io-wq.c b/fs/io-wq.c index 74b40506c5d9..90c4978781fb 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -49,7 +49,6 @@ struct io_worker { struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; - wait_queue_head_t wait; struct io_wqe *wqe; struct io_wq_work *cur_work; @@ -258,7 +257,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) worker = hlist_nulls_entry(n, struct io_worker, nulls_node); if (io_worker_get(worker)) { - wake_up(&worker->wait); + wake_up_process(worker->task); io_worker_release(worker); return true; } @@ -492,28 +491,46 @@ next: } while (1); } +static inline void io_worker_spin_for_work(struct io_wqe *wqe) +{ + int i = 0; + + while (++i < 1000) { + if (io_wqe_run_queue(wqe)) + break; + if (need_resched()) + break; + cpu_relax(); + } +} + static int io_wqe_worker(void *data) { struct io_worker *worker = data; struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; - DEFINE_WAIT(wait); + bool did_work; io_worker_start(wqe, worker); + did_work = false; while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - prepare_to_wait(&worker->wait, &wait, TASK_INTERRUPTIBLE); - + set_current_state(TASK_INTERRUPTIBLE); +loop: + if (did_work) + io_worker_spin_for_work(wqe); spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); - continue; + did_work = true; + goto loop; } + did_work = false; /* drops the lock on success, retry */ if (__io_worker_idle(wqe, worker)) { __release(&wqe->lock); - continue; + goto loop; } spin_unlock_irq(&wqe->lock); if (signal_pending(current)) @@ -526,8 +543,6 @@ static int io_wqe_worker(void *data) break; } - finish_wait(&worker->wait, &wait); - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { spin_lock_irq(&wqe->lock); if (!wq_list_empty(&wqe->work_list)) @@ -589,7 +604,6 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) refcount_set(&worker->ref, 1); worker->nulls_node.pprev = NULL; - init_waitqueue_head(&worker->wait); worker->wqe = wqe; spin_lock_init(&worker->lock); diff --git a/fs/io-wq.h b/fs/io-wq.h index 7c333a28e2a7..fb993b2bd0ef 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -35,7 +35,8 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { if (!list->first) { - list->first = list->last = node; + list->last = node; + WRITE_ONCE(list->first, node); } else { list->last->next = node; list->last = node; @@ -47,7 +48,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, struct io_wq_work_node *prev) { if (node == list->first) - list->first = node->next; + WRITE_ONCE(list->first, node->next); if (node == list->last) list->last = prev; if (prev) @@ -58,7 +59,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, #define wq_list_for_each(pos, prv, head) \ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) -#define wq_list_empty(list) ((list)->first == NULL) +#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) #define INIT_WQ_LIST(list) do { \ (list)->first = NULL; \ (list)->last = NULL; \ diff --git a/fs/io_uring.c b/fs/io_uring.c index 405be10da73d..9b1833fedc5c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -293,7 +293,7 @@ struct io_poll_iocb { __poll_t events; bool done; bool canceled; - struct wait_queue_entry *wait; + struct wait_queue_entry wait; }; struct io_timeout_data { @@ -377,6 +377,7 @@ struct io_kiocb { #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ +#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ u64 user_data; u32 result; u32 sequence; @@ -580,7 +581,9 @@ static inline bool io_prep_async_work(struct io_kiocb *req, switch (req->sqe->opcode) { case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - do_hashed = true; + /* only regular files should be hashed for writes */ + if (req->flags & REQ_F_ISREG) + do_hashed = true; /* fall-through */ case IORING_OP_READV: case IORING_OP_READ_FIXED: @@ -1292,6 +1295,12 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } +static inline void req_set_fail_links(struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; +} + static void io_complete_rw_common(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); @@ -1299,8 +1308,8 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if ((req->flags & REQ_F_LINK) && res != req->result) - req->flags |= REQ_F_FAIL_LINK; + if (res != req->result) + req_set_fail_links(req); io_cqring_add_event(req, res); } @@ -1330,8 +1339,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if ((req->flags & REQ_F_LINK) && res != req->result) - req->flags |= REQ_F_FAIL_LINK; + if (res != req->result) + req_set_fail_links(req); req->result = res; if (res != -EAGAIN) req->flags |= REQ_F_IOPOLL_COMPLETED; @@ -1422,7 +1431,7 @@ static bool io_file_supports_async(struct file *file) { umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode) || S_ISCHR(mode)) + if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode)) return true; if (S_ISREG(mode) && file->f_op != &io_uring_fops) return true; @@ -1858,7 +1867,9 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, goto copy_iov; } - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) + /* file path doesn't support NOWAIT for non-direct_IO */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && + (req->flags & REQ_F_ISREG)) goto copy_iov; iov_count = iov_iter_count(&iter); @@ -1956,8 +1967,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, end > 0 ? end : LLONG_MAX, fsync_flags & IORING_FSYNC_DATASYNC); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; @@ -2003,8 +2014,8 @@ static int io_sync_file_range(struct io_kiocb *req, ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; @@ -2019,6 +2030,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) flags = READ_ONCE(sqe->msg_flags); msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + io->msg.iov = io->msg.fast_iov; return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); #else return 0; @@ -2054,7 +2066,6 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, } else { kmsg = &io.msg.msg; kmsg->msg_name = &addr; - io.msg.iov = io.msg.fast_iov; ret = io_sendmsg_prep(req, &io); if (ret) goto out; @@ -2079,8 +2090,8 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, out: io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); return 0; #else @@ -2097,6 +2108,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) flags = READ_ONCE(sqe->msg_flags); msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + io->msg.iov = io->msg.fast_iov; return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, &io->msg.iov); #else @@ -2136,7 +2148,6 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, } else { kmsg = &io.msg.msg; kmsg->msg_name = &addr; - io.msg.iov = io.msg.fast_iov; ret = io_recvmsg_prep(req, &io); if (ret) goto out; @@ -2161,8 +2172,8 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, out: io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); return 0; #else @@ -2196,8 +2207,8 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, } if (ret == -ERESTARTSYS) ret = -EINTR; - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; @@ -2263,8 +2274,8 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret == -ERESTARTSYS) ret = -EINTR; out: - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; @@ -2279,8 +2290,8 @@ static void io_poll_remove_one(struct io_kiocb *req) spin_lock(&poll->head->lock); WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait->entry)) { - list_del_init(&poll->wait->entry); + if (!list_empty(&poll->wait.entry)) { + list_del_init(&poll->wait.entry); io_queue_async_work(req); } spin_unlock(&poll->head->lock); @@ -2340,8 +2351,8 @@ static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) spin_unlock_irq(&ctx->completion_lock); io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req(req); return 0; } @@ -2351,7 +2362,6 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) struct io_ring_ctx *ctx = req->ctx; req->poll.done = true; - kfree(req->poll.wait); if (error) io_cqring_fill_event(req, error); else @@ -2389,7 +2399,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) */ spin_lock_irq(&ctx->completion_lock); if (!mask && ret != -ECANCELED) { - add_wait_queue(poll->head, poll->wait); + add_wait_queue(poll->head, &poll->wait); spin_unlock_irq(&ctx->completion_lock); return; } @@ -2399,8 +2409,8 @@ static void io_poll_complete_work(struct io_wq_work **workptr) io_cqring_ev_posted(ctx); - if (ret < 0 && req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, &nxt); if (nxt) *workptr = &nxt->work; @@ -2419,7 +2429,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & poll->events)) return 0; - list_del_init(&poll->wait->entry); + list_del_init(&poll->wait.entry); /* * Run completion inline if we can. We're using trylock here because @@ -2460,7 +2470,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, pt->error = 0; pt->req->poll.head = head; - add_wait_queue(head, pt->req->poll.wait); + add_wait_queue(head, &pt->req->poll.wait); } static void io_poll_req_insert(struct io_kiocb *req) @@ -2489,10 +2499,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (!poll->file) return -EBADF; - poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL); - if (!poll->wait) - return -ENOMEM; - req->io = NULL; INIT_IO_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); @@ -2509,9 +2515,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait->entry); - init_waitqueue_func_entry(poll->wait, io_poll_wake); - poll->wait->private = poll; + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, io_poll_wake); + poll->wait.private = poll; INIT_LIST_HEAD(&req->list); @@ -2520,14 +2526,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, spin_lock_irq(&ctx->completion_lock); if (likely(poll->head)) { spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait->entry))) { + if (unlikely(list_empty(&poll->wait.entry))) { if (ipt.error) cancel = true; ipt.error = 0; mask = 0; } if (mask || ipt.error) - list_del_init(&poll->wait->entry); + list_del_init(&poll->wait.entry); else if (cancel) WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ @@ -2582,8 +2588,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_put_req(req); return HRTIMER_NORESTART; } @@ -2608,8 +2613,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -1) return -EALREADY; - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_cqring_fill_event(req, -ECANCELED); io_put_req(req); return 0; @@ -2640,8 +2644,8 @@ static int io_timeout_remove(struct io_kiocb *req, io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - if (ret < 0 && req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req(req); return 0; } @@ -2822,8 +2826,8 @@ done: spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); } @@ -2991,12 +2995,7 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, if (req->result == -EAGAIN) return -EAGAIN; - /* workqueue context doesn't hold uring_lock, grab it now */ - if (req->in_async) - mutex_lock(&ctx->uring_lock); io_iopoll_req_issued(req); - if (req->in_async) - mutex_unlock(&ctx->uring_lock); } return 0; @@ -3044,8 +3043,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_put_req(req); if (ret) { - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req(req); } @@ -3064,7 +3062,12 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } } -static bool io_op_needs_file(const struct io_uring_sqe *sqe) +static bool io_req_op_valid(int op) +{ + return op >= IORING_OP_NOP && op < IORING_OP_LAST; +} + +static int io_op_needs_file(const struct io_uring_sqe *sqe) { int op = READ_ONCE(sqe->opcode); @@ -3075,9 +3078,11 @@ static bool io_op_needs_file(const struct io_uring_sqe *sqe) case IORING_OP_TIMEOUT_REMOVE: case IORING_OP_ASYNC_CANCEL: case IORING_OP_LINK_TIMEOUT: - return false; + return 0; default: - return true; + if (io_req_op_valid(op)) + return 1; + return -EINVAL; } } @@ -3094,7 +3099,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; unsigned flags; - int fd; + int fd, ret; flags = READ_ONCE(req->sqe->flags); fd = READ_ONCE(req->sqe->fd); @@ -3102,8 +3107,9 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - if (!io_op_needs_file(req->sqe)) - return 0; + ret = io_op_needs_file(req->sqe); + if (ret <= 0) + return ret; if (flags & IOSQE_FIXED_FILE) { if (unlikely(!ctx->file_table || @@ -3179,8 +3185,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { - if (prev->flags & REQ_F_LINK) - prev->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(prev); io_async_find_and_cancel(ctx, req, prev->user_data, NULL, -ETIME); io_put_req(prev); @@ -3231,13 +3236,14 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) static void __io_queue_sqe(struct io_kiocb *req) { - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout; struct io_kiocb *nxt = NULL; int ret; +again: + linked_timeout = io_prep_linked_timeout(req); + ret = io_issue_sqe(req, &nxt, true); - if (nxt) - io_queue_async_work(nxt); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -3256,7 +3262,7 @@ static void __io_queue_sqe(struct io_kiocb *req) * submit reference when the iocb is actually submitted. */ io_queue_async_work(req); - return; + goto done_req; } err: @@ -3273,10 +3279,15 @@ err: /* and drop final reference, if we failed */ if (ret) { io_cqring_add_event(req, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_put_req(req); } +done_req: + if (nxt) { + req = nxt; + nxt = NULL; + goto again; + } } static void io_queue_sqe(struct io_kiocb *req) @@ -3293,8 +3304,7 @@ static void io_queue_sqe(struct io_kiocb *req) if (ret) { if (ret != -EIOCBQUEUED) { io_cqring_add_event(req, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_double_put_req(req); } } else @@ -3310,8 +3320,8 @@ static inline void io_queue_link_head(struct io_kiocb *req) io_queue_sqe(req); } - -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ + IOSQE_IO_HARDLINK) static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, struct io_kiocb **link) @@ -3349,6 +3359,9 @@ err_req: if (req->sqe->flags & IOSQE_IO_DRAIN) (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; + if (req->sqe->flags & IOSQE_IO_HARDLINK) + req->flags |= REQ_F_HARDLINK; + io = kmalloc(sizeof(*io), GFP_KERNEL); if (!io) { ret = -EAGAIN; @@ -3358,13 +3371,16 @@ err_req: ret = io_req_defer_prep(req, io); if (ret) { kfree(io); + /* fail even hard links since we don't submit */ prev->flags |= REQ_F_FAIL_LINK; goto err_req; } trace_io_uring_link(ctx, req, prev); list_add_tail(&req->link_list, &prev->link_list); - } else if (req->sqe->flags & IOSQE_IO_LINK) { + } else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { req->flags |= REQ_F_LINK; + if (req->sqe->flags & IOSQE_IO_HARDLINK) + req->flags |= REQ_F_HARDLINK; INIT_LIST_HEAD(&req->link_list); *link = req; @@ -3647,7 +3663,9 @@ static int io_sq_thread(void *data) } to_submit = min(to_submit, ctx->sq_entries); + mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); + mutex_unlock(&ctx->uring_lock); if (ret > 0) inflight += ret; } diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index b801c6353100..6220642fe113 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -227,13 +227,17 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) { struct ovl_fh *fh; - int fh_type, fh_len, dwords; - void *buf; + int fh_type, dwords; int buflen = MAX_HANDLE_SZ; uuid_t *uuid = &real->d_sb->s_uuid; + int err; - buf = kmalloc(buflen, GFP_KERNEL); - if (!buf) + /* Make sure the real fid stays 32bit aligned */ + BUILD_BUG_ON(OVL_FH_FID_OFFSET % 4); + BUILD_BUG_ON(MAX_HANDLE_SZ + OVL_FH_FID_OFFSET > 255); + + fh = kzalloc(buflen + OVL_FH_FID_OFFSET, GFP_KERNEL); + if (!fh) return ERR_PTR(-ENOMEM); /* @@ -242,27 +246,19 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) * the price or reconnecting the dentry. */ dwords = buflen >> 2; - fh_type = exportfs_encode_fh(real, buf, &dwords, 0); + fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0); buflen = (dwords << 2); - fh = ERR_PTR(-EIO); + err = -EIO; if (WARN_ON(fh_type < 0) || WARN_ON(buflen > MAX_HANDLE_SZ) || WARN_ON(fh_type == FILEID_INVALID)) - goto out; + goto out_err; - BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255); - fh_len = offsetof(struct ovl_fh, fid) + buflen; - fh = kmalloc(fh_len, GFP_KERNEL); - if (!fh) { - fh = ERR_PTR(-ENOMEM); - goto out; - } - - fh->version = OVL_FH_VERSION; - fh->magic = OVL_FH_MAGIC; - fh->type = fh_type; - fh->flags = OVL_FH_FLAG_CPU_ENDIAN; + fh->fb.version = OVL_FH_VERSION; + fh->fb.magic = OVL_FH_MAGIC; + fh->fb.type = fh_type; + fh->fb.flags = OVL_FH_FLAG_CPU_ENDIAN; /* * When we will want to decode an overlay dentry from this handle * and all layers are on the same fs, if we get a disconncted real @@ -270,14 +266,15 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) * it to upperdentry or to lowerstack is by checking this flag. */ if (is_upper) - fh->flags |= OVL_FH_FLAG_PATH_UPPER; - fh->len = fh_len; - fh->uuid = *uuid; - memcpy(fh->fid, buf, buflen); + fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER; + fh->fb.len = sizeof(fh->fb) + buflen; + fh->fb.uuid = *uuid; -out: - kfree(buf); return fh; + +out_err: + kfree(fh); + return ERR_PTR(err); } int ovl_set_origin(struct dentry *dentry, struct dentry *lower, @@ -300,8 +297,8 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower, /* * Do not fail when upper doesn't support xattrs. */ - err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh, - fh ? fh->len : 0, 0); + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh->buf, + fh ? fh->fb.len : 0, 0); kfree(fh); return err; @@ -317,7 +314,7 @@ static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index) if (IS_ERR(fh)) return PTR_ERR(fh); - err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh, fh->len, 0); + err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh->buf, fh->fb.len, 0); kfree(fh); return err; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 702aa63f6774..29abdb1d3b5c 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1170,7 +1170,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (newdentry == trap) goto out_dput; - if (WARN_ON(olddentry->d_inode == newdentry->d_inode)) + if (olddentry->d_inode == newdentry->d_inode) goto out_dput; err = 0; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 73c9775215b3..70e55588aedc 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -211,10 +211,11 @@ static int ovl_check_encode_origin(struct dentry *dentry) return 1; } -static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) +static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen) { struct ovl_fh *fh = NULL; int err, enc_lower; + int len; /* * Check if we should encode a lower or upper file handle and maybe @@ -231,11 +232,12 @@ static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) return PTR_ERR(fh); err = -EOVERFLOW; - if (fh->len > buflen) + len = OVL_FH_LEN(fh); + if (len > buflen) goto fail; - memcpy(buf, (char *)fh, fh->len); - err = fh->len; + memcpy(fid, fh, len); + err = len; out: kfree(fh); @@ -243,31 +245,16 @@ out: fail: pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", - dentry, err, buflen, fh ? (int)fh->len : 0, - fh ? fh->type : 0); + dentry, err, buflen, fh ? (int)fh->fb.len : 0, + fh ? fh->fb.type : 0); goto out; } -static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len) -{ - int res, len = *max_len << 2; - - res = ovl_d_to_fh(dentry, (char *)fid, len); - if (res <= 0) - return FILEID_INVALID; - - len = res; - - /* Round up to dwords */ - *max_len = (len + 3) >> 2; - return OVL_FILEID; -} - static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { struct dentry *dentry; - int type; + int bytes = *max_len << 2; /* TODO: encode connectable file handles */ if (parent) @@ -277,10 +264,14 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, if (WARN_ON(!dentry)) return FILEID_INVALID; - type = ovl_dentry_to_fh(dentry, fid, max_len); - + bytes = ovl_dentry_to_fid(dentry, fid, bytes); dput(dentry); - return type; + if (bytes <= 0) + return FILEID_INVALID; + + *max_len = bytes >> 2; + + return OVL_FILEID_V1; } /* @@ -777,24 +768,45 @@ out_err: goto out; } +static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) +{ + struct ovl_fh *fh; + + /* If on-wire inner fid is aligned - nothing to do */ + if (fh_type == OVL_FILEID_V1) + return (struct ovl_fh *)fid; + + if (fh_type != OVL_FILEID_V0) + return ERR_PTR(-EINVAL); + + fh = kzalloc(buflen, GFP_KERNEL); + if (!fh) + return ERR_PTR(-ENOMEM); + + /* Copy unaligned inner fh into aligned buffer */ + memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET); + return fh; +} + static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct dentry *dentry = NULL; - struct ovl_fh *fh = (struct ovl_fh *) fid; + struct ovl_fh *fh = NULL; int len = fh_len << 2; unsigned int flags = 0; int err; - err = -EINVAL; - if (fh_type != OVL_FILEID) + fh = ovl_fid_to_fh(fid, len, fh_type); + err = PTR_ERR(fh); + if (IS_ERR(fh)) goto out_err; err = ovl_check_fh_len(fh, len); if (err) goto out_err; - flags = fh->flags; + flags = fh->fb.flags; dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ? ovl_upper_fh_to_d(sb, fh) : ovl_lower_fh_to_d(sb, fh); @@ -802,12 +814,18 @@ static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, if (IS_ERR(dentry) && err != -ESTALE) goto out_err; +out: + /* We may have needed to re-align OVL_FILEID_V0 */ + if (!IS_ERR_OR_NULL(fh) && fh != (void *)fid) + kfree(fh); + return dentry; out_err: pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", - len, fh_type, flags, err); - return ERR_PTR(err); + fh_len, fh_type, flags, err); + dentry = ERR_PTR(err); + goto out; } static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index bc14781886bf..b045cf1826fc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -200,8 +200,14 @@ int ovl_getattr(const struct path *path, struct kstat *stat, if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || (!ovl_verify_lower(dentry->d_sb) && (is_dir || lowerstat.nlink == 1))) { - stat->ino = lowerstat.ino; lower_layer = ovl_layer_lower(dentry); + /* + * Cannot use origin st_dev;st_ino because + * origin inode content may differ from overlay + * inode content. + */ + if (samefs || lower_layer->fsid) + stat->ino = lowerstat.ino; } /* diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index c269d6033525..76ff66339173 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -84,21 +84,21 @@ static int ovl_acceptable(void *ctx, struct dentry *dentry) * Return -ENODATA for "origin unknown". * Return <0 for an invalid file handle. */ -int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) +int ovl_check_fb_len(struct ovl_fb *fb, int fb_len) { - if (fh_len < sizeof(struct ovl_fh) || fh_len < fh->len) + if (fb_len < sizeof(struct ovl_fb) || fb_len < fb->len) return -EINVAL; - if (fh->magic != OVL_FH_MAGIC) + if (fb->magic != OVL_FH_MAGIC) return -EINVAL; /* Treat larger version and unknown flags as "origin unknown" */ - if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL) + if (fb->version > OVL_FH_VERSION || fb->flags & ~OVL_FH_FLAG_ALL) return -ENODATA; /* Treat endianness mismatch as "origin unknown" */ - if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) && - (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) + if (!(fb->flags & OVL_FH_FLAG_ANY_ENDIAN) && + (fb->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) return -ENODATA; return 0; @@ -119,15 +119,15 @@ static struct ovl_fh *ovl_get_fh(struct dentry *dentry, const char *name) if (res == 0) return NULL; - fh = kzalloc(res, GFP_KERNEL); + fh = kzalloc(res + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); - res = vfs_getxattr(dentry, name, fh, res); + res = vfs_getxattr(dentry, name, fh->buf, res); if (res < 0) goto fail; - err = ovl_check_fh_len(fh, res); + err = ovl_check_fb_len(&fh->fb, res); if (err < 0) { if (err == -ENODATA) goto out; @@ -158,12 +158,12 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, * Make sure that the stored uuid matches the uuid of the lower * layer where file handle will be decoded. */ - if (!uuid_equal(&fh->uuid, &mnt->mnt_sb->s_uuid)) + if (!uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid)) return NULL; - bytes = (fh->len - offsetof(struct ovl_fh, fid)); - real = exportfs_decode_fh(mnt, (struct fid *)fh->fid, - bytes >> 2, (int)fh->type, + bytes = (fh->fb.len - offsetof(struct ovl_fb, fid)); + real = exportfs_decode_fh(mnt, (struct fid *)fh->fb.fid, + bytes >> 2, (int)fh->fb.type, connected ? ovl_acceptable : NULL, mnt); if (IS_ERR(real)) { /* @@ -173,7 +173,7 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, * index entries correctly. */ if (real == ERR_PTR(-ESTALE) && - !(fh->flags & OVL_FH_FLAG_PATH_UPPER)) + !(fh->fb.flags & OVL_FH_FLAG_PATH_UPPER)) real = NULL; return real; } @@ -323,6 +323,14 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, int i; for (i = 0; i < ofs->numlower; i++) { + /* + * If lower fs uuid is not unique among lower fs we cannot match + * fh->uuid to layer. + */ + if (ofs->lower_layers[i].fsid && + ofs->lower_layers[i].fs->bad_uuid) + continue; + origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt, connected); if (origin) @@ -400,7 +408,7 @@ static int ovl_verify_fh(struct dentry *dentry, const char *name, if (IS_ERR(ofh)) return PTR_ERR(ofh); - if (fh->len != ofh->len || memcmp(fh, ofh, fh->len)) + if (fh->fb.len != ofh->fb.len || memcmp(&fh->fb, &ofh->fb, fh->fb.len)) err = -ESTALE; kfree(ofh); @@ -431,7 +439,7 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name, err = ovl_verify_fh(dentry, name, fh); if (set && err == -ENODATA) - err = ovl_do_setxattr(dentry, name, fh, fh->len, 0); + err = ovl_do_setxattr(dentry, name, fh->buf, fh->fb.len, 0); if (err) goto fail; @@ -505,20 +513,20 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) goto fail; err = -EINVAL; - if (index->d_name.len < sizeof(struct ovl_fh)*2) + if (index->d_name.len < sizeof(struct ovl_fb)*2) goto fail; err = -ENOMEM; len = index->d_name.len / 2; - fh = kzalloc(len, GFP_KERNEL); + fh = kzalloc(len + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) goto fail; err = -EINVAL; - if (hex2bin((u8 *)fh, index->d_name.name, len)) + if (hex2bin(fh->buf, index->d_name.name, len)) goto fail; - err = ovl_check_fh_len(fh, len); + err = ovl_check_fb_len(&fh->fb, len); if (err) goto fail; @@ -597,11 +605,11 @@ static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) { char *n, *s; - n = kcalloc(fh->len, 2, GFP_KERNEL); + n = kcalloc(fh->fb.len, 2, GFP_KERNEL); if (!n) return -ENOMEM; - s = bin2hex(n, fh, fh->len); + s = bin2hex(n, fh->buf, fh->fb.len); *name = (struct qstr) QSTR_INIT(n, s - n); return 0; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6934bcf030f0..f283b1d69a9e 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -71,20 +71,36 @@ enum ovl_entry_flag { #error Endianness not defined #endif -/* The type returned by overlay exportfs ops when encoding an ovl_fh handle */ -#define OVL_FILEID 0xfb +/* The type used to be returned by overlay exportfs for misaligned fid */ +#define OVL_FILEID_V0 0xfb +/* The type returned by overlay exportfs for 32bit aligned fid */ +#define OVL_FILEID_V1 0xf8 -/* On-disk and in-memeory format for redirect by file handle */ -struct ovl_fh { +/* On-disk format for "origin" file handle */ +struct ovl_fb { u8 version; /* 0 */ u8 magic; /* 0xfb */ u8 len; /* size of this header + size of fid */ u8 flags; /* OVL_FH_FLAG_* */ u8 type; /* fid_type of fid */ uuid_t uuid; /* uuid of filesystem */ - u8 fid[0]; /* file identifier */ + u32 fid[0]; /* file identifier should be 32bit aligned in-memory */ } __packed; +/* In-memory and on-wire format for overlay file handle */ +struct ovl_fh { + u8 padding[3]; /* make sure fb.fid is 32bit aligned */ + union { + struct ovl_fb fb; + u8 buf[0]; + }; +} __packed; + +#define OVL_FH_WIRE_OFFSET offsetof(struct ovl_fh, fb) +#define OVL_FH_LEN(fh) (OVL_FH_WIRE_OFFSET + (fh)->fb.len) +#define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \ + offsetof(struct ovl_fb, fid)) + static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { int err = vfs_rmdir(dir, dentry); @@ -302,7 +318,13 @@ static inline void ovl_inode_unlock(struct inode *inode) /* namei.c */ -int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); +int ovl_check_fb_len(struct ovl_fb *fb, int fb_len); + +static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) +{ + return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET); +} + struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, bool connected); int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index a8279280e88d..28348c44ea5b 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -22,6 +22,8 @@ struct ovl_config { struct ovl_sb { struct super_block *sb; dev_t pseudo_dev; + /* Unusable (conflicting) uuid */ + bool bad_uuid; }; struct ovl_layer { diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index afbcb116a7f1..7621ff176d15 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1255,7 +1255,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) { unsigned int i; - if (!ofs->config.nfs_export && !(ofs->config.index && ofs->upper_mnt)) + if (!ofs->config.nfs_export && !ofs->upper_mnt) return true; for (i = 0; i < ofs->numlowerfs; i++) { @@ -1263,9 +1263,13 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) * We use uuid to associate an overlay lower file handle with a * lower layer, so we can accept lower fs with null uuid as long * as all lower layers with null uuid are on the same fs. + * if we detect multiple lower fs with the same uuid, we + * disable lower file handle decoding on all of them. */ - if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) + if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) { + ofs->lower_fs[i].bad_uuid = true; return false; + } } return true; } @@ -1277,6 +1281,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) unsigned int i; dev_t dev; int err; + bool bad_uuid = false; /* fsid 0 is reserved for upper fs even with non upper overlay */ if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb) @@ -1288,11 +1293,15 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) } if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { - ofs->config.index = false; - ofs->config.nfs_export = false; - pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", - uuid_is_null(&sb->s_uuid) ? "null" : "conflicting", - path->dentry); + bad_uuid = true; + if (ofs->config.index || ofs->config.nfs_export) { + ofs->config.index = false; + ofs->config.nfs_export = false; + pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", + uuid_is_null(&sb->s_uuid) ? "null" : + "conflicting", + path->dentry); + } } err = get_anon_bdev(&dev); @@ -1303,6 +1312,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) ofs->lower_fs[ofs->numlowerfs].sb = sb; ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev; + ofs->lower_fs[ofs->numlowerfs].bad_uuid = bad_uuid; ofs->numlowerfs++; return ofs->numlowerfs; diff --git a/fs/verity/enable.c b/fs/verity/enable.c index eabc6ac19906..b79e3fd19d11 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -315,7 +315,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) if (arg.block_size != PAGE_SIZE) return -EINVAL; - if (arg.salt_size > FIELD_SIZEOF(struct fsverity_descriptor, salt)) + if (arg.salt_size > sizeof_field(struct fsverity_descriptor, salt)) return -EMSGSIZE; if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE) |