diff options
Diffstat (limited to 'fs')
113 files changed, 1620 insertions, 805 deletions
diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 005921e3b38d..5b79cdceefa0 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -154,10 +154,17 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, return ERR_PTR(-ENOMEM); } + cell->name = kmalloc(namelen + 1, GFP_KERNEL); + if (!cell->name) { + kfree(cell); + return ERR_PTR(-ENOMEM); + } + cell->net = net; cell->name_len = namelen; for (i = 0; i < namelen; i++) cell->name[i] = tolower(name[i]); + cell->name[i] = 0; atomic_set(&cell->usage, 2); INIT_WORK(&cell->manager, afs_manage_cell); @@ -207,6 +214,7 @@ parse_failed: if (ret == -EINVAL) printk(KERN_ERR "kAFS: bad VL server IP address\n"); error: + kfree(cell->name); kfree(cell); _leave(" = %d", ret); return ERR_PTR(ret); @@ -489,6 +497,7 @@ static void afs_cell_destroy(struct rcu_head *rcu) afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers)); afs_put_cell(cell->net, cell->alias_of); key_put(cell->anonymous_key); + kfree(cell->name); kfree(cell); _leave(" [destroyed]"); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index aa1d34141ea3..96757f3abd74 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -648,7 +648,7 @@ static void afs_do_lookup_success(struct afs_operation *op) vp = &op->file[0]; abort_code = vp->scb.status.abort_code; if (abort_code != 0) { - op->abort_code = abort_code; + op->ac.abort_code = abort_code; op->error = afs_abort_to_error(abort_code); } break; @@ -696,10 +696,11 @@ static const struct afs_operation_ops afs_inline_bulk_status_operation = { .success = afs_do_lookup_success, }; -static const struct afs_operation_ops afs_fetch_status_operation = { +static const struct afs_operation_ops afs_lookup_fetch_status_operation = { .issue_afs_rpc = afs_fs_fetch_status, .issue_yfs_rpc = yfs_fs_fetch_status, .success = afs_do_lookup_success, + .aborted = afs_check_for_remote_deletion, }; /* @@ -844,7 +845,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, * to FS.FetchStatus for op->file[1]. */ op->fetch_status.which = 1; - op->ops = &afs_fetch_status_operation; + op->ops = &afs_lookup_fetch_status_operation; afs_begin_vnode_operation(op); afs_wait_for_operation(op); } @@ -1236,6 +1237,17 @@ void afs_d_release(struct dentry *dentry) _enter("%pd", dentry); } +void afs_check_for_remote_deletion(struct afs_operation *op) +{ + struct afs_vnode *vnode = op->file[0].vnode; + + switch (op->ac.abort_code) { + case VNOVNODE: + set_bit(AFS_VNODE_DELETED, &vnode->flags); + afs_break_callback(vnode, afs_cb_break_for_deleted); + } +} + /* * Create a new inode for create/mkdir/symlink */ @@ -1268,7 +1280,7 @@ static void afs_vnode_new_inode(struct afs_operation *op) static void afs_create_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); - afs_check_for_remote_deletion(op, op->file[0].vnode); + op->ctime = op->file[0].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[0]); afs_update_dentry_version(op, &op->file[0], op->dentry); afs_vnode_new_inode(op); @@ -1302,6 +1314,7 @@ static const struct afs_operation_ops afs_mkdir_operation = { .issue_afs_rpc = afs_fs_make_dir, .issue_yfs_rpc = yfs_fs_make_dir, .success = afs_create_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_create_edit_dir, .put = afs_create_put, }; @@ -1325,6 +1338,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].update_ctime = true; op->dentry = dentry; op->create.mode = S_IFDIR | mode; op->create.reason = afs_edit_dir_for_mkdir; @@ -1350,7 +1364,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry) static void afs_rmdir_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); - afs_check_for_remote_deletion(op, op->file[0].vnode); + op->ctime = op->file[0].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[0]); afs_update_dentry_version(op, &op->file[0], op->dentry); } @@ -1382,6 +1396,7 @@ static const struct afs_operation_ops afs_rmdir_operation = { .issue_afs_rpc = afs_fs_remove_dir, .issue_yfs_rpc = yfs_fs_remove_dir, .success = afs_rmdir_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_rmdir_edit_dir, .put = afs_rmdir_put, }; @@ -1404,6 +1419,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].update_ctime = true; op->dentry = dentry; op->ops = &afs_rmdir_operation; @@ -1479,7 +1495,8 @@ static void afs_dir_remove_link(struct afs_operation *op) static void afs_unlink_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); - afs_check_for_remote_deletion(op, op->file[0].vnode); + op->ctime = op->file[0].scb.status.mtime_client; + afs_check_dir_conflict(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[1]); afs_update_dentry_version(op, &op->file[0], op->dentry); @@ -1511,6 +1528,7 @@ static const struct afs_operation_ops afs_unlink_operation = { .issue_afs_rpc = afs_fs_remove_file, .issue_yfs_rpc = yfs_fs_remove_file, .success = afs_unlink_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_unlink_edit_dir, .put = afs_unlink_put, }; @@ -1537,6 +1555,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].update_ctime = true; /* Try to make sure we have a callback promise on the victim. */ ret = afs_validate(vnode, op->key); @@ -1561,9 +1580,25 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) spin_unlock(&dentry->d_lock); op->file[1].vnode = vnode; + op->file[1].update_ctime = true; + op->file[1].op_unlinked = true; op->dentry = dentry; op->ops = &afs_unlink_operation; - return afs_do_sync_operation(op); + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + + /* If there was a conflict with a third party, check the status of the + * unlinked vnode. + */ + if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + op->file[1].update_ctime = false; + op->fetch_status.which = 1; + op->ops = &afs_fetch_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + } + + return afs_put_operation(op); error: return afs_put_operation(op); @@ -1573,6 +1608,7 @@ static const struct afs_operation_ops afs_create_operation = { .issue_afs_rpc = afs_fs_create_file, .issue_yfs_rpc = yfs_fs_create_file, .success = afs_create_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_create_edit_dir, .put = afs_create_put, }; @@ -1601,6 +1637,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].update_ctime = true; op->dentry = dentry; op->create.mode = S_IFREG | mode; @@ -1620,6 +1657,7 @@ static void afs_link_success(struct afs_operation *op) struct afs_vnode_param *vp = &op->file[1]; _enter("op=%08x", op->debug_id); + op->ctime = dvp->scb.status.mtime_client; afs_vnode_commit_status(op, dvp); afs_vnode_commit_status(op, vp); afs_update_dentry_version(op, dvp, op->dentry); @@ -1640,6 +1678,7 @@ static const struct afs_operation_ops afs_link_operation = { .issue_afs_rpc = afs_fs_link, .issue_yfs_rpc = yfs_fs_link, .success = afs_link_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_create_edit_dir, .put = afs_link_put, }; @@ -1672,6 +1711,8 @@ static int afs_link(struct dentry *from, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); op->file[0].dv_delta = 1; + op->file[0].update_ctime = true; + op->file[1].update_ctime = true; op->dentry = dentry; op->dentry_2 = from; @@ -1689,6 +1730,7 @@ static const struct afs_operation_ops afs_symlink_operation = { .issue_afs_rpc = afs_fs_symlink, .issue_yfs_rpc = yfs_fs_symlink, .success = afs_create_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_create_edit_dir, .put = afs_create_put, }; @@ -1740,9 +1782,13 @@ static void afs_rename_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); + op->ctime = op->file[0].scb.status.mtime_client; + afs_check_dir_conflict(op, &op->file[1]); afs_vnode_commit_status(op, &op->file[0]); - if (op->file[1].vnode != op->file[0].vnode) + if (op->file[1].vnode != op->file[0].vnode) { + op->ctime = op->file[1].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[1]); + } } static void afs_rename_edit_dir(struct afs_operation *op) @@ -1860,6 +1906,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; op->file[1].dv_delta = 1; + op->file[0].update_ctime = true; + op->file[1].update_ctime = true; op->dentry = old_dentry; op->dentry_2 = new_dentry; diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index b14e3d9a25e2..04f75a44f243 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -16,6 +16,7 @@ static void afs_silly_rename_success(struct afs_operation *op) { _enter("op=%08x", op->debug_id); + afs_check_dir_conflict(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[0]); } @@ -69,6 +70,11 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode return PTR_ERR(op); afs_op_set_vnode(op, 0, dvnode); + afs_op_set_vnode(op, 1, dvnode); + op->file[0].dv_delta = 1; + op->file[1].dv_delta = 1; + op->file[0].update_ctime = true; + op->file[1].update_ctime = true; op->dentry = old; op->dentry_2 = new; @@ -129,6 +135,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, switch (ret) { case 0: /* The rename succeeded. */ + set_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags); d_move(dentry, sdentry); break; case -ERESTARTSYS: @@ -148,19 +155,11 @@ out: static void afs_silly_unlink_success(struct afs_operation *op) { - struct afs_vnode *vnode = op->file[1].vnode; - _enter("op=%08x", op->debug_id); - afs_check_for_remote_deletion(op, op->file[0].vnode); + afs_check_dir_conflict(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[1]); afs_update_dentry_version(op, &op->file[0], op->dentry); - - drop_nlink(&vnode->vfs_inode); - if (vnode->vfs_inode.i_nlink == 0) { - set_bit(AFS_VNODE_DELETED, &vnode->flags); - clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); - } } static void afs_silly_unlink_edit_dir(struct afs_operation *op) @@ -181,6 +180,7 @@ static const struct afs_operation_ops afs_silly_unlink_operation = { .issue_afs_rpc = afs_fs_remove_file, .issue_yfs_rpc = yfs_fs_remove_file, .success = afs_silly_unlink_success, + .aborted = afs_check_for_remote_deletion, .edit_dir = afs_silly_unlink_edit_dir, }; @@ -200,12 +200,30 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); + op->file[0].dv_delta = 1; + op->file[0].update_ctime = true; + op->file[1].op_unlinked = true; + op->file[1].update_ctime = true; op->dentry = dentry; op->ops = &afs_silly_unlink_operation; trace_afs_silly_rename(vnode, true); - return afs_do_sync_operation(op); + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + + /* If there was a conflict with a third party, check the status of the + * unlinked vnode. + */ + if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + op->file[1].update_ctime = false; + op->fetch_status.which = 1; + op->ops = &afs_fetch_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + } + + return afs_put_operation(op); } /* diff --git a/fs/afs/file.c b/fs/afs/file.c index 506c47471b42..6f6ed1605cfe 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -225,7 +225,6 @@ static void afs_fetch_data_success(struct afs_operation *op) struct afs_vnode *vnode = op->file[0].vnode; _enter("op=%08x", op->debug_id); - afs_check_for_remote_deletion(op, vnode); afs_vnode_commit_status(op, &op->file[0]); afs_stat_v(vnode, n_fetches); atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes); @@ -240,6 +239,7 @@ static const struct afs_operation_ops afs_fetch_data_operation = { .issue_afs_rpc = afs_fs_fetch_data, .issue_yfs_rpc = yfs_fs_fetch_data, .success = afs_fetch_data_success, + .aborted = afs_check_for_remote_deletion, .put = afs_fetch_data_put, }; diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 71eea2a908c7..ffb8575345ca 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -175,10 +175,7 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode) static void afs_lock_success(struct afs_operation *op) { - struct afs_vnode *vnode = op->file[0].vnode; - _enter("op=%08x", op->debug_id); - afs_check_for_remote_deletion(op, vnode); afs_vnode_commit_status(op, &op->file[0]); } @@ -186,6 +183,7 @@ static const struct afs_operation_ops afs_set_lock_operation = { .issue_afs_rpc = afs_fs_set_lock, .issue_yfs_rpc = yfs_fs_set_lock, .success = afs_lock_success, + .aborted = afs_check_for_remote_deletion, }; /* diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 2d2dff5688a4..24fd163c6323 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -71,7 +71,7 @@ static bool afs_get_io_locks(struct afs_operation *op) swap(vnode, vnode2); if (mutex_lock_interruptible(&vnode->io_lock) < 0) { - op->error = -EINTR; + op->error = -ERESTARTSYS; op->flags |= AFS_OPERATION_STOP; _leave(" = f [I 0]"); return false; @@ -80,7 +80,7 @@ static bool afs_get_io_locks(struct afs_operation *op) if (vnode2) { if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) { - op->error = -EINTR; + op->error = -ERESTARTSYS; op->flags |= AFS_OPERATION_STOP; mutex_unlock(&vnode->io_lock); op->flags &= ~AFS_OPERATION_LOCK_0; @@ -187,9 +187,17 @@ void afs_wait_for_operation(struct afs_operation *op) op->error = afs_wait_for_call_to_complete(op->call, &op->ac); } - if (op->error == 0) { + switch (op->error) { + case 0: _debug("success"); op->ops->success(op); + break; + case -ECONNABORTED: + if (op->ops->aborted) + op->ops->aborted(op); + break; + default: + break; } afs_end_vnode_operation(op); diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index b34f74b0f319..5d9ef517cf81 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -314,7 +314,7 @@ void afs_fs_probe_timer(struct timer_list *timer) { struct afs_net *net = container_of(timer, struct afs_net, fs_probe_timer); - if (!queue_work(afs_wq, &net->fs_prober)) + if (!net->live || !queue_work(afs_wq, &net->fs_prober)) afs_dec_servers_outstanding(net); } @@ -458,3 +458,12 @@ dont_wait: return -ETIME; return -EDESTADDRREQ; } + +/* + * Clean up the probing when the namespace is killed off. + */ +void afs_fs_probe_cleanup(struct afs_net *net) +{ + if (del_timer_sync(&net->fs_probe_timer)) + afs_dec_servers_outstanding(net); +} diff --git a/fs/afs/inode.c b/fs/afs/inode.c index cd0a0060950b..1d13d2e882ad 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -165,9 +165,11 @@ static void afs_apply_status(struct afs_operation *op, { struct afs_file_status *status = &vp->scb.status; struct afs_vnode *vnode = vp->vnode; + struct inode *inode = &vnode->vfs_inode; struct timespec64 t; umode_t mode; bool data_changed = false; + bool change_size = vp->set_size; _enter("{%llx:%llu.%u} %s", vp->fid.vid, vp->fid.vnode, vp->fid.unique, @@ -186,25 +188,25 @@ static void afs_apply_status(struct afs_operation *op, } if (status->nlink != vnode->status.nlink) - set_nlink(&vnode->vfs_inode, status->nlink); + set_nlink(inode, status->nlink); if (status->owner != vnode->status.owner) - vnode->vfs_inode.i_uid = make_kuid(&init_user_ns, status->owner); + inode->i_uid = make_kuid(&init_user_ns, status->owner); if (status->group != vnode->status.group) - vnode->vfs_inode.i_gid = make_kgid(&init_user_ns, status->group); + inode->i_gid = make_kgid(&init_user_ns, status->group); if (status->mode != vnode->status.mode) { - mode = vnode->vfs_inode.i_mode; + mode = inode->i_mode; mode &= ~S_IALLUGO; mode |= status->mode; - WRITE_ONCE(vnode->vfs_inode.i_mode, mode); + WRITE_ONCE(inode->i_mode, mode); } t = status->mtime_client; - vnode->vfs_inode.i_ctime = t; - vnode->vfs_inode.i_mtime = t; - vnode->vfs_inode.i_atime = t; + inode->i_mtime = t; + if (vp->update_ctime) + inode->i_ctime = op->ctime; if (vnode->status.data_version != status->data_version) data_changed = true; @@ -226,6 +228,7 @@ static void afs_apply_status(struct afs_operation *op, } else { set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); } + change_size = true; } else if (vnode->status.type == AFS_FTYPE_DIR) { /* Expected directory change is handled elsewhere so * that we can locally edit the directory and save on a @@ -233,11 +236,22 @@ static void afs_apply_status(struct afs_operation *op, */ if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) data_changed = false; + change_size = true; } if (data_changed) { - inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); - afs_set_i_size(vnode, status->size); + inode_set_iversion_raw(inode, status->data_version); + + /* Only update the size if the data version jumped. If the + * file is being modified locally, then we might have our own + * idea of what the size should be that's not the same as + * what's on the server. + */ + if (change_size) { + afs_set_i_size(vnode, status->size); + inode->i_ctime = t; + inode->i_atime = t; + } } } @@ -267,32 +281,39 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v _enter(""); - ASSERTCMP(op->error, ==, 0); - write_seqlock(&vnode->cb_lock); if (vp->scb.have_error) { + /* A YFS server will return this from RemoveFile2 and AFS and + * YFS will return this from InlineBulkStatus. + */ if (vp->scb.status.abort_code == VNOVNODE) { set_bit(AFS_VNODE_DELETED, &vnode->flags); clear_nlink(&vnode->vfs_inode); __afs_break_callback(vnode, afs_cb_break_for_deleted); + op->flags &= ~AFS_OPERATION_DIR_CONFLICT; } - } else { - if (vp->scb.have_status) - afs_apply_status(op, vp); + } else if (vp->scb.have_status) { + afs_apply_status(op, vp); if (vp->scb.have_cb) afs_apply_callback(op, vp); + } else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) { + drop_nlink(&vnode->vfs_inode); + if (vnode->vfs_inode.i_nlink == 0) { + set_bit(AFS_VNODE_DELETED, &vnode->flags); + __afs_break_callback(vnode, afs_cb_break_for_deleted); + } } write_sequnlock(&vnode->cb_lock); - if (op->error == 0 && vp->scb.have_status) + if (vp->scb.have_status) afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb); } static void afs_fetch_status_success(struct afs_operation *op) { - struct afs_vnode_param *vp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; struct afs_vnode *vnode = vp->vnode; int ret; @@ -306,10 +327,11 @@ static void afs_fetch_status_success(struct afs_operation *op) } } -static const struct afs_operation_ops afs_fetch_status_operation = { +const struct afs_operation_ops afs_fetch_status_operation = { .issue_afs_rpc = afs_fs_fetch_status, .issue_yfs_rpc = yfs_fs_fetch_status, .success = afs_fetch_status_success, + .aborted = afs_check_for_remote_deletion, }; /* @@ -716,6 +738,9 @@ int afs_getattr(const struct path *path, struct kstat *stat, do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); generic_fillattr(inode, stat); + if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && + stat->nlink > 0) + stat->nlink -= 1; } while (need_seqretry(&vnode->cb_lock, seq)); done_seqretry(&vnode->cb_lock, seq); @@ -785,7 +810,15 @@ void afs_evict_inode(struct inode *inode) static void afs_setattr_success(struct afs_operation *op) { + struct inode *inode = &op->file[0].vnode->vfs_inode; + afs_vnode_commit_status(op, &op->file[0]); + if (op->setattr.attr->ia_valid & ATTR_SIZE) { + loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size; + if (size > i_size) + pagecache_isize_extended(inode, i_size, size); + truncate_pagecache(inode, size); + } } static const struct afs_operation_ops afs_setattr_operation = { @@ -801,17 +834,31 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) { struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + int ret; _enter("{%llx:%llu},{n=%pd},%x", vnode->fid.vid, vnode->fid.vnode, dentry, attr->ia_valid); if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | - ATTR_MTIME))) { + ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | + ATTR_TOUCH))) { _leave(" = 0 [unsupported]"); return 0; } + if (attr->ia_valid & ATTR_SIZE) { + if (!S_ISREG(vnode->vfs_inode.i_mode)) + return -EISDIR; + + ret = inode_newsize_ok(&vnode->vfs_inode, attr->ia_size); + if (ret) + return ret; + + if (attr->ia_size == i_size_read(&vnode->vfs_inode)) + attr->ia_valid &= ~ATTR_SIZE; + } + /* flush any dirty data outstanding on a regular file */ if (S_ISREG(vnode->vfs_inode.i_mode)) filemap_write_and_wait(vnode->vfs_inode.i_mapping); @@ -825,8 +872,12 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) afs_op_set_vnode(op, 0, vnode); op->setattr.attr = attr; - if (attr->ia_valid & ATTR_SIZE) + if (attr->ia_valid & ATTR_SIZE) { op->file[0].dv_delta = 1; + op->file[0].set_size = true; + } + op->ctime = attr->ia_ctime; + op->file[0].update_ctime = 1; op->ops = &afs_setattr_operation; return afs_do_sync_operation(op); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 0c9806ef2a19..792ac711985e 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -388,7 +388,7 @@ struct afs_cell { struct afs_vlserver_list __rcu *vl_servers; u8 name_len; /* Length of name */ - char name[64 + 1]; /* Cell name, case-flattened and NUL-padded */ + char *name; /* Cell name, case-flattened and NUL-padded */ }; /* @@ -634,6 +634,7 @@ struct afs_vnode { #define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ +#define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ @@ -744,8 +745,11 @@ struct afs_vnode_param { afs_dataversion_t dv_before; /* Data version before the call */ unsigned int cb_break_before; /* cb_break + cb_s_break before the call */ u8 dv_delta; /* Expected change in data version */ - bool put_vnode; /* T if we have a ref on the vnode */ - bool need_io_lock; /* T if we need the I/O lock on this */ + bool put_vnode:1; /* T if we have a ref on the vnode */ + bool need_io_lock:1; /* T if we need the I/O lock on this */ + bool update_ctime:1; /* Need to update the ctime */ + bool set_size:1; /* Must update i_size */ + bool op_unlinked:1; /* True if file was unlinked by op */ }; /* @@ -766,9 +770,9 @@ struct afs_operation { struct dentry *dentry; /* Dentry to be altered */ struct dentry *dentry_2; /* Second dentry to be altered */ struct timespec64 mtime; /* Modification time to record */ + struct timespec64 ctime; /* Change time to set */ short nr_files; /* Number of entries in file[], more_files */ short error; - unsigned int abort_code; unsigned int debug_id; unsigned int cb_v_break; /* Volume break counter before op */ @@ -837,6 +841,7 @@ struct afs_operation { #define AFS_OPERATION_LOCK_1 0x0200 /* Set if have io_lock on file[1] */ #define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */ #define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */ +#define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */ }; /* @@ -932,6 +937,7 @@ extern const struct address_space_operations afs_dir_aops; extern const struct dentry_operations afs_fs_dentry_operations; extern void afs_d_release(struct dentry *); +extern void afs_check_for_remote_deletion(struct afs_operation *); /* * dir_edit.c @@ -1059,10 +1065,13 @@ extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); extern int afs_wait_for_one_fs_probe(struct afs_server *, bool); +extern void afs_fs_probe_cleanup(struct afs_net *); /* * inode.c */ +extern const struct afs_operation_ops afs_fetch_status_operation; + extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_ilookup5_test_by_fid(struct inode *, void *); @@ -1435,7 +1444,6 @@ extern ssize_t afs_listxattr(struct dentry *, char *, size_t); /* * yfsclient.c */ -extern void yfs_fs_fetch_file_status(struct afs_operation *); extern void yfs_fs_fetch_data(struct afs_operation *); extern void yfs_fs_create_file(struct afs_operation *); extern void yfs_fs_make_dir(struct afs_operation *); @@ -1481,15 +1489,6 @@ static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) return &vnode->vfs_inode; } -static inline void afs_check_for_remote_deletion(struct afs_operation *op, - struct afs_vnode *vnode) -{ - if (op->error == -ENOENT) { - set_bit(AFS_VNODE_DELETED, &vnode->flags); - afs_break_callback(vnode, afs_cb_break_for_deleted); - } -} - /* * Note that a dentry got changed. We need to set d_fsdata to the data version * number derived from the result of the operation. It doesn't matter if @@ -1504,6 +1503,18 @@ static inline void afs_update_dentry_version(struct afs_operation *op, (void *)(unsigned long)dir_vp->scb.status.data_version; } +/* + * Check for a conflicting operation on a directory that we just unlinked from. + * If someone managed to sneak a link or an unlink in on the file we just + * unlinked, we won't be able to trust nlink on an AFS file (but not YFS). + */ +static inline void afs_check_dir_conflict(struct afs_operation *op, + struct afs_vnode_param *dvp) +{ + if (dvp->dv_before + dvp->dv_delta != dvp->scb.status.data_version) + op->flags |= AFS_OPERATION_DIR_CONFLICT; +} + static inline int afs_io_error(struct afs_call *call, enum afs_io_error where) { trace_afs_io_error(call->debug_id, -EIO, where); diff --git a/fs/afs/main.c b/fs/afs/main.c index 9c79c91e8005..31b472f7c734 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -100,6 +100,7 @@ static int __net_init afs_net_init(struct net *net_ns) timer_setup(&net->fs_timer, afs_servers_timer, 0); INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher); timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0); + atomic_set(&net->servers_outstanding, 1); ret = -ENOMEM; sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); @@ -130,6 +131,7 @@ static int __net_init afs_net_init(struct net *net_ns) error_open_socket: net->live = false; + afs_fs_probe_cleanup(net); afs_cell_purge(net); afs_purge_servers(net); error_cell_init: @@ -150,6 +152,7 @@ static void __net_exit afs_net_exit(struct net *net_ns) struct afs_net *net = afs_net(net_ns); net->live = false; + afs_fs_probe_cleanup(net); afs_cell_purge(net); afs_purge_servers(net); afs_close_socket(net); diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 52b19e9c1535..5334f1bd2bca 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -83,6 +83,7 @@ int afs_abort_to_error(u32 abort_code) case UAENOLCK: return -ENOLCK; case UAENOTEMPTY: return -ENOTEMPTY; case UAELOOP: return -ELOOP; + case UAEOVERFLOW: return -EOVERFLOW; case UAENOMEDIUM: return -ENOMEDIUM; case UAEDQUOT: return -EDQUOT; diff --git a/fs/afs/server.c b/fs/afs/server.c index 039e3488511c..e82e452e2612 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -605,11 +605,12 @@ void afs_purge_servers(struct afs_net *net) _enter(""); if (del_timer_sync(&net->fs_timer)) - atomic_dec(&net->servers_outstanding); + afs_dec_servers_outstanding(net); afs_queue_server_manager(net); _debug("wait"); + atomic_dec(&net->servers_outstanding); wait_var_event(&net->servers_outstanding, !atomic_read(&net->servers_outstanding)); _leave(""); diff --git a/fs/afs/write.c b/fs/afs/write.c index 768497f82aee..a121c247d95a 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -194,11 +194,11 @@ int afs_write_end(struct file *file, struct address_space *mapping, i_size = i_size_read(&vnode->vfs_inode); if (maybe_i_size > i_size) { - spin_lock(&vnode->wb_lock); + write_seqlock(&vnode->cb_lock); i_size = i_size_read(&vnode->vfs_inode); if (maybe_i_size > i_size) i_size_write(&vnode->vfs_inode, maybe_i_size); - spin_unlock(&vnode->wb_lock); + write_sequnlock(&vnode->cb_lock); } if (!PageUptodate(page)) { @@ -393,6 +393,7 @@ static void afs_store_data_success(struct afs_operation *op) { struct afs_vnode *vnode = op->file[0].vnode; + op->ctime = op->file[0].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[0]); if (op->error == 0) { afs_pages_written_back(vnode, op->store.first, op->store.last); @@ -448,6 +449,7 @@ static int afs_store_data(struct address_space *mapping, op->store.first_offset = offset; op->store.last_to = to; op->mtime = vnode->vfs_inode.i_mtime; + op->flags |= AFS_OPERATION_UNINTR; op->ops = &afs_store_data_operation; try_next_key: @@ -491,6 +493,7 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, unsigned long count, priv; unsigned n, offset, to, f, t; pgoff_t start, first, last; + loff_t i_size, end; int loop, ret; _enter(",%lx", primary_page->index); @@ -591,7 +594,12 @@ no_more: first = primary_page->index; last = first + count - 1; + end = (loff_t)last * PAGE_SIZE + to; + i_size = i_size_read(&vnode->vfs_inode); + _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); + if (end > i_size) + to = i_size & ~PAGE_MASK; ret = afs_store_data(mapping, first, last, offset, to); switch (ret) { @@ -844,6 +852,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) vmf->page->index, priv); SetPagePrivate(vmf->page); set_page_private(vmf->page, priv); + file_update_time(file); sb_end_pagefault(inode->i_sb); return VM_FAULT_LOCKED; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 52d5af5fcd44..8c24fdc899e3 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -330,29 +330,6 @@ static void xdr_decode_YFSFetchVolumeStatus(const __be32 **_bp, } /* - * Deliver a reply that's a status, callback and volsync. - */ -static int yfs_deliver_fs_status_cb_and_volsync(struct afs_call *call) -{ - struct afs_operation *op = call->op; - const __be32 *bp; - int ret; - - ret = afs_transfer_reply(call); - if (ret < 0) - return ret; - - /* unmarshall the reply once we've received all of it */ - bp = call->buffer; - xdr_decode_YFSFetchStatus(&bp, call, &op->file[0].scb); - xdr_decode_YFSCallBack(&bp, call, &op->file[0].scb); - xdr_decode_YFSVolSync(&bp, &op->volsync); - - _leave(" = 0 [done]"); - return 0; -} - -/* * Deliver reply data to operations that just return a file status and a volume * sync record. */ @@ -375,48 +352,6 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call) } /* - * YFS.FetchStatus operation type - */ -static const struct afs_call_type yfs_RXYFSFetchStatus_vnode = { - .name = "YFS.FetchStatus(vnode)", - .op = yfs_FS_FetchStatus, - .deliver = yfs_deliver_fs_status_cb_and_volsync, - .destructor = afs_flat_call_destructor, -}; - -/* - * Fetch the status information for a file. - */ -void yfs_fs_fetch_file_status(struct afs_operation *op) -{ - struct afs_vnode_param *vp = &op->file[0]; - struct afs_call *call; - __be32 *bp; - - _enter(",%x,{%llx:%llu},,", - key_serial(op->key), vp->fid.vid, vp->fid.vnode); - - call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchStatus_vnode, - sizeof(__be32) * 2 + - sizeof(struct yfs_xdr_YFSFid), - sizeof(struct yfs_xdr_YFSFetchStatus) + - sizeof(struct yfs_xdr_YFSCallBack) + - sizeof(struct yfs_xdr_YFSVolSync)); - if (!call) - return afs_op_nomem(op); - - /* marshall the parameters */ - bp = call->request; - bp = xdr_encode_u32(bp, YFSFETCHSTATUS); - bp = xdr_encode_u32(bp, 0); /* RPC flags */ - bp = xdr_encode_YFSFid(bp, &vp->fid); - yfs_check_req(call, bp); - - trace_afs_make_fs_call(call, &vp->fid); - afs_make_op_call(op, call, GFP_NOFS); -} - -/* * Deliver reply data to an YFS.FetchData64. */ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) @@ -1605,12 +1540,36 @@ void yfs_fs_release_lock(struct afs_operation *op) } /* + * Deliver a reply to YFS.FetchStatus + */ +static int yfs_deliver_fs_fetch_status(struct afs_call *call) +{ + struct afs_operation *op = call->op; + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; + const __be32 *bp; + int ret; + + ret = afs_transfer_reply(call); + if (ret < 0) + return ret; + + /* unmarshall the reply once we've received all of it */ + bp = call->buffer; + xdr_decode_YFSFetchStatus(&bp, call, &vp->scb); + xdr_decode_YFSCallBack(&bp, call, &vp->scb); + xdr_decode_YFSVolSync(&bp, &op->volsync); + + _leave(" = 0 [done]"); + return 0; +} + +/* * YFS.FetchStatus operation type */ static const struct afs_call_type yfs_RXYFSFetchStatus = { .name = "YFS.FetchStatus", .op = yfs_FS_FetchStatus, - .deliver = yfs_deliver_fs_status_cb_and_volsync, + .deliver = yfs_deliver_fs_fetch_status, .destructor = afs_flat_call_destructor, }; @@ -1619,7 +1578,7 @@ static const struct afs_call_type yfs_RXYFSFetchStatus = { */ void yfs_fs_fetch_status(struct afs_operation *op) { - struct afs_vnode_param *vp = &op->file[0]; + struct afs_vnode_param *vp = &op->file[op->fetch_status.which]; struct afs_call *call; __be32 *bp; @@ -67,7 +67,7 @@ struct aio_ring { unsigned header_length; /* size of aio_ring */ - struct io_event io_events[0]; + struct io_event io_events[]; }; /* 128 bytes + ring size */ /* diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index b04c528b19d3..74c886f7c51c 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -53,7 +53,7 @@ static int autofs_write(struct autofs_sb_info *sbi, mutex_lock(&sbi->pipe_mutex); while (bytes) { - wr = __kernel_write(file, data, bytes, &file->f_pos); + wr = kernel_write(file, data, bytes, &file->f_pos); if (wr <= 0) break; data += wr; diff --git a/fs/block_dev.c b/fs/block_dev.c index 47860e589388..0ae656e022fd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -75,7 +75,7 @@ static void bdev_write_inode(struct block_device *bdev) } /* Kill _all_ buffers and pagecache , dirty or not.. */ -void kill_bdev(struct block_device *bdev) +static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; @@ -84,8 +84,7 @@ void kill_bdev(struct block_device *bdev) invalidate_bh_lrus(); truncate_inode_pages(mapping, 0); -} -EXPORT_SYMBOL(kill_bdev); +} /* Invalidate clean unused buffers and pagecache. */ void invalidate_bdev(struct block_device *bdev) @@ -1565,10 +1564,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) */ if (!for_part) { ret = devcgroup_inode_permission(bdev->bd_inode, perm); - if (ret != 0) { - bdput(bdev); + if (ret != 0) return ret; - } } restart: @@ -1637,8 +1634,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; BUG_ON(for_part); ret = __blkdev_get(whole, mode, 1); - if (ret) + if (ret) { + bdput(whole); goto out_clear; + } bdev->bd_contains = whole; bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || @@ -1688,7 +1687,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_unblock_events(disk); put_disk_and_module(disk); out: - bdput(bdev); return ret; } @@ -1755,6 +1753,9 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) bdput(whole); } + if (res) + bdput(bdev); + return res; } EXPORT_SYMBOL(blkdev_get); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index d888e71e66b6..ea10f7bc99ab 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1461,6 +1461,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); + *roots = NULL; return ret; } node = ulist_next(tmp, &uiter); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 176e8a292fd1..c037ef514b64 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -940,7 +940,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto out_put_group; + goto out; } /* @@ -978,7 +978,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(inode); - goto out_put_group; + goto out; } clear_nlink(inode); /* One for the block groups ref */ @@ -1001,13 +1001,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) - goto out_put_group; + goto out; if (ret > 0) btrfs_release_path(path); if (ret == 0) { ret = btrfs_del_item(trans, tree_root, path); if (ret) - goto out_put_group; + goto out; btrfs_release_path(path); } @@ -1016,6 +1016,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); + /* Once for the block groups rbtree */ + btrfs_put_block_group(block_group); + if (fs_info->first_logical_byte == block_group->start) fs_info->first_logical_byte = (u64)-1; spin_unlock(&fs_info->block_group_cache_lock); @@ -1089,6 +1092,25 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->space_info->lock); + /* + * Remove the free space for the block group from the free space tree + * and the block group's item from the extent tree before marking the + * block group as removed. This is to prevent races with tasks that + * freeze and unfreeze a block group, this task and another task + * allocating a new block group - the unfreeze task ends up removing + * the block group's extent map before the task calling this function + * deletes the block group item from the extent tree, allowing for + * another task to attempt to create another block group with the same + * item key (and failing with -EEXIST and a transaction abort). + */ + ret = remove_block_group_free_space(trans, block_group); + if (ret) + goto out; + + ret = remove_block_group_item(trans, path, block_group); + if (ret < 0) + goto out; + mutex_lock(&fs_info->chunk_mutex); spin_lock(&block_group->lock); block_group->removed = 1; @@ -1123,17 +1145,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->chunk_mutex); - ret = remove_block_group_free_space(trans, block_group); - if (ret) - goto out_put_group; - - /* Once for the block groups rbtree */ - btrfs_put_block_group(block_group); - - ret = remove_block_group_item(trans, path, block_group); - if (ret < 0) - goto out; - if (remove_em) { struct extent_map_tree *em_tree; @@ -1145,10 +1156,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, free_extent_map(em); } -out_put_group: +out: /* Once for the lookup reference */ btrfs_put_block_group(block_group); -out: if (remove_rsv) btrfs_delayed_refs_rsv_release(fs_info, 1); btrfs_free_path(path); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3a7648bff42c..82ab6e5a386d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1196,7 +1196,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, switch (tm->op) { case MOD_LOG_KEY_REMOVE_WHILE_FREEING: BUG_ON(tm->slot < n); - /* Fallthrough */ + fallthrough; case MOD_LOG_KEY_REMOVE_WHILE_MOVING: case MOD_LOG_KEY_REMOVE: btrfs_set_node_key(eb, &tm->key, tm->slot); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 30ce7039bc27..d404cce8ae40 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1009,6 +1009,8 @@ enum { BTRFS_ROOT_DEAD_RELOC_TREE, /* Mark dead root stored on device whose cleanup needs to be resumed */ BTRFS_ROOT_DEAD_TREE, + /* The root has a log tree. Used only for subvolume roots. */ + BTRFS_ROOT_HAS_LOG_TREE, }; /* diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 5615320fa659..741c7e19c32f 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -619,6 +619,7 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, bg_list) { list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); } spin_unlock(&fs_info->unused_bgs_lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7c6f0bbb54a5..b1a148058773 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2593,10 +2593,12 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) !extent_buffer_uptodate(tree_root->node)) { handle_error = true; - if (IS_ERR(tree_root->node)) + if (IS_ERR(tree_root->node)) { ret = PTR_ERR(tree_root->node); - else if (!extent_buffer_uptodate(tree_root->node)) + tree_root->node = NULL; + } else if (!extent_buffer_uptodate(tree_root->node)) { ret = -EUCLEAN; + } btrfs_warn(fs_info, "failed to read tree root"); continue; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 68c96057ad2d..60278e52c37a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1999,7 +1999,8 @@ static int __process_pages_contig(struct address_space *mapping, if (!PageDirty(pages[i]) || pages[i]->mapping != mapping) { unlock_page(pages[i]); - put_page(pages[i]); + for (; i < ret; i++) + put_page(pages[i]); err = -EAGAIN; goto out; } @@ -5058,25 +5059,28 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; - /* the ref bit is tricky. We have to make sure it is set - * if we have the buffer dirty. Otherwise the - * code to free a buffer can end up dropping a dirty - * page + /* + * The TREE_REF bit is first set when the extent_buffer is added + * to the radix tree. It is also reset, if unset, when a new reference + * is created by find_extent_buffer. * - * Once the ref bit is set, it won't go away while the - * buffer is dirty or in writeback, and it also won't - * go away while we have the reference count on the - * eb bumped. + * It is only cleared in two cases: freeing the last non-tree + * reference to the extent_buffer when its STALE bit is set or + * calling releasepage when the tree reference is the only reference. * - * We can't just set the ref bit without bumping the - * ref on the eb because free_extent_buffer might - * see the ref bit and try to clear it. If this happens - * free_extent_buffer might end up dropping our original - * ref by mistake and freeing the page before we are able - * to add one more ref. + * In both cases, care is taken to ensure that the extent_buffer's + * pages are not under io. However, releasepage can be concurrently + * called with creating new references, which is prone to race + * conditions between the calls to check_buffer_tree_ref in those + * codepaths and clearing TREE_REF in try_release_extent_buffer. * - * So bump the ref count first, then set the bit. If someone - * beat us to it, drop the ref we added. + * The actual lifetime of the extent_buffer in the radix tree is + * adequately protected by the refcount, but the TREE_REF bit and + * its corresponding reference are not. To protect against this + * class of races, we call check_buffer_tree_ref from the codepaths + * which trigger io after they set eb->io_pages. Note that once io is + * initiated, TREE_REF can no longer be cleared, so that is the + * moment at which any such race is best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -5527,6 +5531,11 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); + /* + * It is possible for releasepage to clear the TREE_REF bit before we + * set io_pages. See check_buffer_tree_ref for a more detailed comment. + */ + check_buffer_tree_ref(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2c14312b05e8..b0d2c976587e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1533,7 +1533,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) + size_t *write_bytes, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1541,27 +1541,43 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, u64 num_bytes; int ret; - if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) + if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; + num_bytes = lockend - lockstart + 1; - btrfs_lock_and_flush_ordered_range(inode, lockstart, - lockend, NULL); + if (nowait) { + struct btrfs_ordered_extent *ordered; + + if (!try_lock_extent(&inode->io_tree, lockstart, lockend)) + return -EAGAIN; + + ordered = btrfs_lookup_ordered_range(inode, lockstart, + num_bytes); + if (ordered) { + btrfs_put_ordered_extent(ordered); + ret = -EAGAIN; + goto out_unlock; + } + } else { + btrfs_lock_and_flush_ordered_range(inode, lockstart, + lockend, NULL); + } - num_bytes = lockend - lockstart + 1; ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, NULL, NULL, NULL); if (ret <= 0) { ret = 0; - btrfs_drew_write_unlock(&root->snapshot_lock); + if (!nowait) + btrfs_drew_write_unlock(&root->snapshot_lock); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); } - +out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend); return ret; @@ -1633,7 +1649,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) && check_can_nocow(BTRFS_I(inode), pos, - &write_bytes) > 0) { + &write_bytes, false) > 0) { /* * For nodata cow case, no need to reserve * data space. @@ -1904,13 +1920,25 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, pos = iocb->ki_pos; count = iov_iter_count(from); if (iocb->ki_flags & IOCB_NOWAIT) { + size_t nocow_bytes = count; + /* * We will allocate space in case nodatacow is not set, * so bail */ if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) || - check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { + check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes, + true) <= 0) { + inode_unlock(inode); + return -EAGAIN; + } + /* + * There are holes in the range or parts of the range that must + * be COWed (shared extents, RO block groups, etc), so just bail + * out. + */ + if (nocow_bytes < count) { inode_unlock(inode); return -EAGAIN; } @@ -3481,6 +3509,7 @@ const struct file_operations btrfs_file_operations = { .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, + .splice_write = iter_file_splice_write, .mmap = btrfs_file_mmap, .open = btrfs_file_open, .release = btrfs_release_file, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d04c82c88418..6862cd7e21a9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -985,6 +985,7 @@ static noinline int cow_file_range(struct inode *inode, u64 num_bytes; unsigned long ram_size; u64 cur_alloc_size = 0; + u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; @@ -1035,10 +1036,26 @@ static noinline int cow_file_range(struct inode *inode, btrfs_drop_extent_cache(BTRFS_I(inode), start, start + num_bytes - 1, 0); + /* + * Relocation relies on the relocated extents to have exactly the same + * size as the original extents. Normally writeback for relocation data + * extents follows a NOCOW path because relocation preallocates the + * extents. However, due to an operation such as scrub turning a block + * group to RO mode, it may fallback to COW mode, so we must make sure + * an extent allocated during COW has exactly the requested size and can + * not be split into smaller extents, otherwise relocation breaks and + * fails during the stage where it updates the bytenr of file extent + * items. + */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + min_alloc_size = num_bytes; + else + min_alloc_size = fs_info->sectorsize; + while (num_bytes > 0) { cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, - fs_info->sectorsize, 0, alloc_hint, + min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret < 0) goto out_unlock; @@ -1361,6 +1378,8 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page, int *page_started, unsigned long *nr_written) { const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode)); + const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; u64 range_start = start; @@ -1391,18 +1410,23 @@ static int fallback_to_cow(struct inode *inode, struct page *locked_page, * data space info, which we incremented in the step above. * * If we need to fallback to cow and the inode corresponds to a free - * space cache inode, we must also increment bytes_may_use of the data - * space_info for the same reason. Space caches always get a prealloc + * space cache inode or an inode of the data relocation tree, we must + * also increment bytes_may_use of the data space_info for the same + * reason. Space caches and relocated data extents always get a prealloc * extent for them, however scrub or balance may have set the block - * group that contains that extent to RO mode. + * group that contains that extent to RO mode and therefore force COW + * when starting writeback. */ count = count_range_bits(io_tree, &range_start, end, range_bytes, EXTENT_NORESERVE, 0); - if (count > 0 || is_space_ino) { - const u64 bytes = is_space_ino ? range_bytes : count; + if (count > 0 || is_space_ino || is_reloc_ino) { + u64 bytes = count; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_space_info *sinfo = fs_info->data_sinfo; + if (is_space_ino || is_reloc_ino) + bytes = range_bytes; + spin_lock(&sinfo->lock); btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); spin_unlock(&sinfo->lock); @@ -1666,12 +1690,8 @@ out_check: ret = fallback_to_cow(inode, locked_page, cow_start, found_key.offset - 1, page_started, nr_written); - if (ret) { - if (nocow) - btrfs_dec_nocow_writers(fs_info, - disk_bytenr); + if (ret) goto error; - } cow_start = (u64)-1; } @@ -1687,9 +1707,6 @@ out_check: ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { - if (nocow) - btrfs_dec_nocow_writers(fs_info, - disk_bytenr); ret = PTR_ERR(em); goto error; } @@ -7865,9 +7882,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) dio_data.overwrite = 1; inode_unlock(inode); relock = true; - } else if (iocb->ki_flags & IOCB_NOWAIT) { - ret = -EAGAIN; - goto out; } ret = btrfs_delalloc_reserve_space(inode, &data_reserved, offset, count); @@ -8109,20 +8123,17 @@ again: /* * Qgroup reserved space handler * Page here will be either - * 1) Already written to disk - * In this case, its reserved space is released from data rsv map - * and will be freed by delayed_ref handler finally. - * So even we call qgroup_free_data(), it won't decrease reserved - * space. - * 2) Not written to disk - * This means the reserved space should be freed here. However, - * if a truncate invalidates the page (by clearing PageDirty) - * and the page is accounted for while allocating extent - * in btrfs_check_data_free_space() we let delayed_ref to - * free the entire extent. + * 1) Already written to disk or ordered extent already submitted + * Then its QGROUP_RESERVED bit in io_tree is already cleaned. + * Qgroup will be handled by its qgroup_record then. + * btrfs_qgroup_free_data() call will do nothing here. + * + * 2) Not written to disk yet + * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED + * bit of its io_tree, and free the qgroup reserved data space. + * Since the IO will never happen for this page. */ - if (PageDirty(page)) - btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 168deb8ef68a..e8f7c5f00894 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2692,7 +2692,7 @@ out: btrfs_put_root(root); out_free: btrfs_free_path(path); - kzfree(subvol_info); + kfree(subvol_info); return ret; } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 7887317033c9..af92525dbb16 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -509,7 +509,7 @@ static int process_leaf(struct btrfs_root *root, switch (key.type) { case BTRFS_EXTENT_ITEM_KEY: *num_bytes = key.offset; - /* fall through */ + fallthrough; case BTRFS_METADATA_ITEM_KEY: *bytenr = key.objectid; ret = process_extent_item(fs_info, path, &key, i, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 41ee88633769..c7bd3fdd7792 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -879,8 +879,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, return false; } global_rsv->reserved -= ticket->bytes; + remove_ticket(space_info, ticket); ticket->bytes = 0; - list_del_init(&ticket->list); wake_up(&ticket->wait); space_info->tickets_id++; if (global_rsv->reserved < global_rsv->size) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index bc73fd670702..c3826ae883f0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -523,7 +523,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_compress_force: case Opt_compress_force_type: compress_force = true; - /* Fallthrough */ + fallthrough; case Opt_compress: case Opt_compress_type: saved_compress_type = btrfs_test_opt(info, @@ -622,7 +622,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_opt(info->mount_opt, NOSSD); btrfs_clear_and_info(info, SSD, "not using ssd optimizations"); - /* Fallthrough */ + fallthrough; case Opt_nossd_spread: btrfs_clear_and_info(info, SSD_SPREAD, "not using spread ssd allocation scheme"); @@ -793,7 +793,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_recovery: btrfs_warn(info, "'recovery' is deprecated, use 'usebackuproot' instead"); - /* fall through */ + fallthrough; case Opt_usebackuproot: btrfs_info(info, "trying to use backup root at mount time"); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 920cee312f4e..cd5348f352dd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -169,6 +169,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, if (ret) goto out; + set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); root->log_start_pid = current->pid; } @@ -195,6 +196,9 @@ static int join_running_log_trans(struct btrfs_root *root) { int ret = -ENOENT; + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) + return ret; + mutex_lock(&root->log_mutex); if (root->log_root) { ret = 0; @@ -3303,6 +3307,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) if (root->log_root) { free_log_tree(trans, root->log_root); root->log_root = NULL; + clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); } return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0d6e785bcb98..f403fb1e6d37 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7052,6 +7052,14 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) mutex_lock(&fs_info->chunk_mutex); /* + * It is possible for mount and umount to race in such a way that + * we execute this code path, but open_fs_devices failed to clear + * total_rw_bytes. We certainly want it cleared before reading the + * device items, so clear it here. + */ + fs_info->fs_devices->total_rw_bytes = 0; + + /* * Read all device items, and then all the chunk items. All * device items are found before any chunk item (their object id * is smaller than the lowest possible object id for a chunk diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f067b5934c46..75af2334b2e3 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -408,7 +408,7 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) return BTRFS_MAP_WRITE; default: WARN_ON_ONCE(1); - /* fall through */ + fallthrough; case REQ_OP_READ: return BTRFS_MAP_READ; } diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index e7726f5f1241..3080cda9e824 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -937,7 +937,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) } data = kmap(page); - ret = __kernel_write(file, data, len, &pos); + ret = kernel_write(file, data, len, &pos); kunmap(page); fput(file); if (ret != len) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index fc98b97b396a..53588d7517b4 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -399,6 +399,10 @@ skip_rdma: if (ses->sign) seq_puts(m, " signed"); + seq_printf(m, "\n\tUser: %d Cred User: %d", + from_kuid(&init_user_ns, ses->linux_uid), + from_kuid(&init_user_ns, ses->cred_uid)); + if (ses->chan_count > 1) { seq_printf(m, "\n\n\tExtra Channels: %zu\n", ses->chan_count-1); @@ -406,7 +410,7 @@ skip_rdma: cifs_dump_channel(m, j, &ses->chans[j]); } - seq_puts(m, "\n\tShares:"); + seq_puts(m, "\n\n\tShares:"); j = 0; seq_printf(m, "\n\t%d) IPC: ", j); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index c7a311d28d3d..99b3180c613a 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -156,5 +156,5 @@ extern int cifs_truncate_page(struct address_space *mapping, loff_t from); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.27" +#define CIFS_VERSION "2.28" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5fac34f192af..a61abde09ffe 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -5306,9 +5306,15 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) vol_info->nocase = master_tcon->nocase; vol_info->nohandlecache = master_tcon->nohandlecache; vol_info->local_lease = master_tcon->local_lease; + vol_info->no_lease = master_tcon->no_lease; + vol_info->resilient = master_tcon->use_resilient; + vol_info->persistent = master_tcon->use_persistent; + vol_info->handle_timeout = master_tcon->handle_timeout; vol_info->no_linux_ext = !master_tcon->unix_ext; + vol_info->linux_ext = master_tcon->posix_extensions; vol_info->sectype = master_tcon->ses->sectype; vol_info->sign = master_tcon->ses->sign; + vol_info->seal = master_tcon->seal; rc = cifs_set_vol_auth(vol_info, master_tcon->ses); if (rc) { @@ -5334,10 +5340,6 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) goto out; } - /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */ - if (tcon->posix_extensions) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; - if (cap_unix(ses)) reset_cifs_unix_caps(0, tcon, NULL, vol_info); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 4fe757cfc360..be46fab4c96d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1149,20 +1149,20 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock) /* * Set the byte-range lock (posix style). Returns: - * 1) 0, if we set the lock and don't need to request to the server; - * 2) 1, if we need to request to the server; - * 3) <0, if the error occurs while setting the lock. + * 1) <0, if the error occurs while setting the lock; + * 2) 0, if we set the lock and don't need to request to the server; + * 3) FILE_LOCK_DEFERRED, if we will wait for some other file_lock; + * 4) FILE_LOCK_DEFERRED + 1, if we need to request to the server. */ static int cifs_posix_lock_set(struct file *file, struct file_lock *flock) { struct cifsInodeInfo *cinode = CIFS_I(file_inode(file)); - int rc = 1; + int rc = FILE_LOCK_DEFERRED + 1; if ((flock->fl_flags & FL_POSIX) == 0) return rc; -try_again: cifs_down_write(&cinode->lock_sem); if (!cinode->can_cache_brlcks) { up_write(&cinode->lock_sem); @@ -1171,13 +1171,6 @@ try_again: rc = posix_lock_file(file, flock, NULL); up_write(&cinode->lock_sem); - if (rc == FILE_LOCK_DEFERRED) { - rc = wait_event_interruptible(flock->fl_wait, - list_empty(&flock->fl_blocked_member)); - if (!rc) - goto try_again; - locks_delete_block(flock); - } return rc; } @@ -1652,7 +1645,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, int posix_lock_type; rc = cifs_posix_lock_set(file, flock); - if (!rc || rc < 0) + if (rc <= FILE_LOCK_DEFERRED) return rc; if (type & server->vals->shared_lock_type) @@ -4336,7 +4329,8 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, break; __SetPageLocked(page); - if (add_to_page_cache_locked(page, mapping, page->index, gfp)) { + rc = add_to_page_cache_locked(page, mapping, page->index, gfp); + if (rc) { __ClearPageLocked(page); break; } @@ -4352,6 +4346,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, struct list_head *page_list, unsigned num_pages) { int rc; + int err = 0; struct list_head tmplist; struct cifsFileInfo *open_file = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); @@ -4396,7 +4391,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, * the order of declining indexes. When we put the pages in * the rdata->pages, then we want them in increasing order. */ - while (!list_empty(page_list)) { + while (!list_empty(page_list) && !err) { unsigned int i, nr_pages, bytes, rsize; loff_t offset; struct page *page, *tpage; @@ -4429,9 +4424,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, return 0; } - rc = readpages_get_pages(mapping, page_list, rsize, &tmplist, + nr_pages = 0; + err = readpages_get_pages(mapping, page_list, rsize, &tmplist, &nr_pages, &offset, &bytes); - if (rc) { + if (!nr_pages) { add_credits_and_wake_if(server, credits, 0); break; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 583f5e4008c2..ce95801e9b66 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2535,6 +2535,15 @@ set_size_out: if (rc == 0) { cifsInode->server_eof = attrs->ia_size; cifs_setsize(inode, attrs->ia_size); + + /* + * The man page of truncate says if the size changed, + * then the st_ctime and st_mtime fields for the file + * are updated. + */ + attrs->ia_ctime = attrs->ia_mtime = current_time(inode); + attrs->ia_valid |= ATTR_CTIME | ATTR_MTIME; + cifs_truncate_page(inode->i_mapping, inode->i_size); } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 4a73e63c4d43..dcde44ff6cf9 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -169,6 +169,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) unsigned int xid; struct cifsFileInfo *pSMBFile = filep->private_data; struct cifs_tcon *tcon; + struct tcon_link *tlink; struct cifs_sb_info *cifs_sb; __u64 ExtAttrBits = 0; __u64 caps; @@ -307,13 +308,19 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; } cifs_sb = CIFS_SB(inode->i_sb); - tcon = tlink_tcon(cifs_sb_tlink(cifs_sb)); + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + break; + } + tcon = tlink_tcon(tlink); if (tcon && tcon->ses->server->ops->notify) { rc = tcon->ses->server->ops->notify(xid, filep, (void __user *)arg); cifs_dbg(FYI, "ioctl notify rc %d\n", rc); } else rc = -EOPNOTSUPP; + cifs_put_tlink(tlink); break; default: cifs_dbg(FYI, "unsupported ioctl\n"); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 56791a692c8b..e44d049142d0 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -844,28 +844,26 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) struct bio_vec *bv = NULL; if (iov_iter_is_kvec(iter)) { - memcpy(&ctx->iter, iter, sizeof(struct iov_iter)); + memcpy(&ctx->iter, iter, sizeof(*iter)); ctx->len = count; iov_iter_advance(iter, count); return 0; } - if (max_pages * sizeof(struct bio_vec) <= CIFS_AIO_KMALLOC_LIMIT) - bv = kmalloc_array(max_pages, sizeof(struct bio_vec), - GFP_KERNEL); + if (array_size(max_pages, sizeof(*bv)) <= CIFS_AIO_KMALLOC_LIMIT) + bv = kmalloc_array(max_pages, sizeof(*bv), GFP_KERNEL); if (!bv) { - bv = vmalloc(array_size(max_pages, sizeof(struct bio_vec))); + bv = vmalloc(array_size(max_pages, sizeof(*bv))); if (!bv) return -ENOMEM; } - if (max_pages * sizeof(struct page *) <= CIFS_AIO_KMALLOC_LIMIT) - pages = kmalloc_array(max_pages, sizeof(struct page *), - GFP_KERNEL); + if (array_size(max_pages, sizeof(*pages)) <= CIFS_AIO_KMALLOC_LIMIT) + pages = kmalloc_array(max_pages, sizeof(*pages), GFP_KERNEL); if (!pages) { - pages = vmalloc(array_size(max_pages, sizeof(struct page *))); + pages = vmalloc(array_size(max_pages, sizeof(*pages))); if (!pages) { kvfree(bv); return -ENOMEM; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 6a39451973f8..157992864ce7 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -354,9 +354,13 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr) ((struct smb2_ioctl_rsp *)shdr)->OutputCount); break; case SMB2_CHANGE_NOTIFY: + *off = le16_to_cpu( + ((struct smb2_change_notify_rsp *)shdr)->OutputBufferOffset); + *len = le32_to_cpu( + ((struct smb2_change_notify_rsp *)shdr)->OutputBufferLength); + break; default: - /* BB FIXME for unimplemented cases above */ - cifs_dbg(VFS, "no length check for command\n"); + cifs_dbg(VFS, "no length check for command %d\n", le16_to_cpu(shdr->Command)); break; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 736d86b8a910..32f90dc82c84 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -763,6 +763,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, /* close extra handle outside of crit sec */ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); } + rc = 0; goto oshr_free; } @@ -2147,7 +2148,7 @@ smb3_notify(const unsigned int xid, struct file *pfile, tcon = cifs_sb_master_tcon(cifs_sb); oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA; oparms.disposition = FILE_OPEN; oparms.create_options = cifs_create_options(cifs_sb, 0); oparms.fid = &fid; @@ -3187,6 +3188,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid, ses->Suid, offset, len); + /* + * We zero the range through ioctl, so we need remove the page caches + * first, otherwise the data may be inconsistent with the server. + */ + truncate_pagecache_range(inode, offset, offset + len - 1); /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) @@ -3253,6 +3259,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, return rc; } + /* + * We implement the punch hole through ioctl, so we need remove the page + * caches first, otherwise the data may be inconsistent with the server. + */ + truncate_pagecache_range(inode, offset, offset + len - 1); + cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len); fsctl_buf.FileOffset = cpu_to_le64(offset); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index d11e31064679..84433d0653f9 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -523,7 +523,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, const int timeout, const int flags, unsigned int *instance) { - int rc; + long rc; int *credits; int optype; long int t; diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index e9e27a271af0..feaa5e182b7b 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -51,6 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file, } else { inode_lock(inode); i_size_write(inode, datasize + sizeof(attributes)); + inode->i_mtime = current_time(inode); inode_unlock(inode); } @@ -72,10 +73,8 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, ssize_t size = 0; int err; - while (!__ratelimit(&file->f_cred->user->ratelimit)) { - if (!msleep_interruptible(50)) - return -EINTR; - } + while (!__ratelimit(&file->f_cred->user->ratelimit)) + msleep(50); err = efivar_entry_size(var, &datasize); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 12c66f5d92dd..28bb5689333a 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -201,6 +201,9 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_d_op = &efivarfs_d_ops; sb->s_time_gran = 1; + if (!efivar_supports_writes()) + sb->s_flags |= SB_RDONLY; + inode = efivarfs_get_inode(sb, NULL, S_IFDIR | 0755, 0, true); if (!inode) return -ENOMEM; @@ -252,9 +255,6 @@ static struct file_system_type efivarfs_type = { static __init int efivarfs_init(void) { - if (!efi_rt_services_supported(EFI_RT_SUPPORTED_VARIABLE_SERVICES)) - return -ENODEV; - if (!efivars_kobject()) return -ENODEV; diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 7824f5563a55..9b66c28b3ae9 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -144,22 +144,22 @@ static inline void z_erofs_onlinepage_init(struct page *page) static inline void z_erofs_onlinepage_fixup(struct page *page, uintptr_t index, bool down) { - unsigned long *p, o, v, id; -repeat: - p = &page_private(page); - o = READ_ONCE(*p); + union z_erofs_onlinepage_converter u = { .v = &page_private(page) }; + int orig, orig_index, val; - id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; - if (id) { +repeat: + orig = atomic_read(u.o); + orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; + if (orig_index) { if (!index) return; - DBG_BUGON(id != index); + DBG_BUGON(orig_index != index); } - v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | - ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); - if (cmpxchg(p, o, v) != o) + val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | + ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); + if (atomic_cmpxchg(u.o, orig, val) != orig) goto repeat; } diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index de43534aa299..119abf0d8dd6 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -309,7 +309,7 @@ const struct file_operations exfat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate = exfat_iterate, - .fsync = generic_file_fsync, + .fsync = exfat_file_fsync, }; int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu) @@ -425,10 +425,12 @@ static void exfat_init_name_entry(struct exfat_dentry *ep, ep->dentry.name.flags = 0x0; for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) { - ep->dentry.name.unicode_0_14[i] = cpu_to_le16(*uniname); - if (*uniname == 0x0) - break; - uniname++; + if (*uniname != 0x0) { + ep->dentry.name.unicode_0_14[i] = cpu_to_le16(*uniname); + uniname++; + } else { + ep->dentry.name.unicode_0_14[i] = 0x0; + } } } @@ -1110,7 +1112,7 @@ found: ret = exfat_get_next_cluster(sb, &clu.dir); } - if (ret || clu.dir != EXFAT_EOF_CLUSTER) { + if (ret || clu.dir == EXFAT_EOF_CLUSTER) { /* just initialized hint_stat */ hint_stat->clu = p_dir->dir; hint_stat->eidx = 0; diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 595f3117f492..75c7bdbeba6d 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -371,7 +371,7 @@ static inline bool exfat_is_last_sector_in_cluster(struct exfat_sb_info *sbi, static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi, unsigned int clus) { - return ((clus - EXFAT_RESERVED_CLUSTERS) << sbi->sect_per_clus_bits) + + return ((sector_t)(clus - EXFAT_RESERVED_CLUSTERS) << sbi->sect_per_clus_bits) + sbi->data_start_sector; } @@ -420,6 +420,7 @@ void exfat_truncate(struct inode *inode, loff_t size); int exfat_setattr(struct dentry *dentry, struct iattr *attr); int exfat_getattr(const struct path *path, struct kstat *stat, unsigned int request_mask, unsigned int query_flags); +int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); /* namei.c */ extern const struct dentry_operations exfat_dentry_ops; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index fce03f318787..a6a063830edc 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <linux/cred.h> #include <linux/buffer_head.h> +#include <linux/blkdev.h> #include "exfat_raw.h" #include "exfat_fs.h" @@ -175,7 +176,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size) ep2->dentry.stream.size = 0; } else { ep2->dentry.stream.valid_size = cpu_to_le64(new_size); - ep2->dentry.stream.size = ep->dentry.stream.valid_size; + ep2->dentry.stream.size = ep2->dentry.stream.valid_size; } if (new_size == 0) { @@ -346,12 +347,28 @@ out: return error; } +int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + int err; + + err = __generic_file_fsync(filp, start, end, datasync); + if (err) + return err; + + err = sync_blockdev(inode->i_sb->s_bdev); + if (err) + return err; + + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); +} + const struct file_operations exfat_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, - .fsync = generic_file_fsync, + .fsync = exfat_file_fsync, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 5b0f35329d63..2b9e21094a96 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -975,7 +975,6 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) goto unlock; } - exfat_set_vol_flags(sb, VOL_DIRTY); exfat_chain_set(&clu_to_free, ei->start_clu, EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi), ei->flags); @@ -1002,6 +1001,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) num_entries++; brelse(bh); + exfat_set_vol_flags(sb, VOL_DIRTY); err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries); if (err) { exfat_err(sb, "failed to exfat_remove_entries : err(%d)", err); @@ -1077,10 +1077,14 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh, §or_old); + if (!epold) + return -EIO; epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh, §or_new); - if (!epold || !epnew) + if (!epnew) { + brelse(old_bh); return -EIO; + } memcpy(epnew, epold, DENTRY_SIZE); exfat_update_bh(sb, new_bh, sync); @@ -1161,10 +1165,14 @@ static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh, §or_mov); + if (!epmov) + return -EIO; epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh, §or_new); - if (!epmov || !epnew) + if (!epnew) { + brelse(mov_bh); return -EIO; + } memcpy(epnew, epmov, DENTRY_SIZE); exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode)); diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 57b5a7a4d1f7..a3c927501e67 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -495,7 +495,7 @@ static int exfat_utf8_to_utf16(struct super_block *sb, struct exfat_uni_name *p_uniname, int *p_lossy) { int i, unilen, lossy = NLS_NAME_NO_LOSSY; - unsigned short upname[MAX_NAME_LENGTH + 1]; + __le16 upname[MAX_NAME_LENGTH + 1]; unsigned short *uniname = p_uniname->name; WARN_ON(!len); @@ -519,7 +519,7 @@ static int exfat_utf8_to_utf16(struct super_block *sb, exfat_wstrchr(bad_uni_chars, *uniname)) lossy |= NLS_NAME_LOSSY; - upname[i] = exfat_toupper(sb, *uniname); + upname[i] = cpu_to_le16(exfat_toupper(sb, *uniname)); uniname++; } @@ -597,7 +597,7 @@ static int exfat_nls_to_ucs2(struct super_block *sb, struct exfat_uni_name *p_uniname, int *p_lossy) { int i = 0, unilen = 0, lossy = NLS_NAME_NO_LOSSY; - unsigned short upname[MAX_NAME_LENGTH + 1]; + __le16 upname[MAX_NAME_LENGTH + 1]; unsigned short *uniname = p_uniname->name; struct nls_table *nls = EXFAT_SB(sb)->nls_io; @@ -611,7 +611,7 @@ static int exfat_nls_to_ucs2(struct super_block *sb, exfat_wstrchr(bad_uni_chars, *uniname)) lossy |= NLS_NAME_LOSSY; - upname[unilen] = exfat_toupper(sb, *uniname); + upname[unilen] = cpu_to_le16(exfat_toupper(sb, *uniname)); uniname++; unilen++; } diff --git a/fs/exfat/super.c b/fs/exfat/super.c index e650e65536f8..253a92460d52 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -693,10 +693,20 @@ static void exfat_free(struct fs_context *fc) } } +static int exfat_reconfigure(struct fs_context *fc) +{ + fc->sb_flags |= SB_NODIRATIME; + + /* volume flag will be updated in exfat_sync_fs */ + sync_filesystem(fc->root->d_sb); + return 0; +} + static const struct fs_context_operations exfat_context_ops = { .parse_param = exfat_parse_param, .get_tree = exfat_get_tree, .free = exfat_free, + .reconfigure = exfat_reconfigure, }; static int exfat_init_fs_context(struct fs_context *fc) diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 4ccb3c9189d8..2e42f47a7f98 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -9,7 +9,8 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ - super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o + super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ + xattr_user.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index c654205f648d..1d82336b1cd4 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -675,6 +675,7 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len, struct qstr qstr = {.name = str, .len = len }; const struct dentry *parent = READ_ONCE(dentry->d_parent); const struct inode *inode = READ_ONCE(parent->d_inode); + char strbuf[DNAME_INLINE_LEN]; if (!inode || !IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) { @@ -683,6 +684,21 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len, return memcmp(str, name->name, len); } + /* + * If the dentry name is stored in-line, then it may be concurrently + * modified by a rename. If this happens, the VFS will eventually retry + * the lookup, so it doesn't matter what ->d_compare() returns. + * However, it's unsafe to call utf8_strncasecmp() with an unstable + * string. Therefore, we have to copy the name into a temporary buffer. + */ + if (len <= DNAME_INLINE_LEN - 1) { + memcpy(strbuf, str, len); + strbuf[len] = 0; + qstr.name = strbuf; + /* prevent compiler from optimizing out the temporary buffer */ + barrier(); + } + return ext4_ci_compare(inode, name, &qstr, false); } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b08841f70b69..42f5060f3cdf 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -426,13 +426,16 @@ struct flex_groups { #define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ + +#define EXT4_DAX_FL 0x02000000 /* Inode is DAX */ + #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x725BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x624BC0FF /* User modifiable flags */ /* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ @@ -440,14 +443,16 @@ struct flex_groups { EXT4_APPEND_FL | \ EXT4_NODUMP_FL | \ EXT4_NOATIME_FL | \ - EXT4_PROJINHERIT_FL) + EXT4_PROJINHERIT_FL | \ + EXT4_DAX_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ - EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL) + EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ + EXT4_DAX_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ @@ -459,6 +464,10 @@ struct flex_groups { /* The only flags that should be swapped */ #define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) +/* Flags which are mutually exclusive to DAX */ +#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ + EXT4_JOURNAL_DATA_FL) + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { @@ -499,6 +508,7 @@ enum { EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ /* 22 was formerly EXT4_INODE_EOFBLOCKS */ + EXT4_INODE_DAX = 25, /* Inode is DAX */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */ @@ -1135,9 +1145,9 @@ struct ext4_inode_info { #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #ifdef CONFIG_FS_DAX -#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */ #else -#define EXT4_MOUNT_DAX 0 +#define EXT4_MOUNT_DAX_ALWAYS 0 #endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ @@ -1180,6 +1190,8 @@ struct ext4_inode_info { blocks */ #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated file systems */ +#define EXT4_MOUNT2_DAX_NEVER 0x00000008 /* Do not allow Direct Access */ +#define EXT4_MOUNT2_DAX_INODE 0x00000010 /* For printing options only */ #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly specified journal checksum */ @@ -1992,6 +2004,7 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) */ #define EXT4_FLAGS_RESIZING 0 #define EXT4_FLAGS_SHUTDOWN 1 +#define EXT4_FLAGS_BDEV_IS_DAX 2 static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) { @@ -2705,7 +2718,7 @@ extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); -extern void ext4_set_inode_flags(struct inode *); +extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7d088ff1e902..221f240eae60 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2844,7 +2844,7 @@ again: * in use to avoid freeing it when removing blocks. */ if (sbi->s_cluster_ratio > 1) { - pblk = ext4_ext_pblock(ex) + end - ee_block + 2; + pblk = ext4_ext_pblock(ex) + end - ee_block + 1; partial.pclu = EXT4_B2C(sbi, pblk); partial.state = nofree; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 54d324e80fe5..df25d38d6539 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1116,7 +1116,7 @@ got: ei->i_block_group = group; ei->i_last_alloc_group = ~0; - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, true); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 40ec5c7ef0d3..10dd470876b3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4403,9 +4403,11 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); } -static bool ext4_should_use_dax(struct inode *inode) +static bool ext4_should_enable_dax(struct inode *inode) { - if (!test_opt(inode->i_sb, DAX)) + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (test_opt2(inode->i_sb, DAX_NEVER)) return false; if (!S_ISREG(inode->i_mode)) return false; @@ -4417,14 +4419,21 @@ static bool ext4_should_use_dax(struct inode *inode) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) return false; - return true; + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) + return false; + if (test_opt(inode->i_sb, DAX_ALWAYS)) + return true; + + return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } -void ext4_set_inode_flags(struct inode *inode) +void ext4_set_inode_flags(struct inode *inode, bool init) { unsigned int flags = EXT4_I(inode)->i_flags; unsigned int new_fl = 0; + WARN_ON_ONCE(IS_DAX(inode) && init); + if (flags & EXT4_SYNC_FL) new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) @@ -4435,8 +4444,13 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (ext4_should_use_dax(inode)) + + /* Because of the way inode_set_flags() works we must preserve S_DAX + * here if already set. */ + new_fl |= (inode->i_flags & S_DAX); + if (init && ext4_should_enable_dax(inode)) new_fl |= S_DAX; + if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) @@ -4650,7 +4664,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, true); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 2162db0c747d..999cf6add39c 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -292,6 +292,38 @@ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid, return 0; } +static void ext4_dax_dontcache(struct inode *inode, unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + if (S_ISDIR(inode->i_mode)) + return; + + if (test_opt2(inode->i_sb, DAX_NEVER) || + test_opt(inode->i_sb, DAX_ALWAYS)) + return; + + if ((ei->i_flags ^ flags) & EXT4_DAX_FL) + d_mark_dontcache(inode); +} + +static bool dax_compatible(struct inode *inode, unsigned int oldflags, + unsigned int flags) +{ + if (flags & EXT4_DAX_FL) { + if ((oldflags & EXT4_DAX_MUT_EXCL) || + ext4_test_inode_state(inode, + EXT4_STATE_VERITY_IN_PROGRESS)) { + return false; + } + } + + if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL)) + return false; + + return true; +} + static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) { @@ -300,7 +332,6 @@ static int ext4_ioctl_setflags(struct inode *inode, int err = -EPERM, migrate = 0; struct ext4_iloc iloc; unsigned int oldflags, mask, i; - unsigned int jflag; struct super_block *sb = inode->i_sb; /* Is it quota file? Do not allow user to mess with it */ @@ -309,9 +340,6 @@ static int ext4_ioctl_setflags(struct inode *inode, oldflags = ei->i_flags; - /* The JOURNAL_DATA flag is modifiable only by root */ - jflag = flags & EXT4_JOURNAL_DATA_FL; - err = vfs_ioc_setflags_prepare(inode, oldflags, flags); if (err) goto flags_out; @@ -320,10 +348,16 @@ static int ext4_ioctl_setflags(struct inode *inode, * The JOURNAL_DATA flag can only be changed by * the relevant capability. */ - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } + + if (!dax_compatible(inode, oldflags, flags)) { + err = -EOPNOTSUPP; + goto flags_out; + } + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; @@ -369,6 +403,8 @@ static int ext4_ioctl_setflags(struct inode *inode, if (err) goto flags_err; + ext4_dax_dontcache(inode, flags); + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { if (!(mask & EXT4_FL_USER_MODIFIABLE)) continue; @@ -381,7 +417,8 @@ static int ext4_ioctl_setflags(struct inode *inode, ext4_clear_inode_flag(inode, i); } - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); + inode->i_ctime = current_time(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); @@ -390,17 +427,18 @@ flags_err: if (err) goto flags_out; - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { /* * Changes to the journaling mode can cause unsafe changes to - * S_DAX if we are using the DAX mount option. + * S_DAX if the inode is DAX */ - if (test_opt(inode->i_sb, DAX)) { + if (IS_DAX(inode)) { err = -EBUSY; goto flags_out; } - err = ext4_change_inode_journal_flag(inode, jflag); + err = ext4_change_inode_journal_flag(inode, + flags & EXT4_JOURNAL_DATA_FL); if (err) goto flags_out; } @@ -527,12 +565,15 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) xflags |= FS_XFLAG_NOATIME; if (iflags & EXT4_PROJINHERIT_FL) xflags |= FS_XFLAG_PROJINHERIT; + if (iflags & EXT4_DAX_FL) + xflags |= FS_XFLAG_DAX; return xflags; } #define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ - FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT | \ + FS_XFLAG_DAX) /* Transfer xflags flags to internal */ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) @@ -551,6 +592,8 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) iflags |= EXT4_NOATIME_FL; if (xflags & FS_XFLAG_PROJINHERIT) iflags |= EXT4_PROJINHERIT_FL; + if (xflags & FS_XFLAG_DAX) + iflags |= EXT4_DAX_FL; return iflags; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a9083113a8c0..c0a331e2feb0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4708,7 +4708,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; - seq = *this_cpu_ptr(&discard_pa_seq); + seq = this_cpu_read(discard_pa_seq); if (!ext4_mb_use_preallocated(ac)) { ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c668f6b42374..330957ed1f05 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -522,9 +522,6 @@ static void ext4_handle_error(struct super_block *sb) smp_wmb(); sb->s_flags |= SB_RDONLY; } else if (test_opt(sb, ERRORS_PANIC)) { - if (EXT4_SB(sb)->s_journal && - !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) - return; panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } @@ -725,23 +722,20 @@ void __ext4_abort(struct super_block *sb, const char *function, va_end(args); if (sb_rdonly(sb) == 0) { - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + if (EXT4_SB(sb)->s_journal) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); + + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible * before ->s_flags update */ smp_wmb(); sb->s_flags |= SB_RDONLY; - if (EXT4_SB(sb)->s_journal) - jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); } - if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { - if (EXT4_SB(sb)->s_journal && - !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) - return; + if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) panic("EXT4-fs panic from previous error\n"); - } } void __ext4_msg(struct super_block *sb, @@ -1324,6 +1318,9 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) return -EINVAL; + if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) + return -EOPNOTSUPP; + res = ext4_convert_inline_data(inode); if (res) return res; @@ -1349,7 +1346,7 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); } return res; } @@ -1376,7 +1373,7 @@ retry: * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); @@ -1514,7 +1511,8 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, @@ -1581,6 +1579,9 @@ static const match_table_t tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, {Opt_dax, "dax"}, + {Opt_dax_always, "dax=always"}, + {Opt_dax_inode, "dax=inode"}, + {Opt_dax_never, "dax=never"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, {Opt_warn_on_error, "warn_on_error"}, @@ -1729,6 +1730,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) #define MOPT_NO_EXT3 0x0200 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_STRING 0x0400 +#define MOPT_SKIP 0x0800 static const struct mount_opts { int token; @@ -1778,7 +1780,13 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, - {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, + {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP}, + {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, + {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, + {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -2123,13 +2131,56 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_jquota_fmt = m->mount_opt; #endif - } else if (token == Opt_dax) { + } else if (token == Opt_dax || token == Opt_dax_always || + token == Opt_dax_inode || token == Opt_dax_never) { #ifdef CONFIG_FS_DAX - ext4_msg(sb, KERN_WARNING, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - sbi->s_mount_opt |= m->mount_opt; + switch (token) { + case Opt_dax: + case Opt_dax_always: + if (is_remount && + (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { + fail_dax_change_remount: + ext4_msg(sb, KERN_ERR, "can't change " + "dax mount option while remounting"); + return -1; + } + if (is_remount && + (test_opt(sb, DATA_FLAGS) == + EXT4_MOUNT_JOURNAL_DATA)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + return -1; + } + ext4_msg(sb, KERN_WARNING, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + break; + case Opt_dax_never: + if (is_remount && + (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) + goto fail_dax_change_remount; + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + break; + case Opt_dax_inode: + if (is_remount && + ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || + (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || + !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) + goto fail_dax_change_remount; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; + sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER; + /* Strictly for printing options */ + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE; + break; + } #else ext4_msg(sb, KERN_INFO, "dax option not supported"); + sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER; + sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS; return -1; #endif } else if (token == Opt_data_err_abort) { @@ -2293,7 +2344,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || - (m->flags & MOPT_CLEAR_ERR)) + (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP) continue; if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) continue; /* skip if same as the default */ @@ -2353,6 +2404,17 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, fscrypt_show_test_dummy_encryption(seq, sep, sb); + if (test_opt(sb, DAX_ALWAYS)) { + if (IS_EXT2_SB(sb)) + SEQ_OPTS_PUTS("dax"); + else + SEQ_OPTS_PUTS("dax=always"); + } else if (test_opt2(sb, DAX_NEVER)) { + SEQ_OPTS_PUTS("dax=never"); + } else if (test_opt2(sb, DAX_INODE)) { + SEQ_OPTS_PUTS("dax=inode"); + } + ext4_show_quota_options(seq, sb); return 0; } @@ -2383,6 +2445,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); err = -EROFS; + goto done; } if (read_only) goto done; @@ -4017,7 +4080,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and delalloc"); goto failed_mount; } - if (test_opt(sb, DAX)) { + if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); goto failed_mount; @@ -4127,13 +4190,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (bdev_dax_supported(sb->s_bdev, blocksize)) + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); goto failed_mount; } - if (!bdev_dax_supported(sb->s_bdev, blocksize)) { + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device."); goto failed_mount; @@ -5447,12 +5513,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } - if (test_opt(sb, DAX)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dax"); - err = -EINVAL; - goto restore_opts; - } } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " @@ -5468,12 +5528,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { - ext4_msg(sb, KERN_WARNING, "warning: refusing change of " - "dax flag with busy inodes while remounting"); - sbi->s_mount_opt ^= EXT4_MOUNT_DAX; - } - if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user"); diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index dec1244dd062..bbd5e7e0632b 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -113,6 +113,9 @@ static int ext4_begin_enable_verity(struct file *filp) handle_t *handle; int err; + if (IS_DAX(inode) || ext4_test_inode_flag(inode, EXT4_INODE_DAX)) + return -EINVAL; + if (ext4_verity_in_progress(inode)) return -EBUSY; @@ -241,7 +244,7 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc, if (err) goto out_stop; ext4_set_inode_flag(inode, EXT4_INODE_VERITY); - ext4_set_inode_flags(inode); + ext4_set_inode_flags(inode, false); err = ext4_mark_iloc_dirty(handle, inode, &iloc); } out_stop: diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 9b29a40738ac..7d2f6576d954 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -93,6 +93,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = { #ifdef CONFIG_EXT4_FS_SECURITY [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, #endif + [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, }; const struct xattr_handler *ext4_xattr_handlers[] = { @@ -105,6 +106,7 @@ const struct xattr_handler *ext4_xattr_handlers[] = { #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif + &ext4_xattr_hurd_handler, NULL }; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index ffe21ac77f78..730b91fa0dd7 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -124,6 +124,7 @@ struct ext4_xattr_inode_array { extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_security_handler; +extern const struct xattr_handler ext4_xattr_hurd_handler; #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c new file mode 100644 index 000000000000..8cfa74a56361 --- /dev/null +++ b/fs/ext4/xattr_hurd.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/ext4/xattr_hurd.c + * Handler for extended gnu attributes for the Hurd. + * + * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> + * Copyright (C) 2020 by Jan (janneke) Nieuwenhuizen, <janneke@gnu.org> + */ + +#include <linux/init.h> +#include <linux/string.h> +#include "ext4.h" +#include "xattr.h" + +static bool +ext4_xattr_hurd_list(struct dentry *dentry) +{ + return test_opt(dentry->d_sb, XATTR_USER); +} + +static int +ext4_xattr_hurd_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_HURD, + name, buffer, size); +} + +static int +ext4_xattr_hurd_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + if (!test_opt(inode->i_sb, XATTR_USER)) + return -EOPNOTSUPP; + + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_HURD, + name, value, size, flags); +} + +const struct xattr_handler ext4_xattr_hurd_handler = { + .prefix = XATTR_HURD_PREFIX, + .list = ext4_xattr_hurd_list, + .get = ext4_xattr_hurd_get, + .set = ext4_xattr_hurd_set, +}; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e573b0cd2737..83d917f7e542 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -18,6 +18,7 @@ #include <linux/swap.h> #include <linux/falloc.h> #include <linux/uio.h> +#include <linux/fs.h> static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, struct fuse_page_desc **desc) @@ -1586,7 +1587,6 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - rb_erase(&wpa->writepages_entry, &fi->writepages); for (i = 0; i < ap->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); @@ -1637,6 +1637,7 @@ __acquires(fi->lock) out_free: fi->writectr--; + rb_erase(&wpa->writepages_entry, &fi->writepages); fuse_writepage_finish(fc, wpa); spin_unlock(&fi->lock); @@ -1674,7 +1675,8 @@ __acquires(fi->lock) } } -static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) +static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, + struct fuse_writepage_args *wpa) { pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; @@ -1697,11 +1699,17 @@ static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) else if (idx_to < curr_index) p = &(*p)->rb_left; else - return (void) WARN_ON(true); + return curr; } rb_link_node(&wpa->writepages_entry, parent, p); rb_insert_color(&wpa->writepages_entry, root); + return NULL; +} + +static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) +{ + WARN_ON(fuse_insert_writeback(root, wpa)); } static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, @@ -1714,6 +1722,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, mapping_set_error(inode->i_mapping, error); spin_lock(&fi->lock); + rb_erase(&wpa->writepages_entry, &fi->writepages); while (wpa->next) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_write_in *inarg = &wpa->ia.write.in; @@ -1952,14 +1961,14 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) } /* - * First recheck under fi->lock if the offending offset is still under - * writeback. If yes, then iterate auxiliary write requests, to see if there's + * Check under fi->lock if the page is under writeback, and insert it onto the + * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's * one already added for a page at this offset. If there's none, then insert * this new request onto the auxiliary list, otherwise reuse the existing one by - * copying the new page contents over to the old temporary page. + * swapping the new temp page with the old one. */ -static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, - struct page *page) +static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, + struct page *page) { struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); struct fuse_writepage_args *tmp; @@ -1967,17 +1976,15 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, struct fuse_args_pages *new_ap = &new_wpa->ia.ap; WARN_ON(new_ap->num_pages != 0); + new_ap->num_pages = 1; spin_lock(&fi->lock); - rb_erase(&new_wpa->writepages_entry, &fi->writepages); - old_wpa = fuse_find_writeback(fi, page->index, page->index); + old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); if (!old_wpa) { - tree_insert(&fi->writepages, new_wpa); spin_unlock(&fi->lock); - return false; + return true; } - new_ap->num_pages = 1; for (tmp = old_wpa->next; tmp; tmp = tmp->next) { pgoff_t curr_index; @@ -2006,7 +2013,41 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, fuse_writepage_free(new_wpa); } - return true; + return false; +} + +static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, + struct fuse_args_pages *ap, + struct fuse_fill_wb_data *data) +{ + WARN_ON(!ap->num_pages); + + /* + * Being under writeback is unlikely but possible. For example direct + * read to an mmaped fuse file will set the page dirty twice; once when + * the pages are faulted with get_user_pages(), and then after the read + * completed. + */ + if (fuse_page_is_writeback(data->inode, page->index)) + return true; + + /* Reached max pages */ + if (ap->num_pages == fc->max_pages) + return true; + + /* Reached max write bytes */ + if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write) + return true; + + /* Discontinuity */ + if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index) + return true; + + /* Need to grow the pages array? If so, did the expansion fail? */ + if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) + return true; + + return false; } static int fuse_writepages_fill(struct page *page, @@ -2019,7 +2060,6 @@ static int fuse_writepages_fill(struct page *page, struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); struct page *tmp_page; - bool is_writeback; int err; if (!data->ff) { @@ -2029,25 +2069,9 @@ static int fuse_writepages_fill(struct page *page, goto out_unlock; } - /* - * Being under writeback is unlikely but possible. For example direct - * read to an mmaped fuse file will set the page dirty twice; once when - * the pages are faulted with get_user_pages(), and then after the read - * completed. - */ - is_writeback = fuse_page_is_writeback(inode, page->index); - - if (wpa && ap->num_pages && - (is_writeback || ap->num_pages == fc->max_pages || - (ap->num_pages + 1) * PAGE_SIZE > fc->max_write || - data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) { + if (wpa && fuse_writepage_need_send(fc, page, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; - } else if (wpa && ap->num_pages == data->max_pages) { - if (!fuse_pages_realloc(data)) { - fuse_writepages_send(data); - data->wpa = NULL; - } } err = -ENOMEM; @@ -2085,12 +2109,6 @@ static int fuse_writepages_fill(struct page *page, ap->args.end = fuse_writepage_end; ap->num_pages = 0; wpa->inode = inode; - - spin_lock(&fi->lock); - tree_insert(&fi->writepages, wpa); - spin_unlock(&fi->lock); - - data->wpa = wpa; } set_page_writeback(page); @@ -2098,26 +2116,25 @@ static int fuse_writepages_fill(struct page *page, ap->pages[ap->num_pages] = tmp_page; ap->descs[ap->num_pages].offset = 0; ap->descs[ap->num_pages].length = PAGE_SIZE; + data->orig_pages[ap->num_pages] = page; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; - if (is_writeback && fuse_writepage_in_flight(wpa, page)) { + if (data->wpa) { + /* + * Protected by fi->lock against concurrent access by + * fuse_page_is_writeback(). + */ + spin_lock(&fi->lock); + ap->num_pages++; + spin_unlock(&fi->lock); + } else if (fuse_writepage_add(wpa, page)) { + data->wpa = wpa; + } else { end_page_writeback(page); - data->wpa = NULL; - goto out_unlock; } - data->orig_pages[ap->num_pages] = page; - - /* - * Protected by fi->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fi->lock); - ap->num_pages++; - spin_unlock(&fi->lock); - out_unlock: unlock_page(page); @@ -2149,10 +2166,8 @@ static int fuse_writepages(struct address_space *mapping, err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { - /* Ignore errors if we can write at least one page */ WARN_ON(!data.wpa->ia.ap.num_pages); fuse_writepages_send(&data); - err = 0; } if (data.ff) fuse_file_put(data.ff, false, false); @@ -2761,7 +2776,16 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, struct iovec *iov = iov_page; iov->iov_base = (void __user *)arg; - iov->iov_len = _IOC_SIZE(cmd); + + switch (cmd) { + case FS_IOC_GETFLAGS: + case FS_IOC_SETFLAGS: + iov->iov_len = sizeof(int); + break; + default: + iov->iov_len = _IOC_SIZE(cmd); + break; + } if (_IOC_DIR(cmd) & _IOC_WRITE) { in_iov = iov; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5b4aebf5821f..bba747520e9b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -121,10 +121,12 @@ static void fuse_evict_inode(struct inode *inode) } } -static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) +static int fuse_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; + sync_filesystem(sb); - if (*flags & SB_MANDLOCK) + if (fc->sb_flags & SB_MANDLOCK) return -EINVAL; return 0; @@ -475,6 +477,17 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) struct fuse_fs_context *ctx = fc->fs_private; int opt; + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + /* + * Ignore options coming from mount(MS_REMOUNT) for backward + * compatibility. + */ + if (fc->oldapi) + return 0; + + return invalfc(fc, "No changes allowed in reconfigure"); + } + opt = fs_parse(fc, fuse_fs_parameters, param, &result); if (opt < 0) return opt; @@ -817,7 +830,6 @@ static const struct super_operations fuse_super_operations = { .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, - .remount_fs = fuse_remount_fs, .put_super = fuse_put_super, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, @@ -1296,6 +1308,7 @@ static int fuse_get_tree(struct fs_context *fc) static const struct fs_context_operations fuse_context_ops = { .free = fuse_free_fc, .parse_param = fuse_parse_param, + .reconfigure = fuse_reconfigure, .get_tree = fuse_get_tree, }; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 72c9560f4467..68cd700a2719 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -468,21 +468,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) } -/** - * __gfs2_readpage - readpage - * @file: The file to read a page for - * @page: The page to read - * - * This is the core of gfs2's readpage. It's used by the internal file - * reading code as in that case we already hold the glock. Also it's - * called by gfs2_readpage() once the required lock has been granted. - */ - static int __gfs2_readpage(void *file, struct page *page) { struct gfs2_inode *ip = GFS2_I(page->mapping->host); struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); - int error; if (i_blocksize(page->mapping->host) == PAGE_SIZE && @@ -505,36 +494,11 @@ static int __gfs2_readpage(void *file, struct page *page) * gfs2_readpage - read a page of a file * @file: The file to read * @page: The page of the file - * - * This deals with the locking required. We have to unlock and - * relock the page in order to get the locking in the right - * order. */ static int gfs2_readpage(struct file *file, struct page *page) { - struct address_space *mapping = page->mapping; - struct gfs2_inode *ip = GFS2_I(mapping->host); - struct gfs2_holder gh; - int error; - - unlock_page(page); - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - error = gfs2_glock_nq(&gh); - if (unlikely(error)) - goto out; - error = AOP_TRUNCATED_PAGE; - lock_page(page); - if (page->mapping == mapping && !PageUptodate(page)) - error = __gfs2_readpage(file, page); - else - unlock_page(page); - gfs2_glock_dq(&gh); -out: - gfs2_holder_uninit(&gh); - if (error && error != AOP_TRUNCATED_PAGE) - lock_page(page); - return error; + return __gfs2_readpage(file, page); } /** @@ -598,16 +562,9 @@ static void gfs2_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - if (gfs2_glock_nq(&gh)) - goto out_uninit; if (!gfs2_is_stuffed(ip)) mpage_readahead(rac, gfs2_block_map); - gfs2_glock_dq(&gh); -out_uninit: - gfs2_holder_uninit(&gh); } /** diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index fe305e4bfd37..bebde537ac8c 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -558,8 +558,29 @@ out_uninit: return block_page_mkwrite_return(ret); } +static vm_fault_t gfs2_fault(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + vm_fault_t ret; + int err; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + err = gfs2_glock_nq(&gh); + if (err) { + ret = block_page_mkwrite_return(err); + goto out_uninit; + } + ret = filemap_fault(vmf); + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + return ret; +} + static const struct vm_operations_struct gfs2_vm_ops = { - .fault = filemap_fault, + .fault = gfs2_fault, .map_pages = filemap_map_pages, .page_mkwrite = gfs2_page_mkwrite, }; @@ -824,6 +845,9 @@ out_uninit: static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { + struct gfs2_inode *ip; + struct gfs2_holder gh; + size_t written = 0; ssize_t ret; if (iocb->ki_flags & IOCB_DIRECT) { @@ -832,7 +856,31 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; iocb->ki_flags &= ~IOCB_DIRECT; } - return generic_file_read_iter(iocb, to); + iocb->ki_flags |= IOCB_NOIO; + ret = generic_file_read_iter(iocb, to); + iocb->ki_flags &= ~IOCB_NOIO; + if (ret >= 0) { + if (!iov_iter_count(to)) + return ret; + written = ret; + } else { + if (ret != -EAGAIN) + return ret; + if (iocb->ki_flags & IOCB_NOWAIT) + return ret; + } + ip = GFS2_I(iocb->ki_filp->f_mapping->host); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret) + goto out_uninit; + ret = generic_file_read_iter(iocb, to); + if (ret > 0) + written += ret; + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + return written ? written : ret; } /** diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 2299dcc417ea..8545024a1401 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1899,7 +1899,10 @@ bool gfs2_delete_work_queued(const struct gfs2_glock *gl) static void flush_delete_work(struct gfs2_glock *gl) { - flush_delayed_work(&gl->gl_delete); + if (cancel_delayed_work(&gl->gl_delete)) { + queue_delayed_work(gfs2_delete_workqueue, + &gl->gl_delete, 0); + } gfs2_glock_queue_work(gl, 0); } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index c84887769b5a..de1d5f1d9ff8 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -531,8 +531,7 @@ static int freeze_go_sync(struct gfs2_glock *gl) int error = 0; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp) && - test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + if (gl->gl_req == LM_ST_EXCLUSIVE && !gfs2_withdrawn(sdp)) { atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE); error = freeze_super(sdp->sd_vfs); if (error) { @@ -545,8 +544,11 @@ static int freeze_go_sync(struct gfs2_glock *gl) gfs2_assert_withdraw(sdp, 0); } queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work); - gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | - GFS2_LFC_FREEZE_GO_SYNC); + if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | + GFS2_LFC_FREEZE_GO_SYNC); + else /* read-only mounts */ + atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); } return 0; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 03ab11fab962..ca2ec02436ec 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -399,7 +399,6 @@ enum { GIF_QD_LOCKED = 1, GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, - GIF_ORDERED = 4, GIF_FREE_VFS_INODE = 5, GIF_GLOP_PENDING = 6, GIF_DEFERRED_DELETE = 7, diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 370c3a4b31ac..6774865f5b5b 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -207,10 +207,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (no_formal_ino && ip->i_no_formal_ino && no_formal_ino != ip->i_no_formal_ino) { + error = -ESTALE; if (inode->i_state & I_NEW) goto fail; iput(inode); - return ERR_PTR(-ESTALE); + return ERR_PTR(error); } if (inode->i_state & I_NEW) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 3e4734431783..a76e55bc28eb 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -613,6 +613,12 @@ static int ip_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } +static void __ordered_del_inode(struct gfs2_inode *ip) +{ + if (!list_empty(&ip->i_ordered)) + list_del_init(&ip->i_ordered); +} + static void gfs2_ordered_write(struct gfs2_sbd *sdp) { struct gfs2_inode *ip; @@ -623,8 +629,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) while (!list_empty(&sdp->sd_log_ordered)) { ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered); if (ip->i_inode.i_mapping->nrpages == 0) { - test_and_clear_bit(GIF_ORDERED, &ip->i_flags); - list_del(&ip->i_ordered); + __ordered_del_inode(ip); continue; } list_move(&ip->i_ordered, &written); @@ -643,8 +648,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) spin_lock(&sdp->sd_ordered_lock); while (!list_empty(&sdp->sd_log_ordered)) { ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered); - list_del(&ip->i_ordered); - WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags)); + __ordered_del_inode(ip); if (ip->i_inode.i_mapping->nrpages == 0) continue; spin_unlock(&sdp->sd_ordered_lock); @@ -659,8 +663,7 @@ void gfs2_ordered_del_inode(struct gfs2_inode *ip) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); spin_lock(&sdp->sd_ordered_lock); - if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags)) - list_del(&ip->i_ordered); + __ordered_del_inode(ip); spin_unlock(&sdp->sd_ordered_lock); } @@ -1002,6 +1005,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) out: if (gfs2_withdrawn(sdp)) { + /** + * If the tr_list is empty, we're withdrawing during a log + * flush that targets a transaction, but the transaction was + * never queued onto any of the ail lists. Here we add it to + * ail1 just so that ail_drain() will find and free it. + */ + spin_lock(&sdp->sd_ail_lock); + if (tr && list_empty(&tr->tr_list)) + list_add(&tr->tr_list, &sdp->sd_ail1_list); + spin_unlock(&sdp->sd_ail_lock); ail_drain(sdp); /* frees all transactions */ tr = NULL; } diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index c1cd6ae17659..8965c751a303 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -53,9 +53,9 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) if (gfs2_is_jdata(ip) || !gfs2_is_ordered(sdp)) return; - if (!test_bit(GIF_ORDERED, &ip->i_flags)) { + if (list_empty(&ip->i_ordered)) { spin_lock(&sdp->sd_ordered_lock); - if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags)) + if (list_empty(&ip->i_ordered)) list_add(&ip->i_ordered, &sdp->sd_log_ordered); spin_unlock(&sdp->sd_ordered_lock); } diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 733470ca6be9..c7393ee9cf68 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -39,6 +39,7 @@ static void gfs2_init_inode_once(void *foo) atomic_set(&ip->i_sizehint, 0); init_rwsem(&ip->i_rw_mutex); INIT_LIST_HEAD(&ip->i_trunc_list); + INIT_LIST_HEAD(&ip->i_ordered); ip->i_qadata = NULL; gfs2_holder_mark_uninitialized(&ip->i_rgd_gh); memset(&ip->i_res, 0, sizeof(ip->i_res)); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 094f5fe7c009..6d18d2c91add 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1136,7 +1136,18 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) goto fail_per_node; } - if (!sb_rdonly(sb)) { + if (sb_rdonly(sb)) { + struct gfs2_holder freeze_gh; + + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT, + &freeze_gh); + if (error) { + fs_err(sdp, "can't make FS RO: %d\n", error); + goto fail_per_node; + } + gfs2_glock_dq_uninit(&freeze_gh); + } else { error = gfs2_make_fs_rw(sdp); if (error) { fs_err(sdp, "can't make FS RW: %d\n", error); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 96c345f49273..390ea79d682c 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -364,8 +364,8 @@ void gfs2_recover_func(struct work_struct *work) /* Acquire a shared hold on the freeze lock */ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | LM_FLAG_PRIORITY, - &thaw_gh); + LM_FLAG_NOEXP | LM_FLAG_PRIORITY | + GL_EXACT, &thaw_gh); if (error) goto fail_gunlock_ji; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 32d8d26126a1..47d0ae158b69 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -167,7 +167,8 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) if (error) return error; - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT, &freeze_gh); if (error) goto fail_threads; @@ -203,7 +204,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) return 0; fail: - freeze_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&freeze_gh); fail_threads: if (sdp->sd_quotad_process) @@ -430,7 +430,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) } error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, - GL_NOCACHE, &sdp->sd_freeze_gh); + LM_FLAG_NOEXP, &sdp->sd_freeze_gh); if (error) goto out; @@ -613,13 +613,15 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) !gfs2_glock_is_locked_by_me(sdp->sd_freeze_gl)) { if (!log_write_allowed) { error = gfs2_glock_nq_init(sdp->sd_freeze_gl, - LM_ST_SHARED, GL_NOCACHE | - LM_FLAG_TRY, &freeze_gh); + LM_ST_SHARED, LM_FLAG_TRY | + LM_FLAG_NOEXP | GL_EXACT, + &freeze_gh); if (error == GLR_TRYFAILED) error = 0; } else { error = gfs2_glock_nq_init(sdp->sd_freeze_gl, - LM_ST_SHARED, GL_NOCACHE, + LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT, &freeze_gh); if (error && !gfs2_withdrawn(sdp)) return error; @@ -761,8 +763,8 @@ void gfs2_freeze_func(struct work_struct *work) struct super_block *sb = sdp->sd_vfs; atomic_inc(&sb->s_active); - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, - &freeze_gh); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT, &freeze_gh); if (error) { fs_info(sdp, "GFS2: couldn't get freeze lock : %d\n", error); gfs2_assert_withdraw(sdp, 0); @@ -774,8 +776,6 @@ void gfs2_freeze_func(struct work_struct *work) error); gfs2_assert_withdraw(sdp, 0); } - if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) - freeze_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&freeze_gh); } deactivate_super(sb); diff --git a/fs/io-wq.c b/fs/io-wq.c index 0b65a912b036..47c5f3aeb460 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -903,13 +903,15 @@ void io_wq_cancel_all(struct io_wq *wq) struct io_cb_cancel_data { work_cancel_fn *fn; void *data; + int nr_running; + int nr_pending; + bool cancel_all; }; static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_cb_cancel_data *match = data; unsigned long flags; - bool ret = false; /* * Hold the lock to avoid ->cur_work going out of scope, caller @@ -920,74 +922,90 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) && match->fn(worker->cur_work, match->data)) { send_sig(SIGINT, worker->task, 1); - ret = true; + match->nr_running++; } spin_unlock_irqrestore(&worker->lock, flags); - return ret; + return match->nr_running && !match->cancel_all; } -static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, - struct io_cb_cancel_data *match) +static void io_wqe_cancel_pending_work(struct io_wqe *wqe, + struct io_cb_cancel_data *match) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; unsigned long flags; - bool found = false; - /* - * First check pending list, if we're lucky we can just remove it - * from there. CANCEL_OK means that the work is returned as-new, - * no completion will be posted for it. - */ +retry: spin_lock_irqsave(&wqe->lock, flags); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); + if (!match->fn(work, match->data)) + continue; - if (match->fn(work, match->data)) { - wq_list_del(&wqe->work_list, node, prev); - found = true; - break; - } - } - spin_unlock_irqrestore(&wqe->lock, flags); - - if (found) { + wq_list_del(&wqe->work_list, node, prev); + spin_unlock_irqrestore(&wqe->lock, flags); io_run_cancel(work, wqe); - return IO_WQ_CANCEL_OK; + match->nr_pending++; + if (!match->cancel_all) + return; + + /* not safe to continue after unlock */ + goto retry; } + spin_unlock_irqrestore(&wqe->lock, flags); +} - /* - * Now check if a free (going busy) or busy worker has the work - * currently running. If we find it there, we'll return CANCEL_RUNNING - * as an indication that we attempt to signal cancellation. The - * completion will run normally in this case. - */ +static void io_wqe_cancel_running_work(struct io_wqe *wqe, + struct io_cb_cancel_data *match) +{ rcu_read_lock(); - found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); + io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); rcu_read_unlock(); - return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; } enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, - void *data) + void *data, bool cancel_all) { struct io_cb_cancel_data match = { - .fn = cancel, - .data = data, + .fn = cancel, + .data = data, + .cancel_all = cancel_all, }; - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; int node; + /* + * First check pending list, if we're lucky we can just remove it + * from there. CANCEL_OK means that the work is returned as-new, + * no completion will be posted for it. + */ for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - ret = io_wqe_cancel_work(wqe, &match); - if (ret != IO_WQ_CANCEL_NOTFOUND) - break; + io_wqe_cancel_pending_work(wqe, &match); + if (match.nr_pending && !match.cancel_all) + return IO_WQ_CANCEL_OK; } - return ret; + /* + * Now check if a free (going busy) or busy worker has the work + * currently running. If we find it there, we'll return CANCEL_RUNNING + * as an indication that we attempt to signal cancellation. The + * completion will run normally in this case. + */ + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + + io_wqe_cancel_running_work(wqe, &match); + if (match.nr_running && !match.cancel_all) + return IO_WQ_CANCEL_RUNNING; + } + + if (match.nr_running) + return IO_WQ_CANCEL_RUNNING; + if (match.nr_pending) + return IO_WQ_CANCEL_OK; + return IO_WQ_CANCEL_NOTFOUND; } static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) @@ -997,21 +1015,7 @@ static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) { - return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork); -} - -static bool io_wq_pid_match(struct io_wq_work *work, void *data) -{ - pid_t pid = (pid_t) (unsigned long) data; - - return work->task_pid == pid; -} - -enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) -{ - void *data = (void *) (unsigned long) pid; - - return io_wq_cancel_cb(wq, io_wq_pid_match, data); + return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false); } struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) diff --git a/fs/io-wq.h b/fs/io-wq.h index 8e138fa88b9f..071f1a997800 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -90,7 +90,6 @@ struct io_wq_work { const struct cred *creds; struct fs_struct *fs; unsigned flags; - pid_t task_pid; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) @@ -125,12 +124,11 @@ static inline bool io_wq_is_hashed(struct io_wq_work *work) void io_wq_cancel_all(struct io_wq *wq); enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); -enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid); typedef bool (work_cancel_fn)(struct io_wq_work *, void *); enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, - void *data); + void *data, bool cancel_all); struct task_struct *io_wq_get_task(struct io_wq *wq); diff --git a/fs/io_uring.c b/fs/io_uring.c index 155f3d830ddb..32b0064f806e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -541,6 +541,7 @@ enum { REQ_F_NO_FILE_TABLE_BIT, REQ_F_QUEUE_TIMEOUT_BIT, REQ_F_WORK_INITIALIZED_BIT, + REQ_F_TASK_PINNED_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -598,10 +599,13 @@ enum { REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), /* io_wq_work is initialized */ REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), + /* req->task is refcounted */ + REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT), }; struct async_poll { struct io_poll_iocb poll; + struct io_poll_iocb *double_poll; struct io_wq_work work; }; @@ -887,6 +891,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); static int io_grab_files(struct io_kiocb *req); +static void io_complete_rw_common(struct kiocb *kiocb, long res); static void io_cleanup_req(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); @@ -910,6 +915,21 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +static void io_get_req_task(struct io_kiocb *req) +{ + if (req->flags & REQ_F_TASK_PINNED) + return; + get_task_struct(req->task); + req->flags |= REQ_F_TASK_PINNED; +} + +/* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ +static void __io_put_req_task(struct io_kiocb *req) +{ + if (req->flags & REQ_F_TASK_PINNED) + put_task_struct(req->task); +} + static void io_file_put_work(struct work_struct *work); /* @@ -1045,8 +1065,6 @@ static inline void io_req_work_grab_env(struct io_kiocb *req, } spin_unlock(¤t->fs->lock); } - if (!req->work.task_pid) - req->work.task_pid = task_pid_vnr(current); } static inline void io_req_work_drop_env(struct io_kiocb *req) @@ -1079,6 +1097,8 @@ static inline void io_prep_async_work(struct io_kiocb *req, { const struct io_op_def *def = &io_op_defs[req->opcode]; + io_req_init_async(req); + if (req->flags & REQ_F_ISREG) { if (def->hash_reg_file) io_wq_hash_work(&req->work, file_inode(req->file)); @@ -1256,6 +1276,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) { clear_bit(0, &ctx->sq_check_overflow); clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -1293,6 +1314,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) if (list_empty(&ctx->cq_overflow_list)) { set_bit(0, &ctx->sq_check_overflow); set_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } req->flags |= REQ_F_OVERFLOW; refcount_inc(&req->refs); @@ -1398,9 +1420,7 @@ static void __io_req_aux_free(struct io_kiocb *req) kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); - if (req->task) - put_task_struct(req->task); - + __io_put_req_task(req); io_req_work_drop_env(req); } @@ -1727,6 +1747,26 @@ static int io_put_kbuf(struct io_kiocb *req) return cflags; } +static void io_iopoll_queue(struct list_head *again) +{ + struct io_kiocb *req; + + do { + req = list_first_entry(again, struct io_kiocb, list); + list_del(&req->list); + + /* shouldn't happen unless io_uring is dying, cancel reqs */ + if (unlikely(!current->mm)) { + io_complete_rw_common(&req->rw.kiocb, -EAGAIN); + io_put_req(req); + continue; + } + + refcount_inc(&req->refs); + io_queue_async_work(req); + } while (!list_empty(again)); +} + /* * Find and free completed poll iocbs */ @@ -1735,12 +1775,21 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, { struct req_batch rb; struct io_kiocb *req; + LIST_HEAD(again); + + /* order with ->result store in io_complete_rw_iopoll() */ + smp_rmb(); rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { int cflags = 0; req = list_first_entry(done, struct io_kiocb, list); + if (READ_ONCE(req->result) == -EAGAIN) { + req->iopoll_completed = 0; + list_move_tail(&req->list, &again); + continue; + } list_del(&req->list); if (req->flags & REQ_F_BUFFER_SELECTED) @@ -1758,18 +1807,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, if (ctx->flags & IORING_SETUP_SQPOLL) io_cqring_ev_posted(ctx); io_free_req_many(ctx, &rb); -} - -static void io_iopoll_queue(struct list_head *again) -{ - struct io_kiocb *req; - do { - req = list_first_entry(again, struct io_kiocb, list); - list_del(&req->list); - refcount_inc(&req->refs); - io_queue_async_work(req); - } while (!list_empty(again)); + if (!list_empty(&again)) + io_iopoll_queue(&again); } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, @@ -1777,7 +1817,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, { struct io_kiocb *req, *tmp; LIST_HEAD(done); - LIST_HEAD(again); bool spin; int ret; @@ -1803,13 +1842,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) break; - if (req->result == -EAGAIN) { - list_move_tail(&req->list, &again); - continue; - } - if (!list_empty(&again)) - break; - ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); if (ret < 0) break; @@ -1822,9 +1854,6 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) io_iopoll_complete(ctx, nr_events, &done); - if (!list_empty(&again)) - io_iopoll_queue(&again); - return ret; } @@ -1973,11 +2002,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if (res != req->result) + if (res != -EAGAIN && res != req->result) req_set_fail_links(req); - req->result = res; - if (res != -EAGAIN) - WRITE_ONCE(req->iopoll_completed, 1); + + WRITE_ONCE(req->result, res); + /* order with io_poll_complete() checking ->result */ + smp_wmb(); + WRITE_ONCE(req->iopoll_completed, 1); } /* @@ -2650,8 +2681,8 @@ copy_iov: } } out_free: - kfree(iovec); - req->flags &= ~REQ_F_NEED_CLEANUP; + if (!(req->flags & REQ_F_NEED_CLEANUP)) + kfree(iovec); return ret; } @@ -2773,8 +2804,8 @@ copy_iov: } } out_free: - req->flags &= ~REQ_F_NEED_CLEANUP; - kfree(iovec); + if (!(req->flags & REQ_F_NEED_CLEANUP)) + kfree(iovec); return ret; } @@ -3524,6 +3555,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->flags & REQ_F_NEED_CLEANUP) return 0; + io->msg.msg.msg_name = &io->msg.addr; io->msg.iov = io->msg.fast_iov; ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags, &io->msg.iov); @@ -3705,6 +3737,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io) { + io->msg.msg.msg_name = &io->msg.addr; io->msg.iov = io->msg.fast_iov; #ifdef CONFIG_COMPAT @@ -3813,10 +3846,16 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock) ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg, kmsg->uaddr, flags); - if (force_nonblock && ret == -EAGAIN) - return io_setup_async_msg(req, kmsg); + if (force_nonblock && ret == -EAGAIN) { + ret = io_setup_async_msg(req, kmsg); + if (ret != -EAGAIN) + kfree(kbuf); + return ret; + } if (ret == -ERESTARTSYS) ret = -EINTR; + if (kbuf) + kfree(kbuf); } if (kmsg && kmsg->iov != kmsg->fast_iov) @@ -4045,6 +4084,29 @@ struct io_poll_table { int error; }; +static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) +{ + struct task_struct *tsk = req->task; + struct io_ring_ctx *ctx = req->ctx; + int ret, notify = TWA_RESUME; + + /* + * SQPOLL kernel thread doesn't need notification, just a wakeup. + * If we're not using an eventfd, then TWA_RESUME is always fine, + * as we won't have dependencies between request completions for + * other kernel wait conditions. + */ + if (ctx->flags & IORING_SETUP_SQPOLL) + notify = 0; + else if (ctx->cq_ev_fd) + notify = TWA_SIGNAL; + + ret = task_work_add(tsk, cb, notify); + if (!ret) + wake_up_process(tsk); + return ret; +} + static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { @@ -4068,13 +4130,13 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, * of executing it. We can't safely execute it anyway, as we may not * have the needed state needed for it anyway. */ - ret = task_work_add(tsk, &req->task_work, true); + ret = io_req_task_work_add(req, &req->task_work); if (unlikely(ret)) { WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); - task_work_add(tsk, &req->task_work, true); + task_work_add(tsk, &req->task_work, 0); + wake_up_process(tsk); } - wake_up_process(tsk); return 1; } @@ -4098,9 +4160,9 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) return false; } -static void io_poll_remove_double(struct io_kiocb *req) +static void io_poll_remove_double(struct io_kiocb *req, void *data) { - struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io; + struct io_poll_iocb *poll = data; lockdep_assert_held(&req->ctx->completion_lock); @@ -4120,7 +4182,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) { struct io_ring_ctx *ctx = req->ctx; - io_poll_remove_double(req); + io_poll_remove_double(req, req->io); req->poll.done = true; io_cqring_fill_event(req, error ? error : mangle_poll(mask)); io_commit_cqring(ctx); @@ -4163,21 +4225,21 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io; + struct io_poll_iocb *poll = req->apoll->double_poll; __poll_t mask = key_to_poll(key); /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) return 0; - if (req->poll.head) { + if (poll && poll->head) { bool done; - spin_lock(&req->poll.head->lock); - done = list_empty(&req->poll.wait.entry); + spin_lock(&poll->head->lock); + done = list_empty(&poll->wait.entry); if (!done) - list_del_init(&req->poll.wait.entry); - spin_unlock(&req->poll.head->lock); + list_del_init(&poll->wait.entry); + spin_unlock(&poll->head->lock); if (!done) __io_async_wake(req, poll, mask, io_poll_task_func); } @@ -4197,7 +4259,8 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, } static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, - struct wait_queue_head *head) + struct wait_queue_head *head, + struct io_poll_iocb **poll_ptr) { struct io_kiocb *req = pt->req; @@ -4208,7 +4271,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, */ if (unlikely(poll->head)) { /* already have a 2nd entry, fail a third attempt */ - if (req->io) { + if (*poll_ptr) { pt->error = -EINVAL; return; } @@ -4220,7 +4283,7 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake); refcount_inc(&req->refs); poll->wait.private = req; - req->io = (void *) poll; + *poll_ptr = poll; } pt->error = 0; @@ -4232,8 +4295,31 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct async_poll *apoll = pt->req->apoll; - __io_queue_proc(&pt->req->apoll->poll, pt, head); + __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); +} + +static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) +{ + struct mm_struct *mm = current->mm; + + if (mm) { + kthread_unuse_mm(mm); + mmput(mm); + } +} + +static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (io_op_defs[req->opcode].needs_mm && !current->mm) { + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + return -EFAULT; + kthread_use_mm(ctx->sqo_mm); + } + + return 0; } static void io_async_task_func(struct callback_head *cb) @@ -4261,20 +4347,27 @@ static void io_async_task_func(struct callback_head *cb) } } + io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); /* restore ->work in case we need to retry again */ if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll->double_poll); kfree(apoll); if (!canceled) { __set_current_state(TASK_RUNNING); + if (io_sq_thread_acquire_mm(ctx, req)) { + io_cqring_add_event(req, -EFAULT); + goto end_req; + } mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); mutex_unlock(&ctx->uring_lock); } else { io_cqring_ev_posted(ctx); +end_req: req_set_fail_links(req); io_double_put_req(req); } @@ -4348,7 +4441,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req) struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask, ret; - bool had_io; if (!req->file || !file_can_poll(req->file)) return false; @@ -4360,14 +4452,13 @@ static bool io_arm_poll_handler(struct io_kiocb *req) apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) return false; + apoll->double_poll = NULL; req->flags |= REQ_F_POLLED; if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&apoll->work, &req->work, sizeof(req->work)); - had_io = req->io != NULL; - get_task_struct(current); - req->task = current; + io_get_req_task(req); req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4383,13 +4474,11 @@ static bool io_arm_poll_handler(struct io_kiocb *req) ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); if (ret) { - ipt.error = 0; - /* only remove double add if we did it here */ - if (!had_io) - io_poll_remove_double(req); + io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll->double_poll); kfree(apoll); return false; } @@ -4420,11 +4509,13 @@ static bool io_poll_remove_one(struct io_kiocb *req) bool do_complete; if (req->opcode == IORING_OP_POLL_ADD) { - io_poll_remove_double(req); + io_poll_remove_double(req, req->io); do_complete = __io_poll_remove_one(req, &req->poll); } else { struct async_poll *apoll = req->apoll; + io_poll_remove_double(req, apoll->double_poll); + /* non-poll requests have submit ref still */ do_complete = __io_poll_remove_one(req, &apoll->poll); if (do_complete) { @@ -4437,6 +4528,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll->double_poll); kfree(apoll); } } @@ -4537,7 +4629,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - __io_queue_proc(&pt->req->poll, pt, head); + __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->io); } static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4555,8 +4647,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - get_task_struct(current); - req->task = current; + io_get_req_task(req); return 0; } @@ -4646,7 +4737,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req, { if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->ioprio || sqe->buf_index || sqe->len) return -EINVAL; req->timeout.addr = READ_ONCE(sqe->addr); @@ -4772,7 +4865,7 @@ static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) enum io_wq_cancel cancel_ret; int ret = 0; - cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr); + cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; @@ -4824,8 +4917,9 @@ static int io_async_cancel_prep(struct io_kiocb *req, { if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || - sqe->cancel_flags) + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); @@ -4843,7 +4937,9 @@ static int io_async_cancel(struct io_kiocb *req) static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->flags || sqe->ioprio || sqe->rw_flags) + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->ioprio || sqe->rw_flags) return -EINVAL; req->files_update.offset = READ_ONCE(sqe->off); @@ -5308,9 +5404,6 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) { const bool in_async = io_wq_current_is_worker(); - if (req->result == -EAGAIN) - return -EAGAIN; - /* workqueue context doesn't hold uring_lock, grab it now */ if (in_async) mutex_lock(&ctx->uring_lock); @@ -5637,6 +5730,7 @@ fail_req: * Never try inline submit of IOSQE_ASYNC is set, go straight * to async execution. */ + io_req_init_async(req); req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { @@ -5817,17 +5911,14 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->flags = 0; /* one is dropped after submission, the other at completion */ refcount_set(&req->refs, 2); - req->task = NULL; + req->task = current; req->result = 0; if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; - if (io_op_defs[req->opcode].needs_mm && !current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_mm); - } + if (unlikely(io_sq_thread_acquire_mm(ctx, req))) + return -EFAULT; sqe_flags = READ_ONCE(sqe->flags); /* enforce forwards compatibility on users */ @@ -5936,16 +6027,6 @@ fail_req: return submitted; } -static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - } -} - static int io_sq_thread(void *data) { struct io_ring_ctx *ctx = data; @@ -5979,7 +6060,7 @@ static int io_sq_thread(void *data) * If submit got -EBUSY, flag us as needing the application * to enter the kernel to reap and flush events. */ - if (!to_submit || ret == -EBUSY) { + if (!to_submit || ret == -EBUSY || need_resched()) { /* * Drop cur_mm before scheduling, we can't hold it for * long periods (or over schedule()). Do this before @@ -5995,7 +6076,7 @@ static int io_sq_thread(void *data) * more IO, we should wait for the application to * reap events and wake us up. */ - if (!list_empty(&ctx->poll_list) || + if (!list_empty(&ctx->poll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { if (current->task_works) @@ -6021,9 +6102,9 @@ static int io_sq_thread(void *data) } /* Tell userspace we may need a wakeup call */ + spin_lock_irq(&ctx->completion_lock); ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; - /* make sure to read SQ tail after writing flags */ - smp_mb(); + spin_unlock_irq(&ctx->completion_lock); to_submit = io_sqring_entries(ctx); if (!to_submit || ret == -EBUSY) { @@ -6041,13 +6122,17 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); + spin_lock_irq(&ctx->completion_lock); ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); ret = 0; continue; } finish_wait(&ctx->sqo_wait, &wait); + spin_lock_irq(&ctx->completion_lock); ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); } mutex_lock(&ctx->uring_lock); @@ -6146,15 +6231,23 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); + /* make sure we run task_work before checking for signals */ if (current->task_works) task_work_run(); - if (io_should_wake(&iowq, false)) - break; - schedule(); if (signal_pending(current)) { + if (current->jobctl & JOBCTL_TASK_WORK) { + spin_lock_irq(¤t->sighand->siglock); + current->jobctl &= ~JOBCTL_TASK_WORK; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + continue; + } ret = -EINTR; break; } + if (io_should_wake(&iowq, false)) + break; + schedule(); } while (1); finish_wait(&ctx->wait, &iowq.wq); @@ -6626,6 +6719,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, for (i = 0; i < nr_tables; i++) kfree(ctx->file_data->table[i].files); + percpu_ref_exit(&ctx->file_data->refs); kfree(ctx->file_data->table); kfree(ctx->file_data); ctx->file_data = NULL; @@ -6778,8 +6872,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, } table->files[index] = file; err = io_sqe_file_register(ctx, file, i); - if (err) + if (err) { + fput(file); break; + } } nr_args--; done++; @@ -7275,9 +7371,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) io_mem_free(ctx->sq_sqes); percpu_ref_exit(&ctx->refs); - if (ctx->account_mem) - io_unaccount_mem(ctx->user, - ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); put_cred(ctx->creds); kfree(ctx->cancel_hash); @@ -7331,7 +7424,17 @@ static void io_ring_exit_work(struct work_struct *work) if (ctx->rings) io_cqring_overflow_flush(ctx, true); - wait_for_completion(&ctx->ref_comp); + /* + * If we're doing polled IO and end up having requests being + * submitted async (out-of-line), then completions can come in while + * we're waiting for refs to drop. We need to reap these manually, + * as nobody else will be looking for them. + */ + while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) { + io_iopoll_reap_events(ctx); + if (ctx->rings) + io_cqring_overflow_flush(ctx, true); + } io_ring_ctx_free(ctx); } @@ -7352,6 +7455,16 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) if (ctx->rings) io_cqring_overflow_flush(ctx, true); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); + + /* + * Do this upfront, so we won't have a grace period where the ring + * is closed but resources aren't reaped yet. This can cause + * spurious failure in setting up a new ring. + */ + if (ctx->account_mem) + io_unaccount_mem(ctx->user, + ring_pages(ctx->sq_entries, ctx->cq_entries)); + INIT_WORK(&ctx->exit_work, io_ring_exit_work); queue_work(system_wq, &ctx->exit_work); } @@ -7365,9 +7478,22 @@ static int io_uring_release(struct inode *inode, struct file *file) return 0; } +static bool io_wq_files_match(struct io_wq_work *work, void *data) +{ + struct files_struct *files = data; + + return work->files == files; +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { + if (list_empty_careful(&ctx->inflight_list)) + return; + + /* cancel all at once, should be faster than doing it one by one*/ + io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); + while (!list_empty_careful(&ctx->inflight_list)) { struct io_kiocb *cancel_req = NULL, *req; DEFINE_WAIT(wait); @@ -7398,6 +7524,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, if (list_empty(&ctx->cq_overflow_list)) { clear_bit(0, &ctx->sq_check_overflow); clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } spin_unlock_irq(&ctx->completion_lock); @@ -7423,6 +7550,14 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } +static bool io_cancel_task_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct task_struct *task = data; + + return req->task == task; +} + static int io_uring_flush(struct file *file, void *data) { struct io_ring_ctx *ctx = file->private_data; @@ -7433,7 +7568,7 @@ static int io_uring_flush(struct file *file, void *data) * If the task is going away, cancel work it may have pending */ if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) - io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current)); + io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true); return 0; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a49d0e670ddf..e4944436e733 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1140,6 +1140,7 @@ static journal_t *journal_init_common(struct block_device *bdev, init_waitqueue_head(&journal->j_wait_commit); init_waitqueue_head(&journal->j_wait_updates); init_waitqueue_head(&journal->j_wait_reserved); + mutex_init(&journal->j_abort_mutex); mutex_init(&journal->j_barrier); mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); @@ -1402,7 +1403,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) printk(KERN_ERR "JBD2: Error %d detected when updating " "journal superblock for %s.\n", ret, journal->j_devname); - jbd2_journal_abort(journal, ret); + if (!is_journal_aborted(journal)) + jbd2_journal_abort(journal, ret); } return ret; @@ -2154,6 +2156,13 @@ void jbd2_journal_abort(journal_t *journal, int errno) transaction_t *transaction; /* + * Lock the aborting procedure until everything is done, this avoid + * races between filesystem's error handling flow (e.g. ext4_abort()), + * ensure panic after the error info is written into journal's + * superblock. + */ + mutex_lock(&journal->j_abort_mutex); + /* * ESHUTDOWN always takes precedence because a file system check * caused by any other journal abort error is not required after * a shutdown triggered. @@ -2167,6 +2176,7 @@ void jbd2_journal_abort(journal_t *journal, int errno) journal->j_errno = errno; jbd2_journal_update_sb_errno(journal); } + mutex_unlock(&journal->j_abort_mutex); return; } @@ -2188,10 +2198,7 @@ void jbd2_journal_abort(journal_t *journal, int errno) * layer could realise that a filesystem check is needed. */ jbd2_journal_update_sb_errno(journal); - - write_lock(&journal->j_state_lock); - journal->j_flags |= JBD2_REC_ERR; - write_unlock(&journal->j_state_lock); + mutex_unlock(&journal->j_abort_mutex); } /** diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index 0637271f3770..8ff4d1a1e774 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -259,7 +259,7 @@ struct jffs2_full_dirent uint32_t ino; /* == zero for unlink */ unsigned int nhash; unsigned char type; - unsigned char name[0]; + unsigned char name[]; }; /* diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h index 60207a2ae952..e4131cb1f1d4 100644 --- a/fs/jffs2/summary.h +++ b/fs/jffs2/summary.h @@ -61,7 +61,7 @@ struct jffs2_sum_dirent_flash jint32_t ino; /* == zero for unlink */ uint8_t nsize; /* dirent name size */ uint8_t type; /* dirent type */ - uint8_t name[0]; /* dirent name */ + uint8_t name[]; /* dirent name */ } __attribute__((packed)); struct jffs2_sum_xattr_flash @@ -117,7 +117,7 @@ struct jffs2_sum_dirent_mem jint32_t ino; /* == zero for unlink */ uint8_t nsize; /* dirent name size */ uint8_t type; /* dirent type */ - uint8_t name[0]; /* dirent name */ + uint8_t name[]; /* dirent name */ } __attribute__((packed)); struct jffs2_sum_xattr_mem diff --git a/fs/namespace.c b/fs/namespace.c index f30ed401cc6d..4a0f600a3328 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2603,6 +2603,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, if (IS_ERR(fc)) return PTR_ERR(fc); + fc->oldapi = true; err = parse_monolithic_mount_data(fc, data); if (!err) { down_write(&sb->s_umount); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7d399f72ebbb..de03e440b7ee 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -907,9 +907,8 @@ retry: goto out_mds; /* Use a direct mapping of ds_idx to pgio mirror_idx */ - if (WARN_ON_ONCE(pgio->pg_mirror_count != - FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))) - goto out_mds; + if (pgio->pg_mirror_count != FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)) + goto out_eagain; for (i = 0; i < pgio->pg_mirror_count; i++) { mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); @@ -931,7 +930,10 @@ retry: (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) pgio->pg_maxretrans = io_maxretrans; return; - +out_eagain: + pnfs_generic_pg_cleanup(pgio); + pgio->pg_error = -EAGAIN; + return; out_mds: trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode, 0, NFS4_MAX_UINT64, IOMODE_RW, @@ -941,6 +943,7 @@ out_mds: pgio->pg_lseg = NULL; pgio->pg_maxretrans = 0; nfs_pageio_reset_write_mds(pgio); + pgio->pg_error = -EAGAIN; } static unsigned int diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index a3ab6e219061..873342308dc0 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -308,6 +308,7 @@ static int try_location(struct fs_context *fc, if (IS_ERR(export_path)) return PTR_ERR(export_path); + kfree(ctx->nfs_server.export_path); ctx->nfs_server.export_path = export_path; source = kmalloc(len + 1 + ctx->nfs_server.export_path_len + 1, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e32717fd1169..2e2dac29a9e9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -774,6 +774,14 @@ static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, slot->seq_nr_last_acked = seqnr; } +static void nfs4_probe_sequence(struct nfs_client *client, const struct cred *cred, + struct nfs4_slot *slot) +{ + struct rpc_task *task = _nfs41_proc_sequence(client, cred, slot, true); + if (!IS_ERR(task)) + rpc_put_task_async(task); +} + static int nfs41_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) { @@ -790,6 +798,7 @@ static int nfs41_sequence_process(struct rpc_task *task, goto out; session = slot->table->session; + clp = session->clp; trace_nfs4_sequence_done(session, res); @@ -804,7 +813,6 @@ static int nfs41_sequence_process(struct rpc_task *task, nfs4_slot_sequence_acked(slot, slot->seq_nr); /* Update the slot's sequence and clientid lease timer */ slot->seq_done = 1; - clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags, @@ -852,10 +860,18 @@ static int nfs41_sequence_process(struct rpc_task *task, /* * Were one or more calls using this slot interrupted? * If the server never received the request, then our - * transmitted slot sequence number may be too high. + * transmitted slot sequence number may be too high. However, + * if the server did receive the request then it might + * accidentally give us a reply with a mismatched operation. + * We can sort this out by sending a lone sequence operation + * to the server on the same slot. */ if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) { slot->seq_nr--; + if (task->tk_msg.rpc_proc != &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE]) { + nfs4_probe_sequence(clp, task->tk_msg.rpc_cred, slot); + res->sr_slot = NULL; + } goto retry_nowait; } /* diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index bb3d2c32664a..c9056316a0b3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -507,6 +507,17 @@ find_any_file(struct nfs4_file *f) return ret; } +static struct nfsd_file *find_deleg_file(struct nfs4_file *f) +{ + struct nfsd_file *ret = NULL; + + spin_lock(&f->fi_lock); + if (f->fi_deleg_file) + ret = nfsd_file_get(f->fi_deleg_file); + spin_unlock(&f->fi_lock); + return ret; +} + static atomic_long_t num_delegations; unsigned long max_delegations; @@ -2444,6 +2455,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) oo = ols->st_stateowner; nf = st->sc_file; file = find_any_file(nf); + if (!file) + return 0; seq_printf(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); @@ -2481,6 +2494,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) oo = ols->st_stateowner; nf = st->sc_file; file = find_any_file(nf); + if (!file) + return 0; seq_printf(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); @@ -2513,7 +2528,9 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) ds = delegstateid(st); nf = st->sc_file; - file = nf->fi_deleg_file; + file = find_deleg_file(nf); + if (!file) + return 0; seq_printf(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); @@ -2529,6 +2546,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_fname(s, file); seq_printf(s, " }\n"); + nfsd_file_put(file); return 0; } @@ -7912,9 +7930,14 @@ nfs4_state_start_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; - ret = nfs4_state_create_net(net); + ret = get_nfsdfs(net); if (ret) return ret; + ret = nfs4_state_create_net(net); + if (ret) { + mntput(nn->nfsd_mnt); + return ret; + } locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0) @@ -7984,6 +8007,7 @@ nfs4_state_shutdown_net(struct net *net) nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); + mntput(nn->nfsd_mnt); } void diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b68e96681522..cd05732f8eaa 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1335,6 +1335,7 @@ void nfsd_client_rmdir(struct dentry *dentry) WARN_ON_ONCE(ret); fsnotify_rmdir(dir, dentry); d_delete(dentry); + dput(dentry); inode_unlock(dir); } @@ -1424,6 +1425,18 @@ static struct file_system_type nfsd_fs_type = { }; MODULE_ALIAS_FS("nfsd"); +int get_nfsdfs(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct vfsmount *mnt; + + mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + nn->nfsd_mnt = mnt; + return 0; +} + #ifdef CONFIG_PROC_FS static int create_proc_exports_entry(void) { @@ -1451,7 +1464,6 @@ unsigned int nfsd_net_id; static __net_init int nfsd_init_net(struct net *net) { int retval; - struct vfsmount *mnt; struct nfsd_net *nn = net_generic(net, nfsd_net_id); retval = nfsd_export_init(net); @@ -1478,16 +1490,8 @@ static __net_init int nfsd_init_net(struct net *net) init_waitqueue_head(&nn->ntf_wq); seqlock_init(&nn->boot_lock); - mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL); - if (IS_ERR(mnt)) { - retval = PTR_ERR(mnt); - goto out_mount_err; - } - nn->nfsd_mnt = mnt; return 0; -out_mount_err: - nfsd_reply_cache_shutdown(nn); out_drc_error: nfsd_idmap_shutdown(net); out_idmap_error: @@ -1500,7 +1504,6 @@ static __net_exit void nfsd_exit_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - mntput(nn->nfsd_mnt); nfsd_reply_cache_shutdown(nn); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 36cdd81b6688..57c832d1b30f 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -90,6 +90,8 @@ void nfsd_destroy(struct net *net); bool i_am_nfsd(void); +int get_nfsdfs(struct net *); + struct nfsdfs_client { struct kref cl_ref; void (*cl_release)(struct kref *kref); @@ -100,6 +102,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn, struct nfsdfs_client *ncl, u32 id, const struct tree_descr *); void nfsd_client_rmdir(struct dentry *dentry); + #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) #ifdef CONFIG_NFSD_V2_ACL extern const struct svc_version nfsd_acl_version2; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c3fbab1753ec..d22a056da477 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1226,6 +1226,9 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, iap->ia_mode = 0; iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; + if (!IS_POSIXACL(dirp)) + iap->ia_mode &= ~current_umask(); + err = 0; host_err = 0; switch (type) { @@ -1458,6 +1461,9 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; } + if (!IS_POSIXACL(dirp)) + iap->ia_mode &= ~current_umask(); + host_err = vfs_create(dirp, dchild, iap->ia_mode, true); if (host_err < 0) { fh_drop_write(fhp); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 152a0fc4e905..751bc4dc7466 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -689,6 +689,12 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_nfs_sync_lops, osb); } +static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb) +{ + ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); + init_rwsem(&osb->nfs_sync_rwlock); +} + void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) { struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; @@ -2855,6 +2861,11 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex) if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ex) + down_write(&osb->nfs_sync_rwlock); + else + down_read(&osb->nfs_sync_rwlock); + if (ocfs2_mount_local(osb)) return 0; @@ -2873,6 +2884,10 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex) if (!ocfs2_mount_local(osb)) ocfs2_cluster_unlock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE); + if (ex) + up_write(&osb->nfs_sync_rwlock); + else + up_read(&osb->nfs_sync_rwlock); } int ocfs2_trim_fs_lock(struct ocfs2_super *osb, @@ -3340,7 +3355,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); - ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); + ocfs2_nfs_sync_lock_init(osb); ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); osb->cconn = conn; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index ee5d98516212..2dd71d626196 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -395,6 +395,7 @@ struct ocfs2_super struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; + struct rw_semaphore nfs_sync_rwlock; struct ocfs2_lock_res osb_trim_fs_lockres; struct mutex obs_trim_fs_mutex; struct ocfs2_dlm_debug *osb_dlm_debug; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 0dd8c41bafd4..19137c6d087b 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -290,7 +290,7 @@ #define OCFS2_MAX_SLOTS 255 /* Slot map indicator for an empty slot */ -#define OCFS2_INVALID_SLOT -1 +#define OCFS2_INVALID_SLOT ((u16)-1) #define OCFS2_VOL_UUID_LEN 16 #define OCFS2_MAX_VOL_LABEL_LEN 64 @@ -326,8 +326,8 @@ struct ocfs2_system_inode_info { enum { BAD_BLOCK_SYSTEM_INODE = 0, GLOBAL_INODE_ALLOC_SYSTEM_INODE, +#define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE, -#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE HEARTBEAT_SYSTEM_INODE, GLOBAL_BITMAP_SYSTEM_INODE, USER_QUOTA_SYSTEM_INODE, diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 4836becb7578..45745cc3408a 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2825,9 +2825,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) goto bail; } - inode_alloc_inode = - ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, - suballoc_slot); + if (suballoc_slot == (u16)OCFS2_INVALID_SLOT) + inode_alloc_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot); + else + inode_alloc_inode = ocfs2_get_system_file_inode(osb, + INODE_ALLOC_SYSTEM_INODE, suballoc_slot); if (!inode_alloc_inode) { /* the error code could be inaccurate, but we are not able to * get the correct one. */ diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 79dd052c7dbf..5e0cde85bd6b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -895,7 +895,7 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, return err; } -int ovl_copy_up_flags(struct dentry *dentry, int flags) +static int ovl_copy_up_flags(struct dentry *dentry, int flags) { int err = 0; const struct cred *old_cred = ovl_override_creds(dentry->d_sb); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 8f4286450f92..0e696f72cf65 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -476,7 +476,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, if (IS_ERR_OR_NULL(this)) return this; - if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) { + if (ovl_dentry_real_at(this, layer->idx) != real) { dput(this); this = ERR_PTR(-EIO); } diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 01820e654a21..0d940e29d62b 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -33,13 +33,16 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode) return 'm'; } +/* No atime modificaton nor notify on underlying */ +#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) + static struct file *ovl_open_realfile(const struct file *file, struct inode *realinode) { struct inode *inode = file_inode(file); struct file *realfile; const struct cred *old_cred; - int flags = file->f_flags | O_NOATIME | FMODE_NONOTIFY; + int flags = file->f_flags | OVL_OPEN_FLAGS; int acc_mode = ACC_MODE(flags); int err; @@ -72,8 +75,7 @@ static int ovl_change_flags(struct file *file, unsigned int flags) struct inode *inode = file_inode(file); int err; - /* No atime modificaton on underlying */ - flags |= O_NOATIME | FMODE_NONOTIFY; + flags |= OVL_OPEN_FLAGS; /* If some flag changed that cannot be changed then something's amiss */ if (WARN_ON((file->f_flags ^ flags) & ~OVL_SETFL_MASK)) @@ -126,7 +128,7 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real, } /* Did the flags change since open? */ - if (unlikely((file->f_flags ^ real->file->f_flags) & ~O_NOATIME)) + if (unlikely((file->f_flags ^ real->file->f_flags) & ~OVL_OPEN_FLAGS)) return ovl_change_flags(real->file, file->f_flags); return 0; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 3566282a9199..f7d4358db637 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -389,7 +389,7 @@ invalid: } static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, - struct ovl_path **stackp, unsigned int *ctrp) + struct ovl_path **stackp) { struct ovl_fh *fh = ovl_get_fh(upperdentry, OVL_XATTR_ORIGIN); int err; @@ -406,10 +406,6 @@ static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, return err; } - if (WARN_ON(*ctrp)) - return -EIO; - - *ctrp = 1; return 0; } @@ -861,8 +857,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out; } if (upperdentry && !d.is_dir) { - unsigned int origin_ctr = 0; - /* * Lookup copy up origin by decoding origin file handle. * We may get a disconnected dentry, which is fine, @@ -873,8 +867,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * number - it's the same as if we held a reference * to a dentry in lower layer that was moved under us. */ - err = ovl_check_origin(ofs, upperdentry, &origin_path, - &origin_ctr); + err = ovl_check_origin(ofs, upperdentry, &origin_path); if (err) goto out_put_upper; @@ -1073,6 +1066,10 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, upperredirect = NULL; goto out_free_oe; } + err = ovl_check_metacopy_xattr(upperdentry); + if (err < 0) + goto out_free_oe; + uppermetacopy = err; } if (upperdentry || ctr) { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index b725c7f15ff4..29bc1ec699e7 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -483,7 +483,6 @@ void ovl_aio_request_cache_destroy(void); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_with_data(struct dentry *dentry); -int ovl_copy_up_flags(struct dentry *dentry, int flags); int ovl_maybe_copy_up(struct dentry *dentry, int flags); int ovl_copy_xattr(struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 91476bc422f9..4b38141c2985 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -580,12 +580,19 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) } } - /* Workdir is useless in non-upper mount */ - if (!config->upperdir && config->workdir) { - pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n", - config->workdir); - kfree(config->workdir); - config->workdir = NULL; + /* Workdir/index are useless in non-upper mount */ + if (!config->upperdir) { + if (config->workdir) { + pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + config->workdir); + kfree(config->workdir); + config->workdir = NULL; + } + if (config->index && index_opt) { + pr_info("option \"index=on\" is useless in a non-upper mount, ignore\n"); + index_opt = false; + } + config->index = false; } err = ovl_parse_redirect_mode(config, config->redirect_mode); @@ -622,11 +629,13 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) /* Resolve nfs_export -> index dependency */ if (config->nfs_export && !config->index) { - if (nfs_export_opt && index_opt) { + if (!config->upperdir && config->redirect_follow) { + pr_info("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); + config->nfs_export = false; + } else if (nfs_export_opt && index_opt) { pr_err("conflicting options: nfs_export=on,index=off\n"); return -EINVAL; - } - if (index_opt) { + } else if (index_opt) { /* * There was an explicit index=off that resulted * in this conflict. @@ -1352,8 +1361,15 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, goto out; } + /* index dir will act also as workdir */ + iput(ofs->workdir_trap); + ofs->workdir_trap = NULL; + dput(ofs->workdir); + ofs->workdir = NULL; ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true); if (ofs->indexdir) { + ofs->workdir = dget(ofs->indexdir); + err = ovl_setup_trap(sb, ofs->indexdir, &ofs->indexdir_trap, "indexdir"); if (err) @@ -1396,6 +1412,18 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) if (!ofs->config.nfs_export && !ovl_upper_mnt(ofs)) return true; + /* + * We allow using single lower with null uuid for index and nfs_export + * for example to support those features with single lower squashfs. + * To avoid regressions in setups of overlay with re-formatted lower + * squashfs, do not allow decoding origin with lower null uuid unless + * user opted-in to one of the new features that require following the + * lower inode of non-dir upper. + */ + if (!ofs->config.index && !ofs->config.metacopy && !ofs->config.xino && + uuid_is_null(uuid)) + return false; + for (i = 0; i < ofs->numfs; i++) { /* * We use uuid to associate an overlay lower file handle with a @@ -1493,14 +1521,23 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, if (err < 0) goto out; + /* + * Check if lower root conflicts with this overlay layers before + * checking if it is in-use as upperdir/workdir of "another" + * mount, because we do not bother to check in ovl_is_inuse() if + * the upperdir/workdir is in fact in-use by our + * upperdir/workdir. + */ err = ovl_setup_trap(sb, stack[i].dentry, &trap, "lowerdir"); if (err) goto out; if (ovl_is_inuse(stack[i].dentry)) { err = ovl_report_in_use(ofs, "lowerdir"); - if (err) + if (err) { + iput(trap); goto out; + } } mnt = clone_private_mount(&stack[i]); @@ -1575,10 +1612,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, if (!ofs->config.upperdir && numlower == 1) { pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n"); return ERR_PTR(-EINVAL); - } else if (!ofs->config.upperdir && ofs->config.nfs_export && - ofs->config.redirect_follow) { - pr_warn("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); - ofs->config.nfs_export = false; } stack = kcalloc(numlower, sizeof(struct path), GFP_KERNEL); @@ -1842,21 +1875,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ovl_upper_mnt(ofs)) sb->s_flags |= SB_RDONLY; - if (!(ovl_force_readonly(ofs)) && ofs->config.index) { - /* index dir will act also as workdir */ - dput(ofs->workdir); - ofs->workdir = NULL; - iput(ofs->workdir_trap); - ofs->workdir_trap = NULL; - + if (!ovl_force_readonly(ofs) && ofs->config.index) { err = ovl_get_indexdir(sb, ofs, oe, &upperpath); if (err) goto out_free_oe; /* Force r/o mount with no index dir */ - if (ofs->indexdir) - ofs->workdir = dget(ofs->indexdir); - else + if (!ofs->indexdir) sb->s_flags |= SB_RDONLY; } diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 9955d75c0585..ad31ec4ad627 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -26,8 +26,9 @@ static int boot_config_proc_show(struct seq_file *m, void *v) static int __init copy_xbc_key_value_list(char *dst, size_t size) { struct xbc_node *leaf, *vnode; - const char *val; char *key, *end = dst + size; + const char *val; + char q; int ret = 0; key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL); @@ -41,16 +42,20 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size) break; dst += ret; vnode = xbc_node_get_child(leaf); - if (vnode && xbc_node_is_array(vnode)) { + if (vnode) { xbc_array_for_each_value(vnode, val) { - ret = snprintf(dst, rest(dst, end), "\"%s\"%s", - val, vnode->next ? ", " : "\n"); + if (strchr(val, '"')) + q = '\''; + else + q = '"'; + ret = snprintf(dst, rest(dst, end), "%c%s%c%s", + q, val, q, vnode->next ? ", " : "\n"); if (ret < 0) goto out; dst += ret; } } else { - ret = snprintf(dst, rest(dst, end), "\"%s\"\n", val); + ret = snprintf(dst, rest(dst, end), "\"\"\n"); if (ret < 0) break; dst += ret; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 8ba492d44e68..e502414b3556 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -512,7 +512,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) * Using bounce buffer to bypass the * hardened user copy kernel text checks. */ - if (probe_kernel_read(buf, (void *) start, tsz)) { + if (copy_from_kernel_nofault(buf, (void *)start, + tsz)) { if (clear_user(buffer, tsz)) { ret = -EFAULT; goto out; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 42c5128c7d1c..6c1166ccdaea 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -566,8 +566,9 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, goto out; /* don't even try if the size is too large */ - if (count > KMALLOC_MAX_SIZE) - return -ENOMEM; + error = -ENOMEM; + if (count >= KMALLOC_MAX_SIZE) + goto out; if (write) { kbuf = memdup_user_nul(ubuf, count); @@ -576,7 +577,6 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, goto out; } } else { - error = -ENOMEM; kbuf = kzalloc(count, GFP_KERNEL); if (!kbuf) goto out; diff --git a/fs/read_write.c b/fs/read_write.c index bbfa9b12b15e..4fb797822567 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -419,28 +419,42 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo return ret; } -ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, - loff_t *pos) +ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { + mm_segment_t old_fs = get_fs(); + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) + return -EINVAL; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + set_fs(KERNEL_DS); if (file->f_op->read) - return file->f_op->read(file, buf, count, pos); + ret = file->f_op->read(file, (void __user *)buf, count, pos); else if (file->f_op->read_iter) - return new_sync_read(file, buf, count, pos); + ret = new_sync_read(file, (void __user *)buf, count, pos); else - return -EINVAL; + ret = -EINVAL; + set_fs(old_fs); + if (ret > 0) { + fsnotify_access(file); + add_rchar(current, ret); + } + inc_syscr(current); + return ret; } ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { - mm_segment_t old_fs; - ssize_t result; + ssize_t ret; - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* The cast to a user pointer is valid due to the set_fs() */ - result = vfs_read(file, (void __user *)buf, count, pos); - set_fs(old_fs); - return result; + ret = rw_verify_area(READ, file, pos, count); + if (ret) + return ret; + return __kernel_read(file, buf, count, pos); } EXPORT_SYMBOL(kernel_read); @@ -456,17 +470,22 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); - if (!ret) { - if (count > MAX_RW_COUNT) - count = MAX_RW_COUNT; - ret = __vfs_read(file, buf, count, pos); - if (ret > 0) { - fsnotify_access(file); - add_rchar(current, ret); - } - inc_syscr(current); - } + if (ret) + return ret; + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + if (file->f_op->read) + ret = file->f_op->read(file, buf, count, pos); + else if (file->f_op->read_iter) + ret = new_sync_read(file, buf, count, pos); + else + ret = -EINVAL; + if (ret > 0) { + fsnotify_access(file); + add_rchar(current, ret); + } + inc_syscr(current); return ret; } @@ -488,23 +507,15 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t return ret; } -static ssize_t __vfs_write(struct file *file, const char __user *p, - size_t count, loff_t *pos) -{ - if (file->f_op->write) - return file->f_op->write(file, p, count, pos); - else if (file->f_op->write_iter) - return new_sync_write(file, p, count, pos); - else - return -EINVAL; -} - +/* caller is responsible for file_start_write/file_end_write */ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { mm_segment_t old_fs; const char __user *p; ssize_t ret; + if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) + return -EBADF; if (!(file->f_mode & FMODE_CAN_WRITE)) return -EINVAL; @@ -513,7 +524,12 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t p = (__force const char __user *)buf; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; - ret = __vfs_write(file, p, count, pos); + if (file->f_op->write) + ret = file->f_op->write(file, p, count, pos); + else if (file->f_op->write_iter) + ret = new_sync_write(file, p, count, pos); + else + ret = -EINVAL; set_fs(old_fs); if (ret > 0) { fsnotify_modify(file); @@ -522,21 +538,20 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t inc_syscw(current); return ret; } -EXPORT_SYMBOL(__kernel_write); ssize_t kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) { - mm_segment_t old_fs; - ssize_t res; + ssize_t ret; - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* The cast to a user pointer is valid due to the set_fs() */ - res = vfs_write(file, (__force const char __user *)buf, count, pos); - set_fs(old_fs); + ret = rw_verify_area(WRITE, file, pos, count); + if (ret) + return ret; - return res; + file_start_write(file); + ret = __kernel_write(file, buf, count, pos); + file_end_write(file); + return ret; } EXPORT_SYMBOL(kernel_write); @@ -552,19 +567,23 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); - if (!ret) { - if (count > MAX_RW_COUNT) - count = MAX_RW_COUNT; - file_start_write(file); - ret = __vfs_write(file, buf, count, pos); - if (ret > 0) { - fsnotify_modify(file); - add_wchar(current, ret); - } - inc_syscw(current); - file_end_write(file); + if (ret) + return ret; + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + file_start_write(file); + if (file->f_op->write) + ret = file->f_op->write(file, buf, count, pos); + else if (file->f_op->write_iter) + ret = new_sync_write(file, buf, count, pos); + else + ret = -EINVAL; + if (ret > 0) { + fsnotify_modify(file); + add_wchar(current, ret); } - + inc_syscw(current); + file_end_write(file); return ret; } diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 64f61330564a..76bb1c846845 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -175,7 +175,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, /* Extract the length of the metadata block */ data = page_address(bvec->bv_page) + bvec->bv_offset; length = data[offset]; - if (offset <= bvec->bv_len - 1) { + if (offset < bvec->bv_len - 1) { length |= data[offset + 1] << 8; } else { if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 7187bd1a30ea..8d64edb80ebf 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -262,7 +262,7 @@ struct squashfs_dir_index { __le32 index; __le32 start_block; __le32 size; - unsigned char name[0]; + unsigned char name[]; }; struct squashfs_base_inode { @@ -327,7 +327,7 @@ struct squashfs_symlink_inode { __le32 inode_number; __le32 nlink; __le32 symlink_size; - char symlink[0]; + char symlink[]; }; struct squashfs_reg_inode { @@ -341,7 +341,7 @@ struct squashfs_reg_inode { __le32 fragment; __le32 offset; __le32 file_size; - __le16 block_list[0]; + __le16 block_list[]; }; struct squashfs_lreg_inode { @@ -358,7 +358,7 @@ struct squashfs_lreg_inode { __le32 fragment; __le32 offset; __le32 xattr; - __le16 block_list[0]; + __le16 block_list[]; }; struct squashfs_dir_inode { @@ -389,7 +389,7 @@ struct squashfs_ldir_inode { __le16 i_count; __le16 offset; __le32 xattr; - struct squashfs_dir_index index[0]; + struct squashfs_dir_index index[]; }; union squashfs_inode { @@ -410,7 +410,7 @@ struct squashfs_dir_entry { __le16 inode_number; __le16 type; __le16 size; - char name[0]; + char name[]; }; struct squashfs_dir_header { @@ -428,12 +428,12 @@ struct squashfs_fragment_entry { struct squashfs_xattr_entry { __le16 type; __le16 size; - char data[0]; + char data[]; }; struct squashfs_xattr_val { __le32 vsize; - char value[0]; + char value[]; }; struct squashfs_xattr_id { diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index b43f0e8f43f2..9ed90368ab31 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -671,7 +671,8 @@ xlog_cil_push_work( /* * Wake up any background push waiters now this context is being pushed. */ - wake_up_all(&ctx->push_wait); + if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) + wake_up_all(&cil->xc_push_wait); /* * Check if we've anything to push. If there is nothing, then we don't @@ -743,13 +744,12 @@ xlog_cil_push_work( /* * initialise the new context and attach it to the CIL. Then attach - * the current context to the CIL committing lsit so it can be found + * the current context to the CIL committing list so it can be found * during log forces to extract the commit lsn of the sequence that * needs to be forced. */ INIT_LIST_HEAD(&new_ctx->committing); INIT_LIST_HEAD(&new_ctx->busy_extents); - init_waitqueue_head(&new_ctx->push_wait); new_ctx->sequence = ctx->sequence + 1; new_ctx->cil = cil; cil->xc_ctx = new_ctx; @@ -937,7 +937,7 @@ xlog_cil_push_background( if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) { trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); ASSERT(cil->xc_ctx->space_used < log->l_logsize); - xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock); + xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock); return; } @@ -1216,12 +1216,12 @@ xlog_cil_init( INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); spin_lock_init(&cil->xc_push_lock); + init_waitqueue_head(&cil->xc_push_wait); init_rwsem(&cil->xc_ctx_lock); init_waitqueue_head(&cil->xc_commit_wait); INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); - init_waitqueue_head(&ctx->push_wait); ctx->sequence = 1; ctx->cil = cil; cil->xc_ctx = ctx; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ec22c7a3867f..75a62870b63a 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -240,7 +240,6 @@ struct xfs_cil_ctx { struct xfs_log_vec *lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ - wait_queue_head_t push_wait; /* background push throttle */ struct work_struct discard_endio_work; }; @@ -274,6 +273,7 @@ struct xfs_cil { wait_queue_head_t xc_commit_wait; xfs_lsn_t xc_current_sequence; struct work_struct xc_push_work; + wait_queue_head_t xc_push_wait; /* background push throttle */ } ____cacheline_aligned_in_smp; /* diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 07bc42d62673..abfb17f88f9a 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -607,14 +607,14 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) int nr_pages; ssize_t ret; - nr_pages = iov_iter_npages(from, BIO_MAX_PAGES); - if (!nr_pages) - return 0; - max = queue_max_zone_append_sectors(bdev_get_queue(bdev)); max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); iov_iter_truncate(from, max); + nr_pages = iov_iter_npages(from, BIO_MAX_PAGES); + if (!nr_pages) + return 0; + bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set); if (!bio) return -ENOMEM; @@ -1119,7 +1119,7 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, char *file_name; struct dentry *dir; unsigned int n = 0; - int ret = -ENOMEM; + int ret; /* If the group is empty, there is nothing to do */ if (!zd->nr_zones[type]) @@ -1135,8 +1135,10 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, zgroup_name = "seq"; dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type); - if (!dir) + if (!dir) { + ret = -ENOMEM; goto free; + } /* * The first zone contains the super block: skip it. @@ -1174,8 +1176,10 @@ static int zonefs_create_zgroup(struct zonefs_zone_data *zd, * Use the file number within its group as file name. */ snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n); - if (!zonefs_create_inode(dir, file_name, zone, type)) + if (!zonefs_create_inode(dir, file_name, zone, type)) { + ret = -ENOMEM; goto free; + } n++; } |