diff options
author | Miklos Szeredi <mszeredi@redhat.com> | 2021-09-01 12:39:02 +0200 |
---|---|---|
committer | Miklos Szeredi <mszeredi@redhat.com> | 2021-09-06 13:37:02 +0200 |
commit | 660585b56e63ca034ad506ea53c807c5cdca3196 (patch) | |
tree | a3ebc2af64f6e82c70ef1aaac95343b2ea443023 /fs/fuse/inode.c | |
parent | fuse: flush extending writes (diff) | |
download | linux-660585b56e63ca034ad506ea53c807c5cdca3196.tar.xz linux-660585b56e63ca034ad506ea53c807c5cdca3196.zip |
fuse: wait for writepages in syncfs
In case of fuse the MM subsystem doesn't guarantee that page writeback
completes by the time ->sync_fs() is called. This is because fuse
completes page writeback immediately to prevent DoS of memory reclaim by
the userspace file server.
This means that fuse itself must ensure that writes are synced before
sending the SYNCFS request to the server.
Introduce sync buckets, that hold a counter for the number of outstanding
write requests. On syncfs replace the current bucket with a new one and
wait until the old bucket's counter goes down to zero.
It is possible to have multiple syncfs calls in parallel, in which case
there could be more than one waited-on buckets. Descendant buckets must
not complete until the parent completes. Add a count to the child (new)
bucket until the (parent) old bucket completes.
Use RCU protection to dereference the current bucket and to wake up an
emptied bucket. Use fc->lock to protect against parallel assignments to
the current bucket.
This leaves just the counter to be a possible scalability issue. The
fc->num_waiting counter has a similar issue, so both should be addressed at
the same time.
Reported-by: Amir Goldstein <amir73il@gmail.com>
Fixes: 2d82ab251ef0 ("virtiofs: propagate sync() to file server")
Cc: <stable@vger.kernel.org> # v5.14
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Diffstat (limited to 'fs/fuse/inode.c')
-rw-r--r-- | fs/fuse/inode.c | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a3e7fb484938..2187211893ff 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -506,6 +506,57 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) return err; } +static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void) +{ + struct fuse_sync_bucket *bucket; + + bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL); + if (bucket) { + init_waitqueue_head(&bucket->waitq); + /* Initial active count */ + atomic_set(&bucket->count, 1); + } + return bucket; +} + +static void fuse_sync_fs_writes(struct fuse_conn *fc) +{ + struct fuse_sync_bucket *bucket, *new_bucket; + int count; + + new_bucket = fuse_sync_bucket_alloc(); + spin_lock(&fc->lock); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + count = atomic_read(&bucket->count); + WARN_ON(count < 1); + /* No outstanding writes? */ + if (count == 1) { + spin_unlock(&fc->lock); + kfree(new_bucket); + return; + } + + /* + * Completion of new bucket depends on completion of this bucket, so add + * one more count. + */ + atomic_inc(&new_bucket->count); + rcu_assign_pointer(fc->curr_bucket, new_bucket); + spin_unlock(&fc->lock); + /* + * Drop initial active count. At this point if all writes in this and + * ancestor buckets complete, the count will go to zero and this task + * will be woken up. + */ + atomic_dec(&bucket->count); + + wait_event(bucket->waitq, atomic_read(&bucket->count) == 0); + + /* Drop temp count on descendant bucket */ + fuse_sync_bucket_dec(new_bucket); + kfree_rcu(bucket, rcu); +} + static int fuse_sync_fs(struct super_block *sb, int wait) { struct fuse_mount *fm = get_fuse_mount_super(sb); @@ -528,6 +579,8 @@ static int fuse_sync_fs(struct super_block *sb, int wait) if (!fc->sync_fs) return 0; + fuse_sync_fs_writes(fc); + memset(&inarg, 0, sizeof(inarg)); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); @@ -763,6 +816,7 @@ void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { struct fuse_iqueue *fiq = &fc->iq; + struct fuse_sync_bucket *bucket; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc); @@ -770,6 +824,11 @@ void fuse_conn_put(struct fuse_conn *fc) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); put_user_ns(fc->user_ns); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + if (bucket) { + WARN_ON(atomic_read(&bucket->count) != 1); + kfree(bucket); + } fc->release(fc); } } @@ -1418,6 +1477,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_flags & SB_MANDLOCK) goto err; + rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc()); fuse_sb_defaults(sb); if (ctx->is_bdev) { |