diff options
Diffstat (limited to 'fs')
431 files changed, 13016 insertions, 9850 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 6c31b8c8112d..2996fb00387f 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -374,6 +374,28 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; } +/* + * v9fs_file_splice_read - splice-read from a file + * @in: The 9p file to read from + * @ppos: Where to find/update the file position + * @pipe: The pipe to splice into + * @len: The maximum amount of data to splice + * @flags: SPLICE_F_* flags + */ +static ssize_t v9fs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct p9_fid *fid = in->private_data; + + p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n", + fid->fid, len, *ppos); + + if (fid->mode & P9L_DIRECT) + return copy_splice_read(in, ppos, pipe, len, flags); + return filemap_splice_read(in, ppos, pipe, len, flags); +} + /** * v9fs_file_write_iter - write to a file * @iocb: The operation parameters @@ -569,7 +591,7 @@ const struct file_operations v9fs_file_operations = { .release = v9fs_dir_release, .lock = v9fs_file_lock, .mmap = generic_file_readonly_mmap, - .splice_read = generic_file_splice_read, + .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync, }; @@ -583,7 +605,7 @@ const struct file_operations v9fs_file_operations_dotl = { .lock = v9fs_file_lock_dotl, .flock = v9fs_file_flock_dotl, .mmap = v9fs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync_dotl, }; diff --git a/fs/Makefile b/fs/Makefile index 5bfdbf0d7037..e513aaee0603 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -17,14 +17,8 @@ obj-y := open.o read_write.o file_table.o super.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o -ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o mpage.o -else -obj-y += no-block.o -endif - -obj-$(CONFIG_PROC_FS) += proc_namespace.o - +obj-$(CONFIG_BLOCK) += buffer.o mpage.o +obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o diff --git a/fs/adfs/file.c b/fs/adfs/file.c index 754afb14a6ff..ee80718aaeec 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -28,7 +28,7 @@ const struct file_operations adfs_file_operations = { .mmap = generic_file_mmap, .fsync = generic_file_fsync, .write_iter = generic_file_write_iter, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, }; const struct inode_operations adfs_file_inode_operations = { diff --git a/fs/affs/file.c b/fs/affs/file.c index 8daeed31e1af..e43f2f007ac1 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -1001,7 +1001,7 @@ const struct file_operations affs_file_operations = { .open = affs_file_open, .release = affs_file_release, .fsync = affs_file_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, }; const struct inode_operations affs_file_inode_operations = { diff --git a/fs/afs/file.c b/fs/afs/file.c index 719b31374879..d37dd201752b 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -25,6 +25,9 @@ static void afs_invalidate_folio(struct folio *folio, size_t offset, static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); +static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); static void afs_vm_open(struct vm_area_struct *area); static void afs_vm_close(struct vm_area_struct *area); static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); @@ -36,7 +39,7 @@ const struct file_operations afs_file_operations = { .read_iter = afs_file_read_iter, .write_iter = afs_file_write, .mmap = afs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = afs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = afs_fsync, .lock = afs_lock, @@ -587,3 +590,18 @@ static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) return generic_file_read_iter(iocb, iter); } + +static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct afs_vnode *vnode = AFS_FS_I(file_inode(in)); + struct afs_file *af = in->private_data; + int ret; + + ret = afs_validate(vnode, af->key); + if (ret < 0) + return ret; + + return filemap_splice_read(in, ppos, pipe, len, flags); +} diff --git a/fs/afs/write.c b/fs/afs/write.c index 8750b99c3f56..e1c45341719b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -413,17 +413,19 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t afs_op_set_vnode(op, 0, vnode); op->file[0].dv_delta = 1; op->file[0].modification = true; - op->store.write_iter = iter; op->store.pos = pos; op->store.size = size; - op->store.i_size = max(pos + size, vnode->netfs.remote_i_size); op->store.laundering = laundering; - op->mtime = vnode->netfs.inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; op->ops = &afs_store_data_operation; try_next_key: afs_begin_vnode_operation(op); + + op->store.write_iter = iter; + op->store.i_size = max(pos + size, vnode->netfs.remote_i_size); + op->mtime = vnode->netfs.inode.i_mtime; + afs_wait_for_operation(op); switch (op->error) { @@ -465,7 +467,7 @@ static void afs_extend_writeback(struct address_space *mapping, bool caching, unsigned int *_len) { - struct pagevec pvec; + struct folio_batch fbatch; struct folio *folio; unsigned long priv; unsigned int psize, filler = 0; @@ -476,7 +478,7 @@ static void afs_extend_writeback(struct address_space *mapping, unsigned int i; XA_STATE(xas, &mapping->i_pages, index); - pagevec_init(&pvec); + folio_batch_init(&fbatch); do { /* Firstly, we gather up a batch of contiguous dirty pages @@ -535,7 +537,7 @@ static void afs_extend_writeback(struct address_space *mapping, stop = false; index += folio_nr_pages(folio); - if (!pagevec_add(&pvec, &folio->page)) + if (!folio_batch_add(&fbatch, folio)) break; if (stop) break; @@ -545,14 +547,14 @@ static void afs_extend_writeback(struct address_space *mapping, xas_pause(&xas); rcu_read_unlock(); - /* Now, if we obtained any pages, we can shift them to being + /* Now, if we obtained any folios, we can shift them to being * writable and mark them for caching. */ - if (!pagevec_count(&pvec)) + if (!folio_batch_count(&fbatch)) break; - for (i = 0; i < pagevec_count(&pvec); i++) { - folio = page_folio(pvec.pages[i]); + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio); if (!folio_clear_dirty_for_io(folio)) @@ -565,7 +567,7 @@ static void afs_extend_writeback(struct address_space *mapping, folio_unlock(folio); } - pagevec_release(&pvec); + folio_batch_release(&fbatch); cond_resched(); } while (!stop); @@ -530,7 +530,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) for (i = 0; i < nr_pages; i++) { struct page *page; page = find_or_create_page(file->f_mapping, - i, GFP_HIGHUSER | __GFP_ZERO); + i, GFP_USER | __GFP_ZERO); if (!page) break; pr_debug("pid(%d) page[%d]->count=%d\n", @@ -571,7 +571,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ~0U; ring->head = ring->tail = 0; @@ -579,7 +579,6 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ring->compat_features = AIO_RING_COMPAT_FEATURES; ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); - kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); return 0; @@ -682,9 +681,8 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) * we are protected from page migration * changes ring_pages by ->ring_lock. */ - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); ring->id = ctx->id; - kunmap_atomic(ring); return 0; } @@ -1025,9 +1023,8 @@ static void user_refill_reqs_available(struct kioctx *ctx) * against ctx->completed_events below will make sure we do the * safe/right thing. */ - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); head = ring->head; - kunmap_atomic(ring); refill_reqs_available(ctx, head, ctx->tail); } @@ -1133,12 +1130,11 @@ static void aio_complete(struct aio_kiocb *iocb) if (++tail >= ctx->nr_events) tail = 0; - ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + ev_page = page_address(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; *event = iocb->ki_res; - kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, @@ -1152,10 +1148,9 @@ static void aio_complete(struct aio_kiocb *iocb) ctx->tail = tail; - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); head = ring->head; ring->tail = tail; - kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); ctx->completed_events++; @@ -1215,10 +1210,9 @@ static long aio_read_events_ring(struct kioctx *ctx, mutex_lock(&ctx->ring_lock); /* Access to ->ring_pages here is protected by ctx->ring_lock. */ - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); head = ring->head; tail = ring->tail; - kunmap_atomic(ring); /* * Ensure that once we've read the current tail pointer, that @@ -1250,10 +1244,9 @@ static long aio_read_events_ring(struct kioctx *ctx, avail = min(avail, nr - ret); avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); - ev = kmap(page); + ev = page_address(page); copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); - kunmap(page); if (unlikely(copy_ret)) { ret = -EFAULT; @@ -1265,9 +1258,8 @@ static long aio_read_events_ring(struct kioctx *ctx, head %= ctx->nr_events; } - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); ring->head = head; - kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); pr_debug("%li h%u t%u\n", ret, head, tail); diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 6baf90b08e0e..93046c9dc461 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; - dir->i_mtime = current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); return 0; } @@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); - dir->i_mtime = current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); spin_lock(&sbi->lookup_lock); __autofs_add_expiring(dentry); @@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inc_nlink(dir); - dir->i_mtime = current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); return 0; } diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 1b7e0f7128d6..53b36aa29978 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -500,7 +500,7 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds, goto error_alloc; } - strlcpy(keybuf, keystart, keylen + 1); + strscpy(keybuf, keystart, keylen + 1); *value = fs64_to_cpu(sb, valarray[cur_key]); *keysize = keylen; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 32749fcee090..eee9237386e2 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -374,7 +374,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) if (S_ISLNK(inode->i_mode) && !(befs_ino->i_flags & BEFS_LONG_SYMLINK)){ inode->i_size = 0; inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE; - strlcpy(befs_ino->i_data.symlink, raw_inode->data.symlink, + strscpy(befs_ino->i_data.symlink, raw_inode->data.symlink, BEFS_SYMLINK_LEN); } else { int num_blks; diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 57ae5ee6deec..adc2230079c6 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -27,7 +27,7 @@ const struct file_operations bfs_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, }; static int bfs_move_block(unsigned long from, unsigned long to, diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 1033fbdfdbec..7b3d2d491407 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -320,10 +320,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ - if (mmap_read_lock_killable(mm)) + if (mmap_write_lock_killable(mm)) return -EINTR; - vma = find_extend_vma(mm, bprm->p); - mmap_read_unlock(mm); + vma = find_extend_vma_locked(mm, bprm->p); + mmap_write_unlock(mm); if (!vma) return -EFAULT; @@ -1517,7 +1517,7 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) phdr->p_filesz = sz; phdr->p_memsz = 0; phdr->p_flags = 0; - phdr->p_align = 0; + phdr->p_align = 4; } static void fill_note(struct memelfnote *note, const char *name, int type, @@ -1773,7 +1773,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, /* * NT_PRSTATUS is the one special case, because the regset data * goes into the pr_reg field inside the note contents, rather - * than being the whole note contents. We fill the reset in here. + * than being the whole note contents. We fill the regset in here. * We assume that regset 0 is NT_PRSTATUS. */ fill_prstatus(&t->prstatus.common, t->task, signr); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 05a1471d5283..1c6c5832af86 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -743,12 +743,12 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, struct elf32_fdpic_loadmap *loadmap; #ifdef CONFIG_MMU struct elf32_fdpic_loadseg *mseg; + unsigned long load_addr; #endif struct elf32_fdpic_loadseg *seg; struct elf32_phdr *phdr; - unsigned long load_addr, stop; unsigned nloads, tmp; - size_t size; + unsigned long stop; int loop, ret; /* allocate a load map table */ @@ -760,8 +760,7 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, if (nloads == 0) return -ELIBBAD; - size = sizeof(*loadmap) + nloads * sizeof(*seg); - loadmap = kzalloc(size, GFP_KERNEL); + loadmap = kzalloc(struct_size(loadmap, segs, nloads), GFP_KERNEL); if (!loadmap) return -ENOMEM; @@ -770,9 +769,6 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, loadmap->version = ELF32_FDPIC_LOADMAP_VERSION; loadmap->nsegs = nloads; - load_addr = params->load_addr; - seg = loadmap->segs; - /* map the requested LOADs into the memory space */ switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { case ELF_FDPIC_FLAG_CONSTDISP: @@ -1269,7 +1265,7 @@ static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offs phdr->p_filesz = sz; phdr->p_memsz = 0; phdr->p_flags = 0; - phdr->p_align = 0; + phdr->p_align = 4; return; } diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index aac240430efe..ce083e99ef68 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -71,6 +71,16 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) return atomic_read(&wq->pending) > wq->thresh * 2; } +static void btrfs_init_workqueue(struct btrfs_workqueue *wq, + struct btrfs_fs_info *fs_info) +{ + wq->fs_info = fs_info; + atomic_set(&wq->pending, 0); + INIT_LIST_HEAD(&wq->ordered_list); + spin_lock_init(&wq->list_lock); + spin_lock_init(&wq->thres_lock); +} + struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, unsigned int flags, int limit_active, int thresh) @@ -80,9 +90,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, if (!ret) return NULL; - ret->fs_info = fs_info; + btrfs_init_workqueue(ret, fs_info); + ret->limit_active = limit_active; - atomic_set(&ret->pending, 0); if (thresh == 0) thresh = DFT_THRESHOLD; /* For low threshold, disabling threshold is a better choice */ @@ -106,9 +116,33 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, return NULL; } - INIT_LIST_HEAD(&ret->ordered_list); - spin_lock_init(&ret->list_lock); - spin_lock_init(&ret->thres_lock); + trace_btrfs_workqueue_alloc(ret, name); + return ret; +} + +struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( + struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags) +{ + struct btrfs_workqueue *ret; + + ret = kzalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + btrfs_init_workqueue(ret, fs_info); + + /* Ordered workqueues don't allow @max_active adjustments. */ + ret->limit_active = 1; + ret->current_active = 1; + ret->thresh = NO_THRESHOLD; + + ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name); + if (!ret->normal_wq) { + kfree(ret); + return NULL; + } + trace_btrfs_workqueue_alloc(ret, name); return ret; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 6e2596ddae10..30f66c5e2e6e 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -31,6 +31,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, unsigned int flags, int limit_active, int thresh); +struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( + struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags); void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free); void btrfs_queue_work(struct btrfs_workqueue *wq, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index b3ad0f51e616..12b12443efaa 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -27,6 +27,17 @@ struct btrfs_failed_bio { atomic_t repair_count; }; +/* Is this a data path I/O that needs storage layer checksum and repair? */ +static inline bool is_data_bbio(struct btrfs_bio *bbio) +{ + return bbio->inode && is_data_inode(&bbio->inode->vfs_inode); +} + +static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) +{ + return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; +} + /* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. @@ -61,20 +72,6 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, return bbio; } -static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio) -{ - struct btrfs_ordered_extent *ordered; - int ret; - - ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); - if (WARN_ON_ONCE(!ordered)) - return BLK_STS_IOERR; - ret = btrfs_extract_ordered_extent(bbio, ordered); - btrfs_put_ordered_extent(ordered); - - return errno_to_blk_status(ret); -} - static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, struct btrfs_bio *orig_bbio, u64 map_length, bool use_append) @@ -95,13 +92,41 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; bbio->file_offset = orig_bbio->file_offset; - if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED)) - orig_bbio->file_offset += map_length; - + orig_bbio->file_offset += map_length; + if (bbio_has_ordered_extent(bbio)) { + refcount_inc(&orig_bbio->ordered->refs); + bbio->ordered = orig_bbio->ordered; + } atomic_inc(&orig_bbio->pending_ios); return bbio; } +/* Free a bio that was never submitted to the underlying device. */ +static void btrfs_cleanup_bio(struct btrfs_bio *bbio) +{ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); +} + +static void __btrfs_bio_end_io(struct btrfs_bio *bbio) +{ + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; + + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } +} + +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) +{ + bbio->bio.bi_status = status; + __btrfs_bio_end_io(bbio); +} + static void btrfs_orig_write_end_io(struct bio *bio); static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, @@ -130,12 +155,12 @@ static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) if (bbio->bio.bi_status) btrfs_bbio_propagate_error(bbio, orig_bbio); - bio_put(&bbio->bio); + btrfs_cleanup_bio(bbio); bbio = orig_bbio; } if (atomic_dec_and_test(&bbio->pending_ios)) - bbio->end_io(bbio); + __btrfs_bio_end_io(bbio); } static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) @@ -327,7 +352,7 @@ static void btrfs_end_bio_work(struct work_struct *work) struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); /* Metadata reads are checked and repaired by the submitter. */ - if (bbio->inode && !(bbio->bio.bi_opf & REQ_META)) + if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else btrfs_orig_bbio_end_io(bbio); @@ -348,7 +373,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) + if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_orig_bbio_end_io(bbio); } @@ -361,8 +386,7 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - if (bio_op(bio) == REQ_OP_READ && bbio->inode && - !(bbio->bio.bi_opf & REQ_META)) + if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else btrfs_orig_bbio_end_io(bbio); @@ -472,13 +496,12 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, int mirror_num) { - /* Do not leak our private flag into the block layer. */ - bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; - if (!bioc) { /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + if (bio_op(bio) != REQ_OP_READ) + btrfs_bio(bio)->orig_physical = smap->physical; bio->bi_private = smap->dev; bio->bi_end_io = btrfs_simple_end_io; btrfs_submit_dev_bio(smap->dev, bio); @@ -574,27 +597,20 @@ static void run_one_async_free(struct btrfs_work *work) static bool should_async_write(struct btrfs_bio *bbio) { - /* - * If the I/O is not issued by fsync and friends, (->sync_writers != 0), - * then try to defer the submission to a workqueue to parallelize the - * checksum calculation. - */ - if (atomic_read(&bbio->inode->sync_writers)) + /* Submit synchronously if the checksum implementation is fast. */ + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) return false; /* - * Submit metadata writes synchronously if the checksum implementation - * is fast, or we are on a zoned device that wants I/O to be submitted - * in order. + * Try to defer the submission to a workqueue to parallelize the + * checksum calculation unless the I/O is issued synchronously. */ - if (bbio->bio.bi_opf & REQ_META) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + if (op_is_sync(bbio->bio.bi_opf)) + return false; - if (btrfs_is_zoned(fs_info)) - return false; - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return false; - } + /* Zoned devices require I/O to be submitted in order. */ + if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info)) + return false; return true; } @@ -622,10 +638,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, run_one_async_free); - if (op_is_sync(bbio->bio.bi_opf)) - btrfs_queue_work(fs_info->hipri_workers, &async->work); - else - btrfs_queue_work(fs_info->workers, &async->work); + btrfs_queue_work(fs_info->workers, &async->work); return true; } @@ -635,7 +648,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) struct btrfs_fs_info *fs_info = bbio->fs_info; struct btrfs_bio *orig_bbio = bbio; struct bio *bio = &bbio->bio; - u64 logical = bio->bi_iter.bi_sector << 9; + u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; u64 map_length = length; bool use_append = btrfs_use_zone_append(bbio); @@ -645,8 +658,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) int error; btrfs_bio_counter_inc_blocked(fs_info); - error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); + error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); if (error) { ret = errno_to_blk_status(error); goto fail; @@ -665,7 +678,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) * Save the iter for the end_io handler and preload the checksums for * data reads. */ - if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) { + if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); if (ret) @@ -676,9 +689,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (use_append) { bio->bi_opf &= ~REQ_OP_WRITE; bio->bi_opf |= REQ_OP_ZONE_APPEND; - ret = btrfs_bio_extract_ordered_extent(bbio); - if (ret) - goto fail_put_bio; } /* @@ -695,6 +705,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) goto fail_put_bio; + } else if (use_append) { + ret = btrfs_alloc_dummy_sum(bbio); + if (ret) + goto fail_put_bio; } } @@ -704,7 +718,7 @@ done: fail_put_bio: if (map_length < length) - bio_put(bio); + btrfs_cleanup_bio(bbio); fail: btrfs_bio_counter_dec(fs_info); btrfs_bio_end_io(orig_bbio, ret); diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index a8eca3a65673..ca79decee060 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -39,8 +39,8 @@ struct btrfs_bio { union { /* - * Data checksumming and original I/O information for internal - * use in the btrfs_submit_bio machinery. + * For data reads: checksumming and original I/O information. + * (for internal use in the btrfs_submit_bio machinery only) */ struct { u8 *csum; @@ -48,7 +48,20 @@ struct btrfs_bio { struct bvec_iter saved_iter; }; - /* For metadata parentness verification. */ + /* + * For data writes: + * - ordered extent covering the bio + * - pointer to the checksums for this bio + * - original physical address from the allocator + * (for zone append only) + */ + struct { + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_sum *sums; + u64 orig_physical; + }; + + /* For metadata reads: parentness verification. */ struct btrfs_tree_parent_check parent_check; }; @@ -84,15 +97,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, struct btrfs_fs_info *fs_info, btrfs_bio_end_io_t end_io, void *private); - -static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - bbio->end_io(bbio); -} - -/* Bio only refers to one ordered extent. */ -#define REQ_BTRFS_ONE_ORDERED REQ_DRV +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); /* Submit using blkcg_punt_bio_submit. */ #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e97af2e510c3..48ae509f2ac2 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -95,14 +95,21 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) } allowed &= flags; - if (allowed & BTRFS_BLOCK_GROUP_RAID6) + /* Select the highest-redundancy RAID level. */ + if (allowed & BTRFS_BLOCK_GROUP_RAID1C4) + allowed = BTRFS_BLOCK_GROUP_RAID1C4; + else if (allowed & BTRFS_BLOCK_GROUP_RAID6) allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3) + allowed = BTRFS_BLOCK_GROUP_RAID1C3; else if (allowed & BTRFS_BLOCK_GROUP_RAID5) allowed = BTRFS_BLOCK_GROUP_RAID5; else if (allowed & BTRFS_BLOCK_GROUP_RAID10) allowed = BTRFS_BLOCK_GROUP_RAID10; else if (allowed & BTRFS_BLOCK_GROUP_RAID1) allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_DUP) + allowed = BTRFS_BLOCK_GROUP_DUP; else if (allowed & BTRFS_BLOCK_GROUP_RAID0) allowed = BTRFS_BLOCK_GROUP_RAID0; @@ -1633,11 +1640,14 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; + trace_btrfs_add_unused_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); if (list_empty(&bg->bg_list)) { btrfs_get_block_group(bg); - trace_btrfs_add_unused_block_group(bg); list_add_tail(&bg->bg_list, &fs_info->unused_bgs); + } else { + /* Pull out the block group from the reclaim_bgs list. */ + list_move_tail(&bg->bg_list, &fs_info->unused_bgs); } spin_unlock(&fs_info->unused_bgs_lock); } @@ -1791,8 +1801,15 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_unlock(&bg->lock); - /* Get out fast, in case we're unmounting the filesystem */ - if (btrfs_fs_closing(fs_info)) { + /* + * Get out fast, in case we're read-only or unmounting the + * filesystem. It is OK to drop block groups from the list even + * for the read-only case. As we did sb_start_write(), + * "mount -o remount,ro" won't happen and read-only filesystem + * means it is forced read-only due to a fatal error. So, it + * never gets back to read-write to let us reclaim again. + */ + if (btrfs_need_cleaner_sleep(fs_info)) { up_write(&space_info->groups_sem); goto next; } @@ -1823,11 +1840,27 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } next: + if (ret) + btrfs_mark_bg_to_reclaim(bg); btrfs_put_block_group(bg); + + mutex_unlock(&fs_info->reclaim_bgs_lock); + /* + * Reclaiming all the block groups in the list can take really + * long. Prioritize cleaning up unused block groups. + */ + btrfs_delete_unused_bgs(fs_info); + /* + * If we are interrupted by a balance, we can just bail out. The + * cleaner thread restart again if necessary. + */ + if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) + goto end; spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); +end: btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); } @@ -3521,9 +3554,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&space_info->lock); - set_extent_dirty(&trans->transaction->pinned_extents, - bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); + set_extent_bit(&trans->transaction->pinned_extents, + bytenr, bytenr + num_bytes - 1, + EXTENT_DIRTY, NULL); } spin_lock(&trans->transaction->dirty_bgs_lock); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index cc0e4b37db2d..f204addc3fe8 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -162,7 +162,14 @@ struct btrfs_block_group { */ struct list_head cluster_list; - /* For delayed block group creation or deletion of empty block groups */ + /* + * Used for several lists: + * + * 1) struct btrfs_fs_info::unused_bgs + * 2) struct btrfs_fs_info::reclaim_bgs + * 3) struct btrfs_transaction::deleted_bgs + * 4) struct btrfs_trans_handle::new_bgs + */ struct list_head bg_list; /* For read-only block groups */ diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index ac18c43fadad..6279d200cf83 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -541,3 +541,22 @@ try_reserve: return ERR_PTR(ret); } + +int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + u64 needed_bytes; + int ret; + + /* 1 for slack space, 1 for updating the inode */ + needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + + btrfs_calc_metadata_size(fs_info, 1); + + spin_lock(&rsv->lock); + if (rsv->reserved < needed_bytes) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&rsv->lock); + return ret; +} diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 6dc781709aca..b0bd12b8652f 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -82,6 +82,8 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize); +int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ec2ae4406c16..d47a927b3504 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -116,9 +116,6 @@ struct btrfs_inode { unsigned long runtime_flags; - /* Keep track of who's O_SYNC/fsyncing currently */ - atomic_t sync_writers; - /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ @@ -335,7 +332,7 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, if (btrfs_is_free_space_inode(inode)) return; trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode), - mod); + mod, inode->outstanding_extents); } /* @@ -407,30 +404,12 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) return true; } -/* - * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two - * separate u32s. These two functions convert between the two representations. - */ -static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) -{ - return (flags | ((u64)ro_flags << 32)); -} - -static inline void btrfs_inode_split_flags(u64 inode_item_flags, - u32 *flags, u32 *ro_flags) -{ - *flags = (u32)inode_item_flags; - *ro_flags = (u32)(inode_item_flags >> 32); -} - /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, - struct btrfs_ordered_extent *ordered); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 82e49d985019..3caf339c4bb3 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1459,13 +1459,13 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfs_fs_info *fs_info = state->fs_info; int ret; u64 length; - struct btrfs_io_context *multi = NULL; + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap, *map; struct btrfs_device *device; length = len; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, - bytenr, &length, &multi, mirror_num); - + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc, + NULL, &mirror_num, 0); if (ret) { block_ctx_out->start = 0; block_ctx_out->dev_bytenr = 0; @@ -1478,21 +1478,26 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, return ret; } - device = multi->stripes[0].dev; + if (bioc) + map = &bioc->stripes[0]; + else + map = &smap; + + device = map->dev; if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || !device->bdev || !device->name) block_ctx_out->dev = NULL; else block_ctx_out->dev = btrfsic_dev_state_lookup( device->bdev->bd_dev); - block_ctx_out->dev_bytenr = multi->stripes[0].physical; + block_ctx_out->dev_bytenr = map->physical; block_ctx_out->start = bytenr; block_ctx_out->len = len; block_ctx_out->datav = NULL; block_ctx_out->pagev = NULL; block_ctx_out->mem_to_free = NULL; - kfree(multi); + kfree(bioc); if (NULL == block_ctx_out->dev) { ret = -ENXIO; pr_info("btrfsic: error, cannot lookup dev (#1)!\n"); @@ -1565,7 +1570,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = dev_bytenr >> 9; + bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2d0493f0a184..8818ed5c390f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -37,7 +37,7 @@ #include "file-item.h" #include "super.h" -struct bio_set btrfs_compressed_bioset; +static struct bio_set btrfs_compressed_bioset; static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; @@ -211,8 +211,6 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) for (i = 0; i < ret; i++) { struct folio *folio = fbatch.folios[i]; - if (errno) - folio_set_error(folio); btrfs_page_clamp_clear_writeback(fs_info, &folio->page, cb->start, cb->len); } @@ -226,13 +224,8 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) struct compressed_bio *cb = container_of(work, struct compressed_bio, write_end_work); - /* - * Ok, we're the last bio for this extent, step one is to call back - * into the FS and do all the end_io operations. - */ - btrfs_writepage_endio_finish_ordered(cb->bbio.inode, NULL, - cb->start, cb->start + cb->len - 1, - cb->bbio.bio.bi_status == BLK_STS_OK); + btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, + cb->bbio.bio.bi_status == BLK_STS_OK); if (cb->writeback) end_compressed_writeback(cb); @@ -281,32 +274,31 @@ static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) * This also checksums the file bytes and gets things ready for * the end io hooks. */ -void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned int len, u64 disk_start, - unsigned int compressed_len, - struct page **compressed_pages, - unsigned int nr_pages, - blk_opf_t write_flags, - bool writeback) +void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, + struct page **compressed_pages, + unsigned int nr_pages, + blk_opf_t write_flags, + bool writeback) { + struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct compressed_bio *cb; - ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && - IS_ALIGNED(len, fs_info->sectorsize)); - - write_flags |= REQ_BTRFS_ONE_ORDERED; + ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize)); - cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags, + cb = alloc_compressed_bio(inode, ordered->file_offset, + REQ_OP_WRITE | write_flags, end_compressed_bio_write); - cb->start = start; - cb->len = len; + cb->start = ordered->file_offset; + cb->len = ordered->num_bytes; cb->compressed_pages = compressed_pages; - cb->compressed_len = compressed_len; + cb->compressed_len = ordered->disk_num_bytes; cb->writeback = writeback; INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; - cb->bbio.bio.bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; + cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; + cb->bbio.ordered = ordered; btrfs_add_compressed_bio_pages(cb); btrfs_submit_bio(&cb->bbio, 0); @@ -421,7 +413,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, */ if (!em || cur < em->start || (cur + fs_info->sectorsize > extent_map_end(em)) || - (em->block_start >> 9) != orig_bio->bi_iter.bi_sector) { + (em->block_start >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, cur, page_end, NULL); unlock_page(page); @@ -472,7 +464,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num) +void btrfs_submit_compressed_read(struct btrfs_bio *bbio) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -538,7 +530,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num) if (memstall) psi_memstall_leave(&pflags); - btrfs_submit_bio(&cb->bbio, mirror_num); + btrfs_submit_bio(&cb->bbio, 0); return; out_free_compressed_pages: diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 19ab2abeddc0..03bb9d143fa7 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -10,6 +10,7 @@ #include "bio.h" struct btrfs_inode; +struct btrfs_ordered_extent; /* * We want to make sure that amount of RAM required to uncompress an extent is @@ -86,14 +87,12 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); -void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned int len, u64 disk_start, - unsigned int compressed_len, +void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, struct page **compressed_pages, unsigned int nr_pages, blk_opf_t write_flags, bool writeback); -void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num); +void btrfs_submit_compressed_read(struct btrfs_bio *bbio); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2ff2961b1183..a4cb4b642987 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,8 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans, static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); -static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, - int level, int slot); static const struct btrfs_csums { u16 size; @@ -150,13 +148,19 @@ static inline void copy_leaf_items(const struct extent_buffer *dst, nr_items * sizeof(struct btrfs_item)); } +/* This exists for btrfs-progs usages. */ +u16 btrfs_csum_type_size(u16 type) +{ + return btrfs_csums[type].size; +} + int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); /* * csum type is validated at mount time */ - return btrfs_csums[t].size; + return btrfs_csum_type_size(t); } const char *btrfs_super_csum_name(u16 csum_type) @@ -417,9 +421,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, &refs, &flags); if (ret) return ret; - if (refs == 0) { - ret = -EROFS; - btrfs_handle_fs_error(fs_info, ret, NULL); + if (unlikely(refs == 0)) { + btrfs_crit(fs_info, + "found 0 references for tree block at bytenr %llu level %d root %llu", + buf->start, btrfs_header_level(buf), + btrfs_root_id(root)); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); return ret; } } else { @@ -464,10 +472,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; } if (new_flags != 0) { - int level = btrfs_header_level(buf); - - ret = btrfs_set_disk_extent_flags(trans, buf, - new_flags, level); + ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); if (ret) return ret; } @@ -583,9 +588,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) parent_start = buf->start; - atomic_inc(&cow->refs); ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } + atomic_inc(&cow->refs); rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, btrfs_root_id(root), buf, @@ -594,8 +604,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); - btrfs_tree_mod_log_insert_key(parent, parent_slot, - BTRFS_MOD_LOG_KEY_REPLACE); + ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, + BTRFS_MOD_LOG_KEY_REPLACE); + if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, @@ -1028,8 +1044,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, child = btrfs_read_node_slot(mid, 0); if (IS_ERR(child)) { ret = PTR_ERR(child); - btrfs_handle_fs_error(fs_info, ret, NULL); - goto enospc; + goto out; } btrfs_tree_lock(child); @@ -1038,11 +1053,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (ret) { btrfs_tree_unlock(child); free_extent_buffer(child); - goto enospc; + goto out; } ret = btrfs_tree_mod_log_insert_root(root->node, child, true); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + btrfs_abort_transaction(trans, ret); + goto out; + } rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); @@ -1070,7 +1090,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (IS_ERR(left)) { ret = PTR_ERR(left); left = NULL; - goto enospc; + goto out; } __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); @@ -1079,7 +1099,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NESTING_LEFT_COW); if (wret) { ret = wret; - goto enospc; + goto out; } } @@ -1088,7 +1108,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (IS_ERR(right)) { ret = PTR_ERR(right); right = NULL; - goto enospc; + goto out; } __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); @@ -1097,7 +1117,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NESTING_RIGHT_COW); if (wret) { ret = wret; - goto enospc; + goto out; } } @@ -1119,7 +1139,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); - del_ptr(root, path, level + 1, pslot + 1); + ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); + if (ret < 0) { + free_extent_buffer_stale(right); + right = NULL; + goto out; + } root_sub_used(root, right->len); btrfs_free_tree_block(trans, btrfs_root_id(root), right, 0, 1); @@ -1130,7 +1155,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1145,15 +1173,19 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * otherwise we would have pulled some pointers from the * right */ - if (!left) { - ret = -EROFS; - btrfs_handle_fs_error(fs_info, ret, NULL); - goto enospc; + if (unlikely(!left)) { + btrfs_crit(fs_info, +"missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", + parent->start, btrfs_header_level(parent), + mid->start, btrfs_root_id(root)); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + goto out; } wret = balance_node_right(trans, mid, left); if (wret < 0) { ret = wret; - goto enospc; + goto out; } if (wret == 1) { wret = push_node_left(trans, left, mid, 1); @@ -1165,7 +1197,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); - del_ptr(root, path, level + 1, pslot); + ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); + if (ret < 0) { + free_extent_buffer_stale(mid); + mid = NULL; + goto out; + } root_sub_used(root, mid->len); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); @@ -1176,7 +1213,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out; + } btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); } @@ -1202,7 +1242,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (orig_ptr != btrfs_node_blockptr(path->nodes[level], path->slots[level])) BUG(); -enospc: +out: if (right) { btrfs_tree_unlock(right); free_extent_buffer(right); @@ -1278,7 +1318,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(left); + free_extent_buffer(left); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -1333,7 +1378,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2379,6 +2429,87 @@ done: } /* + * Search the tree again to find a leaf with smaller keys. + * Returns 0 if it found something. + * Returns 1 if there are no smaller keys. + * Returns < 0 on error. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. + */ +static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_key orig_key; + struct btrfs_disk_key found_key; + int ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + orig_key = key; + + if (key.offset > 0) { + key.offset--; + } else if (key.type > 0) { + key.type--; + key.offset = (u64)-1; + } else if (key.objectid > 0) { + key.objectid--; + key.type = (u8)-1; + key.offset = (u64)-1; + } else { + return 1; + } + + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret <= 0) + return ret; + + /* + * Previous key not found. Even if we were at slot 0 of the leaf we had + * before releasing the path and calling btrfs_search_slot(), we now may + * be in a slot pointing to the same original key - this can happen if + * after we released the path, one of more items were moved from a + * sibling leaf into the front of the leaf we had due to an insertion + * (see push_leaf_right()). + * If we hit this case and our slot is > 0 and just decrement the slot + * so that the caller does not process the same key again, which may or + * may not break the caller, depending on its logic. + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); + ret = comp_keys(&found_key, &orig_key); + if (ret == 0) { + if (path->slots[0] > 0) { + path->slots[0]--; + return 0; + } + /* + * At slot 0, same key as before, it means orig_key is + * the lowest, leftmost, key in the tree. We're done. + */ + return 1; + } + } + + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = comp_keys(&found_key, &key); + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) + return 0; + return 1; +} + +/* * helper to use instead of search slot if no exact match is needed but * instead the next or previous item should be returned. * When find_higher is true, the next higher item is returned, the next lower @@ -2552,6 +2683,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { + btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", slot, btrfs_disk_key_objectid(&disk_key), @@ -2559,13 +2691,13 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, btrfs_disk_key_offset(&disk_key), new_key->objectid, new_key->type, new_key->offset); - btrfs_print_leaf(eb); BUG(); } } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { + btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", slot, btrfs_disk_key_objectid(&disk_key), @@ -2573,7 +2705,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, btrfs_disk_key_offset(&disk_key), new_key->objectid, new_key->type, new_key->offset); - btrfs_print_leaf(eb); BUG(); } } @@ -2626,7 +2757,7 @@ static bool check_sibling_keys(struct extent_buffer *left, btrfs_item_key_to_cpu(right, &right_first, 0); } - if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { + if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { btrfs_crit(left->fs_info, "left extent buffer:"); btrfs_print_tree(left, false); btrfs_crit(left->fs_info, "right extent buffer:"); @@ -2703,8 +2834,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, if (push_items < src_nritems) { /* - * Don't call btrfs_tree_mod_log_insert_move() here, key removal - * was already fully logged by btrfs_tree_mod_log_eb_copy() above. + * btrfs_tree_mod_log_eb_copy handles logging the move, so we + * don't need to do an explicit tree mod log operation for it. */ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), btrfs_node_key_ptr_offset(src, push_items), @@ -2765,8 +2896,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } - ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); - BUG_ON(ret < 0); + + /* + * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't + * need to do an explicit tree mod log operation for it. + */ memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), btrfs_node_key_ptr_offset(dst, 0), (dst_nritems) * @@ -2840,7 +2974,12 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, old = root->node; ret = btrfs_tree_mod_log_insert_root(root->node, c, false); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); + btrfs_tree_unlock(c); + free_extent_buffer(c); + return ret; + } rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ @@ -2861,10 +3000,10 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * slot and level indicate where you want the key to go, and * blocknr is the block the key points to. */ -static void insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, u64 bytenr, - int slot, int level) +static int insert_ptr(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_disk_key *key, u64 bytenr, + int slot, int level) { struct extent_buffer *lower; int nritems; @@ -2880,7 +3019,10 @@ static void insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, slot, nritems - slot); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(lower, slot + 1), @@ -2890,7 +3032,10 @@ static void insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot, BTRFS_MOD_LOG_KEY_ADD); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } btrfs_set_node_key(lower, key, slot); btrfs_set_node_blockptr(lower, slot, bytenr); @@ -2898,6 +3043,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); btrfs_mark_buffer_dirty(lower); + + return 0; } /* @@ -2962,6 +3109,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); if (ret) { + btrfs_tree_unlock(split); + free_extent_buffer(split); btrfs_abort_transaction(trans, ret); return ret; } @@ -2975,8 +3124,13 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - insert_ptr(trans, path, &disk_key, split->start, - path->slots[level + 1] + 1, level + 1); + ret = insert_ptr(trans, path, &disk_key, split->start, + path->slots[level + 1] + 1, level + 1); + if (ret < 0) { + btrfs_tree_unlock(split); + free_extent_buffer(split); + return ret; + } if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -2996,7 +3150,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, * and nr indicate which items in the leaf to check. This totals up the * space used both by the item structs and the item data */ -static int leaf_space_used(struct extent_buffer *l, int start, int nr) +static int leaf_space_used(const struct extent_buffer *l, int start, int nr) { int data_len; int nritems = btrfs_header_nritems(l); @@ -3016,7 +3170,7 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) * the start of the leaf data. IOW, how much room * the leaf has left for both items and data */ -noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) +int btrfs_leaf_free_space(const struct extent_buffer *leaf) { struct btrfs_fs_info *fs_info = leaf->fs_info; int nritems = btrfs_header_nritems(leaf); @@ -3453,16 +3607,17 @@ out: * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. */ -static noinline void copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct extent_buffer *l, - struct extent_buffer *right, - int slot, int mid, int nritems) +static noinline int copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) { struct btrfs_fs_info *fs_info = trans->fs_info; int data_copy_size; int rt_data_off; int i; + int ret; struct btrfs_disk_key disk_key; struct btrfs_map_token token; @@ -3487,7 +3642,9 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(l, mid); btrfs_item_key(right, &disk_key, 0); - insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); + ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); + if (ret < 0) + return ret; btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -3505,6 +3662,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, } BUG_ON(path->slots[0] < 0); + + return 0; } /* @@ -3703,8 +3862,13 @@ again: if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, path, &disk_key, - right->start, path->slots[1] + 1, 1); + ret = insert_ptr(trans, path, &disk_key, + right->start, path->slots[1] + 1, 1); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3712,8 +3876,13 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, path, &disk_key, - right->start, path->slots[1], 1); + ret = insert_ptr(trans, path, &disk_key, + right->start, path->slots[1], 1); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3729,7 +3898,12 @@ again: return ret; } - copy_for_split(trans, path, l, right, slot, mid, nritems); + ret = copy_for_split(trans, path, l, right, slot, mid, nritems); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } if (split == 2) { BUG_ON(num_doubles != 0); @@ -3826,7 +4000,12 @@ static noinline int split_item(struct btrfs_path *path, struct btrfs_disk_key disk_key; leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); + /* + * Shouldn't happen because the caller must have previously called + * setup_leaf_for_split() to make room for the new item in the leaf. + */ + if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) + return -ENOSPC; orig_slot = path->slots[0]; orig_offset = btrfs_item_offset(leaf, path->slots[0]); @@ -4273,9 +4452,11 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * * the tree should have been previously balanced so the deletion does not * empty a node. + * + * This is exported for use inside btrfs-progs, don't un-export it. */ -static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, - int level, int slot) +int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; @@ -4286,7 +4467,10 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, if (level) { ret = btrfs_tree_mod_log_insert_move(parent, slot, slot + 1, nritems - slot - 1); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } memmove_extent_buffer(parent, btrfs_node_key_ptr_offset(parent, slot), @@ -4296,7 +4480,10 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot, BTRFS_MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } nritems--; @@ -4312,6 +4499,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, fixup_low_keys(path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); + return 0; } /* @@ -4324,13 +4512,17 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf) +static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { + int ret; + WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(root, path, 1, path->slots[1]); + ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); + if (ret < 0) + return ret; /* * btrfs_free_extent is expensive, we want to make sure we @@ -4343,6 +4535,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, atomic_inc(&leaf->refs); btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); free_extent_buffer_stale(leaf); + return 0; } /* * delete the item at the leaf level in path. If that empties @@ -4392,7 +4585,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { btrfs_clear_buffer_dirty(trans, leaf); - btrfs_del_leaf(trans, root, path, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf); + if (ret < 0) + return ret; } } else { int used = leaf_space_used(leaf, 0, nritems); @@ -4416,7 +4611,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* push_leaf_left fixes the path. * make sure the path still points to our leaf - * for possible call to del_ptr below + * for possible call to btrfs_del_ptr below */ slot = path->slots[1]; atomic_inc(&leaf->refs); @@ -4453,7 +4648,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - btrfs_del_leaf(trans, root, path, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf); + if (ret < 0) + return ret; free_extent_buffer(leaf); ret = 0; } else { @@ -4474,86 +4671,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* - * search the tree again to find a leaf with lesser keys - * returns 0 if it found something or 1 if there are no lesser leaves. - * returns < 0 on io errors. - * - * This may release the path, and so you may lose any locks held at the - * time you call it. - */ -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) -{ - struct btrfs_key key; - struct btrfs_key orig_key; - struct btrfs_disk_key found_key; - int ret; - - btrfs_item_key_to_cpu(path->nodes[0], &key, 0); - orig_key = key; - - if (key.offset > 0) { - key.offset--; - } else if (key.type > 0) { - key.type--; - key.offset = (u64)-1; - } else if (key.objectid > 0) { - key.objectid--; - key.type = (u8)-1; - key.offset = (u64)-1; - } else { - return 1; - } - - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret <= 0) - return ret; - - /* - * Previous key not found. Even if we were at slot 0 of the leaf we had - * before releasing the path and calling btrfs_search_slot(), we now may - * be in a slot pointing to the same original key - this can happen if - * after we released the path, one of more items were moved from a - * sibling leaf into the front of the leaf we had due to an insertion - * (see push_leaf_right()). - * If we hit this case and our slot is > 0 and just decrement the slot - * so that the caller does not process the same key again, which may or - * may not break the caller, depending on its logic. - */ - if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { - btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); - ret = comp_keys(&found_key, &orig_key); - if (ret == 0) { - if (path->slots[0] > 0) { - path->slots[0]--; - return 0; - } - /* - * At slot 0, same key as before, it means orig_key is - * the lowest, leftmost, key in the tree. We're done. - */ - return 1; - } - } - - btrfs_item_key(path->nodes[0], &found_key, 0); - ret = comp_keys(&found_key, &key); - /* - * We might have had an item with the previous key in the tree right - * before we released our path. And after we released our path, that - * item might have been pushed to the first slot (0) of the leaf we - * were holding due to a tree balance. Alternatively, an item with the - * previous key can exist as the only element of a leaf (big fat item). - * Therefore account for these 2 cases, so that our callers (like - * btrfs_previous_item) don't miss an existing item with a key matching - * the previous key we computed above. - */ - if (ret <= 0) - return 0; - return 1; -} - -/* * A helper function to walk down the tree starting at min_key, and looking * for nodes or leaves that are have a minimum transaction id. * This is used by the btree defrag code, and tree logging diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4c1986cd5bed..f2d2b313bde5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -541,6 +541,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); +int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, @@ -633,7 +635,6 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, return btrfs_insert_empty_items(trans, root, path, &batch); } -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); @@ -686,7 +687,7 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } -int btrfs_leaf_free_space(struct extent_buffer *leaf); +int btrfs_leaf_free_space(const struct extent_buffer *leaf); static inline int is_fstree(u64 rootid) { @@ -702,6 +703,7 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } +u16 btrfs_csum_type_size(u16 type); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); const char *btrfs_super_csum_driver(u16 csum_type); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 8065341d831a..f2ff4cbe8656 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1040,7 +1040,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, clear_extent_bit(&inode->io_tree, start, start + len - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, cached_state); - set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); + set_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); /* Update the page status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 0b32432d7d56..6a13cf00218b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -407,7 +407,6 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); - ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); } @@ -507,6 +506,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( { struct btrfs_delayed_ref_head *head; + lockdep_assert_held(&delayed_refs->lock); again: head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, true); @@ -531,7 +531,7 @@ again: href_node); } - head->processing = 1; + head->processing = true; WARN_ON(delayed_refs->num_heads_ready == 0); delayed_refs->num_heads_ready--; delayed_refs->run_delayed_start = head->bytenr + @@ -549,31 +549,35 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, RB_CLEAR_NODE(&head->href_node); atomic_dec(&delayed_refs->num_entries); delayed_refs->num_heads--; - if (head->processing == 0) + if (!head->processing) delayed_refs->num_heads_ready--; } /* * Helper to insert the ref_node to the tail or merge with tail. * - * Return 0 for insert. - * Return >0 for merge. + * Return false if the ref was inserted. + * Return true if the ref was merged into an existing one (and therefore can be + * freed by the caller). */ -static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, - struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *ref) +static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) { struct btrfs_delayed_ref_node *exist; int mod; - int ret = 0; spin_lock(&href->lock); exist = tree_insert(&href->ref_tree, ref); - if (!exist) - goto inserted; + if (!exist) { + if (ref->action == BTRFS_ADD_DELAYED_REF) + list_add_tail(&ref->add_list, &href->ref_add_list); + atomic_inc(&root->num_entries); + spin_unlock(&href->lock); + return false; + } /* Now we are sure we can merge */ - ret = 1; if (exist->action == ref->action) { mod = ref->ref_mod; } else { @@ -600,13 +604,7 @@ static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, if (exist->ref_mod == 0) drop_delayed_ref(root, href, exist); spin_unlock(&href->lock); - return ret; -inserted: - if (ref->action == BTRFS_ADD_DELAYED_REF) - list_add_tail(&ref->add_list, &href->ref_add_list); - atomic_inc(&root->num_entries); - spin_unlock(&href->lock); - return ret; + return true; } /* @@ -699,34 +697,38 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, bool is_system) { int count_mod = 1; - int must_insert_reserved = 0; + bool must_insert_reserved = false; /* If reserved is provided, it must be a data extent. */ BUG_ON(!is_data && reserved); - /* - * The head node stores the sum of all the mods, so dropping a ref - * should drop the sum in the head node by one. - */ - if (action == BTRFS_UPDATE_DELAYED_HEAD) + switch (action) { + case BTRFS_UPDATE_DELAYED_HEAD: count_mod = 0; - else if (action == BTRFS_DROP_DELAYED_REF) + break; + case BTRFS_DROP_DELAYED_REF: + /* + * The head node stores the sum of all the mods, so dropping a ref + * should drop the sum in the head node by one. + */ count_mod = -1; - - /* - * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved - * accounting when the extent is finally added, or if a later - * modification deletes the delayed ref without ever inserting the - * extent into the extent allocation tree. ref->must_insert_reserved - * is the flag used to record that accounting mods are required. - * - * Once we record must_insert_reserved, switch the action to - * BTRFS_ADD_DELAYED_REF because other special casing is not required. - */ - if (action == BTRFS_ADD_DELAYED_EXTENT) - must_insert_reserved = 1; - else - must_insert_reserved = 0; + break; + case BTRFS_ADD_DELAYED_EXTENT: + /* + * BTRFS_ADD_DELAYED_EXTENT means that we need to update the + * reserved accounting when the extent is finally added, or if a + * later modification deletes the delayed ref without ever + * inserting the extent into the extent allocation tree. + * ref->must_insert_reserved is the flag used to record that + * accounting mods are required. + * + * Once we record must_insert_reserved, switch the action to + * BTRFS_ADD_DELAYED_REF because other special casing is not + * required. + */ + must_insert_reserved = true; + break; + } refcount_set(&head_ref->refs, 1); head_ref->bytenr = bytenr; @@ -738,7 +740,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); RB_CLEAR_NODE(&head_ref->href_node); - head_ref->processing = 0; + head_ref->processing = false; head_ref->total_ref_mod = count_mod; spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); @@ -763,11 +765,11 @@ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, - int action, int *qrecord_inserted_ret) + int action, bool *qrecord_inserted_ret) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; - int qrecord_inserted = 0; + bool qrecord_inserted = false; delayed_refs = &trans->transaction->delayed_refs; @@ -777,7 +779,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, delayed_refs, qrecord)) kfree(qrecord); else - qrecord_inserted = 1; + qrecord_inserted = true; } trace_add_delayed_ref_head(trans->fs_info, head_ref, action); @@ -853,8 +855,6 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, ref->num_bytes = num_bytes; ref->ref_mod = 1; ref->action = action; - ref->is_head = 0; - ref->in_tree = 1; ref->seq = seq; ref->type = ref_type; RB_CLEAR_NODE(&ref->ref_node); @@ -875,11 +875,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - int qrecord_inserted; + bool qrecord_inserted; bool is_system; + bool merged; int action = generic_ref->action; int level = generic_ref->tree_ref.level; - int ret; u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; @@ -935,7 +935,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -947,7 +947,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, trace_add_delayed_tree_ref(fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); - if (ret > 0) + if (merged) kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); if (qrecord_inserted) @@ -968,9 +968,9 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - int qrecord_inserted; + bool qrecord_inserted; int action = generic_ref->action; - int ret; + bool merged; u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; @@ -1027,7 +1027,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -1039,7 +1039,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); - if (ret > 0) + if (merged) kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index b54261fe509b..b8e14b0ba5f1 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -48,9 +48,6 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; - /* is this node still in the rbtree? */ - unsigned int is_head:1; - unsigned int in_tree:1; }; struct btrfs_delayed_extent_op { @@ -70,20 +67,26 @@ struct btrfs_delayed_extent_op { struct btrfs_delayed_ref_head { u64 bytenr; u64 num_bytes; - refcount_t refs; + /* + * For insertion into struct btrfs_delayed_ref_root::href_root. + * Keep it in the same cache line as 'bytenr' for more efficient + * searches in the rbtree. + */ + struct rb_node href_node; /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. */ struct mutex mutex; + refcount_t refs; + + /* Protects 'ref_tree' and 'ref_add_list'. */ spinlock_t lock; struct rb_root_cached ref_tree; /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ struct list_head ref_add_list; - struct rb_node href_node; - struct btrfs_delayed_extent_op *extent_op; /* @@ -113,10 +116,10 @@ struct btrfs_delayed_ref_head { * we need to update the in ram accounting to properly reflect * the free has happened. */ - unsigned int must_insert_reserved:1; - unsigned int is_data:1; - unsigned int is_system:1; - unsigned int processing:1; + bool must_insert_reserved; + bool is_data; + bool is_system; + bool processing; }; struct btrfs_delayed_tree_ref { @@ -337,7 +340,7 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { WARN_ON(refcount_read(&ref->refs) == 0); if (refcount_dec_and_test(&ref->refs)) { - WARN_ON(ref->in_tree); + WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); switch (ref->type) { case BTRFS_TREE_BLOCK_REF_KEY: case BTRFS_SHARED_BLOCK_REF_KEY: diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 78696d331639..5f10965fd72b 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -41,7 +41,7 @@ * All new writes will be written to both target and source devices, so even * if replace gets canceled, sources device still contains up-to-date data. * - * Location: handle_ops_on_dev_replace() from __btrfs_map_block() + * Location: handle_ops_on_dev_replace() from btrfs_map_block() * Start: btrfs_dev_replace_start() * End: btrfs_dev_replace_finishing() * Content: Latest data/metadata @@ -257,8 +257,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, - fs_info->bdev_holder); + bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, + fs_info->bdev_holder, NULL); if (IS_ERR(bdev)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); return PTR_ERR(bdev); @@ -315,7 +315,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->bdev = bdev; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->mode = FMODE_EXCL; + device->holder = fs_info->bdev_holder; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_devices; @@ -334,7 +334,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - blkdev_put(bdev, FMODE_EXCL); + blkdev_put(bdev, fs_info->bdev_holder); return ret; } @@ -795,8 +795,8 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, while (!find_first_extent_bit(&srcdev->alloc_state, start, &found_start, &found_end, CHUNK_ALLOCATED, &cached_state)) { - ret = set_extent_bits(&tgtdev->alloc_state, found_start, - found_end, CHUNK_ALLOCATED); + ret = set_extent_bit(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED, NULL); if (ret) break; start = found_end + 1; diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index a6d77fe41e1a..944a7340f6a4 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -73,6 +73,23 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, return &discard_ctl->discard_list[block_group->discard_index]; } +/* + * Determine if async discard should be running. + * + * @discard_ctl: discard control + * + * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. + */ +static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +{ + struct btrfs_fs_info *fs_info = container_of(discard_ctl, + struct btrfs_fs_info, + discard_ctl); + + return (!(fs_info->sb->s_flags & SB_RDONLY) && + test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); +} + static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { @@ -545,23 +562,6 @@ static void btrfs_discard_workfn(struct work_struct *work) } /* - * Determine if async discard should be running. - * - * @discard_ctl: discard control - * - * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. - */ -bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) -{ - struct btrfs_fs_info *fs_info = container_of(discard_ctl, - struct btrfs_fs_info, - discard_ctl); - - return (!(fs_info->sb->s_flags & SB_RDONLY) && - test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); -} - -/* * Recalculate the base delay. * * @discard_ctl: discard control diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index 57b9202f427f..dddb0f9101ba 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -24,7 +24,6 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group); void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, bool override); -bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl); /* Update operations */ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dabc79c1af1b..7513388b0567 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -60,15 +60,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static void btrfs_destroy_ordered_extents(struct btrfs_root *root); -static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info); -static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); -static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, - struct extent_io_tree *dirty_pages, - int mark); -static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, - struct extent_io_tree *pinned_extents); static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); @@ -110,35 +101,27 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) * detect blocks that either didn't get written at all or got written * in the wrong place. */ -static int verify_parent_transid(struct extent_io_tree *io_tree, - struct extent_buffer *eb, u64 parent_transid, - int atomic) +int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic) { - struct extent_state *cached_state = NULL; - int ret; + if (!extent_buffer_uptodate(eb)) + return 0; if (!parent_transid || btrfs_header_generation(eb) == parent_transid) - return 0; + return 1; if (atomic) return -EAGAIN; - lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); - if (extent_buffer_uptodate(eb) && - btrfs_header_generation(eb) == parent_transid) { - ret = 0; - goto out; - } - btrfs_err_rl(eb->fs_info, + if (!extent_buffer_uptodate(eb) || + btrfs_header_generation(eb) != parent_transid) { + btrfs_err_rl(eb->fs_info, "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", eb->start, eb->read_mirror, parent_transid, btrfs_header_generation(eb)); - ret = 1; - clear_extent_buffer_uptodate(eb); -out: - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); - return ret; + clear_extent_buffer_uptodate(eb); + return 0; + } + return 1; } static bool btrfs_supported_super_csum(u16 csum_type) @@ -180,64 +163,6 @@ int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - int found_level; - struct btrfs_key found_key; - int ret; - - found_level = btrfs_header_level(eb); - if (found_level != level) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree level check failed\n"); - btrfs_err(fs_info, -"tree level mismatch detected, bytenr=%llu level expected=%u has=%u", - eb->start, level, found_level); - return -EIO; - } - - if (!first_key) - return 0; - - /* - * For live tree block (new tree blocks in current transaction), - * we need proper lock context to avoid race, which is impossible here. - * So we only checks tree blocks which is read from disk, whose - * generation <= fs_info->last_trans_committed. - */ - if (btrfs_header_generation(eb) > fs_info->last_trans_committed) - return 0; - - /* We have @first_key, so this @eb must have at least one item */ - if (btrfs_header_nritems(eb) == 0) { - btrfs_err(fs_info, - "invalid tree nritems, bytenr=%llu nritems=0 expect >0", - eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - return -EUCLEAN; - } - - if (found_level) - btrfs_node_key_to_cpu(eb, &found_key, 0); - else - btrfs_item_key_to_cpu(eb, &found_key, 0); - ret = btrfs_comp_cpu_keys(first_key, &found_key); - - if (ret) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree first key check failed\n"); - btrfs_err(fs_info, -"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", - eb->start, parent_transid, first_key->objectid, - first_key->type, first_key->offset, - found_key.objectid, found_key.type, - found_key.offset); - } - return ret; -} - static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { @@ -312,12 +237,34 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, return ret; } -static int csum_one_extent_buffer(struct extent_buffer *eb) +/* + * Checksum a dirty tree block before IO. + */ +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) { + struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; + u64 found_start = btrfs_header_bytenr(eb); u8 result[BTRFS_CSUM_SIZE]; int ret; + /* Btree blocks are always contiguous on disk. */ + if (WARN_ON_ONCE(bbio->file_offset != eb->start)) + return BLK_STS_IOERR; + if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) + return BLK_STS_IOERR; + + if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { + WARN_ON_ONCE(found_start != 0); + return BLK_STS_OK; + } + + if (WARN_ON_ONCE(found_start != eb->start)) + return BLK_STS_IOERR; + if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start, + eb->len))) + return BLK_STS_IOERR; + ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE) == 0); @@ -326,7 +273,7 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) if (btrfs_header_level(eb)) ret = btrfs_check_node(eb); else - ret = btrfs_check_leaf_full(eb); + ret = btrfs_check_leaf(eb); if (ret < 0) goto error; @@ -344,8 +291,7 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); - - return 0; + return BLK_STS_OK; error: btrfs_print_tree(eb, 0); @@ -359,103 +305,10 @@ error: */ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); - return ret; -} - -/* Checksum all dirty extent buffers in one bio_vec */ -static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, - struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - u64 bvec_start = page_offset(page) + bvec->bv_offset; - u64 cur; - int ret = 0; - - for (cur = bvec_start; cur < bvec_start + bvec->bv_len; - cur += fs_info->nodesize) { - struct extent_buffer *eb; - bool uptodate; - - eb = find_extent_buffer(fs_info, cur); - uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, - fs_info->nodesize); - - /* A dirty eb shouldn't disappear from buffer_radix */ - if (WARN_ON(!eb)) - return -EUCLEAN; - - if (WARN_ON(cur != btrfs_header_bytenr(eb))) { - free_extent_buffer(eb); - return -EUCLEAN; - } - if (WARN_ON(!uptodate)) { - free_extent_buffer(eb); - return -EUCLEAN; - } - - ret = csum_one_extent_buffer(eb); - free_extent_buffer(eb); - if (ret < 0) - return ret; - } - return ret; -} - -/* - * Checksum a dirty tree block before IO. This has extra checks to make sure - * we only fill in the checksum field in the first page of a multi-page block. - * For subpage extent buffers we need bvec to also read the offset in the page. - */ -static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - u64 start = page_offset(page); - u64 found_start; - struct extent_buffer *eb; - - if (fs_info->nodesize < PAGE_SIZE) - return csum_dirty_subpage_buffers(fs_info, bvec); - - eb = (struct extent_buffer *)page->private; - if (page != eb->pages[0]) - return 0; - - found_start = btrfs_header_bytenr(eb); - - if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { - WARN_ON(found_start != 0); - return 0; - } - - /* - * Please do not consolidate these warnings into a single if. - * It is useful to know what went wrong. - */ - if (WARN_ON(found_start != start)) - return -EUCLEAN; - if (WARN_ON(!PageUptodate(page))) - return -EUCLEAN; - - return csum_one_extent_buffer(eb); -} - -blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) -{ - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; - struct bvec_iter iter; - struct bio_vec bv; - int ret = 0; - - bio_for_each_segment(bv, &bbio->bio, iter) { - ret = csum_dirty_buffer(fs_info, &bv); - if (ret) - break; - } - return errno_to_blk_status(ret); } -static int check_tree_block_fsid(struct extent_buffer *eb) +static bool check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; @@ -475,18 +328,18 @@ static int check_tree_block_fsid(struct extent_buffer *eb) metadata_uuid = fs_devices->fsid; if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) - return 0; + return false; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) - return 0; + return false; - return 1; + return true; } /* Do basic extent buffer checks at read time */ -static int validate_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check) +int btrfs_validate_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; @@ -583,7 +436,7 @@ static int validate_extent_buffer(struct extent_buffer *eb, * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && btrfs_check_leaf_full(eb)) { + if (found_level == 0 && btrfs_check_leaf(eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } @@ -591,9 +444,7 @@ static int validate_extent_buffer(struct extent_buffer *eb, if (found_level > 0 && btrfs_check_node(eb)) ret = -EIO; - if (!ret) - set_extent_buffer_uptodate(eb); - else + if (ret) btrfs_err(fs_info, "read time tree block corruption detected on logical %llu mirror %u", eb->start, eb->read_mirror); @@ -601,105 +452,6 @@ out: return ret; } -static int validate_subpage_buffer(struct page *page, u64 start, u64 end, - int mirror, struct btrfs_tree_parent_check *check) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); - struct extent_buffer *eb; - bool reads_done; - int ret = 0; - - ASSERT(check); - - /* - * We don't allow bio merge for subpage metadata read, so we should - * only get one eb for each endio hook. - */ - ASSERT(end == start + fs_info->nodesize - 1); - ASSERT(PagePrivate(page)); - - eb = find_extent_buffer(fs_info, start); - /* - * When we are reading one tree block, eb must have been inserted into - * the radix tree. If not, something is wrong. - */ - ASSERT(eb); - - reads_done = atomic_dec_and_test(&eb->io_pages); - /* Subpage read must finish in page read */ - ASSERT(reads_done); - - eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { - ret = -EIO; - goto err; - } - ret = validate_extent_buffer(eb, check); - if (ret < 0) - goto err; - - set_extent_buffer_uptodate(eb); - - free_extent_buffer(eb); - return ret; -err: - /* - * end_bio_extent_readpage decrements io_pages in case of error, - * make sure it has something to decrement. - */ - atomic_inc(&eb->io_pages); - clear_extent_buffer_uptodate(eb); - free_extent_buffer(eb); - return ret; -} - -int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, - struct page *page, u64 start, u64 end, - int mirror) -{ - struct extent_buffer *eb; - int ret = 0; - int reads_done; - - ASSERT(page->private); - - if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return validate_subpage_buffer(page, start, end, mirror, - &bbio->parent_check); - - eb = (struct extent_buffer *)page->private; - - /* - * The pending IO might have been the only thing that kept this buffer - * in memory. Make sure we have a ref for all this other checks - */ - atomic_inc(&eb->refs); - - reads_done = atomic_dec_and_test(&eb->io_pages); - if (!reads_done) - goto err; - - eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { - ret = -EIO; - goto err; - } - ret = validate_extent_buffer(eb, &bbio->parent_check); -err: - if (ret) { - /* - * our io error hook is going to dec the io pages - * again, we have to make sure it has something - * to decrement - */ - atomic_inc(&eb->io_pages); - clear_extent_buffer_uptodate(eb); - } - free_extent_buffer(eb); - - return ret; -} - #ifdef CONFIG_MIGRATION static int btree_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -1396,8 +1148,7 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)root_id); - if (root) - root = btrfs_grab_root(root); + root = btrfs_grab_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); return root; } @@ -1411,31 +1162,28 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, .offset = 0, }; - if (objectid == BTRFS_ROOT_TREE_OBJECTID) + switch (objectid) { + case BTRFS_ROOT_TREE_OBJECTID: return btrfs_grab_root(fs_info->tree_root); - if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + case BTRFS_EXTENT_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); - if (objectid == BTRFS_CHUNK_TREE_OBJECTID) + case BTRFS_CHUNK_TREE_OBJECTID: return btrfs_grab_root(fs_info->chunk_root); - if (objectid == BTRFS_DEV_TREE_OBJECTID) + case BTRFS_DEV_TREE_OBJECTID: return btrfs_grab_root(fs_info->dev_root); - if (objectid == BTRFS_CSUM_TREE_OBJECTID) + case BTRFS_CSUM_TREE_OBJECTID: + return btrfs_grab_root(btrfs_global_root(fs_info, &key)); + case BTRFS_QUOTA_TREE_OBJECTID: + return btrfs_grab_root(fs_info->quota_root); + case BTRFS_UUID_TREE_OBJECTID: + return btrfs_grab_root(fs_info->uuid_root); + case BTRFS_BLOCK_GROUP_TREE_OBJECTID: + return btrfs_grab_root(fs_info->block_group_root); + case BTRFS_FREE_SPACE_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); - if (objectid == BTRFS_QUOTA_TREE_OBJECTID) - return btrfs_grab_root(fs_info->quota_root) ? - fs_info->quota_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_UUID_TREE_OBJECTID) - return btrfs_grab_root(fs_info->uuid_root) ? - fs_info->uuid_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID) - return btrfs_grab_root(fs_info->block_group_root) ? - fs_info->block_group_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { - struct btrfs_root *root = btrfs_global_root(fs_info, &key); - - return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT); - } - return NULL; + default: + return NULL; + } } int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, @@ -1991,7 +1739,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { btrfs_destroy_workqueue(fs_info->fixup_workers); btrfs_destroy_workqueue(fs_info->delalloc_workers); - btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); if (fs_info->endio_workers) destroy_workqueue(fs_info->endio_workers); @@ -2183,12 +1930,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; + unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE; fs_info->workers = btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); - fs_info->hipri_workers = - btrfs_alloc_workqueue(fs_info, "worker-high", - flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = btrfs_alloc_workqueue(fs_info, "delalloc", @@ -2202,7 +1947,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); fs_info->fixup_workers = - btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); + btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags); fs_info->endio_workers = alloc_workqueue("btrfs-endio", flags, max_active); @@ -2221,11 +1966,12 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, max_active, 0); fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); + btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan", + ordered_flags); fs_info->discard_ctl.discard_workers = - alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); + alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE); - if (!(fs_info->workers && fs_info->hipri_workers && + if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->compressed_write_workers && @@ -2265,6 +2011,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); break; + case BTRFS_CSUM_TYPE_XXHASH: + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); + break; default: break; } @@ -2642,6 +2391,14 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, + BTRFS_FSID_SIZE) != 0) { + btrfs_err(fs_info, + "dev_item UUID does not match metadata fsid: %pU != %pU", + fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); + ret = -EINVAL; + } + /* * Artificial requirement for block-group-tree to force newer features * (free-space-tree, no-holes) so the test matrix is smaller. @@ -2654,14 +2411,6 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, - BTRFS_FSID_SIZE) != 0) { - btrfs_err(fs_info, - "dev_item UUID does not match metadata fsid: %pU != %pU", - fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); - ret = -EINVAL; - } - /* * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later @@ -4662,28 +4411,10 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_close_devices(fs_info->fs_devices); } -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, - int atomic) -{ - int ret; - struct inode *btree_inode = buf->pages[0]->mapping->host; - - ret = extent_buffer_uptodate(buf); - if (!ret) - return ret; - - ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, - parent_transid, atomic); - if (ret == -EAGAIN) - return ret; - return !ret; -} - void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { struct btrfs_fs_info *fs_info = buf->fs_info; u64 transid = btrfs_header_generation(buf); - int was_dirty; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* @@ -4698,19 +4429,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (transid != fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", buf->start, transid, fs_info->generation); - was_dirty = set_extent_buffer_dirty(buf); - if (!was_dirty) - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - buf->len, - fs_info->dirty_metadata_batch); + set_extent_buffer_dirty(buf); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* - * Since btrfs_mark_buffer_dirty() can be called with item pointer set - * but item data not updated. - * So here we should only check item pointers, not item data. + * btrfs_check_leaf() won't check item data if we don't have WRITTEN + * set, so this will only validate the basic structure of the items. */ - if (btrfs_header_level(buf) == 0 && - btrfs_check_leaf_relaxed(buf)) { + if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) { btrfs_print_leaf(buf); ASSERT(0); } @@ -4840,13 +4565,12 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } -static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info) +static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_fs_info *fs_info) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; - int ret = 0; delayed_refs = &trans->delayed_refs; @@ -4854,7 +4578,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (atomic_read(&delayed_refs->num_entries) == 0) { spin_unlock(&delayed_refs->lock); btrfs_debug(fs_info, "delayed_refs has NO entry"); - return ret; + return; } while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { @@ -4871,7 +4595,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((n = rb_first_cached(&head->ref_tree)) != NULL) { ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); - ref->in_tree = 0; rb_erase_cached(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) @@ -4916,8 +4639,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); - - return ret; } static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) @@ -5142,8 +4863,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); - btrfs_free_redirty_list(cur_trans); - cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4d5772330110..b03767f4d7ed 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -31,8 +31,6 @@ struct btrfs_tree_parent_check; void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, struct btrfs_tree_parent_check *check); struct extent_buffer *btrfs_find_create_tree_block( @@ -84,9 +82,8 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, - struct page *page, u64 start, u64 end, - int mirror); +int btrfs_validate_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 29a225836e28..a2315a4b8b75 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -533,6 +533,16 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, } /* + * Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly, + * unset the EXTENT_NOWAIT bit. + */ +static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) +{ + *mask = (*bits & EXTENT_NOWAIT ? GFP_NOWAIT : GFP_NOFS); + *bits &= EXTENT_NOWAIT - 1; +} + +/* * Clear some bits on a range in the tree. This may require splitting or * inserting elements in the tree, so the gfp mask is used to indicate which * allocations or sleeping are allowed. @@ -546,7 +556,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, */ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; @@ -556,7 +566,9 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int clear = 0; int wake; int delete = (bits & EXTENT_CLEAR_ALL_BITS); + gfp_t mask; + set_gfp_mask_from_bits(&bits, &mask); btrfs_debug_check_extent_io_range(tree, start, end); trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); @@ -953,7 +965,8 @@ out: /* * Set some bits on a range in the tree. This may require allocations or - * sleeping, so the gfp mask is used to indicate what is allowed. + * sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for + * GFP_NOWAIT. * * If any of the exclusive bits are set, this will fail with -EEXIST if some * part of the range already has the desired bits set. The extent_state of the @@ -968,7 +981,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u64 *failed_start, struct extent_state **failed_state, struct extent_state **cached_state, - struct extent_changeset *changeset, gfp_t mask) + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -978,7 +991,9 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; u32 exclusive_bits = (bits & EXTENT_LOCKED); + gfp_t mask; + set_gfp_mask_from_bits(&bits, &mask); btrfs_debug_check_extent_io_range(tree, start, end); trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); @@ -1188,10 +1203,10 @@ out: } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state, gfp_t mask) + u32 bits, struct extent_state **cached_state) { return __set_extent_bit(tree, start, end, bits, NULL, NULL, - cached_state, NULL, mask); + cached_state, NULL); } /* @@ -1687,8 +1702,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCKED)); - return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, - changeset, GFP_NOFS); + return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -1700,8 +1714,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCKED)); - return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS, - changeset); + return __clear_extent_bit(tree, start, end, bits, NULL, changeset); } int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, @@ -1711,7 +1724,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - NULL, cached, NULL, GFP_NOFS); + NULL, cached, NULL); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, @@ -1733,7 +1746,7 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - &failed_state, cached_state, NULL, GFP_NOFS); + &failed_state, cached_state, NULL); while (err == -EEXIST) { if (failed_start != start) clear_extent_bit(tree, start, failed_start - 1, @@ -1743,7 +1756,7 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, &failed_state); err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, &failed_state, - cached_state, NULL, GFP_NOFS); + cached_state, NULL); } return err; } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 21766e49ec02..fbd3b275ab1c 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -43,6 +43,15 @@ enum { * want the extent states to go away. */ ENUM_BIT(EXTENT_CLEAR_ALL_BITS), + + /* + * This must be last. + * + * Bit not representing a state but a request for NOWAIT semantics, + * e.g. when allocating memory, and must be masked out from the other + * bits. + */ + ENUM_BIT(EXTENT_NOWAIT) }; #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ @@ -127,22 +136,20 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached, gfp_t mask, + u32 bits, struct extent_state **cached, struct extent_changeset *changeset); static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, bits, cached, - GFP_NOFS, NULL); + return __clear_extent_bit(tree, start, end, bits, cached, NULL); } static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, - GFP_NOFS, NULL); + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, NULL); } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, @@ -154,31 +161,13 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state, gfp_t mask); - -static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits) -{ - return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT); -} - -static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits) -{ - return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS); -} + u32 bits, struct extent_state **cached_state); static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, - cached_state, GFP_NOFS, NULL); -} - -static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask); + cached_state, NULL); } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, @@ -193,29 +182,6 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u32 clear_bits, struct extent_state **cached_state); -static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, u32 extra_bits, - struct extent_state **cached_state) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | extra_bits, - cached_state, GFP_NOFS); -} - -static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DEFRAG, - cached_state, GFP_NOFS); -} - -static inline int set_extent_new(struct extent_io_tree *tree, u64 start, - u64 end) -{ - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS); -} - int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits, struct extent_state **cached_state); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5cd289de4e92..911908ea5f6f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -73,8 +73,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { u64 end = start + num_bytes - 1; - set_extent_bits(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE); + set_extent_bit(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE, NULL); return 0; } @@ -402,7 +402,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } } - btrfs_print_leaf((struct extent_buffer *)eb); + btrfs_print_leaf(eb); btrfs_err(eb->fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); @@ -1164,15 +1164,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, * should not happen at all. */ if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_print_leaf(path->nodes[0]); btrfs_crit(trans->fs_info, -"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu", - bytenr, num_bytes, root_objectid); - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { - WARN_ON(1); - btrfs_crit(trans->fs_info, - "path->slots[0]=%d path->nodes[0]:", path->slots[0]); - btrfs_print_leaf(path->nodes[0]); - } +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", + bytenr, num_bytes, root_objectid, path->slots[0]); return -EUCLEAN; } update_inline_extent_backref(path, iref, refs_to_add, extent_op); @@ -1208,11 +1203,11 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, { int j, ret = 0; u64 bytes_left, end; - u64 aligned_start = ALIGN(start, 1 << 9); + u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); if (WARN_ON(start != aligned_start)) { len -= aligned_start - start; - len = round_down(len, 1 << 9); + len = round_down(len, 1 << SECTOR_SHIFT); start = aligned_start; } @@ -1250,7 +1245,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, } if (size) { - ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + size >> SECTOR_SHIFT, GFP_NOFS); if (!ret) *discarded_bytes += size; @@ -1267,7 +1263,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, } if (bytes_left) { - ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + bytes_left >> SECTOR_SHIFT, GFP_NOFS); if (!ret) *discarded_bytes += bytes_left; @@ -1500,7 +1497,7 @@ out: static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) + bool insert_reserved) { int ret = 0; struct btrfs_delayed_data_ref *ref; @@ -1650,7 +1647,7 @@ out: static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) + bool insert_reserved) { int ret = 0; struct btrfs_delayed_tree_ref *ref; @@ -1690,7 +1687,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) + bool insert_reserved) { int ret = 0; @@ -1748,7 +1745,7 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref struct btrfs_delayed_ref_head *head) { spin_lock(&delayed_refs->lock); - head->processing = 0; + head->processing = false; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); btrfs_delayed_ref_unlock(head); @@ -1900,7 +1897,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_extent_op *extent_op; struct btrfs_delayed_ref_node *ref; - int must_insert_reserved = 0; + bool must_insert_reserved; int ret; delayed_refs = &trans->transaction->delayed_refs; @@ -1916,7 +1913,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, return -EAGAIN; } - ref->in_tree = 0; rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) @@ -1943,7 +1939,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, * spin lock. */ must_insert_reserved = locked_ref->must_insert_reserved; - locked_ref->must_insert_reserved = 0; + locked_ref->must_insert_reserved = false; extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; @@ -2155,10 +2151,10 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, - int level) + struct extent_buffer *eb, u64 flags) { struct btrfs_delayed_extent_op *extent_op; + int level = btrfs_header_level(eb); int ret; extent_op = btrfs_alloc_delayed_extent_op(); @@ -2510,8 +2506,8 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - set_extent_dirty(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } @@ -2838,6 +2834,13 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, return ret; } +#define abort_and_dump(trans, path, fmt, args...) \ +({ \ + btrfs_abort_transaction(trans, -EUCLEAN); \ + btrfs_print_leaf(path->nodes[0]); \ + btrfs_crit(trans->fs_info, fmt, ##args); \ +}) + /* * Drop one or more refs of @node. * @@ -2978,10 +2981,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!found_extent) { if (iref) { - btrfs_crit(info, -"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", + path->slots[0]); + ret = -EUCLEAN; + goto out; } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, @@ -3029,11 +3033,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (ret) { - btrfs_err(info, - "umm, got %d back from search, was looking for %llu", - ret, bytenr); if (ret > 0) btrfs_print_leaf(path->nodes[0]); + btrfs_err(info, + "umm, got %d back from search, was looking for %llu, slot %d", + ret, bytenr, path->slots[0]); } if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -3042,12 +3046,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, extent_slot = path->slots[0]; } } else if (WARN_ON(ret == -ENOENT)) { - btrfs_print_leaf(path->nodes[0]); - btrfs_err(info, - "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", - bytenr, parent, root_objectid, owner_objectid, - owner_offset); - btrfs_abort_transaction(trans, ret); + abort_and_dump(trans, path, +"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", + bytenr, parent, root_objectid, owner_objectid, + owner_offset, path->slots[0]); goto out; } else { btrfs_abort_transaction(trans, ret); @@ -3067,14 +3069,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; + if (item_size < sizeof(*ei) + sizeof(*bi)) { - btrfs_crit(info, -"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu", - key.objectid, key.type, key.offset, - owner_objectid, item_size, - sizeof(*ei) + sizeof(*bi)); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", + key.objectid, key.type, key.offset, + path->slots[0], owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + ret = -EUCLEAN; + goto out; } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); @@ -3082,11 +3085,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { - btrfs_crit(info, - "trying to drop %d refs but we only have %llu for bytenr %llu", - refs_to_drop, refs, bytenr); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, + "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", + refs_to_drop, refs, bytenr, path->slots[0]); + ret = -EUCLEAN; + goto out; } refs -= refs_to_drop; @@ -3099,10 +3102,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, */ if (iref) { if (!found_extent) { - btrfs_crit(info, -"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", + path->slots[0]); + ret = -EUCLEAN; + goto out; } } else { btrfs_set_extent_refs(leaf, ei, refs); @@ -3121,21 +3125,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { if (is_data && refs_to_drop != extent_data_ref_count(path, iref)) { - btrfs_crit(info, - "invalid refs_to_drop, current refs %u refs_to_drop %u", - extent_data_ref_count(path, iref), - refs_to_drop); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, + "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", + extent_data_ref_count(path, iref), + refs_to_drop, path->slots[0]); + ret = -EUCLEAN; + goto out; } if (iref) { if (path->slots[0] != extent_slot) { - btrfs_crit(info, -"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref", - key.objectid, key.type, - key.offset); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", + key.objectid, key.type, + key.offset, path->slots[0]); + ret = -EUCLEAN; + goto out; } } else { /* @@ -3145,10 +3149,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] */ if (path->slots[0] != extent_slot + 1) { - btrfs_crit(info, - "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, + "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", + path->slots[0]); + ret = -EUCLEAN; + goto out; } path->slots[0] = extent_slot; num_to_del = 2; @@ -3170,19 +3175,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); return ret; -err_dump: - /* - * Leaf dump can take up a lot of log buffer, so we only do full leaf - * dump for debug build. - */ - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { - btrfs_crit(info, "path->slots[0]=%d extent_slot=%d", - path->slots[0], extent_slot); - btrfs_print_leaf(path->nodes[0]); - } - - btrfs_free_path(path); - return -EUCLEAN; } /* @@ -3219,7 +3211,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out; btrfs_delete_ref_head(delayed_refs, head); - head->processing = 0; + head->processing = false; spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -4804,7 +4796,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) lockdep_owner = BTRFS_FS_TREE_OBJECTID; - /* btrfs_clean_tree_block() accesses generation field. */ + /* btrfs_clear_buffer_dirty() accesses generation field. */ btrfs_set_header_generation(buf, trans->transid); /* @@ -4836,15 +4828,17 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) - set_extent_dirty(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_DIRTY, NULL); else - set_extent_new(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1); + set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_NEW, NULL); } else { buf->log_index = -1; - set_extent_dirty(&trans->transaction->dirty_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + set_extent_bit(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, EXTENT_DIRTY, NULL); } /* this returns a buffer locked for blocking */ return buf; @@ -5102,8 +5096,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, eb, flag, - btrfs_header_level(eb)); + ret = btrfs_set_disk_extent_flags(trans, eb, flag); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -5985,9 +5978,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) ret = btrfs_issue_discard(device->bdev, start, len, &bytes); if (!ret) - set_extent_bits(&device->alloc_state, start, - start + bytes - 1, - CHUNK_TRIMMED); + set_extent_bit(&device->alloc_state, start, + start + bytes - 1, CHUNK_TRIMMED, NULL); mutex_unlock(&fs_info->chunk_mutex); if (ret) diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 0c958fc1b3b8..429d5c570061 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -141,7 +141,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, int level); + struct extent_buffer *eb, u64 flags); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a1adadd5d25d..a91d5ad27984 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -98,33 +98,16 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) */ struct btrfs_bio_ctrl { struct btrfs_bio *bbio; - int mirror_num; enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; - - /* - * This is for metadata read, to provide the extra needed verification - * info. This has to be provided for submit_one_bio(), as - * submit_one_bio() can submit a bio if it ends at stripe boundary. If - * no such parent_check is provided, the metadata can hit false alert at - * endio time. - */ - struct btrfs_tree_parent_check *parent_check; - - /* - * Tell writepage not to lock the state bits for this range, it still - * does the unlocking. - */ - bool extent_locked; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_bio *bbio = bio_ctrl->bbio; - int mirror_num = bio_ctrl->mirror_num; if (!bbio) return; @@ -132,25 +115,11 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* Caller should ensure the bio has at least some range added */ ASSERT(bbio->bio.bi_iter.bi_size); - if (!is_data_inode(&bbio->inode->vfs_inode)) { - if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) { - /* - * For metadata read, we should have the parent_check, - * and copy it to bbio for metadata verification. - */ - ASSERT(bio_ctrl->parent_check); - memcpy(&bbio->parent_check, - bio_ctrl->parent_check, - sizeof(struct btrfs_tree_parent_check)); - } - bbio->bio.bi_opf |= REQ_META; - } - if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) - btrfs_submit_compressed_read(bbio, mirror_num); + btrfs_submit_compressed_read(bbio); else - btrfs_submit_bio(bbio, mirror_num); + btrfs_submit_bio(bbio, 0); /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; @@ -248,8 +217,6 @@ static int process_one_page(struct btrfs_fs_info *fs_info, if (page_ops & PAGE_SET_ORDERED) btrfs_page_clamp_set_ordered(fs_info, page, start, len); - if (page_ops & PAGE_SET_ERROR) - btrfs_page_clamp_set_error(fs_info, page, start, len); if (page_ops & PAGE_START_WRITEBACK) { btrfs_page_clamp_clear_dirty(fs_info, page, start, len); btrfs_page_clamp_set_writeback(fs_info, page, start, len); @@ -295,9 +262,6 @@ static int __process_pages_contig(struct address_space *mapping, ASSERT(processed_end && *processed_end == start); } - if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) - mapping_set_error(mapping, -EIO); - folio_batch_init(&fbatch); while (index <= end_index) { int found_folios; @@ -506,6 +470,15 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, start, end, page_ops, NULL); } +static bool btrfs_verify_page(struct page *page, u64 start) +{ + if (!fsverity_active(page->mapping->host) || + PageUptodate(page) || + start >= i_size_read(page->mapping->host)) + return true; + return fsverity_verify_page(page); +} + static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); @@ -513,20 +486,10 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) ASSERT(page_offset(page) <= start && start + len <= page_offset(page) + PAGE_SIZE); - if (uptodate) { - if (fsverity_active(page->mapping->host) && - !PageError(page) && - !PageUptodate(page) && - start < i_size_read(page->mapping->host) && - !fsverity_verify_page(page)) { - btrfs_page_set_error(fs_info, page, start, len); - } else { - btrfs_page_set_uptodate(fs_info, page, start, len); - } - } else { + if (uptodate && btrfs_verify_page(page, start)) + btrfs_page_set_uptodate(fs_info, page, start, len); + else btrfs_page_clear_uptodate(fs_info, page, start, len); - btrfs_page_set_error(fs_info, page, start, len); - } if (!btrfs_is_subpage(fs_info, page)) unlock_page(page); @@ -554,7 +517,6 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) len = end + 1 - start; btrfs_page_clear_uptodate(fs_info, page, start, len); - btrfs_page_set_error(fs_info, page, start, len); ret = err < 0 ? err : -EIO; mapping_set_error(page->mapping, ret); } @@ -574,8 +536,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; - u64 start; - u64 end; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -584,6 +544,8 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; + u64 start = page_offset(page) + bvec->bv_offset; + u32 len = bvec->bv_len; /* Our read/write should always be sector aligned. */ if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) @@ -595,12 +557,12 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) "incomplete page write with offset %u and length %u", bvec->bv_offset, bvec->bv_len); - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; - - end_extent_writepage(page, error, start, end); - - btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); + btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); + if (error) { + btrfs_page_clear_uptodate(fs_info, page, start, len); + mapping_set_error(page->mapping, error); + } + btrfs_page_clear_writeback(fs_info, page, start, len); } bio_put(bio); @@ -686,35 +648,6 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) } /* - * Find extent buffer for a givne bytenr. - * - * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking - * in endio context. - */ -static struct extent_buffer *find_extent_buffer_readpage( - struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) -{ - struct extent_buffer *eb; - - /* - * For regular sectorsize, we can use page->private to grab extent - * buffer - */ - if (fs_info->nodesize >= PAGE_SIZE) { - ASSERT(PagePrivate(page) && page->private); - return (struct extent_buffer *)page->private; - } - - /* For subpage case, we need to lookup buffer radix tree */ - rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - bytenr >> fs_info->sectorsize_bits); - rcu_read_unlock(); - ASSERT(eb); - return eb; -} - -/* * after a readpage IO is done, we need to: * clear the uptodate bits on error * set the uptodate bits if things worked @@ -735,7 +668,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) * larger than UINT_MAX, u32 here is enough. */ u32 bio_offset = 0; - int mirror; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -775,11 +707,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) end = start + bvec->bv_len - 1; len = bvec->bv_len; - mirror = bbio->mirror_num; - if (uptodate && !is_data_inode(inode) && - btrfs_validate_metadata_buffer(bbio, page, start, end, mirror)) - uptodate = false; - if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; @@ -800,19 +727,12 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) zero_user_segment(page, zero_start, offset_in_page(end) + 1); } - } else if (!is_data_inode(inode)) { - struct extent_buffer *eb; - - eb = find_extent_buffer_readpage(fs_info, page, start); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = mirror; - atomic_dec(&eb->io_pages); } /* Update page status and unlock. */ end_page_read(page, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); + start, end, uptodate); ASSERT(bio_offset + len > bio_offset); bio_offset += len; @@ -906,13 +826,8 @@ static void alloc_new_bio(struct btrfs_inode *inode, bio_ctrl->bbio = bbio; bio_ctrl->len_to_oe_boundary = U32_MAX; - /* - * Limit the extent to the ordered boundary for Zone Append. - * Compressed bios aren't submitted directly, so it doesn't apply to - * them. - */ - if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && - btrfs_use_zone_append(bbio)) { + /* Limit data write bios to the ordered boundary. */ + if (bio_ctrl->wbc) { struct btrfs_ordered_extent *ordered; ordered = btrfs_lookup_ordered_extent(inode, file_offset); @@ -920,11 +835,9 @@ static void alloc_new_bio(struct btrfs_inode *inode, bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, ordered->file_offset + ordered->disk_num_bytes - file_offset); - btrfs_put_ordered_extent(ordered); + bbio->ordered = ordered; } - } - if (bio_ctrl->wbc) { /* * Pick the last added device to support cgroup writeback. For * multi-device file systems this means blk-cgroup policies have @@ -1125,7 +1038,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = set_page_extent_mapped(page); if (ret < 0) { unlock_extent(tree, start, end, NULL); - btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); unlock_page(page); return ret; } @@ -1329,11 +1241,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, } ret = btrfs_run_delalloc_range(inode, page, delalloc_start, delalloc_end, &page_started, &nr_written, wbc); - if (ret) { - btrfs_page_set_error(inode->root->fs_info, page, - page_offset(page), PAGE_SIZE); + if (ret) return ret; - } + /* * delalloc_end is already one less than the total length, so * we don't subtract one from PAGE_SIZE @@ -1438,7 +1348,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct extent_map *em; int ret = 0; int nr = 0; - bool compressed; ret = btrfs_writepage_cow_fixup(page); if (ret) { @@ -1448,12 +1357,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, return 1; } - /* - * we don't want to touch the inode after unlocking the page, - * so we update the mapping writeback index now - */ - bio_ctrl->wbc->nr_to_write--; - bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { u64 disk_bytenr; @@ -1486,7 +1389,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); if (IS_ERR(em)) { - btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); goto out_error; } @@ -1497,10 +1399,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, ASSERT(cur < end); ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); + block_start = em->block_start; - compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); disk_bytenr = em->block_start + extent_offset; + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(block_start != EXTENT_MAP_HOLE); + ASSERT(block_start != EXTENT_MAP_INLINE); + /* * Note that em_end from extent_map_end() and dirty_range_end from * find_next_dirty_byte() are all exclusive @@ -1509,22 +1415,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, free_extent_map(em); em = NULL; - /* - * compressed and inline extents are written through other - * paths in the FS - */ - if (compressed || block_start == EXTENT_MAP_HOLE || - block_start == EXTENT_MAP_INLINE) { - if (compressed) - nr++; - else - btrfs_writepage_endio_finish_ordered(inode, - page, cur, cur + iosize - 1, true); - btrfs_page_clear_dirty(fs_info, page, cur, iosize); - cur += iosize; - continue; - } - btrfs_set_range_writeback(inode, cur, cur + iosize - 1); if (!PageWriteback(page)) { btrfs_err(inode->root->fs_info, @@ -1572,7 +1462,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl { struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u64 page_start = page_offset(page); const u64 page_end = page_start + PAGE_SIZE - 1; int ret; @@ -1585,9 +1474,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl WARN_ON(!PageLocked(page)); - btrfs_page_clear_error(btrfs_sb(inode->i_sb), page, - page_offset(page), PAGE_SIZE); - pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { @@ -1600,77 +1486,30 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); ret = set_page_extent_mapped(page); - if (ret < 0) { - SetPageError(page); + if (ret < 0) goto done; - } - if (!bio_ctrl->extent_locked) { - ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); - if (ret == 1) - return 0; - if (ret) - goto done; - } + ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); + if (ret == 1) + return 0; + if (ret) + goto done; ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr); if (ret == 1) return 0; + bio_ctrl->wbc->nr_to_write--; + done: if (nr == 0) { /* make sure the mapping tag for page dirty gets cleared */ set_page_writeback(page); end_page_writeback(page); } - /* - * Here we used to have a check for PageError() and then set @ret and - * call end_extent_writepage(). - * - * But in fact setting @ret here will cause different error paths - * between subpage and regular sectorsize. - * - * For regular page size, we never submit current page, but only add - * current page to current bio. - * The bio submission can only happen in next page. - * Thus if we hit the PageError() branch, @ret is already set to - * non-zero value and will not get updated for regular sectorsize. - * - * But for subpage case, it's possible we submit part of current page, - * thus can get PageError() set by submitted bio of the same page, - * while our @ret is still 0. - * - * So here we unify the behavior and don't set @ret. - * Error can still be properly passed to higher layer as page will - * be set error, here we just don't handle the IO failure. - * - * NOTE: This is just a hotfix for subpage. - * The root fix will be properly ending ordered extent when we hit - * an error during writeback. - * - * But that needs a bigger refactoring, as we not only need to grab the - * submitted OE, but also need to know exactly at which bytenr we hit - * the error. - * Currently the full page based __extent_writepage_io() is not - * capable of that. - */ - if (PageError(page)) + if (ret) end_extent_writepage(page, ret, page_start, page_end); - if (bio_ctrl->extent_locked) { - struct writeback_control *wbc = bio_ctrl->wbc; - - /* - * If bio_ctrl->extent_locked, it's from extent_write_locked_range(), - * the page can either be locked by lock_page() or - * process_one_page(). - * Let btrfs_page_unlock_writer() handle both cases. - */ - ASSERT(wbc); - btrfs_page_unlock_writer(fs_info, page, wbc->range_start, - wbc->range_end + 1 - wbc->range_start); - } else { - unlock_page(page); - } + unlock_page(page); ASSERT(ret <= 0); return ret; } @@ -1681,52 +1520,26 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } -static void end_extent_buffer_writeback(struct extent_buffer *eb) -{ - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); -} - /* * Lock extent buffer status and pages for writeback. * - * May try to flush write bio if we can't get the lock. - * - * Return 0 if the extent buffer doesn't need to be submitted. - * (E.g. the extent buffer is not dirty) - * Return >0 is the extent buffer is submitted to bio. - * Return <0 if something went wrong, no page is locked. + * Return %false if the extent buffer doesn't need to be submitted (e.g. the + * extent buffer is not dirty) + * Return %true is the extent buffer is submitted to bio. */ -static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, - struct btrfs_bio_ctrl *bio_ctrl) +static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, + struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages; - int flush = 0; - int ret = 0; + bool ret = false; - if (!btrfs_try_tree_write_lock(eb)) { - submit_write_bio(bio_ctrl, 0); - flush = 1; - btrfs_tree_lock(eb); - } - - if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_lock(eb); + while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { btrfs_tree_unlock(eb); - if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL) - return 0; - if (!flush) { - submit_write_bio(bio_ctrl, 0); - flush = 1; - } - while (1) { - wait_on_extent_buffer_writeback(eb); - btrfs_tree_lock(eb); - if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) - break; - btrfs_tree_unlock(eb); - } + if (wbc->sync_mode != WB_SYNC_ALL) + return false; + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); } /* @@ -1742,45 +1555,19 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, fs_info->dirty_metadata_batch); - ret = 1; + ret = true; } else { spin_unlock(&eb->refs_lock); } - btrfs_tree_unlock(eb); - - /* - * Either we don't need to submit any tree block, or we're submitting - * subpage eb. - * Subpage metadata doesn't use page locking at all, so we can skip - * the page locking. - */ - if (!ret || fs_info->nodesize < PAGE_SIZE) - return ret; - - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - if (!trylock_page(p)) { - if (!flush) { - submit_write_bio(bio_ctrl, 0); - flush = 1; - } - lock_page(p); - } - } - return ret; } -static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) +static void set_btree_ioerr(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - btrfs_page_set_error(fs_info, page, eb->start, eb->len); - if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) - return; + set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); /* * A read may stumble upon this buffer later, make sure that it gets an @@ -1794,7 +1581,7 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) * return a 0 because we are readonly if we don't modify the err seq for * the superblock. */ - mapping_set_error(page->mapping, -EIO); + mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO); /* * If writeback for a btree extent that doesn't belong to a log tree @@ -1869,101 +1656,34 @@ static struct extent_buffer *find_extent_buffer_nolock( return NULL; } -/* - * The endio function for subpage extent buffer write. - * - * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() - * after all extent buffers in the page has finished their writeback. - */ -static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio) +static void extent_buffer_write_end_io(struct btrfs_bio *bbio) { - struct bio *bio = &bbio->bio; - struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct extent_buffer *eb = bbio->private; + struct btrfs_fs_info *fs_info = eb->fs_info; + bool uptodate = !bbio->bio.bi_status; struct bvec_iter_all iter_all; + struct bio_vec *bvec; + u32 bio_offset = 0; - fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); - ASSERT(fs_info->nodesize < PAGE_SIZE); + if (!uptodate) + set_btree_ioerr(eb); - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + u64 start = eb->start + bio_offset; struct page *page = bvec->bv_page; - u64 bvec_start = page_offset(page) + bvec->bv_offset; - u64 bvec_end = bvec_start + bvec->bv_len - 1; - u64 cur_bytenr = bvec_start; - - ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize)); - - /* Iterate through all extent buffers in the range */ - while (cur_bytenr <= bvec_end) { - struct extent_buffer *eb; - int done; - - /* - * Here we can't use find_extent_buffer(), as it may - * try to lock eb->refs_lock, which is not safe in endio - * context. - */ - eb = find_extent_buffer_nolock(fs_info, cur_bytenr); - ASSERT(eb); - - cur_bytenr = eb->start + eb->len; - - ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)); - done = atomic_dec_and_test(&eb->io_pages); - ASSERT(done); - - if (bio->bi_status || - test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { - ClearPageUptodate(page); - set_btree_ioerr(page, eb); - } + u32 len = bvec->bv_len; - btrfs_subpage_clear_writeback(fs_info, page, eb->start, - eb->len); - end_extent_buffer_writeback(eb); - /* - * free_extent_buffer() will grab spinlock which is not - * safe in endio context. Thus here we manually dec - * the ref. - */ - atomic_dec(&eb->refs); - } + if (!uptodate) + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_clear_writeback(fs_info, page, start, len); + bio_offset += len; } - bio_put(bio); -} -static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio) -{ - struct bio *bio = &bbio->bio; - struct bio_vec *bvec; - struct extent_buffer *eb; - int done; - struct bvec_iter_all iter_all; - - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - - eb = (struct extent_buffer *)page->private; - BUG_ON(!eb); - done = atomic_dec_and_test(&eb->io_pages); - - if (bio->bi_status || - test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { - ClearPageUptodate(page); - set_btree_ioerr(page, eb); - } - - end_page_writeback(page); - - if (!done) - continue; - - end_extent_buffer_writeback(eb); - } + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); - bio_put(bio); + bio_put(&bbio->bio); } static void prepare_eb_write(struct extent_buffer *eb) @@ -1973,7 +1693,6 @@ static void prepare_eb_write(struct extent_buffer *eb) unsigned long end; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); - atomic_set(&eb->io_pages, num_extent_pages(eb)); /* Set btree blocks beyond nritems with 0 to avoid stale content */ nritems = btrfs_header_nritems(eb); @@ -1995,63 +1714,49 @@ static void prepare_eb_write(struct extent_buffer *eb) } } -/* - * Unlike the work in write_one_eb(), we rely completely on extent locking. - * Page locking is only utilized at minimum to keep the VMM code happy. - */ -static void write_one_subpage_eb(struct extent_buffer *eb, - struct btrfs_bio_ctrl *bio_ctrl) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page = eb->pages[0]; - bool no_dirty_ebs = false; - - prepare_eb_write(eb); - - /* clear_page_dirty_for_io() in subpage helper needs page locked */ - lock_page(page); - btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); - - /* Check if this is the last dirty bit to update nr_written */ - no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page, - eb->start, eb->len); - if (no_dirty_ebs) - clear_page_dirty_for_io(page); - - bio_ctrl->end_io_func = end_bio_subpage_eb_writepage; - - submit_extent_page(bio_ctrl, eb->start, page, eb->len, - eb->start - page_offset(page)); - unlock_page(page); - /* - * Submission finished without problem, if no range of the page is - * dirty anymore, we have submitted a page. Update nr_written in wbc. - */ - if (no_dirty_ebs) - bio_ctrl->wbc->nr_to_write--; -} - static noinline_for_stack void write_one_eb(struct extent_buffer *eb, - struct btrfs_bio_ctrl *bio_ctrl) + struct writeback_control *wbc) { - u64 disk_bytenr = eb->start; - int i, num_pages; + struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_bio *bbio; prepare_eb_write(eb); - bio_ctrl->end_io_func = end_bio_extent_buffer_writepage; - - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - clear_page_dirty_for_io(p); - set_page_writeback(p); - submit_extent_page(bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0); - disk_bytenr += PAGE_SIZE; - bio_ctrl->wbc->nr_to_write--; + bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, + REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), + eb->fs_info, extent_buffer_write_end_io, eb); + bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; + bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); + wbc_init_bio(wbc, &bbio->bio); + bbio->inode = BTRFS_I(eb->fs_info->btree_inode); + bbio->file_offset = eb->start; + if (fs_info->nodesize < PAGE_SIZE) { + struct page *p = eb->pages[0]; + + lock_page(p); + btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len); + if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start, + eb->len)) { + clear_page_dirty_for_io(p); + wbc->nr_to_write--; + } + __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p)); + wbc_account_cgroup_owner(wbc, p, eb->len); unlock_page(p); + } else { + for (int i = 0; i < num_extent_pages(eb); i++) { + struct page *p = eb->pages[i]; + + lock_page(p); + clear_page_dirty_for_io(p); + set_page_writeback(p); + __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0); + wbc_account_cgroup_owner(wbc, p, PAGE_SIZE); + wbc->nr_to_write--; + unlock_page(p); + } } + btrfs_submit_bio(bbio, 0); } /* @@ -2068,14 +1773,13 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * Return >=0 for the number of submitted extent buffers. * Return <0 for fatal error. */ -static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) +static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); int submitted = 0; u64 page_start = page_offset(page); int bit_start = 0; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; - int ret; /* Lock and write each dirty extent buffers in the range */ while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { @@ -2121,25 +1825,13 @@ static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) if (!eb) continue; - ret = lock_extent_buffer_for_io(eb, bio_ctrl); - if (ret == 0) { - free_extent_buffer(eb); - continue; + if (lock_extent_buffer_for_io(eb, wbc)) { + write_one_eb(eb, wbc); + submitted++; } - if (ret < 0) { - free_extent_buffer(eb); - goto cleanup; - } - write_one_subpage_eb(eb, bio_ctrl); free_extent_buffer(eb); - submitted++; } return submitted; - -cleanup: - /* We hit error, end bio for the submitted extent buffers */ - submit_write_bio(bio_ctrl, ret); - return ret; } /* @@ -2162,7 +1854,7 @@ cleanup: * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, +static int submit_eb_page(struct page *page, struct writeback_control *wbc, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; @@ -2174,7 +1866,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, return 0; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, bio_ctrl); + return submit_eb_subpage(page, wbc); spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { @@ -2207,8 +1899,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, * If for_sync, this hole will be filled with * trasnsaction commit. */ - if (bio_ctrl->wbc->sync_mode == WB_SYNC_ALL && - !bio_ctrl->wbc->for_sync) + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) ret = -EAGAIN; else ret = 0; @@ -2218,13 +1909,12 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, *eb_context = eb; - ret = lock_extent_buffer_for_io(eb, bio_ctrl); - if (ret <= 0) { + if (!lock_extent_buffer_for_io(eb, wbc)) { btrfs_revert_meta_write_pointer(cache, eb); if (cache) btrfs_put_block_group(cache); free_extent_buffer(eb); - return ret; + return 0; } if (cache) { /* @@ -2233,7 +1923,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); } - write_one_eb(eb, bio_ctrl); + write_one_eb(eb, wbc); free_extent_buffer(eb); return 1; } @@ -2242,11 +1932,6 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_buffer *eb_context = NULL; - struct btrfs_bio_ctrl bio_ctrl = { - .wbc = wbc, - .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), - .extent_locked = 0, - }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; int done = 0; @@ -2288,7 +1973,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, &bio_ctrl, &eb_context); + ret = submit_eb_page(&folio->page, wbc, &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -2349,8 +2034,6 @@ retry: ret = 0; if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; - submit_write_bio(&bio_ctrl, ret); - btrfs_zoned_meta_io_unlock(fs_info); return ret; } @@ -2520,38 +2203,31 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -int extent_write_locked_range(struct inode *inode, u64 start, u64 end) +int extent_write_locked_range(struct inode *inode, u64 start, u64 end, + struct writeback_control *wbc) { bool found_error = false; int first_error = 0; int ret = 0; struct address_space *mapping = inode->i_mapping; - struct page *page; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + loff_t i_size = i_size_read(inode); u64 cur = start; - unsigned long nr_pages; - const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; - struct writeback_control wbc_writepages = { - .sync_mode = WB_SYNC_ALL, - .range_start = start, - .range_end = end + 1, - .no_cgroup_owner = 1, - }; struct btrfs_bio_ctrl bio_ctrl = { - .wbc = &wbc_writepages, - /* We're called from an async helper function */ - .opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT | - wbc_to_write_flags(&wbc_writepages), - .extent_locked = 1, + .wbc = wbc, + .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), }; + if (wbc->no_cgroup_owner) + bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT; + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); - nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> - PAGE_SHIFT; - wbc_writepages.nr_to_write = nr_pages * 2; - wbc_attach_fdatawrite_inode(&wbc_writepages, inode); while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + struct page *page; + int nr = 0; page = find_get_page(mapping, cur >> PAGE_SHIFT); /* @@ -2562,19 +2238,31 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) ASSERT(PageLocked(page)); ASSERT(PageDirty(page)); clear_page_dirty_for_io(page); - ret = __extent_writepage(page, &bio_ctrl); - ASSERT(ret <= 0); + + ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl, + i_size, &nr); + if (ret == 1) + goto next_page; + + /* Make sure the mapping tag for page dirty gets cleared. */ + if (nr == 0) { + set_page_writeback(page); + end_page_writeback(page); + } + if (ret) + end_extent_writepage(page, ret, cur, cur_end); + btrfs_page_unlock_writer(fs_info, page, cur, cur_end + 1 - cur); if (ret < 0) { found_error = true; first_error = ret; } +next_page: put_page(page); cur = cur_end + 1; } submit_write_bio(&bio_ctrl, found_error ? ret : 0); - wbc_detach_inode(&wbc_writepages); if (found_error) return first_error; return ret; @@ -2588,7 +2276,6 @@ int extent_writepages(struct address_space *mapping, struct btrfs_bio_ctrl bio_ctrl = { .wbc = wbc, .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), - .extent_locked = 0, }; /* @@ -2679,8 +2366,7 @@ static int try_release_extent_state(struct extent_io_tree *tree, * The delalloc new bit will be cleared by ordered extent * completion. */ - ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, - mask, NULL); + ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. @@ -3421,10 +3107,9 @@ static void __free_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -int extent_buffer_under_io(const struct extent_buffer *eb) +static int extent_buffer_under_io(const struct extent_buffer *eb) { - return (atomic_read(&eb->io_pages) || - test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } @@ -3557,11 +3242,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, init_rwsem(&eb->lock); btrfs_leak_debug_add_eb(eb); - INIT_LIST_HEAD(&eb->release_list); spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); - atomic_set(&eb->io_pages, 0); ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); @@ -3678,9 +3361,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * adequately protected by the refcount, but the TREE_REF bit and * its corresponding reference are not. To protect against this * class of races, we call check_buffer_tree_ref from the codepaths - * which trigger io after they set eb->io_pages. Note that once io is - * initiated, TREE_REF can no longer be cleared, so that is the - * moment at which any such race is best fixed. + * which trigger io. Note that once io is initiated, TREE_REF can no + * longer be cleared, so that is the moment at which any such race is + * best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -3939,7 +3622,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); eb->pages[i] = p; - if (!PageUptodate(p)) + if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len)) uptodate = 0; /* @@ -4142,13 +3825,12 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, continue; lock_page(page); btree_clear_page_dirty(page); - ClearPageError(page); unlock_page(page); } WARN_ON(atomic_read(&eb->refs) == 0); } -bool set_extent_buffer_dirty(struct extent_buffer *eb) +void set_extent_buffer_dirty(struct extent_buffer *eb) { int i; int num_pages; @@ -4183,13 +3865,14 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) eb->start, eb->len); if (subpage) unlock_page(eb->pages[0]); + percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, + eb->len, + eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG for (i = 0; i < num_pages; i++) ASSERT(PageDirty(eb->pages[i])); #endif - - return was_dirty; } void clear_extent_buffer_uptodate(struct extent_buffer *eb) @@ -4242,84 +3925,54 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } -static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, - int mirror_num, - struct btrfs_tree_parent_check *check) +static void extent_buffer_read_end_io(struct btrfs_bio *bbio) { + struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; - struct extent_io_tree *io_tree; - struct page *page = eb->pages[0]; - struct extent_state *cached_state = NULL; - struct btrfs_bio_ctrl bio_ctrl = { - .opf = REQ_OP_READ, - .mirror_num = mirror_num, - .parent_check = check, - }; - int ret; + bool uptodate = !bbio->bio.bi_status; + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + u32 bio_offset = 0; - ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); - ASSERT(PagePrivate(page)); - ASSERT(check); - io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; + eb->read_mirror = bbio->mirror_num; - if (wait == WAIT_NONE) { - if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state)) - return -EAGAIN; - } else { - ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); - if (ret < 0) - return ret; - } + if (uptodate && + btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) + uptodate = false; - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || - PageUptodate(page) || - btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); - return 0; + if (uptodate) { + set_extent_buffer_uptodate(eb); + } else { + clear_extent_buffer_uptodate(eb); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); } - clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = 0; - atomic_set(&eb->io_pages, 1); - check_buffer_tree_ref(eb); - bio_ctrl.end_io_func = end_bio_extent_readpage; + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + u64 start = eb->start + bio_offset; + struct page *page = bvec->bv_page; + u32 len = bvec->bv_len; - btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); + if (uptodate) + btrfs_page_set_uptodate(fs_info, page, start, len); + else + btrfs_page_clear_uptodate(fs_info, page, start, len); - btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); - submit_extent_page(&bio_ctrl, eb->start, page, eb->len, - eb->start - page_offset(page)); - submit_one_bio(&bio_ctrl); - if (wait != WAIT_COMPLETE) { - free_extent_state(cached_state); - return 0; + bio_offset += len; } - wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, - EXTENT_LOCKED, &cached_state); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return -EIO; - return 0; + clear_bit(EXTENT_BUFFER_READING, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + free_extent_buffer(eb); + + bio_put(&bbio->bio); } int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, struct btrfs_tree_parent_check *check) { - int i; - struct page *page; - int locked_pages = 0; - int all_uptodate = 1; - int num_pages; - unsigned long num_reads = 0; - struct btrfs_bio_ctrl bio_ctrl = { - .opf = REQ_OP_READ, - .mirror_num = mirror_num, - .parent_check = check, - }; + int num_pages = num_extent_pages(eb), i; + struct btrfs_bio *bbio; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -4332,87 +3985,39 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) return -EIO; - if (eb->fs_info->nodesize < PAGE_SIZE) - return read_extent_buffer_subpage(eb, wait, mirror_num, check); - - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - if (wait == WAIT_NONE) { - /* - * WAIT_NONE is only utilized by readahead. If we can't - * acquire the lock atomically it means either the eb - * is being read out or under modification. - * Either way the eb will be or has been cached, - * readahead can exit safely. - */ - if (!trylock_page(page)) - goto unlock_exit; - } else { - lock_page(page); - } - locked_pages++; - } - /* - * We need to firstly lock all pages to make sure that - * the uptodate bit of our pages won't be affected by - * clear_extent_buffer_uptodate(). - */ - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - if (!PageUptodate(page)) { - num_reads++; - all_uptodate = 0; - } - } - - if (all_uptodate) { - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - goto unlock_exit; - } + /* Someone else is already reading the buffer, just wait for it. */ + if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) + goto done; clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; - atomic_set(&eb->io_pages, num_reads); - /* - * It is possible for release_folio to clear the TREE_REF bit before we - * set io_pages. See check_buffer_tree_ref for a more detailed comment. - */ check_buffer_tree_ref(eb); - bio_ctrl.end_io_func = end_bio_extent_readpage; - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - - if (!PageUptodate(page)) { - ClearPageError(page); - submit_extent_page(&bio_ctrl, page_offset(page), page, - PAGE_SIZE, 0); - } else { - unlock_page(page); - } + atomic_inc(&eb->refs); + + bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, + REQ_OP_READ | REQ_META, eb->fs_info, + extent_buffer_read_end_io, eb); + bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; + bbio->inode = BTRFS_I(eb->fs_info->btree_inode); + bbio->file_offset = eb->start; + memcpy(&bbio->parent_check, check, sizeof(*check)); + if (eb->fs_info->nodesize < PAGE_SIZE) { + __bio_add_page(&bbio->bio, eb->pages[0], eb->len, + eb->start - page_offset(eb->pages[0])); + } else { + for (i = 0; i < num_pages; i++) + __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0); } + btrfs_submit_bio(bbio, mirror_num); - submit_one_bio(&bio_ctrl); - - if (wait != WAIT_COMPLETE) - return 0; - - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - wait_on_page_locked(page); - if (!PageUptodate(page)) +done: + if (wait == WAIT_COMPLETE) { + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return -EIO; } return 0; - -unlock_exit: - while (locked_pages > 0) { - locked_pages--; - page = eb->pages[locked_pages]; - unlock_page(page); - } - return 0; } static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, @@ -4561,18 +4166,17 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, * looked up. We don't want to complain in this case, as the page was * valid before, we just didn't write it out. Instead we want to catch * the case where we didn't actually read the block properly, which - * would have !PageUptodate && !PageError, as we clear PageError before - * reading. + * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR. */ - if (fs_info->nodesize < PAGE_SIZE) { - bool uptodate, error; + if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) + return; - uptodate = btrfs_subpage_test_uptodate(fs_info, page, - eb->start, eb->len); - error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); - WARN_ON(!uptodate && !error); + if (fs_info->nodesize < PAGE_SIZE) { + if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page, + eb->start, eb->len))) + btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len); } else { - WARN_ON(!PageUptodate(page) && !PageError(page)); + WARN_ON(!PageUptodate(page)); } } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4341ad978fb8..c5fae3a7d911 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -29,6 +29,8 @@ enum { /* write IO error */ EXTENT_BUFFER_WRITE_ERR, EXTENT_BUFFER_NO_CHECK, + /* Indicate that extent buffer pages a being read */ + EXTENT_BUFFER_READING, }; /* these are flags for __process_pages_contig */ @@ -38,7 +40,6 @@ enum { ENUM_BIT(PAGE_START_WRITEBACK), ENUM_BIT(PAGE_END_WRITEBACK), ENUM_BIT(PAGE_SET_ORDERED), - ENUM_BIT(PAGE_SET_ERROR), ENUM_BIT(PAGE_LOCK), }; @@ -79,7 +80,6 @@ struct extent_buffer { struct btrfs_fs_info *fs_info; spinlock_t refs_lock; atomic_t refs; - atomic_t io_pages; int read_mirror; struct rcu_head rcu_head; pid_t lock_owner; @@ -89,7 +89,6 @@ struct extent_buffer { struct rw_semaphore lock; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; - struct list_head release_list; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif @@ -179,7 +178,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -int extent_write_locked_range(struct inode *inode, u64 start, u64 end); +int extent_write_locked_range(struct inode *inode, u64 start, u64 end, + struct writeback_control *wbc); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, @@ -262,10 +262,9 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); -bool set_extent_buffer_dirty(struct extent_buffer *eb); +void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); -int extent_buffer_under_io(const struct extent_buffer *eb); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 138afa955370..0cdb3e86f29b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -364,8 +364,9 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - set_extent_bits_nowait(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, bits); + set_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); } } @@ -380,8 +381,9 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) struct btrfs_device *device = stripe->dev; __clear_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, bits, - NULL, GFP_NOWAIT, NULL); + stripe->physical + stripe_size - 1, + bits | EXTENT_NOWAIT, + NULL, NULL); } } @@ -502,10 +504,10 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) RB_CLEAR_NODE(&em->rb_node); } -void replace_extent_mapping(struct extent_map_tree *tree, - struct extent_map *cur, - struct extent_map *new, - int modified) +static void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified) { lockdep_assert_held_write(&tree->lock); @@ -959,3 +961,95 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, return ret; } + +/* + * Split off the first pre bytes from the extent_map at [start, start + len], + * and set the block_start for it to new_logical. + * + * This function is used when an ordered_extent needs to be split. + */ +int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical) +{ + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + struct extent_map *split_pre = NULL; + struct extent_map *split_mid = NULL; + int ret = 0; + unsigned long flags; + + ASSERT(pre != 0); + ASSERT(pre < len); + + split_pre = alloc_extent_map(); + if (!split_pre) + return -ENOMEM; + split_mid = alloc_extent_map(); + if (!split_mid) { + ret = -ENOMEM; + goto out_free_pre; + } + + lock_extent(&inode->io_tree, start, start + len - 1, NULL); + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { + ret = -EIO; + goto out_unlock; + } + + ASSERT(em->len == len); + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); + ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); + ASSERT(!list_empty(&em->list)); + + flags = em->flags; + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + + /* First, replace the em with a new extent_map starting from * em->start */ + split_pre->start = em->start; + split_pre->len = pre; + split_pre->orig_start = split_pre->start; + split_pre->block_start = new_logical; + split_pre->block_len = split_pre->len; + split_pre->orig_block_len = split_pre->block_len; + split_pre->ram_bytes = split_pre->len; + split_pre->flags = flags; + split_pre->compress_type = em->compress_type; + split_pre->generation = em->generation; + + replace_extent_mapping(em_tree, em, split_pre, 1); + + /* + * Now we only have an extent_map at: + * [em->start, em->start + pre] + */ + + /* Insert the middle extent_map. */ + split_mid->start = em->start + pre; + split_mid->len = em->len - pre; + split_mid->orig_start = split_mid->start; + split_mid->block_start = em->block_start + pre; + split_mid->block_len = split_mid->len; + split_mid->orig_block_len = split_mid->block_len; + split_mid->ram_bytes = split_mid->len; + split_mid->flags = flags; + split_mid->compress_type = em->compress_type; + split_mid->generation = em->generation; + add_extent_mapping(em_tree, split_mid, 1); + + /* Once for us */ + free_extent_map(em); + /* Once for the tree */ + free_extent_map(em); + +out_unlock: + write_unlock(&em_tree->lock); + unlock_extent(&inode->io_tree, start, start + len - 1, NULL); + free_extent_map(split_mid); +out_free_pre: + free_extent_map(split_pre); + return ret; +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ad311864272a..35d27c756e08 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -90,10 +90,8 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); -void replace_extent_mapping(struct extent_map_tree *tree, - struct extent_map *cur, - struct extent_map *new, - int modified); +int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical); struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d1cd0a692f8e..696bf695d8eb 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -94,8 +94,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; - return set_extent_bits(&inode->file_extent_tree, start, start + len - 1, - EXTENT_DIRTY); + return set_extent_bit(&inode->file_extent_tree, start, start + len - 1, + EXTENT_DIRTY, NULL); } /* @@ -438,9 +438,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset = bbio->file_offset + bio_offset; - set_extent_bits(&inode->io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM); + set_extent_bit(&inode->io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM, NULL); } else { btrfs_warn_rl(fs_info, "csum hole found for disk bytenr range [%llu, %llu)", @@ -560,8 +560,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, goto fail; } - sums->bytenr = start; - sums->len = (int)size; + sums->logical = start; + sums->len = size; offset = bytes_to_csum_size(fs_info, start - key.offset); @@ -721,20 +721,17 @@ fail: */ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) { + struct btrfs_ordered_extent *ordered = bbio->ordered; struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; - u64 offset = bbio->file_offset; struct btrfs_ordered_sum *sums; - struct btrfs_ordered_extent *ordered = NULL; char *data; struct bvec_iter iter; struct bio_vec bvec; int index; unsigned int blockcount; - unsigned long total_bytes = 0; - unsigned long this_sum_bytes = 0; int i; unsigned nofs_flag; @@ -749,61 +746,17 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); - sums->bytenr = bio->bi_iter.bi_sector << 9; + sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; index = 0; shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { - if (!ordered) { - ordered = btrfs_lookup_ordered_extent(inode, offset); - /* - * The bio range is not covered by any ordered extent, - * must be a code logic error. - */ - if (unlikely(!ordered)) { - WARN(1, KERN_WARNING - "no ordered extent for root %llu ino %llu offset %llu\n", - inode->root->root_key.objectid, - btrfs_ino(inode), offset); - kvfree(sums); - return BLK_STS_IOERR; - } - } - blockcount = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len + fs_info->sectorsize - 1); for (i = 0; i < blockcount; i++) { - if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) && - !in_range(offset, ordered->file_offset, - ordered->num_bytes)) { - unsigned long bytes_left; - - sums->len = this_sum_bytes; - this_sum_bytes = 0; - btrfs_add_ordered_sum(ordered, sums); - btrfs_put_ordered_extent(ordered); - - bytes_left = bio->bi_iter.bi_size - total_bytes; - - nofs_flag = memalloc_nofs_save(); - sums = kvzalloc(btrfs_ordered_sum_size(fs_info, - bytes_left), GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - if (!sums) - return BLK_STS_RESOURCE; - - sums->len = bytes_left; - ordered = btrfs_lookup_ordered_extent(inode, - offset); - ASSERT(ordered); /* Logic error */ - sums->bytenr = (bio->bi_iter.bi_sector << 9) - + total_bytes; - index = 0; - } - data = bvec_kmap_local(&bvec); crypto_shash_digest(shash, data + (i * fs_info->sectorsize), @@ -811,15 +764,28 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) sums->sums + index); kunmap_local(data); index += fs_info->csum_size; - offset += fs_info->sectorsize; - this_sum_bytes += fs_info->sectorsize; - total_bytes += fs_info->sectorsize; } } - this_sum_bytes = 0; + + bbio->sums = sums; btrfs_add_ordered_sum(ordered, sums); - btrfs_put_ordered_extent(ordered); + return 0; +} + +/* + * Nodatasum I/O on zoned file systems still requires an btrfs_ordered_sum to + * record the updated logical address on Zone Append completion. + * Allocate just the structure with an empty sums array here for that case. + */ +blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) +{ + bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS); + if (!bbio->sums) + return BLK_STS_RESOURCE; + bbio->sums->len = bbio->bio.bi_iter.bi_size; + bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + btrfs_add_ordered_sum(bbio->ordered, bbio->sums); return 0; } @@ -1086,7 +1052,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, again: next_offset = (u64)-1; found_next = 0; - bytenr = sums->bytenr + total_bytes; + bytenr = sums->logical + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 6be8725cd574..4ec669b69008 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -50,6 +50,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); +blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f649647392e0..fd03e689a6be 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1145,7 +1145,6 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return -EAGAIN; - current->backing_dev_info = inode_to_bdi(inode); ret = file_remove_privs(file); if (ret) return ret; @@ -1165,10 +1164,8 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, loff_t end_pos = round_up(pos + count, fs_info->sectorsize); ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); - if (ret) { - current->backing_dev_info = NULL; + if (ret) return ret; - } } return 0; @@ -1651,7 +1648,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; - const bool sync = iocb_is_dsync(iocb); /* * If the fs flips readonly due to some impossible error, although we @@ -1664,9 +1660,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; - if (sync) - atomic_inc(&inode->sync_writers); - if (encoded) { num_written = btrfs_encoded_write(iocb, from, encoded); num_sync = encoded->len; @@ -1686,10 +1679,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, num_written = num_sync; } - if (sync) - atomic_dec(&inode->sync_writers); - - current->backing_dev_info = NULL; return num_written; } @@ -1733,9 +1722,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) * several segments of stripe length (currently 64K). */ blk_start_plug(&plug); - atomic_inc(&BTRFS_I(inode)->sync_writers); ret = btrfs_fdatawrite_range(inode, start, end); - atomic_dec(&BTRFS_I(inode)->sync_writers); blk_finish_plug(&plug); return ret; @@ -3709,7 +3696,8 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | + FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); if (ret) @@ -3825,7 +3813,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, .read_iter = btrfs_file_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, .mmap = btrfs_file_mmap, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cf98a3c05480..880800418075 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -292,25 +292,6 @@ out: return ret; } -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv) -{ - u64 needed_bytes; - int ret; - - /* 1 for slack space, 1 for updating the inode */ - needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + - btrfs_calc_metadata_size(fs_info, 1); - - spin_lock(&rsv->lock); - if (rsv->reserved < needed_bytes) - ret = -ENOSPC; - else - ret = 0; - spin_unlock(&rsv->lock); - return ret; -} - int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct inode *vfs_inode) @@ -923,27 +904,31 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (!info->bitmap) { + const u64 offset = info->offset; + const u64 bytes = info->bytes; + unlink_free_space(ctl, info, true); - ret = btrfs_add_free_space(block_group, info->offset, - info->bytes); + spin_unlock(&ctl->tree_lock); kmem_cache_free(btrfs_free_space_cachep, info); + ret = btrfs_add_free_space(block_group, offset, bytes); + spin_lock(&ctl->tree_lock); } else { u64 offset = info->offset; u64 bytes = ctl->unit; - while (search_bitmap(ctl, info, &offset, &bytes, - false) == 0) { + ret = search_bitmap(ctl, info, &offset, &bytes, false); + if (ret == 0) { + bitmap_clear_bits(ctl, info, offset, bytes, true); + spin_unlock(&ctl->tree_lock); ret = btrfs_add_free_space(block_group, offset, bytes); - if (ret) - break; - bitmap_clear_bits(ctl, info, offset, bytes, true); - offset = info->offset; - bytes = ctl->unit; + spin_lock(&ctl->tree_lock); + } else { + free_bitmap(ctl, info); + ret = 0; } - free_bitmap(ctl, info); } - cond_resched(); + cond_resched_lock(&ctl->tree_lock); } return ret; } @@ -1037,7 +1022,9 @@ int load_free_space_cache(struct btrfs_block_group *block_group) block_group->bytes_super)); if (matched) { + spin_lock(&tmp_ctl.tree_lock); ret = copy_free_space_cache(block_group, &tmp_ctl); + spin_unlock(&tmp_ctl.tree_lock); /* * ret == 1 means we successfully loaded the free space cache, * so we need to re-set it here. @@ -1596,20 +1583,34 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, return bitmap_start; } -static int tree_insert_offset(struct rb_root *root, u64 offset, - struct rb_node *node, int bitmap) +static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_cluster *cluster, + struct btrfs_free_space *new_entry) { - struct rb_node **p = &root->rb_node; + struct rb_root *root; + struct rb_node **p; struct rb_node *parent = NULL; - struct btrfs_free_space *info; + + lockdep_assert_held(&ctl->tree_lock); + + if (cluster) { + lockdep_assert_held(&cluster->lock); + root = &cluster->root; + } else { + root = &ctl->free_space_offset; + } + + p = &root->rb_node; while (*p) { + struct btrfs_free_space *info; + parent = *p; info = rb_entry(parent, struct btrfs_free_space, offset_index); - if (offset < info->offset) { + if (new_entry->offset < info->offset) { p = &(*p)->rb_left; - } else if (offset > info->offset) { + } else if (new_entry->offset > info->offset) { p = &(*p)->rb_right; } else { /* @@ -1625,7 +1626,7 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, * found a bitmap, we want to go left, or before * logically. */ - if (bitmap) { + if (new_entry->bitmap) { if (info->bitmap) { WARN_ON_ONCE(1); return -EEXIST; @@ -1641,8 +1642,8 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, } } - rb_link_node(node, parent, p); - rb_insert_color(node, root); + rb_link_node(&new_entry->offset_index, parent, p); + rb_insert_color(&new_entry->offset_index, root); return 0; } @@ -1705,6 +1706,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, struct rb_node *n = ctl->free_space_offset.rb_node; struct btrfs_free_space *entry = NULL, *prev = NULL; + lockdep_assert_held(&ctl->tree_lock); + /* find entry that is closest to the 'offset' */ while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); @@ -1814,6 +1817,8 @@ static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { + lockdep_assert_held(&ctl->tree_lock); + rb_erase(&info->offset_index, &ctl->free_space_offset); rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); ctl->free_extents--; @@ -1832,9 +1837,10 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, { int ret = 0; + lockdep_assert_held(&ctl->tree_lock); + ASSERT(info->bytes || info->bitmap); - ret = tree_insert_offset(&ctl->free_space_offset, info->offset, - &info->offset_index, (info->bitmap != NULL)); + ret = tree_insert_offset(ctl, NULL, info); if (ret) return ret; @@ -1862,6 +1868,8 @@ static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, if (RB_EMPTY_NODE(&info->bytes_index)) return; + lockdep_assert_held(&ctl->tree_lock); + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); } @@ -2447,6 +2455,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, u64 offset = info->offset; u64 bytes = info->bytes; const bool is_trimmed = btrfs_free_space_trimmed(info); + struct rb_node *right_prev = NULL; /* * first we want to see if there is free space adjacent to the range we @@ -2454,9 +2463,11 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, * cover the entire range */ right_info = tree_search_offset(ctl, offset + bytes, 0, 0); - if (right_info && rb_prev(&right_info->offset_index)) - left_info = rb_entry(rb_prev(&right_info->offset_index), - struct btrfs_free_space, offset_index); + if (right_info) + right_prev = rb_prev(&right_info->offset_index); + + if (right_prev) + left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); else if (!right_info) left_info = tree_search_offset(ctl, offset - 1, 0, 0); @@ -2969,9 +2980,10 @@ static void __btrfs_return_cluster_to_free_space( struct btrfs_free_cluster *cluster) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry; struct rb_node *node; + lockdep_assert_held(&ctl->tree_lock); + spin_lock(&cluster->lock); if (cluster->block_group != block_group) { spin_unlock(&cluster->lock); @@ -2984,15 +2996,14 @@ static void __btrfs_return_cluster_to_free_space( node = rb_first(&cluster->root); while (node) { - bool bitmap; + struct btrfs_free_space *entry; entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); RB_CLEAR_NODE(&entry->offset_index); - bitmap = (entry->bitmap != NULL); - if (!bitmap) { + if (!entry->bitmap) { /* Merging treats extents as if they were new */ if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; @@ -3010,8 +3021,7 @@ static void __btrfs_return_cluster_to_free_space( entry->bytes; } } - tree_insert_offset(&ctl->free_space_offset, - entry->offset, &entry->offset_index, bitmap); + tree_insert_offset(ctl, NULL, entry); rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, entry_less); } @@ -3324,6 +3334,8 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, unsigned long total_found = 0; int ret; + lockdep_assert_held(&ctl->tree_lock); + i = offset_to_bit(entry->offset, ctl->unit, max_t(u64, offset, entry->offset)); want_bits = bytes_to_bits(bytes, ctl->unit); @@ -3385,8 +3397,7 @@ again: */ RB_CLEAR_NODE(&entry->bytes_index); - ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index, 1); + ret = tree_insert_offset(ctl, cluster, entry); ASSERT(!ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, @@ -3414,6 +3425,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, u64 max_extent; u64 total_size = 0; + lockdep_assert_held(&ctl->tree_lock); + entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) return -ENOSPC; @@ -3476,8 +3489,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, rb_erase(&entry->offset_index, &ctl->free_space_offset); rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); - ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index, 0); + ret = tree_insert_offset(ctl, cluster, entry); total_size += entry->bytes; ASSERT(!ret); /* -EEXIST; Logic error */ } while (node && entry != last); @@ -3671,7 +3683,7 @@ static int do_trimming(struct btrfs_block_group *block_group, __btrfs_add_free_space(block_group, reserved_start, start - reserved_start, reserved_trim_state); - if (start + bytes < reserved_start + reserved_bytes) + if (end < reserved_end) __btrfs_add_free_space(block_group, end, reserved_end - end, reserved_trim_state); __btrfs_add_free_space(block_group, start, bytes, trim_state); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index a855e0483e03..33b4da3271b1 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -101,8 +101,6 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group); -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct inode *inode); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index b21da1446f2a..045ddce32eca 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1280,7 +1280,10 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) goto abort; btrfs_global_root_delete(free_space_root); + + spin_lock(&fs_info->trans_lock); list_del(&free_space_root->dirty_list); + spin_unlock(&fs_info->trans_lock); btrfs_tree_lock(free_space_root->node); btrfs_clear_buffer_dirty(trans, free_space_root->node); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 0d98fc5f6f44..203d2a267828 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -543,7 +543,6 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other two. */ struct btrfs_workqueue *workers; - struct btrfs_workqueue *hipri_workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; struct workqueue_struct *endio_workers; @@ -577,6 +576,7 @@ struct btrfs_fs_info { s32 dirty_metadata_batch; s32 delalloc_batch; + /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; @@ -643,7 +643,6 @@ struct btrfs_fs_info { */ refcount_t scrub_workers_refcnt; struct workqueue_struct *scrub_workers; - struct workqueue_struct *scrub_wr_completion_workers; struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -854,7 +853,7 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { - return fs_info->zone_size > 0; + return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0; } /* diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index b80aeb715701..ede43b6c6559 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -60,6 +60,22 @@ struct btrfs_truncate_control { bool clear_extent_range; }; +/* + * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two + * separate u32s. These two functions convert between the two representations. + */ +static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) +{ + return (flags | ((u64)ro_flags << 32)); +} + +static inline void btrfs_inode_split_flags(u64 inode_item_flags, + u32 *flags, u32 *ro_flags) +{ + *flags = (u32)inode_item_flags; + *ro_flags = (u32)(inode_item_flags >> 32); +} + int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7fcafcc5292c..dbbb67293e34 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -70,6 +70,7 @@ #include "verity.h" #include "super.h" #include "orphan.h" +#include "backref.h" struct btrfs_iget_args { u64 ino; @@ -100,6 +101,18 @@ struct btrfs_rename_ctx { u64 index; }; +/* + * Used by data_reloc_print_warning_inode() to pass needed info for filename + * resolution and output of error message. + */ +struct data_reloc_warn { + struct btrfs_path path; + struct btrfs_fs_info *fs_info; + u64 extent_item_size; + u64 logical; + int mirror_num; +}; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; @@ -122,12 +135,198 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 ram_bytes, int compress_type, int type); +static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *warn_ctx) +{ + struct data_reloc_warn *warn = warn_ctx; + struct btrfs_fs_info *fs_info = warn->fs_info; + struct extent_buffer *eb; + struct btrfs_inode_item *inode_item; + struct inode_fs_paths *ipath = NULL; + struct btrfs_root *local_root; + struct btrfs_key key; + unsigned int nofs_flag; + u32 nlink; + int ret; + + local_root = btrfs_get_fs_root(fs_info, root, true); + if (IS_ERR(local_root)) { + ret = PTR_ERR(local_root); + goto err; + } + + /* This makes the path point to (inum INODE_ITEM ioff). */ + key.objectid = inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0); + if (ret) { + btrfs_put_root(local_root); + btrfs_release_path(&warn->path); + goto err; + } + + eb = warn->path.nodes[0]; + inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item); + nlink = btrfs_inode_nlink(eb, inode_item); + btrfs_release_path(&warn->path); + + nofs_flag = memalloc_nofs_save(); + ipath = init_ipath(4096, local_root, &warn->path); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(ipath)) { + btrfs_put_root(local_root); + ret = PTR_ERR(ipath); + ipath = NULL; + /* + * -ENOMEM, not a critical error, just output an generic error + * without filename. + */ + btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu", + warn->logical, warn->mirror_num, root, inum, offset); + return ret; + } + ret = paths_from_inode(inum, ipath); + if (ret < 0) + goto err; + + /* + * We deliberately ignore the bit ipath might have been too small to + * hold all of the paths here + */ + for (int i = 0; i < ipath->fspath->elem_cnt; i++) { + btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)", + warn->logical, warn->mirror_num, root, inum, offset, + fs_info->sectorsize, nlink, + (char *)(unsigned long)ipath->fspath->val[i]); + } + + btrfs_put_root(local_root); + free_ipath(ipath); + return 0; + +err: + btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", + warn->logical, warn->mirror_num, root, inum, offset, ret); + + free_ipath(ipath); + return ret; +} + +/* + * Do extra user-friendly error output (e.g. lookup all the affected files). + * + * Return true if we succeeded doing the backref lookup. + * Return false if such lookup failed, and has to fallback to the old error message. + */ +static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off, + const u8 *csum, const u8 *csum_expected, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_path path = { 0 }; + struct btrfs_key found_key = { 0 }; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + const u32 csum_size = fs_info->csum_size; + u64 logical; + u64 flags; + u32 item_size; + int ret; + + mutex_lock(&fs_info->reloc_mutex); + logical = btrfs_get_reloc_bg_bytenr(fs_info); + mutex_unlock(&fs_info->reloc_mutex); + + if (logical == U64_MAX) { + btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); + btrfs_warn_rl(fs_info, +"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + inode->root->root_key.objectid, btrfs_ino(inode), file_off, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + return; + } + + logical += file_off; + btrfs_warn_rl(fs_info, +"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + inode->root->root_key.objectid, + btrfs_ino(inode), file_off, logical, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + + ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); + if (ret < 0) { + btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", + logical, ret); + return; + } + eb = path.nodes[0]; + ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size(eb, path.slots[0]); + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + unsigned long ptr = 0; + u64 ref_root; + u8 ref_level; + + while (true) { + ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, + item_size, &ref_root, + &ref_level); + if (ret < 0) { + btrfs_warn_rl(fs_info, + "failed to resolve tree backref for logical %llu: %d", + logical, ret); + break; + } + if (ret > 0) + break; + + btrfs_warn_rl(fs_info, +"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu", + logical, mirror_num, + (ref_level ? "node" : "leaf"), + ref_level, ref_root); + } + btrfs_release_path(&path); + } else { + struct btrfs_backref_walk_ctx ctx = { 0 }; + struct data_reloc_warn reloc_warn = { 0 }; + + btrfs_release_path(&path); + + ctx.bytenr = found_key.objectid; + ctx.extent_item_pos = logical - found_key.objectid; + ctx.fs_info = fs_info; + + reloc_warn.logical = logical; + reloc_warn.extent_item_size = found_key.offset; + reloc_warn.mirror_num = mirror_num; + reloc_warn.fs_info = fs_info; + + iterate_extent_inodes(&ctx, true, + data_reloc_print_warning_inode, &reloc_warn); + } +} + static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; const u32 csum_size = root->fs_info->csum_size; + /* For data reloc tree, it's better to do a backref lookup instead. */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + return print_data_reloc_error(inode, logical_start, csum, + csum_expected, mirror_num); + /* Output without objectid, which is more meaningful */ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, @@ -636,6 +835,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) { struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; @@ -750,7 +950,7 @@ again: /* Compression level is applied here and only here */ ret = btrfs_compress_pages( compress_type | (fs_info->compress_level << 4), - inode->vfs_inode.i_mapping, start, + mapping, start, pages, &nr_pages, &total_in, @@ -793,9 +993,9 @@ cont: unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING; - unsigned long page_error_op; - page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; + if (ret < 0) + mapping_set_error(mapping, -EIO); /* * inline extent creation worked or returned error, @@ -812,7 +1012,6 @@ cont: clear_flags, PAGE_UNLOCK | PAGE_START_WRITEBACK | - page_error_op | PAGE_END_WRITEBACK); /* @@ -934,6 +1133,12 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, unsigned long nr_written = 0; int page_started = 0; int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .range_start = start, + .range_end = end, + .no_cgroup_owner = 1, + }; /* * Call cow_file_range() to run the delalloc range directly, since we @@ -954,8 +1159,6 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, const u64 page_start = page_offset(locked_page); const u64 page_end = page_start + PAGE_SIZE - 1; - btrfs_page_set_error(inode->root->fs_info, locked_page, - page_start, PAGE_SIZE); set_page_writeback(locked_page); end_page_writeback(locked_page); end_extent_writepage(locked_page, ret, page_start, page_end); @@ -965,7 +1168,10 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, } /* All pages will be unlocked, including @locked_page */ - return extent_write_locked_range(&inode->vfs_inode, start, end); + wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); + ret = extent_write_locked_range(&inode->vfs_inode, start, end, &wbc); + wbc_detach_inode(&wbc); + return ret; } static int submit_one_async_extent(struct btrfs_inode *inode, @@ -976,6 +1182,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ordered_extent *ordered; struct btrfs_key ins; struct page *locked_page = NULL; struct extent_map *em; @@ -1037,7 +1244,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, /* file_offset */ + ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */ async_extent->ram_size, /* num_bytes */ async_extent->ram_size, /* ram_bytes */ ins.objectid, /* disk_bytenr */ @@ -1045,8 +1252,9 @@ static int submit_one_async_extent(struct btrfs_inode *inode, 0, /* offset */ 1 << BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - if (ret) { + if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); + ret = PTR_ERR(ordered); goto out_free_reserve; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1055,11 +1263,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); - - btrfs_submit_compressed_write(inode, start, /* file_offset */ - async_extent->ram_size, /* num_bytes */ - ins.objectid, /* disk_bytenr */ - ins.offset, /* compressed_len */ + btrfs_submit_compressed_write(ordered, async_extent->pages, /* compressed_pages */ async_extent->nr_pages, async_chunk->write_flags, true); @@ -1074,12 +1278,13 @@ out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_free: + mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK | PAGE_SET_ERROR); + PAGE_END_WRITEBACK); free_async_extent_pages(async_extent); goto done; } @@ -1287,6 +1492,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, min_alloc_size = fs_info->sectorsize; while (num_bytes > 0) { + struct btrfs_ordered_extent *ordered; + cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, min_alloc_size, 0, alloc_hint, @@ -1311,16 +1518,18 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size, - ins.objectid, cur_alloc_size, 0, - 1 << BTRFS_ORDERED_REGULAR, - BTRFS_COMPRESS_NONE); - if (ret) + ordered = btrfs_alloc_ordered_extent(inode, start, ram_size, + ram_size, ins.objectid, cur_alloc_size, + 0, 1 << BTRFS_ORDERED_REGULAR, + BTRFS_COMPRESS_NONE); + if (IS_ERR(ordered)) { + ret = PTR_ERR(ordered); goto out_drop_extent_cache; + } if (btrfs_is_data_reloc_root(root)) { - ret = btrfs_reloc_clone_csums(inode, start, - cur_alloc_size); + ret = btrfs_reloc_clone_csums(ordered); + /* * Only drop cache here, and process as normal. * @@ -1337,6 +1546,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, start + ram_size - 1, false); } + btrfs_put_ordered_extent(ordered); btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1494,7 +1704,7 @@ static noinline void async_cow_submit(struct btrfs_work *work) * ->inode could be NULL if async_chunk_start has failed to compress, * in which case we don't have anything to submit, yet we need to * always adjust ->async_delalloc_pages as its paired with the init - * happening in cow_file_range_async + * happening in run_delalloc_compressed */ if (async_chunk->inode) submit_compressed_extents(async_chunk); @@ -1521,58 +1731,36 @@ static noinline void async_cow_free(struct btrfs_work *work) kvfree(async_cow); } -static int cow_file_range_async(struct btrfs_inode *inode, - struct writeback_control *wbc, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written) +static bool run_delalloc_compressed(struct btrfs_inode *inode, + struct writeback_control *wbc, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); struct async_cow *ctx; struct async_chunk *async_chunk; unsigned long nr_pages; - u64 cur_end; u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); int i; - bool should_compress; unsigned nofs_flag; const blk_opf_t write_flags = wbc_to_write_flags(wbc); - unlock_extent(&inode->io_tree, start, end, NULL); - - if (inode->flags & BTRFS_INODE_NOCOMPRESS && - !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { - num_chunks = 1; - should_compress = false; - } else { - should_compress = true; - } - nofs_flag = memalloc_nofs_save(); ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); memalloc_nofs_restore(nofs_flag); + if (!ctx) + return false; - if (!ctx) { - unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING; - unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK | PAGE_SET_ERROR; - - extent_clear_unlock_delalloc(inode, start, end, locked_page, - clear_bits, page_ops); - return -ENOMEM; - } + unlock_extent(&inode->io_tree, start, end, NULL); + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); async_chunk = ctx->chunks; atomic_set(&ctx->num_chunks, num_chunks); for (i = 0; i < num_chunks; i++) { - if (should_compress) - cur_end = min(end, start + SZ_512K - 1); - else - cur_end = end; + u64 cur_end = min(end, start + SZ_512K - 1); /* * igrab is called higher up in the call chain, take only the @@ -1633,13 +1821,14 @@ static int cow_file_range_async(struct btrfs_inode *inode, start = cur_end + 1; } *page_started = 1; - return 0; + return true; } static noinline int run_delalloc_zoned(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written) + unsigned long *nr_written, + struct writeback_control *wbc) { u64 done_offset = end; int ret; @@ -1671,8 +1860,8 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, account_page_redirty(locked_page); } locked_page_done = true; - extent_write_locked_range(&inode->vfs_inode, start, done_offset); - + extent_write_locked_range(&inode->vfs_inode, start, done_offset, + wbc); start = done_offset + 1; } @@ -1947,6 +2136,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, nocow_args.writeback_path = true; while (1) { + struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; @@ -1954,6 +2144,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, u64 ram_bytes; u64 nocow_end; int extent_type; + bool is_prealloc; nocow = false; @@ -2092,8 +2283,8 @@ out_check: } nocow_end = cur_offset + nocow_args.num_bytes - 1; - - if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; + if (is_prealloc) { u64 orig_start = found_key.offset - nocow_args.extent_offset; struct extent_map *em; @@ -2109,29 +2300,22 @@ out_check: goto error; } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, - cur_offset, nocow_args.num_bytes, - nocow_args.num_bytes, - nocow_args.disk_bytenr, - nocow_args.num_bytes, 0, - 1 << BTRFS_ORDERED_PREALLOC, - BTRFS_COMPRESS_NONE); - if (ret) { + } + + ordered = btrfs_alloc_ordered_extent(inode, cur_offset, + nocow_args.num_bytes, nocow_args.num_bytes, + nocow_args.disk_bytenr, nocow_args.num_bytes, 0, + is_prealloc + ? (1 << BTRFS_ORDERED_PREALLOC) + : (1 << BTRFS_ORDERED_NOCOW), + BTRFS_COMPRESS_NONE); + if (IS_ERR(ordered)) { + if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, nocow_end, false); - goto error; } - } else { - ret = btrfs_add_ordered_extent(inode, cur_offset, - nocow_args.num_bytes, - nocow_args.num_bytes, - nocow_args.disk_bytenr, - nocow_args.num_bytes, - 0, - 1 << BTRFS_ORDERED_NOCOW, - BTRFS_COMPRESS_NONE); - if (ret) - goto error; + ret = PTR_ERR(ordered); + goto error; } if (nocow) { @@ -2145,8 +2329,8 @@ out_check: * extent_clear_unlock_delalloc() in error handler * from freeing metadata of created ordered extent. */ - ret = btrfs_reloc_clone_csums(inode, cur_offset, - nocow_args.num_bytes); + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, locked_page, EXTENT_LOCKED | @@ -2214,7 +2398,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc) { - int ret; + int ret = 0; const bool zoned = btrfs_is_zoned(inode->root->fs_info); /* @@ -2235,19 +2419,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); - } else if (!btrfs_inode_can_compress(inode) || - !inode_need_compress(inode, start, end)) { - if (zoned) - ret = run_delalloc_zoned(inode, locked_page, start, end, - page_started, nr_written); - else - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1, NULL); - } else { - set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); - ret = cow_file_range_async(inode, wbc, locked_page, start, end, - page_started, nr_written); + goto out; } + + if (btrfs_inode_can_compress(inode) && + inode_need_compress(inode, start, end) && + run_delalloc_compressed(inode, wbc, locked_page, start, + end, page_started, nr_written)) + goto out; + + if (zoned) + ret = run_delalloc_zoned(inode, locked_page, start, end, + page_started, nr_written, wbc); + else + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1, NULL); + +out: ASSERT(ret <= 0); if (ret) btrfs_cleanup_ordered_extents(inode, locked_page, start, @@ -2515,125 +2703,42 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } } -/* - * Split off the first pre bytes from the extent_map at [start, start + len] - * - * This function is intended to be used only for extract_ordered_extent(). - */ -static int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre) -{ - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - struct extent_map *split_pre = NULL; - struct extent_map *split_mid = NULL; - int ret = 0; - unsigned long flags; - - ASSERT(pre != 0); - ASSERT(pre < len); - - split_pre = alloc_extent_map(); - if (!split_pre) - return -ENOMEM; - split_mid = alloc_extent_map(); - if (!split_mid) { - ret = -ENOMEM; - goto out_free_pre; - } - - lock_extent(&inode->io_tree, start, start + len - 1, NULL); - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) { - ret = -EIO; - goto out_unlock; - } - - ASSERT(em->len == len); - ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); - ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); - ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); - ASSERT(!list_empty(&em->list)); - - flags = em->flags; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - - /* First, replace the em with a new extent_map starting from * em->start */ - split_pre->start = em->start; - split_pre->len = pre; - split_pre->orig_start = split_pre->start; - split_pre->block_start = em->block_start; - split_pre->block_len = split_pre->len; - split_pre->orig_block_len = split_pre->block_len; - split_pre->ram_bytes = split_pre->len; - split_pre->flags = flags; - split_pre->compress_type = em->compress_type; - split_pre->generation = em->generation; - - replace_extent_mapping(em_tree, em, split_pre, 1); - - /* - * Now we only have an extent_map at: - * [em->start, em->start + pre] - */ - - /* Insert the middle extent_map. */ - split_mid->start = em->start + pre; - split_mid->len = em->len - pre; - split_mid->orig_start = split_mid->start; - split_mid->block_start = em->block_start + pre; - split_mid->block_len = split_mid->len; - split_mid->orig_block_len = split_mid->block_len; - split_mid->ram_bytes = split_mid->len; - split_mid->flags = flags; - split_mid->compress_type = em->compress_type; - split_mid->generation = em->generation; - add_extent_mapping(em_tree, split_mid, 1); - - /* Once for us */ - free_extent_map(em); - /* Once for the tree */ - free_extent_map(em); - -out_unlock: - write_unlock(&em_tree->lock); - unlock_extent(&inode->io_tree, start, start + len - 1, NULL); - free_extent_map(split_mid); -out_free_pre: - free_extent_map(split_pre); - return ret; -} - -int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, - struct btrfs_ordered_extent *ordered) +static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, + struct btrfs_ordered_extent *ordered) { u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; u64 len = bbio->bio.bi_iter.bi_size; - struct btrfs_inode *inode = bbio->inode; - u64 ordered_len = ordered->num_bytes; - int ret = 0; + struct btrfs_ordered_extent *new; + int ret; /* Must always be called for the beginning of an ordered extent. */ if (WARN_ON_ONCE(start != ordered->disk_bytenr)) return -EINVAL; /* No need to split if the ordered extent covers the entire bio. */ - if (ordered->disk_num_bytes == len) + if (ordered->disk_num_bytes == len) { + refcount_inc(&ordered->refs); + bbio->ordered = ordered; return 0; - - ret = btrfs_split_ordered_extent(ordered, len); - if (ret) - return ret; + } /* * Don't split the extent_map for NOCOW extents, as we're writing into * a pre-existing one. */ - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) - return 0; + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = split_extent_map(bbio->inode, bbio->file_offset, + ordered->num_bytes, len, + ordered->disk_bytenr); + if (ret) + return ret; + } - return split_extent_map(inode, bbio->file_offset, ordered_len, len); + new = btrfs_split_ordered_extent(ordered, len); + if (IS_ERR(new)) + return PTR_ERR(new); + bbio->ordered = new; + return 0; } /* @@ -2651,7 +2756,7 @@ static int add_pending_csums(struct btrfs_trans_handle *trans, trans->adding_csums = true; if (!csum_root) csum_root = btrfs_csum_root(trans->fs_info, - sum->bytenr); + sum->logical); ret = btrfs_csum_file_blocks(trans, csum_root, sum); trans->adding_csums = false; if (ret) @@ -2689,8 +2794,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, ret = set_extent_bit(&inode->io_tree, search_start, search_start + em_len - 1, - EXTENT_DELALLOC_NEW, cached_state, - GFP_NOFS); + EXTENT_DELALLOC_NEW, cached_state); next: search_start = extent_map_end(em); free_extent_map(em); @@ -2723,8 +2827,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, return ret; } - return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, - cached_state); + return set_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC | extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2847,7 +2951,6 @@ out_page: mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); clear_page_dirty_for_io(page); - SetPageError(page); } btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE); unlock_page(page); @@ -3068,7 +3171,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * an ordered extent if the range of bytes in the file it covers are * fully written. */ -int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) +int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); struct btrfs_root *root = inode->root; @@ -3103,15 +3206,9 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - /* A valid ->physical implies a write on a sequential zone. */ - if (ordered_extent->physical != (u64)-1) { - btrfs_rewrite_logical_zoned(ordered_extent); + if (btrfs_is_zoned(fs_info)) btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); - } else if (btrfs_is_data_reloc_root(inode->root)) { - btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes); - } if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -3279,6 +3376,14 @@ out: return ret; } +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) +{ + if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && + !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + btrfs_finish_ordered_zoned(ordered); + return btrfs_finish_one_ordered(ordered); +} + void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate) @@ -4226,7 +4331,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) } btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - 0); + false); ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); @@ -4801,7 +4906,7 @@ again: if (only_release_metadata) set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, NULL, GFP_NOFS); + EXTENT_NORESERVE, NULL); out_unlock: if (ret) { @@ -7670,8 +7775,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, pos += submitted; length -= submitted; if (write) - btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, - pos, length, false); + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, NULL); @@ -7701,12 +7806,14 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, bio->bi_status); } - if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset, - dip->bytes, !bio->bi_status); - else + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + btrfs_finish_ordered_extent(bbio->ordered, NULL, + dip->file_offset, dip->bytes, + !bio->bi_status); + } else { unlock_extent(&inode->io_tree, dip->file_offset, dip->file_offset + dip->bytes - 1, NULL); + } bbio->bio.bi_private = bbio->private; iomap_dio_bio_end_io(bio); @@ -7742,7 +7849,8 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); if (ret) { - btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); + bbio->bio.bi_status = errno_to_blk_status(ret); + btrfs_dio_end_io(bbio); return; } } @@ -8236,7 +8344,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; - u64 min_size = btrfs_calc_metadata_size(fs_info, 1); + const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { ret = btrfs_wait_ordered_range(&inode->vfs_inode, @@ -8293,7 +8401,15 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); + /* + * We have reserved 2 metadata units when we started the transaction and + * min_size matches 1 unit, so this should never fail, but if it does, + * it's not critical we just fail truncation. + */ + if (WARN_ON(ret)) { + btrfs_end_transaction(trans); + goto out; + } trans->block_rsv = rsv; @@ -8341,7 +8457,14 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_block_rsv_release(fs_info, rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ + /* + * We have reserved 2 metadata units when we started the + * transaction and min_size matches 1 unit, so this should never + * fail, but if it does, it's not critical we just fail truncation. + */ + if (WARN_ON(ret)) + break; + trans->block_rsv = rsv; } @@ -8468,7 +8591,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); - atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); @@ -8639,7 +8761,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, inode_bytes = inode_get_bytes(inode); spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_bytes, blocksize) + - ALIGN(delalloc_bytes, blocksize)) >> 9; + ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT; return 0; } @@ -8795,9 +8917,9 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), - BTRFS_I(old_inode), 1); + BTRFS_I(old_inode), true); btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), - BTRFS_I(new_inode), 1); + BTRFS_I(new_inode), true); } /* src is a subvolume */ @@ -9063,7 +9185,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), - BTRFS_I(old_inode), 1); + BTRFS_I(old_inode), true); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); @@ -10170,6 +10292,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, struct extent_io_tree *io_tree = &inode->io_tree; struct extent_changeset *data_reserved = NULL; struct extent_state *cached_state = NULL; + struct btrfs_ordered_extent *ordered; int compression; size_t orig_count; u64 start, end; @@ -10346,14 +10469,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes, + ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes, ins.objectid, ins.offset, encoded->unencoded_offset, (1 << BTRFS_ORDERED_ENCODED) | (1 << BTRFS_ORDERED_COMPRESSED), compression); - if (ret) { + if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); + ret = PTR_ERR(ordered); goto out_free_reserved; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -10365,8 +10489,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_extents(inode, num_bytes); - btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, - ins.offset, pages, nr_pages, 0, false); + btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false); ret = orig_count; goto out; @@ -10903,7 +11026,6 @@ static const struct address_space_operations btrfs_aops = { .read_folio = btrfs_read_folio, .writepages = btrfs_writepages, .readahead = btrfs_readahead, - .direct_IO = noop_direct_IO, .invalidate_folio = btrfs_invalidate_folio, .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2fa36f694daa..a895d105464b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -649,6 +649,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap, } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; + /* Tree log can't currently deal with an inode which is a new root. */ + btrfs_set_log_full_commit(trans); ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); if (ret) @@ -757,10 +759,7 @@ out: trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv); - if (ret) - btrfs_end_transaction(trans); - else - ret = btrfs_commit_transaction(trans); + btrfs_end_transaction(trans); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: @@ -2672,7 +2671,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; struct block_device *bdev = NULL; - fmode_t mode; + void *holder; int ret; bool cancel = false; @@ -2709,7 +2708,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; /* Exclusive operation is now claimed */ - ret = btrfs_rm_device(fs_info, &args, &bdev, &mode); + ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); btrfs_exclop_finish(fs_info); @@ -2724,7 +2723,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) err_drop: mnt_drop_write_file(file); if (bdev) - blkdev_put(bdev, mode); + blkdev_put(bdev, holder); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2738,7 +2737,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; struct block_device *bdev = NULL; - fmode_t mode; + void *holder; int ret; bool cancel = false; @@ -2765,7 +2764,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, &args, &bdev, &mode); + ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); @@ -2773,7 +2772,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) mnt_drop_write_file(file); if (bdev) - blkdev_put(bdev, mode); + blkdev_put(bdev, holder); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -3113,6 +3112,13 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, struct btrfs_trans_handle *trans; u64 transid; + /* + * Start orphan cleanup here for the given root in case it hasn't been + * started already by other means. Errors are handled in the other + * functions during transaction commit. + */ + btrfs_orphan_cleanup(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) @@ -3134,14 +3140,13 @@ out: static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, void __user *argp) { - u64 transid; + /* By default wait for the current transaction. */ + u64 transid = 0; - if (argp) { + if (argp) if (copy_from_user(&transid, argp, sizeof(transid))) return -EFAULT; - } else { - transid = 0; /* current trans */ - } + return btrfs_wait_for_commit(fs_info, transid); } diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 3a496b0d3d2b..7979449a58d6 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -57,8 +57,8 @@ static struct btrfs_lockdep_keyset { u64 id; /* root objectid */ - /* Longest entry: btrfs-free-space-00 */ - char names[BTRFS_MAX_LEVEL][20]; + /* Longest entry: btrfs-block-group-00 */ + char names[BTRFS_MAX_LEVEL][24]; struct lock_class_key keys[BTRFS_MAX_LEVEL]; } btrfs_lockdep_keysets[] = { { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, @@ -72,6 +72,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, + { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, { .id = 0, DEFINE_NAME("tree") }, }; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 3a095b9c6373..d3fcfc628a4f 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -88,9 +88,9 @@ struct list_head *lzo_alloc_workspace(unsigned int level) if (!workspace) return ERR_PTR(-ENOMEM); - workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL); - workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL); + workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN); + workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); + workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 310a05cf95ef..23fc11af498a 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -252,14 +252,6 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, } #endif -#ifdef CONFIG_BTRFS_ASSERT -void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line) -{ - pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); - BUG(); -} -#endif - void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) { btrfs_err(fs_info, diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index ac2d1982ba3d..deedc1a168e2 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -4,14 +4,23 @@ #define BTRFS_MESSAGES_H #include <linux/types.h> +#include <linux/printk.h> +#include <linux/bug.h> struct btrfs_fs_info; +/* + * We want to be able to override this in btrfs-progs. + */ +#ifdef __KERNEL__ + static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } +#endif + #ifdef CONFIG_PRINTK #define btrfs_printk(fs_info, fmt, args...) \ @@ -160,7 +169,11 @@ do { \ } while (0) #ifdef CONFIG_BTRFS_ASSERT -void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line); + +#define btrfs_assertfail(expr, file, line) ({ \ + pr_err("assertion failed: %s, in %s:%d\n", (expr), (file), (line)); \ + BUG(); \ +}) #define ASSERT(expr) \ (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 768583a440e1..005751a12911 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -143,4 +143,24 @@ static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr, return NULL; } +static inline bool bitmap_test_range_all_set(const unsigned long *addr, + unsigned long start, + unsigned long nbits) +{ + unsigned long found_zero; + + found_zero = find_next_zero_bit(addr, start + nbits, start); + return (found_zero == start + nbits); +} + +static inline bool bitmap_test_range_all_zero(const unsigned long *addr, + unsigned long start, + unsigned long nbits) +{ + unsigned long found_set; + + found_set = find_next_bit(addr, start + nbits, start); + return (found_set == start + nbits); +} + #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a9778a91511e..a629532283bc 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -146,35 +146,11 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } -/* - * Add an ordered extent to the per-inode tree. - * - * @inode: Inode that this extent is for. - * @file_offset: Logical offset in file where the extent starts. - * @num_bytes: Logical length of extent in file. - * @ram_bytes: Full length of unencoded data. - * @disk_bytenr: Offset of extent on disk. - * @disk_num_bytes: Size of extent on disk. - * @offset: Offset into unencoded data where file data starts. - * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). - * @compress_type: Compression algorithm used for data. - * - * Most of these parameters correspond to &struct btrfs_file_extent_item. The - * tree is given a single reference on the ordered extent that was inserted, and - * the returned pointer is given a second reference. - * - * Return: the new ordered extent or error pointer. - */ -struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( - struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type) +static struct btrfs_ordered_extent *alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, u64 num_bytes, + u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes, + u64 offset, unsigned long flags, int compress_type) { - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; - struct rb_node *node; struct btrfs_ordered_extent *entry; int ret; @@ -184,7 +160,6 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); if (ret < 0) return ERR_PTR(ret); - ret = 0; } else { /* * The ordered extent has reserved qgroup space, release now @@ -209,15 +184,7 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = ret; - entry->physical = (u64)-1; - - ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); entry->flags = flags; - - percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes, - fs_info->delalloc_batch); - - /* one ref for the tree */ refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); @@ -226,15 +193,40 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); + /* + * We don't need the count_max_extents here, we can assume that all of + * that work has been done at higher layers, so this is truly the + * smallest the extent is going to get. + */ + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); + + return entry; +} + +static void insert_ordered_extent(struct btrfs_ordered_extent *entry) +{ + struct btrfs_inode *inode = BTRFS_I(entry->inode); + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *node; + trace_btrfs_ordered_extent_add(inode, entry); + percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes, + fs_info->delalloc_batch); + + /* One ref for the tree. */ + refcount_inc(&entry->refs); + spin_lock_irq(&tree->lock); - node = tree_insert(&tree->tree, file_offset, - &entry->rb_node); + node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "inconsistency in ordered tree at offset %llu", - file_offset); + entry->file_offset); spin_unlock_irq(&tree->lock); spin_lock(&root->ordered_extent_lock); @@ -248,43 +240,43 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( spin_unlock(&fs_info->ordered_root_lock); } spin_unlock(&root->ordered_extent_lock); - - /* - * We don't need the count_max_extents here, we can assume that all of - * that work has been done at higher layers, so this is truly the - * smallest the extent is going to get. - */ - spin_lock(&inode->lock); - btrfs_mod_outstanding_extents(inode, 1); - spin_unlock(&inode->lock); - - /* One ref for the returned entry to match semantics of lookup. */ - refcount_inc(&entry->refs); - - return entry; } /* - * Add a new btrfs_ordered_extent for the range, but drop the reference instead - * of returning it to the caller. + * Add an ordered extent to the per-inode tree. + * + * @inode: Inode that this extent is for. + * @file_offset: Logical offset in file where the extent starts. + * @num_bytes: Logical length of extent in file. + * @ram_bytes: Full length of unencoded data. + * @disk_bytenr: Offset of extent on disk. + * @disk_num_bytes: Size of extent on disk. + * @offset: Offset into unencoded data where file data starts. + * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @compress_type: Compression algorithm used for data. + * + * Most of these parameters correspond to &struct btrfs_file_extent_item. The + * tree is given a single reference on the ordered extent that was inserted, and + * the returned pointer is given a second reference. + * + * Return: the new ordered extent or error pointer. */ -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type) +struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, + u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type) { - struct btrfs_ordered_extent *ordered; - - ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes, - ram_bytes, disk_bytenr, - disk_num_bytes, offset, flags, - compress_type); + struct btrfs_ordered_extent *entry; - if (IS_ERR(ordered)) - return PTR_ERR(ordered); - btrfs_put_ordered_extent(ordered); + ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); - return 0; + entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes, + disk_bytenr, disk_num_bytes, offset, flags, + compress_type); + if (!IS_ERR(entry)) + insert_ordered_extent(entry); + return entry; } /* @@ -311,6 +303,90 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } +static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, + u64 len, bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + lockdep_assert_held(&inode->ordered_tree.lock); + + if (page) { + ASSERT(page->mapping); + ASSERT(page_offset(page) <= file_offset); + ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE); + + /* + * Ordered (Private2) bit indicates whether we still have + * pending io unfinished for the ordered extent. + * + * If there's no such bit, we need to skip to next range. + */ + if (!btrfs_page_test_ordered(fs_info, page, file_offset, len)) + return false; + btrfs_page_clear_ordered(fs_info, page, file_offset, len); + } + + /* Now we're fine to update the accounting. */ + if (WARN_ON_ONCE(len > ordered->bytes_left)) { + btrfs_crit(fs_info, +"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", + inode->root->root_key.objectid, btrfs_ino(inode), + ordered->file_offset, ordered->num_bytes, + len, ordered->bytes_left); + ordered->bytes_left = 0; + } else { + ordered->bytes_left -= len; + } + + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + + if (ordered->bytes_left) + return false; + + /* + * All the IO of the ordered extent is finished, we need to queue + * the finish_func to be executed. + */ + set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags); + cond_wake_up(&ordered->wait); + refcount_inc(&ordered->refs); + trace_btrfs_ordered_extent_mark_finished(inode, ordered); + return true; +} + +static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? + fs_info->endio_freespace_worker : fs_info->endio_write_workers; + + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(wq, &ordered->work); +} + +bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, u64 len, + bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + unsigned long flags; + bool ret; + + trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); + + spin_lock_irqsave(&inode->ordered_tree.lock, flags); + ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); + spin_unlock_irqrestore(&inode->ordered_tree.lock, flags); + + if (ret) + btrfs_queue_ordered_fn(ordered); + return ret; +} + /* * Mark all ordered extents io inside the specified range finished. * @@ -329,22 +405,11 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, u64 num_bytes, bool uptodate) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_workqueue *wq; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; u64 cur = file_offset; - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - - if (page) - ASSERT(page->mapping && page_offset(page) <= file_offset && - file_offset + num_bytes <= page_offset(page) + PAGE_SIZE); - spin_lock_irqsave(&tree->lock, flags); while (cur < file_offset + num_bytes) { u64 entry_end; @@ -397,50 +462,9 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(end + 1 - cur < U32_MAX); len = end + 1 - cur; - if (page) { - /* - * Ordered (Private2) bit indicates whether we still - * have pending io unfinished for the ordered extent. - * - * If there's no such bit, we need to skip to next range. - */ - if (!btrfs_page_test_ordered(fs_info, page, cur, len)) { - cur += len; - continue; - } - btrfs_page_clear_ordered(fs_info, page, cur, len); - } - - /* Now we're fine to update the accounting */ - if (unlikely(len > entry->bytes_left)) { - WARN_ON(1); - btrfs_crit(fs_info, -"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu", - inode->root->root_key.objectid, - btrfs_ino(inode), - entry->file_offset, - entry->num_bytes, - len, entry->bytes_left); - entry->bytes_left = 0; - } else { - entry->bytes_left -= len; - } - - if (!uptodate) - set_bit(BTRFS_ORDERED_IOERR, &entry->flags); - - /* - * All the IO of the ordered extent is finished, we need to queue - * the finish_func to be executed. - */ - if (entry->bytes_left == 0) { - set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - cond_wake_up(&entry->wait); - refcount_inc(&entry->refs); - trace_btrfs_ordered_extent_mark_finished(inode, entry); + if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { spin_unlock_irqrestore(&tree->lock, flags); - btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL); - btrfs_queue_work(wq, &entry->work); + btrfs_queue_ordered_fn(entry); spin_lock_irqsave(&tree->lock, flags); } cur += len; @@ -564,7 +588,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, freespace_inode = btrfs_is_free_space_inode(btrfs_inode); btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); - /* This is paired with btrfs_add_ordered_extent. */ + /* This is paired with btrfs_alloc_ordered_extent. */ spin_lock(&btrfs_inode->lock); btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); @@ -1117,17 +1141,22 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, } /* Split out a new ordered extent for this first @len bytes of @ordered. */ -int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len) +struct btrfs_ordered_extent *btrfs_split_ordered_extent( + struct btrfs_ordered_extent *ordered, u64 len) { - struct inode *inode = ordered->inode; - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 file_offset = ordered->file_offset; u64 disk_bytenr = ordered->disk_bytenr; - unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; + unsigned long flags = ordered->flags; + struct btrfs_ordered_sum *sum, *tmpsum; + struct btrfs_ordered_extent *new; struct rb_node *node; + u64 offset = 0; - trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered); + trace_btrfs_ordered_extent_split(inode, ordered); ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED))); @@ -1136,18 +1165,27 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len) * reduce the original extent to a zero length either. */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) - return -EINVAL; - /* We cannot split once ordered extent is past end_bio. */ - if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) - return -EINVAL; + return ERR_PTR(-EINVAL); + /* We cannot split partially completed ordered extents. */ + if (ordered->bytes_left) { + ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); + if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) + return ERR_PTR(-EINVAL); + } /* We cannot split a compressed ordered extent. */ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) - return -EINVAL; - /* Checksum list should be empty. */ - if (WARN_ON_ONCE(!list_empty(&ordered->list))) - return -EINVAL; + return ERR_PTR(-EINVAL); - spin_lock_irq(&tree->lock); + new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr, + len, 0, flags, ordered->compress_type); + if (IS_ERR(new)) + return new; + + /* One ref for the tree. */ + refcount_inc(&new->refs); + + spin_lock_irq(&root->ordered_extent_lock); + spin_lock(&tree->lock); /* Remove from tree once */ node = &ordered->rb_node; rb_erase(node, &tree->tree); @@ -1159,26 +1197,48 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len) ordered->disk_bytenr += len; ordered->num_bytes -= len; ordered->disk_num_bytes -= len; - ordered->bytes_left -= len; + + if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { + ASSERT(ordered->bytes_left == 0); + new->bytes_left = 0; + } else { + ordered->bytes_left -= len; + } + + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) { + if (ordered->truncated_len > len) { + ordered->truncated_len -= len; + } else { + new->truncated_len = ordered->truncated_len; + ordered->truncated_len = 0; + } + } + + list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) { + if (offset == len) + break; + list_move_tail(&sum->list, &new->list); + offset += sum->len; + } /* Re-insert the node */ node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "zoned: inconsistency in ordered tree at offset %llu", - ordered->file_offset); + ordered->file_offset); - spin_unlock_irq(&tree->lock); - - /* - * The splitting extent is already counted and will be added again in - * btrfs_add_ordered_extent(). Subtract len to avoid double counting. - */ - percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch); + node = tree_insert(&tree->tree, new->file_offset, &new->rb_node); + if (node) + btrfs_panic(fs_info, -EEXIST, + "zoned: inconsistency in ordered tree at offset %llu", + new->file_offset); + spin_unlock(&tree->lock); - return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, - disk_bytenr, len, 0, flags, - ordered->compress_type); + list_add_tail(&new->root_extent_list, &root->ordered_extents); + root->nr_ordered_extents++; + spin_unlock_irq(&root->ordered_extent_lock); + return new; } int __init ordered_data_init(void) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f0f1138d23c3..173bd5c5df26 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -14,13 +14,13 @@ struct btrfs_ordered_inode_tree { }; struct btrfs_ordered_sum { - /* bytenr is the start of this extent on disk */ - u64 bytenr; - /* - * this is the length in bytes covered by the sums array below. + * Logical start address and length for of the blocks covered by + * the sums array. */ - int len; + u64 logical; + u32 len; + struct list_head list; /* last field is a variable length array of csums */ u8 sums[]; @@ -151,12 +151,6 @@ struct btrfs_ordered_extent { struct completion completion; struct btrfs_work flush_work; struct list_head work_list; - - /* - * Used to reverse-map physical address returned from ZONE_APPEND write - * command in a workqueue context - */ - u64 physical; }; static inline void @@ -167,11 +161,15 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) t->last = NULL; } +int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent); int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); +bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, u64 len, + bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, u64 num_bytes, bool uptodate); @@ -183,10 +181,6 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes, u64 offset, unsigned long flags, int compress_type); -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, @@ -212,7 +206,8 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, struct extent_state **cached_state); bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); -int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len); +struct btrfs_ordered_extent *btrfs_split_ordered_extent( + struct btrfs_ordered_extent *ordered, u64 len); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 497b9dbd8a13..aa06d9ca911d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -49,7 +49,7 @@ const char *btrfs_root_name(const struct btrfs_key *key, char *buf) return buf; } -static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) +static void print_chunk(const struct extent_buffer *eb, struct btrfs_chunk *chunk) { int num_stripes = btrfs_chunk_num_stripes(eb, chunk); int i; @@ -62,7 +62,7 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) btrfs_stripe_offset_nr(eb, chunk, i)); } } -static void print_dev_item(struct extent_buffer *eb, +static void print_dev_item(const struct extent_buffer *eb, struct btrfs_dev_item *dev_item) { pr_info("\t\tdev item devid %llu total_bytes %llu bytes used %llu\n", @@ -70,7 +70,7 @@ static void print_dev_item(struct extent_buffer *eb, btrfs_device_total_bytes(eb, dev_item), btrfs_device_bytes_used(eb, dev_item)); } -static void print_extent_data_ref(struct extent_buffer *eb, +static void print_extent_data_ref(const struct extent_buffer *eb, struct btrfs_extent_data_ref *ref) { pr_cont("extent data backref root %llu objectid %llu offset %llu count %u\n", @@ -80,7 +80,7 @@ static void print_extent_data_ref(struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot, int type) +static void print_extent_item(const struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -169,7 +169,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) WARN_ON(ptr > end); } -static void print_uuid_item(struct extent_buffer *l, unsigned long offset, +static void print_uuid_item(const struct extent_buffer *l, unsigned long offset, u32 item_size) { if (!IS_ALIGNED(item_size, sizeof(u64))) { @@ -191,7 +191,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, * Helper to output refs and locking status of extent buffer. Useful to debug * race condition related problems. */ -static void print_eb_refs_lock(struct extent_buffer *eb) +static void print_eb_refs_lock(const struct extent_buffer *eb) { #ifdef CONFIG_BTRFS_DEBUG btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u", @@ -199,7 +199,7 @@ static void print_eb_refs_lock(struct extent_buffer *eb) #endif } -void btrfs_print_leaf(struct extent_buffer *l) +void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; int i; @@ -355,7 +355,7 @@ void btrfs_print_leaf(struct extent_buffer *l) } } -void btrfs_print_tree(struct extent_buffer *c, bool follow) +void btrfs_print_tree(const struct extent_buffer *c, bool follow) { struct btrfs_fs_info *fs_info; int i; u32 nr; diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 8c3e9319ec4e..c42bc666d5ee 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -9,8 +9,8 @@ /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 -void btrfs_print_leaf(struct extent_buffer *l); -void btrfs_print_tree(struct extent_buffer *c, bool follow); +void btrfs_print_leaf(const struct extent_buffer *l); +void btrfs_print_tree(const struct extent_buffer *c, bool follow); const char *btrfs_root_name(const struct btrfs_key *key, char *buf); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f41da7ac360d..da1f84a0eb29 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1232,12 +1232,23 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) int ret = 0; /* - * We need to have subvol_sem write locked, to prevent races between - * concurrent tasks trying to disable quotas, because we will unlock - * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + * We need to have subvol_sem write locked to prevent races with + * snapshot creation. */ lockdep_assert_held_write(&fs_info->subvol_sem); + /* + * Lock the cleaner mutex to prevent races with concurrent relocation, + * because relocation may be building backrefs for blocks of the quota + * root while we are deleting the root. This is like dropping fs roots + * of deleted snapshots/subvolumes, we need the same protection. + * + * This also prevents races between concurrent tasks trying to disable + * quotas, because we will unlock and relock qgroup_ioctl_lock across + * BTRFS_FS_QUOTA_ENABLED changes. + */ + mutex_lock(&fs_info->cleaner_mutex); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; @@ -1301,7 +1312,9 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) goto out; } + spin_lock(&fs_info->trans_lock); list_del("a_root->dirty_list); + spin_unlock(&fs_info->trans_lock); btrfs_tree_lock(quota_root->node); btrfs_clear_buffer_dirty(trans, quota_root->node); @@ -1317,6 +1330,7 @@ out: btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); + mutex_unlock(&fs_info->cleaner_mutex); return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2fab37f062de..f37b925d587f 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1079,7 +1079,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, /* see if we can add this page onto our existing bio */ if (last) { - u64 last_end = last->bi_iter.bi_sector << 9; + u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT; last_end += last->bi_iter.bi_size; /* @@ -1099,7 +1099,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio = bio_alloc(stripe->dev->bdev, max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), op, GFP_NOFS); - bio->bi_iter.bi_sector = disk_start >> 9; + bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); @@ -2747,3 +2747,48 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) if (!lock_stripe_add(rbio)) start_async_work(rbio, scrub_rbio_work_locked); } + +/* + * This is for scrub call sites where we already have correct data contents. + * This allows us to avoid reading data stripes again. + * + * Unfortunately here we have to do page copy, other than reusing the pages. + * This is due to the fact rbio has its own page management for its cache. + */ +void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, + struct page **data_pages, u64 data_logical) +{ + const u64 offset_in_full_stripe = data_logical - + rbio->bioc->full_stripe_logical; + const int page_index = offset_in_full_stripe >> PAGE_SHIFT; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int ret; + + /* + * If we hit ENOMEM temporarily, but later at + * raid56_parity_submit_scrub_rbio() time it succeeded, we just do + * the extra read, not a big deal. + * + * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time, + * the bio would got proper error number set. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + return; + + /* data_logical must be at stripe boundary and inside the full stripe. */ + ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); + ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); + + for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { + struct page *dst = rbio->stripe_pages[page_nr + page_index]; + struct page *src = data_pages[page_nr]; + + memcpy_page(dst, 0, src, 0, PAGE_SIZE); + for (int sector_nr = sectors_per_page * page_index; + sector_nr < sectors_per_page * (page_index + 1); + sector_nr++) + rbio->stripe_sectors[sector_nr].uptodate = true; + } +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0f7f31c8cb98..0e84c9c9293f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -193,6 +193,9 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); +void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, + struct page **data_pages, u64 data_logical); + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 59a06499c647..25a3361caedc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -174,8 +174,8 @@ static void mark_block_processed(struct reloc_control *rc, in_range(node->bytenr, rc->block_group->start, rc->block_group->length)) { blocksize = rc->extent_root->fs_info->nodesize; - set_extent_bits(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY); + set_extent_bit(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); } node->processed = 1; } @@ -3051,9 +3051,9 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, u64 boundary_end = boundary_start + fs_info->sectorsize - 1; - set_extent_bits(&BTRFS_I(inode)->io_tree, - boundary_start, boundary_end, - EXTENT_BOUNDARY); + set_extent_bit(&BTRFS_I(inode)->io_tree, + boundary_start, boundary_end, + EXTENT_BOUNDARY, NULL); } unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, &cached_state); @@ -4342,29 +4342,25 @@ out: * cloning checksum properly handles the nodatasum extents. * it also saves CPU time to re-calculate the checksum. */ -int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) +int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) { + struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_root *csum_root; - struct btrfs_ordered_sum *sums; - struct btrfs_ordered_extent *ordered; - int ret; - u64 disk_bytenr; - u64 new_bytenr; + u64 disk_bytenr = ordered->file_offset + inode->index_cnt; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); LIST_HEAD(list); + int ret; - ordered = btrfs_lookup_ordered_extent(inode, file_pos); - BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); - - disk_bytenr = file_pos + inode->index_cnt; - csum_root = btrfs_csum_root(fs_info, disk_bytenr); ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, - disk_bytenr + len - 1, &list, 0, false); + disk_bytenr + ordered->num_bytes - 1, + &list, 0, false); if (ret) - goto out; + return ret; while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); + struct btrfs_ordered_sum *sums = + list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); /* @@ -4379,14 +4375,11 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) * disk_len vs real len like with real inodes since it's all * disk length. */ - new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr; - sums->bytenr = new_bytenr; - + sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; btrfs_add_ordered_sum(ordered, sums); } -out: - btrfs_put_ordered_extent(ordered); - return ret; + + return 0; } int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, @@ -4523,3 +4516,19 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, ret = clone_backref_node(trans, rc, root, reloc_root); return ret; } + +/* + * Get the current bytenr for the block group which is being relocated. + * + * Return U64_MAX if no running relocation. + */ +u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info) +{ + u64 logical = U64_MAX; + + lockdep_assert_held(&fs_info->reloc_mutex); + + if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) + logical = fs_info->reloc_ctl->block_group->start; + return logical; +} diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 2041a86186de..77d69f6ae967 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -8,7 +8,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *r int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); -int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); +int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *cow); @@ -19,5 +19,6 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_should_ignore_reloc_root(struct btrfs_root *root); +u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 16c228344cbb..4cae41bd6de0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -177,7 +177,6 @@ struct scrub_ctx { struct btrfs_fs_info *fs_info; int first_free; int cur_stripe; - struct list_head csum_list; atomic_t cancel_req; int readonly; int sectors_per_bio; @@ -309,17 +308,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) scrub_pause_off(fs_info); } -static void scrub_free_csums(struct scrub_ctx *sctx) -{ - while (!list_empty(&sctx->csum_list)) { - struct btrfs_ordered_sum *sum; - sum = list_first_entry(&sctx->csum_list, - struct btrfs_ordered_sum, list); - list_del(&sum->list); - kfree(sum); - } -} - static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) { int i; @@ -330,7 +318,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) release_scrub_stripe(&sctx->stripes[i]); - scrub_free_csums(sctx); kfree(sctx); } @@ -352,7 +339,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; - INIT_LIST_HEAD(&sctx->csum_list); for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { int ret; @@ -479,11 +465,8 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * struct extent_buffer *eb; struct btrfs_extent_item *ei; struct scrub_warning swarn; - unsigned long ptr = 0; u64 flags = 0; - u64 ref_root; u32 item_size; - u8 ref_level = 0; int ret; /* Super block error, no need to search extent tree. */ @@ -513,19 +496,28 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * item_size = btrfs_item_size(eb, path->slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - do { + unsigned long ptr = 0; + u8 ref_level; + u64 ref_root; + + while (true) { ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); + if (ret < 0) { + btrfs_warn(fs_info, + "failed to resolve tree backref for logical %llu: %d", + swarn.logical, ret); + break; + } + if (ret > 0) + break; btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", - errstr, swarn.logical, - btrfs_dev_name(dev), - swarn.physical, - ref_level ? "node" : "leaf", - ret < 0 ? -1 : ref_level, - ret < 0 ? -1 : ref_root); - } while (ret != 1); + errstr, swarn.logical, btrfs_dev_name(dev), + swarn.physical, (ref_level ? "node" : "leaf"), + ref_level, ref_root); + } btrfs_release_path(path); } else { struct btrfs_backref_walk_ctx ctx = { 0 }; @@ -546,48 +538,6 @@ out: btrfs_free_path(path); } -static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) -{ - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) - return 2; - else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) - return 3; - else - return (int)bioc->num_stripes; -} - -static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, - u64 full_stripe_logical, - int nstripes, int mirror, - int *stripe_index, - u64 *stripe_offset) -{ - int i; - - if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ? - nstripes - 1 : nstripes - 2; - - /* RAID5/6 */ - for (i = 0; i < nr_data_stripes; i++) { - const u64 data_stripe_start = full_stripe_logical + - (i * BTRFS_STRIPE_LEN); - - if (logical >= data_stripe_start && - logical < data_stripe_start + BTRFS_STRIPE_LEN) - break; - } - - *stripe_index = i; - *stripe_offset = (logical - full_stripe_logical) & - BTRFS_STRIPE_LEN_MASK; - } else { - /* The other RAID type */ - *stripe_index = mirror; - *stripe_offset = 0; - } -} - static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) { int ret = 0; @@ -924,8 +874,9 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, /* For scrub, our mirror_num should always start at 1. */ ASSERT(stripe->mirror_num >= 1); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - stripe->logical, &mapped_len, &bioc); + ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + stripe->logical, &mapped_len, &bioc, + NULL, NULL, 1); /* * If we failed, dev will be NULL, and later detailed reports * will just be skipped. @@ -1957,8 +1908,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bio->bi_end_io = raid56_scrub_wait_endio; btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start, - &length, &bioc); + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, + &length, &bioc, NULL, NULL, 1); if (ret < 0) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); @@ -1972,6 +1923,13 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_bio_counter_dec(fs_info); goto out; } + /* Use the recovered stripes as cache to avoid read them from disk again. */ + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; + + raid56_parity_cache_data_pages(rbio, stripe->pages, + full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); + } raid56_parity_submit_scrub_rbio(rbio); wait_for_completion_io(&io_done); ret = blk_status_to_errno(bio->bi_status); @@ -2740,17 +2698,12 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, &fs_info->scrub_lock)) { struct workqueue_struct *scrub_workers = fs_info->scrub_workers; - struct workqueue_struct *scrub_wr_comp = - fs_info->scrub_wr_completion_workers; fs_info->scrub_workers = NULL; - fs_info->scrub_wr_completion_workers = NULL; mutex_unlock(&fs_info->scrub_lock); if (scrub_workers) destroy_workqueue(scrub_workers); - if (scrub_wr_comp) - destroy_workqueue(scrub_wr_comp); } } @@ -2761,7 +2714,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { struct workqueue_struct *scrub_workers = NULL; - struct workqueue_struct *scrub_wr_comp = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; int ret = -ENOMEM; @@ -2769,21 +2721,17 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - scrub_workers = alloc_workqueue("btrfs-scrub", flags, - is_dev_replace ? 1 : max_active); + if (is_dev_replace) + scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags); + else + scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); if (!scrub_workers) - goto fail_scrub_workers; - - scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active); - if (!scrub_wr_comp) - goto fail_scrub_wr_completion_workers; + return -ENOMEM; mutex_lock(&fs_info->scrub_lock); if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { - ASSERT(fs_info->scrub_workers == NULL && - fs_info->scrub_wr_completion_workers == NULL); + ASSERT(fs_info->scrub_workers == NULL); fs_info->scrub_workers = scrub_workers; - fs_info->scrub_wr_completion_workers = scrub_wr_comp; refcount_set(&fs_info->scrub_workers_refcnt, 1); mutex_unlock(&fs_info->scrub_lock); return 0; @@ -2794,10 +2742,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, ret = 0; - destroy_workqueue(scrub_wr_comp); -fail_scrub_wr_completion_workers: destroy_workqueue(scrub_workers); -fail_scrub_workers: return ret; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index af2e153543a5..8bfd44750efe 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1774,9 +1774,21 @@ static int read_symlink(struct btrfs_root *root, ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], ei); + if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) { + ret = -EUCLEAN; + btrfs_crit(root->fs_info, +"send: found symlink extent that is not inline, ino %llu root %llu extent type %d", + ino, btrfs_root_id(root), type); + goto out; + } compression = btrfs_file_extent_compression(path->nodes[0], ei); - BUG_ON(type != BTRFS_FILE_EXTENT_INLINE); - BUG_ON(compression); + if (unlikely(compression != BTRFS_COMPRESS_NONE)) { + ret = -EUCLEAN; + btrfs_crit(root->fs_info, +"send: found symlink extent with compression, ino %llu root %llu compression type %d", + ino, btrfs_root_id(root), compression); + goto out; + } off = btrfs_file_extent_inline_start(ei); len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index dd46b978ac2c..1b999c6e4193 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -100,9 +100,6 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector subpage_info->uptodate_offset = cur; cur += nr_bits; - subpage_info->error_offset = cur; - cur += nr_bits; - subpage_info->dirty_offset = cur; cur += nr_bits; @@ -367,28 +364,6 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, unlock_page(page); } -static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start, - unsigned int nbits) -{ - unsigned int found_zero; - - found_zero = find_next_zero_bit(addr, start + nbits, start); - if (found_zero == start + nbits) - return true; - return false; -} - -static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start, - unsigned int nbits) -{ - unsigned int found_set; - - found_set = find_next_bit(addr, start + nbits, start); - if (found_set == start + nbits) - return true; - return false; -} - #define subpage_calc_start_bit(fs_info, page, name, start, len) \ ({ \ unsigned int start_bit; \ @@ -438,35 +413,6 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } -void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, - error, start, len); - unsigned long flags; - - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - SetPageError(page); - spin_unlock_irqrestore(&subpage->lock, flags); -} - -void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, - error, start, len); - unsigned long flags; - - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, error)) - ClearPageError(page); - spin_unlock_irqrestore(&subpage->lock, flags); -} - void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { @@ -628,7 +574,6 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ return ret; \ } IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); -IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); @@ -696,7 +641,6 @@ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, PageUptodate); -IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, PageDirty); IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, @@ -767,3 +711,44 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, /* Have writers, use proper subpage helper to end it */ btrfs_page_end_writer_lock(fs_info, page, start, len); } + +#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ + bitmap_cut(dst, subpage->bitmaps, 0, \ + subpage_info->name##_offset, subpage_info->bitmap_nr_bits) + +void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; + struct btrfs_subpage *subpage; + unsigned long uptodate_bitmap; + unsigned long error_bitmap; + unsigned long dirty_bitmap; + unsigned long writeback_bitmap; + unsigned long ordered_bitmap; + unsigned long checked_bitmap; + unsigned long flags; + + ASSERT(PagePrivate(page) && page->private); + ASSERT(subpage_info); + subpage = (struct btrfs_subpage *)page->private; + + spin_lock_irqsave(&subpage->lock, flags); + GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); + spin_unlock_irqrestore(&subpage->lock, flags); + + dump_page(page, "btrfs subpage dump"); + btrfs_warn(fs_info, +"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", + start, len, page_offset(page), + subpage_info->bitmap_nr_bits, &uptodate_bitmap, + subpage_info->bitmap_nr_bits, &error_bitmap, + subpage_info->bitmap_nr_bits, &dirty_bitmap, + subpage_info->bitmap_nr_bits, &writeback_bitmap, + subpage_info->bitmap_nr_bits, &ordered_bitmap, + subpage_info->bitmap_nr_bits, &checked_bitmap); +} diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 0e80ad336904..5cbf67ccbdeb 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -8,17 +8,17 @@ /* * Extra info for subpapge bitmap. * - * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into + * For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into * one larger bitmap. * * This structure records how they are organized in the bitmap: * - * /- uptodate_offset /- error_offset /- dirty_offset + * /- uptodate_offset /- dirty_offset /- ordered_offset * | | | * v v v - * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o| + * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o| * |<- bitmap_nr_bits ->| - * |<--------------- total_nr_bits ---------------->| + * |<----------------- total_nr_bits ------------------>| */ struct btrfs_subpage_info { /* Number of bits for each bitmap */ @@ -32,7 +32,6 @@ struct btrfs_subpage_info { * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. */ unsigned int uptodate_offset; - unsigned int error_offset; unsigned int dirty_offset; unsigned int writeback_offset; unsigned int ordered_offset; @@ -141,7 +140,6 @@ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); -DECLARE_BTRFS_SUBPAGE_OPS(error); DECLARE_BTRFS_SUBPAGE_OPS(dirty); DECLARE_BTRFS_SUBPAGE_OPS(writeback); DECLARE_BTRFS_SUBPAGE_OPS(ordered); @@ -154,5 +152,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct page *page); void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); +void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index efeb1a9d040a..f1dd172d8d5b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -849,8 +849,7 @@ out: * All other options will be parsed on much later in the mount process and * only when we need to allocate a new super block. */ -static int btrfs_parse_device_options(const char *options, fmode_t flags, - void *holder) +static int btrfs_parse_device_options(const char *options, blk_mode_t flags) { substring_t args[MAX_OPT_ARGS]; char *device_name, *opts, *orig, *p; @@ -884,8 +883,7 @@ static int btrfs_parse_device_options(const char *options, fmode_t flags, error = -ENOMEM; goto out; } - device = btrfs_scan_one_device(device_name, flags, - holder); + device = btrfs_scan_one_device(device_name, flags); kfree(device_name); if (IS_ERR(device)) { error = PTR_ERR(device); @@ -1442,12 +1440,9 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; void *new_sec_opts = NULL; - fmode_t mode = FMODE_READ; + blk_mode_t mode = sb_open_mode(flags); int error = 0; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; - if (data) { error = security_sb_eat_lsm_opts(data, &new_sec_opts); if (error) @@ -1477,13 +1472,13 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, } mutex_lock(&uuid_mutex); - error = btrfs_parse_device_options(data, mode, fs_type); + error = btrfs_parse_device_options(data, mode); if (error) { mutex_unlock(&uuid_mutex); goto error_fs_info; } - device = btrfs_scan_one_device(device_name, mode, fs_type); + device = btrfs_scan_one_device(device_name, mode); if (IS_ERR(device)) { mutex_unlock(&uuid_mutex); error = PTR_ERR(device); @@ -1631,7 +1626,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, old_pool_size, new_pool_size); btrfs_workqueue_set_max(fs_info->workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); workqueue_set_max_active(fs_info->endio_workers, new_pool_size); @@ -2196,8 +2190,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case BTRFS_IOC_SCAN_DEV: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_root_fs_type); + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; @@ -2211,8 +2204,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_root_fs_type); + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); if (IS_ERR(device)) { mutex_unlock(&uuid_mutex); ret = PTR_ERR(device); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index dfc5c7fa6038..f6bc6d738555 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -159,7 +159,7 @@ static int test_find_delalloc(u32 sectorsize) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); + set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, @@ -190,7 +190,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("couldn't find the locked page"); goto out_bits; } - set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); + set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, @@ -245,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); + set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, @@ -503,8 +503,8 @@ static int test_find_first_clear_extent_bit(void) * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between * 4M-32M */ - set_extent_bits(&tree, SZ_1M, SZ_4M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + set_extent_bit(&tree, SZ_1M, SZ_4M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); @@ -516,8 +516,8 @@ static int test_find_first_clear_extent_bit(void) } /* Now add 32M-64M so that we have a hole between 4M-32M */ - set_extent_bits(&tree, SZ_32M, SZ_64M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + set_extent_bit(&tree, SZ_32M, SZ_64M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); /* * Request first hole starting at 12M, we should get 4M-32M @@ -548,7 +548,7 @@ static int test_find_first_clear_extent_bit(void) * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag * being unset in this range, we should get the entry in range 64M-72M */ - set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED); + set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL); find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, CHUNK_TRIMMED); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8b6a99b8d7f6..cf306351b148 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -374,8 +374,6 @@ loop: spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); - INIT_LIST_HEAD(&cur_trans->releasing_ebs); - spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES); @@ -1056,7 +1054,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { bool wait_writeback = false; @@ -1092,7 +1089,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, cond_resched(); start = end + 1; } - atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers); return werr; } @@ -1688,7 +1684,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * insert the directory item */ ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, @@ -2484,13 +2483,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } - /* - * At this point, we should have written all the tree blocks allocated - * in this transaction. So it's now safe to free the redirtyied extent - * buffers. - */ - btrfs_free_redirty_list(cur_trans); - ret = write_all_supers(fs_info, 0); /* * the super is written, we can safely allow the tree-loggers diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index fa728ab80826..8e9fa23bd7fe 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -94,9 +94,6 @@ struct btrfs_transaction { */ atomic_t pending_ordered; wait_queue_head_t pending_wait; - - spinlock_t releasing_ebs_lock; - struct list_head releasing_ebs; }; enum { diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 2138e9fc0564..038dfa8f1788 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -25,10 +25,10 @@ #include "compression.h" #include "volumes.h" #include "misc.h" -#include "btrfs_inode.h" #include "fs.h" #include "accessors.h" #include "file-item.h" +#include "inode-item.h" /* * Error message should follow the following format: @@ -1620,9 +1620,10 @@ static int check_inode_ref(struct extent_buffer *leaf, /* * Common point to switch the item-specific validation. */ -static int check_leaf_item(struct extent_buffer *leaf, - struct btrfs_key *key, int slot, - struct btrfs_key *prev_key) +static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, + struct btrfs_key *key, + int slot, + struct btrfs_key *prev_key) { int ret = 0; struct btrfs_chunk *chunk; @@ -1671,10 +1672,13 @@ static int check_leaf_item(struct extent_buffer *leaf, ret = check_extent_data_ref(leaf, key, slot); break; } - return ret; + + if (ret) + return BTRFS_TREE_BLOCK_INVALID_ITEM; + return BTRFS_TREE_BLOCK_CLEAN; } -static int check_leaf(struct extent_buffer *leaf, bool check_item_data) +enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) { struct btrfs_fs_info *fs_info = leaf->fs_info; /* No valid key type is 0, so all key should be larger than this key */ @@ -1687,7 +1691,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) generic_err(leaf, 0, "invalid level for leaf, have %d expect 0", btrfs_header_level(leaf)); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_LEVEL; } /* @@ -1710,32 +1714,32 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) generic_err(leaf, 0, "invalid root, root %llu must never be empty", owner); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_NRITEMS; } /* Unknown tree */ if (unlikely(owner == 0)) { generic_err(leaf, 0, "invalid owner, root 0 is not defined"); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_OWNER; } /* EXTENT_TREE_V2 can have empty extent trees. */ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) - return 0; + return BTRFS_TREE_BLOCK_CLEAN; if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) { generic_err(leaf, 0, "invalid root, root %llu must never be empty", owner); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_NRITEMS; } - return 0; + return BTRFS_TREE_BLOCK_CLEAN; } if (unlikely(nritems == 0)) - return 0; + return BTRFS_TREE_BLOCK_CLEAN; /* * Check the following things to make sure this is a good leaf, and @@ -1751,7 +1755,6 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; u64 item_data_end; - int ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -1762,7 +1765,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) prev_key.objectid, prev_key.type, prev_key.offset, key.objectid, key.type, key.offset); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } item_data_end = (u64)btrfs_item_offset(leaf, slot) + @@ -1781,7 +1784,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) generic_err(leaf, slot, "unexpected item end, have %llu expect %u", item_data_end, item_end_expected); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } /* @@ -1793,7 +1796,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) generic_err(leaf, slot, "slot end outside of leaf, have %llu expect range [0, %u]", item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info)); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } /* Also check if the item pointer overlaps with btrfs item. */ @@ -1804,16 +1807,22 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item), btrfs_item_ptr_offset(leaf, slot)); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } - if (check_item_data) { + /* + * We only want to do this if WRITTEN is set, otherwise the leaf + * may be in some intermediate state and won't appear valid. + */ + if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) { + enum btrfs_tree_block_status ret; + /* * Check if the item size and content meet other * criteria */ ret = check_leaf_item(leaf, &key, slot, &prev_key); - if (unlikely(ret < 0)) + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) return ret; } @@ -1822,21 +1831,21 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) prev_key.offset = key.offset; } - return 0; + return BTRFS_TREE_BLOCK_CLEAN; } -int btrfs_check_leaf_full(struct extent_buffer *leaf) +int btrfs_check_leaf(struct extent_buffer *leaf) { - return check_leaf(leaf, true); -} -ALLOW_ERROR_INJECTION(btrfs_check_leaf_full, ERRNO); + enum btrfs_tree_block_status ret; -int btrfs_check_leaf_relaxed(struct extent_buffer *leaf) -{ - return check_leaf(leaf, false); + ret = __btrfs_check_leaf(leaf); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return -EUCLEAN; + return 0; } +ALLOW_ERROR_INJECTION(btrfs_check_leaf, ERRNO); -int btrfs_check_node(struct extent_buffer *node) +enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node) { struct btrfs_fs_info *fs_info = node->fs_info; unsigned long nr = btrfs_header_nritems(node); @@ -1844,13 +1853,12 @@ int btrfs_check_node(struct extent_buffer *node) int slot; int level = btrfs_header_level(node); u64 bytenr; - int ret = 0; if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) { generic_err(node, 0, "invalid level for node, have %d expect [1, %d]", level, BTRFS_MAX_LEVEL - 1); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_LEVEL; } if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) { btrfs_crit(fs_info, @@ -1858,7 +1866,7 @@ int btrfs_check_node(struct extent_buffer *node) btrfs_header_owner(node), node->start, nr == 0 ? "small" : "large", nr, BTRFS_NODEPTRS_PER_BLOCK(fs_info)); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_NRITEMS; } for (slot = 0; slot < nr - 1; slot++) { @@ -1869,15 +1877,13 @@ int btrfs_check_node(struct extent_buffer *node) if (unlikely(!bytenr)) { generic_err(node, slot, "invalid NULL node pointer"); - ret = -EUCLEAN; - goto out; + return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR; } if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) { generic_err(node, slot, "unaligned pointer, have %llu should be aligned to %u", bytenr, fs_info->sectorsize); - ret = -EUCLEAN; - goto out; + return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR; } if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) { @@ -1886,12 +1892,20 @@ int btrfs_check_node(struct extent_buffer *node) key.objectid, key.type, key.offset, next_key.objectid, next_key.type, next_key.offset); - ret = -EUCLEAN; - goto out; + return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } } -out: - return ret; + return BTRFS_TREE_BLOCK_CLEAN; +} + +int btrfs_check_node(struct extent_buffer *node) +{ + enum btrfs_tree_block_status ret; + + ret = __btrfs_check_node(node); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return -EUCLEAN; + return 0; } ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); @@ -1949,3 +1963,61 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) } return 0; } + +int btrfs_verify_level_key(struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + int found_level; + struct btrfs_key found_key; + int ret; + + found_level = btrfs_header_level(eb); + if (found_level != level) { + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: tree level check failed\n"); + btrfs_err(fs_info, +"tree level mismatch detected, bytenr=%llu level expected=%u has=%u", + eb->start, level, found_level); + return -EIO; + } + + if (!first_key) + return 0; + + /* + * For live tree block (new tree blocks in current transaction), + * we need proper lock context to avoid race, which is impossible here. + * So we only checks tree blocks which is read from disk, whose + * generation <= fs_info->last_trans_committed. + */ + if (btrfs_header_generation(eb) > fs_info->last_trans_committed) + return 0; + + /* We have @first_key, so this @eb must have at least one item */ + if (btrfs_header_nritems(eb) == 0) { + btrfs_err(fs_info, + "invalid tree nritems, bytenr=%llu nritems=0 expect >0", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return -EUCLEAN; + } + + if (found_level) + btrfs_node_key_to_cpu(eb, &found_key, 0); + else + btrfs_item_key_to_cpu(eb, &found_key, 0); + ret = btrfs_comp_cpu_keys(first_key, &found_key); + + if (ret) { + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: tree first key check failed\n"); + btrfs_err(fs_info, +"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", + eb->start, parent_transid, first_key->objectid, + first_key->type, first_key->offset, + found_key.objectid, found_key.type, + found_key.offset); + } + return ret; +} diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index bfb5efa4e01f..3c2a02a72f64 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -40,22 +40,33 @@ struct btrfs_tree_parent_check { u8 level; }; -/* - * Comprehensive leaf checker. - * Will check not only the item pointers, but also every possible member - * in item data. - */ -int btrfs_check_leaf_full(struct extent_buffer *leaf); +enum btrfs_tree_block_status { + BTRFS_TREE_BLOCK_CLEAN, + BTRFS_TREE_BLOCK_INVALID_NRITEMS, + BTRFS_TREE_BLOCK_INVALID_PARENT_KEY, + BTRFS_TREE_BLOCK_BAD_KEY_ORDER, + BTRFS_TREE_BLOCK_INVALID_LEVEL, + BTRFS_TREE_BLOCK_INVALID_FREE_SPACE, + BTRFS_TREE_BLOCK_INVALID_OFFSETS, + BTRFS_TREE_BLOCK_INVALID_BLOCKPTR, + BTRFS_TREE_BLOCK_INVALID_ITEM, + BTRFS_TREE_BLOCK_INVALID_OWNER, +}; /* - * Less strict leaf checker. - * Will only check item pointers, not reading item data. + * Exported simply for btrfs-progs which wants to have the + * btrfs_tree_block_status return codes. */ -int btrfs_check_leaf_relaxed(struct extent_buffer *leaf); +enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf); +enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node); + +int btrfs_check_leaf(struct extent_buffer *leaf); int btrfs_check_node(struct extent_buffer *node); int btrfs_check_chunk_valid(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 logical); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); +int btrfs_verify_level_key(struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d2755d5e338b..365a1cc0a3c3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -859,10 +859,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); csum_root = btrfs_csum_root(fs_info, - sums->bytenr); + sums->logical); if (!ret) ret = btrfs_del_csums(trans, csum_root, - sums->bytenr, + sums->logical, sums->len); if (!ret) ret = btrfs_csum_file_blocks(trans, @@ -3252,7 +3252,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, * Returns 1 if the inode was logged before in the transaction, 0 if it was not, * and < 0 on error. */ -static int inode_logged(struct btrfs_trans_handle *trans, +static int inode_logged(const struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path_in) { @@ -4056,14 +4056,14 @@ static int drop_inode_items(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, log, &key, path, -1, 1); - BUG_ON(ret == 0); /* Logic error */ - if (ret < 0) - break; - - if (path->slots[0] == 0) + if (ret < 0) { break; + } else if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } - path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); @@ -4221,7 +4221,7 @@ static int log_csums(struct btrfs_trans_handle *trans, struct btrfs_root *log_root, struct btrfs_ordered_sum *sums) { - const u64 lock_end = sums->bytenr + sums->len - 1; + const u64 lock_end = sums->logical + sums->len - 1; struct extent_state *cached_state = NULL; int ret; @@ -4239,7 +4239,7 @@ static int log_csums(struct btrfs_trans_handle *trans, * file which happens to refer to the same extent as well. Such races * can leave checksum items in the log with overlapping ranges. */ - ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, + ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end, &cached_state); if (ret) return ret; @@ -4252,11 +4252,11 @@ static int log_csums(struct btrfs_trans_handle *trans, * some checksums missing in the fs/subvolume tree. So just delete (or * trim and adjust) any existing csum items in the log for this range. */ - ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); + ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len); if (!ret) ret = btrfs_csum_file_blocks(trans, log_root, sums); - unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, + unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, &cached_state); return ret; @@ -5303,7 +5303,7 @@ out: * multiple times when multiple tasks have joined the same log transaction. */ static bool need_log_inode(const struct btrfs_trans_handle *trans, - const struct btrfs_inode *inode) + struct btrfs_inode *inode) { /* * If a directory was not modified, no dentries added or removed, we can @@ -5321,7 +5321,7 @@ static bool need_log_inode(const struct btrfs_trans_handle *trans, * logged_trans will be 0, in which case we have to fully log it since * logged_trans is a transient field, not persisted. */ - if (inode->logged_trans == trans->transid && + if (inode_logged(trans, inode, NULL) == 1 && !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) return false; @@ -7309,7 +7309,7 @@ error: */ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - int for_rename) + bool for_rename) { /* * when we're logging a file, if it hasn't been renamed @@ -7325,18 +7325,25 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, inode->last_unlink_trans = trans->transid; mutex_unlock(&inode->log_mutex); + if (!for_rename) + return; + /* - * if this directory was already logged any new - * names for this file/dir will get recorded + * If this directory was already logged, any new names will be logged + * with btrfs_log_new_name() and old names will be deleted from the log + * tree with btrfs_del_dir_entries_in_log() or with + * btrfs_del_inode_ref_in_log(). */ - if (dir->logged_trans == trans->transid) + if (inode_logged(trans, dir, NULL) == 1) return; /* - * if the inode we're about to unlink was logged, - * the log will be properly updated for any new names + * If the inode we're about to unlink was logged before, the log will be + * properly updated with the new name with btrfs_log_new_name() and the + * old name removed with btrfs_del_dir_entries_in_log() or with + * btrfs_del_inode_ref_in_log(). */ - if (inode->logged_trans == trans->transid) + if (inode_logged(trans, inode, NULL) == 1) return; /* @@ -7346,13 +7353,6 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * properly. So, we have to be conservative and force commits * so the new name gets discovered. */ - if (for_rename) - goto record; - - /* we can safely do the unlink without any special recording */ - return; - -record: mutex_lock(&dir->log_mutex); dir->last_unlink_trans = trans->transid; mutex_unlock(&dir->log_mutex); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index bdeb5216718f..a550a8a375cd 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -100,7 +100,7 @@ void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - int for_rename); + bool for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); void btrfs_log_new_name(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index a555baa0143a..3df6153d5d5a 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -226,21 +226,32 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; - int ret; + int ret = 0; if (!tree_mod_need_log(eb->fs_info, eb)) return 0; tm = alloc_tree_mod_elem(eb, slot, op); if (!tm) - return -ENOMEM; + ret = -ENOMEM; if (tree_mod_dont_log(eb->fs_info, eb)) { kfree(tm); + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ return 0; + } else if (ret != 0) { + /* + * We previously failed to allocate memory and we need to log, + * so we have to fail. + */ + goto out_unlock; } ret = tree_mod_log_insert(eb->fs_info, tm); +out_unlock: write_unlock(&eb->fs_info->tree_mod_log_lock); if (ret) kfree(tm); @@ -248,6 +259,26 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, return ret; } +static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items) +{ + struct tree_mod_elem *tm; + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) + return ERR_PTR(-ENOMEM); + + tm->logical = eb->start; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = BTRFS_MOD_LOG_MOVE_KEYS; + RB_CLEAR_NODE(&tm->node); + + return tm; +} + int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) @@ -262,35 +293,46 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, return 0; tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; - - tm = kzalloc(sizeof(*tm), GFP_NOFS); - if (!tm) { + if (!tm_list) { ret = -ENOMEM; - goto free_tms; + goto lock; } - tm->logical = eb->start; - tm->slot = src_slot; - tm->move.dst_slot = dst_slot; - tm->move.nr_items = nr_items; - tm->op = BTRFS_MOD_LOG_MOVE_KEYS; + tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items); + if (IS_ERR(tm)) { + ret = PTR_ERR(tm); + tm = NULL; + goto lock; + } for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); if (!tm_list[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } } - if (tree_mod_dont_log(eb->fs_info, eb)) +lock: + if (tree_mod_dont_log(eb->fs_info, eb)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; goto free_tms; + } locked = true; /* + * We previously failed to allocate memory and we need to log, so we + * have to fail. + */ + if (ret != 0) + goto free_tms; + + /* * When we override something during the move, we log these removals. * This can only happen when we move towards the beginning of the * buffer, i.e. dst_slot < src_slot. @@ -310,10 +352,12 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, return 0; free_tms: - for (i = 0; i < nr_items; i++) { - if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); - kfree(tm_list[i]); + if (tm_list) { + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); + kfree(tm_list[i]); + } } if (locked) write_unlock(&eb->fs_info->tree_mod_log_lock); @@ -363,14 +407,14 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, GFP_NOFS); if (!tm_list) { ret = -ENOMEM; - goto free_tms; + goto lock; } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i, BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } } } @@ -378,7 +422,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; - goto free_tms; + goto lock; } tm->logical = new_root->start; @@ -387,14 +431,28 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, tm->generation = btrfs_header_generation(old_root); tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; - if (tree_mod_dont_log(fs_info, NULL)) +lock: + if (tree_mod_dont_log(fs_info, NULL)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; goto free_tms; + } else if (ret != 0) { + /* + * We previously failed to allocate memory and we need to log, + * so we have to fail. + */ + goto out_unlock; + } if (tm_list) ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); if (!ret) ret = tree_mod_log_insert(fs_info, tm); +out_unlock: write_unlock(&fs_info->tree_mod_log_lock); if (ret) goto free_tms; @@ -486,9 +544,14 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, struct btrfs_fs_info *fs_info = dst->fs_info; int ret = 0; struct tree_mod_elem **tm_list = NULL; - struct tree_mod_elem **tm_list_add, **tm_list_rem; + struct tree_mod_elem **tm_list_add = NULL; + struct tree_mod_elem **tm_list_rem = NULL; int i; bool locked = false; + struct tree_mod_elem *dst_move_tm = NULL; + struct tree_mod_elem *src_move_tm = NULL; + u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset; + u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items); if (!tree_mod_need_log(fs_info, NULL)) return 0; @@ -498,8 +561,30 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; + if (!tm_list) { + ret = -ENOMEM; + goto lock; + } + + if (dst_move_nr_items) { + dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items, + dst_offset, dst_move_nr_items); + if (IS_ERR(dst_move_tm)) { + ret = PTR_ERR(dst_move_tm); + dst_move_tm = NULL; + goto lock; + } + } + if (src_move_nr_items) { + src_move_tm = tree_mod_log_alloc_move(src, src_offset, + src_offset + nr_items, + src_move_nr_items); + if (IS_ERR(src_move_tm)) { + ret = PTR_ERR(src_move_tm); + src_move_tm = NULL; + goto lock; + } + } tm_list_add = tm_list; tm_list_rem = tm_list + nr_items; @@ -508,21 +593,40 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, BTRFS_MOD_LOG_KEY_REMOVE); if (!tm_list_rem[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, BTRFS_MOD_LOG_KEY_ADD); if (!tm_list_add[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } } - if (tree_mod_dont_log(fs_info, NULL)) +lock: + if (tree_mod_dont_log(fs_info, NULL)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; goto free_tms; + } locked = true; + /* + * We previously failed to allocate memory and we need to log, so we + * have to fail. + */ + if (ret != 0) + goto free_tms; + + if (dst_move_tm) { + ret = tree_mod_log_insert(fs_info, dst_move_tm); + if (ret) + goto free_tms; + } for (i = 0; i < nr_items; i++) { ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); if (ret) @@ -531,6 +635,11 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, if (ret) goto free_tms; } + if (src_move_tm) { + ret = tree_mod_log_insert(fs_info, src_move_tm); + if (ret) + goto free_tms; + } write_unlock(&fs_info->tree_mod_log_lock); kfree(tm_list); @@ -538,10 +647,18 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, return 0; free_tms: - for (i = 0; i < nr_items * 2; i++) { - if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); - kfree(tm_list[i]); + if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node)) + rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log); + kfree(dst_move_tm); + if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node)) + rb_erase(&src_move_tm->node, &fs_info->tree_mod_log); + kfree(src_move_tm); + if (tm_list) { + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } } if (locked) write_unlock(&fs_info->tree_mod_log_lock); @@ -562,22 +679,38 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) nritems = btrfs_header_nritems(eb); tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; + if (!tm_list) { + ret = -ENOMEM; + goto lock; + } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i, BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } } - if (tree_mod_dont_log(eb->fs_info, eb)) +lock: + if (tree_mod_dont_log(eb->fs_info, eb)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; goto free_tms; + } else if (ret != 0) { + /* + * We previously failed to allocate memory and we need to log, + * so we have to fail. + */ + goto out_unlock; + } ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); +out_unlock: write_unlock(&eb->fs_info->tree_mod_log_lock); if (ret) goto free_tms; @@ -586,9 +719,11 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) return 0; free_tms: - for (i = 0; i < nritems; i++) - kfree(tm_list[i]); - kfree(tm_list); + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } return ret; } @@ -664,10 +799,27 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, unsigned long o_dst; unsigned long o_src; unsigned long p_size = sizeof(struct btrfs_key_ptr); + /* + * max_slot tracks the maximum valid slot of the rewind eb at every + * step of the rewind. This is in contrast with 'n' which eventually + * matches the number of items, but can be wrong during moves or if + * removes overlap on already valid slots (which is probably separately + * a bug). We do this to validate the offsets of memmoves for rewinding + * moves and detect invalid memmoves. + * + * Since a rewind eb can start empty, max_slot is a signed integer with + * a special meaning for -1, which is that no slot is valid to move out + * of. Any other negative value is invalid. + */ + int max_slot; + int move_src_end_slot; + int move_dst_end_slot; n = btrfs_header_nritems(eb); + max_slot = n - 1; read_lock(&fs_info->tree_mod_log_lock); while (tm && tm->seq >= time_seq) { + ASSERT(max_slot >= -1); /* * All the operations are recorded with the operator used for * the modification. As we're going backwards, we do the @@ -684,6 +836,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); n++; + if (tm->slot > max_slot) + max_slot = tm->slot; break; case BTRFS_MOD_LOG_KEY_REPLACE: BUG_ON(tm->slot >= n); @@ -693,14 +847,37 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, tm->generation); break; case BTRFS_MOD_LOG_KEY_ADD: + /* + * It is possible we could have already removed keys + * behind the known max slot, so this will be an + * overestimate. In practice, the copy operation + * inserts them in increasing order, and overestimating + * just means we miss some warnings, so it's OK. It + * isn't worth carefully tracking the full array of + * valid slots to check against when moving. + */ + if (tm->slot == max_slot) + max_slot--; /* if a move operation is needed it's in the log */ n--; break; case BTRFS_MOD_LOG_MOVE_KEYS: + ASSERT(tm->move.nr_items > 0); + move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1; + move_dst_end_slot = tm->slot + tm->move.nr_items - 1; o_dst = btrfs_node_key_ptr_offset(eb, tm->slot); o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot); + if (WARN_ON(move_src_end_slot > max_slot || + tm->move.nr_items <= 0)) { + btrfs_warn(fs_info, +"move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d", + eb->start, tm->slot, + tm->move.dst_slot, tm->move.nr_items, + tm->seq, n, max_slot); + } memmove_extent_buffer(eb, o_dst, o_src, tm->move.nr_items * p_size); + max_slot = move_dst_end_slot; break; case BTRFS_MOD_LOG_ROOT_REPLACE: /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 72a838c97534..73f9ea7672db 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -370,6 +370,8 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, { struct btrfs_fs_devices *fs_devs; + ASSERT(fsid || !metadata_fsid); + fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); if (!fs_devs) return ERR_PTR(-ENOMEM); @@ -380,18 +382,17 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); INIT_LIST_HEAD(&fs_devs->seed_list); - if (fsid) - memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); - if (metadata_fsid) - memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); - else if (fsid) - memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); + if (fsid) { + memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + memcpy(fs_devs->metadata_uuid, + metadata_fsid ?: fsid, BTRFS_FSID_SIZE); + } return fs_devs; } -void btrfs_free_device(struct btrfs_device *device) +static void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); @@ -426,6 +427,21 @@ void __exit btrfs_cleanup_fs_uuids(void) } } +static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices, + const u8 *fsid, const u8 *metadata_fsid) +{ + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0) + return false; + + if (!metadata_fsid) + return true; + + if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) + return false; + + return true; +} + static noinline struct btrfs_fs_devices *find_fsid( const u8 *fsid, const u8 *metadata_fsid) { @@ -435,19 +451,25 @@ static noinline struct btrfs_fs_devices *find_fsid( /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (metadata_fsid) { - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 - && memcmp(metadata_fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) - return fs_devices; - } else { - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) - return fs_devices; - } + if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid)) + return fs_devices; } return NULL; } +/* + * First check if the metadata_uuid is different from the fsid in the given + * fs_devices. Then check if the given fsid is the same as the metadata_uuid + * in the fs_devices. If it is, return true; otherwise, return false. + */ +static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices, + const u8 *fsid) +{ + return memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0; +} + static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( struct btrfs_super_block *disk_super) { @@ -461,14 +483,14 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( * at all and the CHANGING_FSID_V2 flag set. */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change && - memcmp(disk_super->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) == 0 && - memcmp(fs_devices->fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) { + if (!fs_devices->fsid_change) + continue; + + if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid, + fs_devices->fsid)) return fs_devices; - } } + /* * Handle scanned device having completed its fsid change but * belonging to a fs_devices that was created by a device that @@ -476,13 +498,11 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( * CHANGING_FSID_V2 flag set. */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change && - memcmp(fs_devices->metadata_uuid, - fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && - memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) { + if (!fs_devices->fsid_change) + continue; + + if (check_fsid_changed(fs_devices, disk_super->metadata_uuid)) return fs_devices; - } } return find_fsid(disk_super->fsid, disk_super->metadata_uuid); @@ -490,13 +510,13 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( static int -btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, +btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, int flush, struct block_device **bdev, struct btrfs_super_block **disk_super) { int ret; - *bdev = blkdev_get_by_path(device_path, flags, holder); + *bdev = blkdev_get_by_path(device_path, flags, holder, NULL); if (IS_ERR(*bdev)) { ret = PTR_ERR(*bdev); @@ -507,14 +527,14 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, sync_blockdev(*bdev); ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { - blkdev_put(*bdev, flags); + blkdev_put(*bdev, holder); goto error; } invalidate_bdev(*bdev); *disk_super = btrfs_read_dev_super(*bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - blkdev_put(*bdev, flags); + blkdev_put(*bdev, holder); goto error; } @@ -590,7 +610,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device * fs_devices->device_list_mutex here. */ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device, fmode_t flags, + struct btrfs_device *device, blk_mode_t flags, void *holder) { struct block_device *bdev; @@ -642,7 +662,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, device->bdev = bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - device->mode = flags; + device->holder = holder; fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && @@ -656,7 +676,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - blkdev_put(bdev, flags); + blkdev_put(bdev, holder); return -EINVAL; } @@ -673,18 +693,16 @@ static struct btrfs_fs_devices *find_fsid_inprogress( struct btrfs_fs_devices *fs_devices; list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, disk_super->fsid, - BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { + if (fs_devices->fsid_change) + continue; + + if (check_fsid_changed(fs_devices, disk_super->fsid)) return fs_devices; - } } return find_fsid(disk_super->fsid, NULL); } - static struct btrfs_fs_devices *find_fsid_changed( struct btrfs_super_block *disk_super) { @@ -701,10 +719,7 @@ static struct btrfs_fs_devices *find_fsid_changed( */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { /* Changed UUIDs */ - if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, - BTRFS_FSID_SIZE) == 0 && + if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) && memcmp(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE) != 0) return fs_devices; @@ -735,11 +750,10 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata( * fs_devices equal to the FSID of the disk. */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, disk_super->fsid, - BTRFS_FSID_SIZE) == 0 && - fs_devices->fsid_change) + if (!fs_devices->fsid_change) + continue; + + if (check_fsid_changed(fs_devices, disk_super->fsid)) return fs_devices; } @@ -790,12 +804,8 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (!fs_devices) { - if (has_metadata_uuid) - fs_devices = alloc_fs_devices(disk_super->fsid, - disk_super->metadata_uuid); - else - fs_devices = alloc_fs_devices(disk_super->fsid, NULL); - + fs_devices = alloc_fs_devices(disk_super->fsid, + has_metadata_uuid ? disk_super->metadata_uuid : NULL); if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); @@ -1057,7 +1067,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, continue; if (device->bdev) { - blkdev_put(device->bdev, device->mode); + blkdev_put(device->bdev, device->holder); device->bdev = NULL; fs_devices->open_devices--; } @@ -1103,7 +1113,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - blkdev_put(device->bdev, device->mode); + blkdev_put(device->bdev, device->holder); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1207,14 +1217,12 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, - fmode_t flags, void *holder) + blk_mode_t flags, void *holder) { struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; - flags |= FMODE_EXCL; - list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { int ret; @@ -1257,7 +1265,7 @@ static int devid_cmp(void *priv, const struct list_head *a, } int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, - fmode_t flags, void *holder) + blk_mode_t flags, void *holder) { int ret; @@ -1348,8 +1356,7 @@ int btrfs_forget_devices(dev_t devt) * and we are not allowed to call set_blocksize during the scan. The superblock * is read via pagecache */ -struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, - void *holder) +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags) { struct btrfs_super_block *disk_super; bool new_device_added = false; @@ -1368,16 +1375,16 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, */ /* - * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may - * initiate the device scan which may race with the user's mount - * or mkfs command, resulting in failure. - * Since the device scan is solely for reading purposes, there is - * no need for FMODE_EXCL. Additionally, the devices are read again + * Avoid an exclusive open here, as the systemd-udev may initiate the + * device scan which may race with the user's mount or mkfs command, + * resulting in failure. + * Since the device scan is solely for reading purposes, there is no + * need for an exclusive open. Additionally, the devices are read again * during the mount process. It is ok to get some inconsistent * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev = blkdev_get_by_path(path, flags, holder); + bdev = blkdev_get_by_path(path, flags, NULL, NULL); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -1401,7 +1408,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, btrfs_release_disk_super(disk_super); error_bdev_put: - blkdev_put(bdev, flags); + blkdev_put(bdev, NULL); return device; } @@ -1918,7 +1925,7 @@ static void update_dev_time(const char *device_path) return; now = current_time(d_inode(path.dentry)); - inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); + inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME | S_VERSION); path_put(&path); } @@ -2088,7 +2095,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct block_device **bdev, fmode_t *mode) + struct block_device **bdev, void **holder) { struct btrfs_trans_handle *trans; struct btrfs_device *device; @@ -2227,7 +2234,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } *bdev = device->bdev; - *mode = device->mode; + *holder = device->holder; synchronize_rcu(); btrfs_free_device(device); @@ -2381,7 +2388,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, return -ENOMEM; } - ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, + ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, &bdev, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); @@ -2395,7 +2402,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - blkdev_put(bdev, FMODE_READ); + blkdev_put(bdev, NULL); return 0; } @@ -2628,8 +2635,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, - fs_info->bdev_holder); + bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, + fs_info->bdev_holder, NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -2691,7 +2698,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->commit_total_bytes = device->total_bytes; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->mode = FMODE_EXCL; + device->holder = fs_info->bdev_holder; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); @@ -2849,7 +2856,7 @@ error_free_zone: error_free_device: btrfs_free_device(device); error: - blkdev_put(bdev, FMODE_EXCL); + blkdev_put(bdev, fs_info->bdev_holder); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -6163,17 +6170,10 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->replace_nr_stripes = nr_extra_stripes; } -static bool need_full_stripe(enum btrfs_map_op op) -{ - return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); -} - static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, u64 offset, u32 *stripe_nr, u64 *stripe_offset, u64 *full_stripe_start) { - ASSERT(op != BTRFS_MAP_DISCARD); - /* * Stripe_nr is the stripe where this block falls. stripe_offset is * the offset of this block in its stripe. @@ -6226,11 +6226,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); } -int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map) +int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -6253,7 +6253,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 max_len; ASSERT(bioc_ret); - ASSERT(op != BTRFS_MAP_DISCARD); num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); if (mirror_num > num_copies) @@ -6285,21 +6284,21 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_index = stripe_nr % map->num_stripes; stripe_nr /= map->num_stripes; - if (!need_full_stripe(op)) + if (op == BTRFS_MAP_READ) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - if (need_full_stripe(op)) + if (op != BTRFS_MAP_READ) { num_stripes = map->num_stripes; - else if (mirror_num) + } else if (mirror_num) { stripe_index = mirror_num - 1; - else { + } else { stripe_index = find_live_mirror(fs_info, map, 0, dev_replace_is_ongoing); mirror_num = stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (need_full_stripe(op)) { + if (op != BTRFS_MAP_READ) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -6313,7 +6312,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, stripe_index = (stripe_nr % factor) * map->sub_stripes; stripe_nr /= factor; - if (need_full_stripe(op)) + if (op != BTRFS_MAP_READ) num_stripes = map->sub_stripes; else if (mirror_num) stripe_index += mirror_num - 1; @@ -6326,7 +6325,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { + if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) { /* * Push stripe_nr back to the start of the full stripe * For those cases needing a full stripe, @stripe_nr @@ -6362,7 +6361,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* We distribute the parity blocks across stripes */ stripe_index = (stripe_nr + stripe_index) % map->num_stripes; - if (!need_full_stripe(op) && mirror_num <= 1) + if (op == BTRFS_MAP_READ && mirror_num <= 1) mirror_num = 1; } } else { @@ -6402,7 +6401,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, */ if (smap && num_alloc_stripes == 1 && !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && - (!need_full_stripe(op) || !dev_replace_is_ongoing || + (op == BTRFS_MAP_READ || !dev_replace_is_ongoing || !dev_replace->tgtdev)) { set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); *mirror_num_ret = mirror_num; @@ -6426,7 +6425,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * It's still mostly the same as other profiles, just with extra rotation. */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && - (need_full_stripe(op) || mirror_num > 1)) { + (op != BTRFS_MAP_READ || mirror_num > 1)) { /* * For RAID56 @stripe_nr is already the number of full stripes * before us, which is also the rotation value (needs to modulo @@ -6453,11 +6452,11 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } } - if (need_full_stripe(op)) + if (op != BTRFS_MAP_READ) max_errors = btrfs_chunk_max_errors(map); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && - need_full_stripe(op)) { + op != BTRFS_MAP_READ) { handle_ops_on_dev_replace(op, bioc, dev_replace, logical, &num_stripes, &max_errors); } @@ -6477,23 +6476,6 @@ out: return ret; } -int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, int mirror_num) -{ - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, - NULL, &mirror_num, 0); -} - -/* For Scrub/replace */ -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret) -{ - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, - NULL, NULL, 1); -} - static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, const struct btrfs_fs_devices *fs_devices) { @@ -6913,7 +6895,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (IS_ERR(fs_devices)) return fs_devices; - ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); + ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); return ERR_PTR(ret); @@ -8070,8 +8052,8 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, ASSERT(mirror_num > 0); - ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, - &bioc, smap, &mirror_ret, true); + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, + &bioc, smap, &mirror_ret, true); if (ret < 0) return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 64066d48dce1..b8c51f16ba86 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -94,8 +94,8 @@ struct btrfs_device { struct btrfs_zoned_device_info *zone_info; - /* the mode sent to blkdev_get */ - fmode_t mode; + /* block device holder for blkdev_get/put */ + void *holder; /* * Device's major-minor number. Must be set even if the device is not @@ -280,8 +280,19 @@ enum btrfs_read_policy { struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + + /* + * UUID written into the btree blocks: + * + * - If metadata_uuid != fsid then super block must have + * BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag set. + * + * - Following shall be true at all times: + * - metadata_uuid == btrfs_header::fsid + * - metadata_uuid == btrfs_dev_item::fsid + */ u8 metadata_uuid[BTRFS_FSID_SIZE]; - bool fsid_change; + struct list_head fs_list; /* @@ -319,34 +330,32 @@ struct btrfs_fs_devices { */ struct btrfs_device *latest_dev; - /* all of the devices in the FS, protected by a mutex - * so we can safely walk it to write out the supers without - * worrying about add/remove by the multi-device code. - * Scrubbing super can kick off supers writing by holding - * this mutex lock. + /* + * All of the devices in the filesystem, protected by a mutex so we can + * safely walk it to write out the super blocks without worrying about + * adding/removing by the multi-device code. Scrubbing super block can + * kick off supers writing by holding this mutex lock. */ struct mutex device_list_mutex; /* List of all devices, protected by device_list_mutex */ struct list_head devices; - /* - * Devices which can satisfy space allocation. Protected by - * chunk_mutex - */ + /* Devices which can satisfy space allocation. Protected by * chunk_mutex. */ struct list_head alloc_list; struct list_head seed_list; - bool seeding; + /* Count fs-devices opened. */ int opened; - /* set when we find or add a device that doesn't have the - * nonrot flag set - */ + /* Set when we find or add a device that doesn't have the nonrot flag set. */ bool rotating; - /* Devices support TRIM/discard commands */ + /* Devices support TRIM/discard commands. */ bool discardable; + bool fsid_change; + /* The filesystem is a seed filesystem. */ + bool seeding; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -357,7 +366,7 @@ struct btrfs_fs_devices { enum btrfs_chunk_allocation_policy chunk_alloc_policy; - /* Policy used to read the mirrored stripes */ + /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; }; @@ -547,15 +556,12 @@ struct btrfs_dev_lookup_args { enum btrfs_map_op { BTRFS_MAP_READ, BTRFS_MAP_WRITE, - BTRFS_MAP_DISCARD, BTRFS_MAP_GET_READ_MIRRORS, }; static inline enum btrfs_map_op btrfs_op(struct bio *bio) { switch (bio_op(bio)) { - case REQ_OP_DISCARD: - return BTRFS_MAP_DISCARD; case REQ_OP_WRITE: case REQ_OP_ZONE_APPEND: return BTRFS_MAP_WRITE; @@ -589,15 +595,9 @@ void btrfs_get_bioc(struct btrfs_io_context *bioc); void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, int mirror_num); -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret); -int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map); + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map); int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, struct btrfs_io_stripe *smap, u64 logical, u32 length, int mirror_num); @@ -610,9 +610,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, - fmode_t flags, void *holder); -struct btrfs_device *btrfs_scan_one_device(const char *path, - fmode_t flags, void *holder); + blk_mode_t flags, void *holder); +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags); int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); @@ -628,10 +627,9 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid, const char *path); void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); -void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct block_device **bdev, fmode_t *mode); + struct block_device **bdev, void **holder); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8acb05e176c5..6c231a116a29 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -63,7 +63,7 @@ struct list_head *zlib_alloc_workspace(unsigned int level) workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); - workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL); + workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN); workspace->level = level; workspace->buf = NULL; /* diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 39828af4a4e8..85b8b332add9 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -15,6 +15,7 @@ #include "transaction.h" #include "dev-replace.h" #include "space-info.h" +#include "super.h" #include "fs.h" #include "accessors.h" #include "bio.h" @@ -1057,7 +1058,7 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, /* Check if zones in the region are all empty */ if (btrfs_dev_is_sequential(device, pos) && - find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { + !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) { pos += zinfo->zone_size; continue; } @@ -1156,23 +1157,23 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) struct btrfs_zoned_device_info *zinfo = device->zone_info; const u8 shift = zinfo->zone_size_shift; unsigned long begin = start >> shift; - unsigned long end = (start + size) >> shift; + unsigned long nbits = size >> shift; u64 pos; int ret; ASSERT(IS_ALIGNED(start, zinfo->zone_size)); ASSERT(IS_ALIGNED(size, zinfo->zone_size)); - if (end > zinfo->nr_zones) + if (begin + nbits > zinfo->nr_zones) return -ERANGE; /* All the zones are conventional */ - if (find_next_bit(zinfo->seq_zones, end, begin) == end) + if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits)) return 0; /* All the zones are sequential and empty */ - if (find_next_zero_bit(zinfo->seq_zones, end, begin) == end && - find_next_zero_bit(zinfo->empty_zones, end, begin) == end) + if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) && + bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits)) return 0; for (pos = start; pos < start + size; pos += zinfo->zone_size) { @@ -1602,37 +1603,17 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - - if (!btrfs_is_zoned(fs_info) || - btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || - !list_empty(&eb->release_list)) + if (!btrfs_is_zoned(eb->fs_info) || + btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)) return; + ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + memzero_extent_buffer(eb, 0, eb->len); set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); set_extent_buffer_dirty(eb); - set_extent_bits_nowait(&trans->dirty_pages, eb->start, - eb->start + eb->len - 1, EXTENT_DIRTY); - - spin_lock(&trans->releasing_ebs_lock); - list_add_tail(&eb->release_list, &trans->releasing_ebs); - spin_unlock(&trans->releasing_ebs_lock); - atomic_inc(&eb->refs); -} - -void btrfs_free_redirty_list(struct btrfs_transaction *trans) -{ - spin_lock(&trans->releasing_ebs_lock); - while (!list_empty(&trans->releasing_ebs)) { - struct extent_buffer *eb; - - eb = list_first_entry(&trans->releasing_ebs, - struct extent_buffer, release_list); - list_del_init(&eb->release_list); - free_extent_buffer(eb); - } - spin_unlock(&trans->releasing_ebs_lock); + set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, + EXTENT_DIRTY | EXTENT_NOWAIT, NULL); } bool btrfs_use_zone_append(struct btrfs_bio *bbio) @@ -1677,63 +1658,89 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; - struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_sum *sum = bbio->sums; - ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); - if (WARN_ON(!ordered)) - return; - - ordered->physical = physical; - btrfs_put_ordered_extent(ordered); + if (physical < bbio->orig_physical) + sum->logical -= bbio->orig_physical - physical; + else + sum->logical += physical - bbio->orig_physical; } -void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) +static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, + u64 logical) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_map_tree *em_tree; + struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree; struct extent_map *em; - struct btrfs_ordered_sum *sum; - u64 orig_logical = ordered->disk_bytenr; - struct map_lookup *map; - u64 physical = ordered->physical; - u64 chunk_start_phys; - u64 logical; - - em = btrfs_get_chunk_map(fs_info, orig_logical, 1); - if (IS_ERR(em)) - return; - map = em->map_lookup; - chunk_start_phys = map->stripes[0].physical; - - if (WARN_ON_ONCE(map->num_stripes > 1) || - WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) || - WARN_ON_ONCE(physical < chunk_start_phys) || - WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) { - free_extent_map(em); - return; - } - logical = em->start + (physical - map->stripes[0].physical); - free_extent_map(em); - - if (orig_logical == logical) - return; ordered->disk_bytenr = logical; - em_tree = &inode->extent_tree; write_lock(&em_tree->lock); em = search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); em->block_start = logical; free_extent_map(em); write_unlock(&em_tree->lock); +} - list_for_each_entry(sum, &ordered->list, list) { - if (logical < orig_logical) - sum->bytenr -= orig_logical - logical; - else - sum->bytenr += logical - orig_logical; +static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, + u64 logical, u64 len) +{ + struct btrfs_ordered_extent *new; + + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && + split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset, + ordered->num_bytes, len, logical)) + return false; + + new = btrfs_split_ordered_extent(ordered, len); + if (IS_ERR(new)) + return false; + new->disk_bytenr = logical; + btrfs_finish_one_ordered(new); + return true; +} + +void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_ordered_sum *sum = + list_first_entry(&ordered->list, typeof(*sum), list); + u64 logical = sum->logical; + u64 len = sum->len; + + while (len < ordered->disk_num_bytes) { + sum = list_next_entry(sum, list); + if (sum->logical == logical + len) { + len += sum->len; + continue; + } + if (!btrfs_zoned_split_ordered(ordered, logical, len)) { + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + btrfs_err(fs_info, "failed to split ordered extent"); + goto out; + } + logical = sum->logical; + len = sum->len; + } + + if (ordered->disk_bytenr != logical) + btrfs_rewrite_logical_zoned(ordered, logical); + +out: + /* + * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures + * were allocated by btrfs_alloc_dummy_sum only to record the logical + * addresses and don't contain actual checksums. We thus must free them + * here so that we don't attempt to log the csums later. + */ + if ((inode->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) { + while ((sum = list_first_entry_or_null(&ordered->list, + typeof(*sum), list))) { + list_del(&sum->list); + kfree(sum); + } } } @@ -1792,8 +1799,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, int nmirrors; int i, ret; - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &mapped_length, &bioc); + ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, + &mapped_length, &bioc, NULL, NULL, 1); if (ret || !bioc || mapped_length < PAGE_SIZE) { ret = -EIO; goto out_put_bioc; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index c0570d35fea2..27322b926038 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -30,6 +30,8 @@ struct btrfs_zoned_device_info { struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; }; +void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered); + #ifdef CONFIG_BLK_DEV_ZONED int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); @@ -54,10 +56,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); -void btrfs_free_redirty_list(struct btrfs_transaction *trans); bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); -void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, struct btrfs_block_group **cache_ret); @@ -179,7 +179,6 @@ static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } -static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) { @@ -190,9 +189,6 @@ static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { } -static inline void btrfs_rewrite_logical_zoned( - struct btrfs_ordered_extent *ordered) { } - static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, struct btrfs_block_group **cache_ret) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index f798da267590..e7ac4ec809a4 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -356,7 +356,7 @@ struct list_head *zstd_alloc_workspace(unsigned int level) workspace->level = level; workspace->req_level = level; workspace->last_used = jiffies; - workspace->mem = kvmalloc(workspace->size, GFP_KERNEL); + workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->mem || !workspace->buf) goto fail; diff --git a/fs/buffer.c b/fs/buffer.c index a7fc561758b1..bd091329026c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -111,7 +111,6 @@ void buffer_check_dirty_writeback(struct folio *folio, bh = bh->b_this_page; } while (bh != head); } -EXPORT_SYMBOL(buffer_check_dirty_writeback); /* * Block until a buffer comes unlocked. This doesn't stop it @@ -195,19 +194,19 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) pgoff_t index; struct buffer_head *bh; struct buffer_head *head; - struct page *page; + struct folio *folio; int all_mapped = 1; static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1); index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); - page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); - if (!page) + folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0); + if (IS_ERR(folio)) goto out; spin_lock(&bd_mapping->private_lock); - if (!page_has_buffers(page)) + head = folio_buffers(folio); + if (!head) goto out_unlock; - head = page_buffers(page); bh = head; do { if (!buffer_mapped(bh)) @@ -237,7 +236,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) } out_unlock: spin_unlock(&bd_mapping->private_lock); - put_page(page); + folio_put(folio); out: return ret; } @@ -592,6 +591,76 @@ int sync_mapping_buffers(struct address_space *mapping) } EXPORT_SYMBOL(sync_mapping_buffers); +/** + * generic_buffers_fsync_noflush - generic buffer fsync implementation + * for simple filesystems with no inode lock + * + * @file: file to synchronize + * @start: start offset in bytes + * @end: end offset in bytes (inclusive) + * @datasync: only synchronize essential metadata if true + * + * This is a generic implementation of the fsync method for simple + * filesystems which track all non-inode metadata in the buffers list + * hanging off the address_space structure. + */ +int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, + bool datasync) +{ + struct inode *inode = file->f_mapping->host; + int err; + int ret; + + err = file_write_and_wait_range(file, start, end); + if (err) + return err; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY_ALL)) + goto out; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + goto out; + + err = sync_inode_metadata(inode, 1); + if (ret == 0) + ret = err; + +out: + /* check and advance again to catch errors after syncing out buffers */ + err = file_check_and_advance_wb_err(file); + if (ret == 0) + ret = err; + return ret; +} +EXPORT_SYMBOL(generic_buffers_fsync_noflush); + +/** + * generic_buffers_fsync - generic buffer fsync implementation + * for simple filesystems with no inode lock + * + * @file: file to synchronize + * @start: start offset in bytes + * @end: end offset in bytes (inclusive) + * @datasync: only synchronize essential metadata if true + * + * This is a generic implementation of the fsync method for simple + * filesystems which track all non-inode metadata in the buffers list + * hanging off the address_space structure. This also makes sure that + * a device cache flush operation is called at the end. + */ +int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, + bool datasync) +{ + struct inode *inode = file->f_mapping->host; + int ret; + + ret = generic_buffers_fsync_noflush(file, start, end, datasync); + if (!ret) + ret = blkdev_issue_flush(inode->i_sb->s_bdev); + return ret; +} +EXPORT_SYMBOL(generic_buffers_fsync); + /* * Called when we've recently written block `bblock', and it is known that * `bblock' was for a buffer_boundary() buffer. This means that the block at @@ -907,8 +976,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, } EXPORT_SYMBOL_GPL(alloc_page_buffers); -static inline void -link_dev_buffers(struct page *page, struct buffer_head *head) +static inline void link_dev_buffers(struct folio *folio, + struct buffer_head *head) { struct buffer_head *bh, *tail; @@ -918,7 +987,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_private(page, head); + folio_attach_private(folio, head); } static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) @@ -934,15 +1003,14 @@ static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) } /* - * Initialise the state of a blockdev page's buffers. + * Initialise the state of a blockdev folio's buffers. */ -static sector_t -init_page_buffers(struct page *page, struct block_device *bdev, - sector_t block, int size) +static sector_t folio_init_buffers(struct folio *folio, + struct block_device *bdev, sector_t block, int size) { - struct buffer_head *head = page_buffers(page); + struct buffer_head *head = folio_buffers(folio); struct buffer_head *bh = head; - int uptodate = PageUptodate(page); + bool uptodate = folio_test_uptodate(folio); sector_t end_block = blkdev_max_block(bdev, size); do { @@ -976,7 +1044,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, pgoff_t index, int size, int sizebits, gfp_t gfp) { struct inode *inode = bdev->bd_inode; - struct page *page; + struct folio *folio; struct buffer_head *bh; sector_t end_block; int ret = 0; @@ -992,42 +1060,37 @@ grow_dev_page(struct block_device *bdev, sector_t block, */ gfp_mask |= __GFP_NOFAIL; - page = find_or_create_page(inode->i_mapping, index, gfp_mask); + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask); - BUG_ON(!PageLocked(page)); - - if (page_has_buffers(page)) { - bh = page_buffers(page); + bh = folio_buffers(folio); + if (bh) { if (bh->b_size == size) { - end_block = init_page_buffers(page, bdev, - (sector_t)index << sizebits, - size); + end_block = folio_init_buffers(folio, bdev, + (sector_t)index << sizebits, size); goto done; } - if (!try_to_free_buffers(page_folio(page))) + if (!try_to_free_buffers(folio)) goto failed; } - /* - * Allocate some buffers for this page - */ - bh = alloc_page_buffers(page, size, true); + bh = folio_alloc_buffers(folio, size, true); /* - * Link the page to the buffers and initialise them. Take the + * Link the folio to the buffers and initialise them. Take the * lock to be atomic wrt __find_get_block(), which does not - * run under the page lock. + * run under the folio lock. */ spin_lock(&inode->i_mapping->private_lock); - link_dev_buffers(page, bh); - end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, - size); + link_dev_buffers(folio, bh); + end_block = folio_init_buffers(folio, bdev, + (sector_t)index << sizebits, size); spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; failed: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return ret; } @@ -1764,7 +1827,7 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this * causes the writes to be flagged as synchronous writes. */ -int __block_write_full_page(struct inode *inode, struct page *page, +int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler) { @@ -1776,14 +1839,14 @@ int __block_write_full_page(struct inode *inode, struct page *page, int nr_underway = 0; blk_opf_t write_flags = wbc_to_write_flags(wbc); - head = folio_create_buffers(page_folio(page), inode, + head = folio_create_buffers(folio, inode, (1 << BH_Dirty) | (1 << BH_Uptodate)); /* * Be very careful. We have no exclusion from block_dirty_folio * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it - * then we just miss that fact, and the page stays dirty. + * then we just miss that fact, and the folio stays dirty. * * Buffers outside i_size may be dirtied by block_dirty_folio; * handle that here by just cleaning them. @@ -1793,7 +1856,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, blocksize = bh->b_size; bbits = block_size_bits(blocksize); - block = (sector_t)page->index << (PAGE_SHIFT - bbits); + block = (sector_t)folio->index << (PAGE_SHIFT - bbits); last_block = (i_size_read(inode) - 1) >> bbits; /* @@ -1804,7 +1867,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, if (block > last_block) { /* * mapped buffers outside i_size will occur, because - * this page can be outside i_size when there is a + * this folio can be outside i_size when there is a * truncate in progress. */ /* @@ -1834,7 +1897,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, continue; /* * If it's a fully non-blocking write attempt and we cannot - * lock the buffer then redirty the page. Note that this can + * lock the buffer then redirty the folio. Note that this can * potentially cause a busy-wait loop from writeback threads * and kswapd activity, but those code paths have their own * higher-level throttling. @@ -1842,7 +1905,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else if (!trylock_buffer(bh)) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); continue; } if (test_clear_buffer_dirty(bh)) { @@ -1853,11 +1916,11 @@ int __block_write_full_page(struct inode *inode, struct page *page, } while ((bh = bh->b_this_page) != head); /* - * The page and its buffers are protected by PageWriteback(), so we can - * drop the bh refcounts early. + * The folio and its buffers are protected by the writeback flag, + * so we can drop the bh refcounts early. */ - BUG_ON(PageWriteback(page)); - set_page_writeback(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); do { struct buffer_head *next = bh->b_this_page; @@ -1867,20 +1930,20 @@ int __block_write_full_page(struct inode *inode, struct page *page, } bh = next; } while (bh != head); - unlock_page(page); + folio_unlock(folio); err = 0; done: if (nr_underway == 0) { /* - * The page was marked dirty, but the buffers were + * The folio was marked dirty, but the buffers were * clean. Someone wrote them back by hand with * write_dirty_buffer/submit_bh. A rare case. */ - end_page_writeback(page); + folio_end_writeback(folio); /* - * The page and buffer_heads can be released at any time from + * The folio and buffer_heads can be released at any time from * here on. */ } @@ -1891,7 +1954,7 @@ recover: * ENOSPC, or some other error. We may already have added some * blocks to the file, so we need to write these out to avoid * exposing stale data. - * The page is currently locked and not marked for writeback + * The folio is currently locked and not marked for writeback */ bh = head; /* Recovery: lock and submit the mapped buffers */ @@ -1903,15 +1966,15 @@ recover: } else { /* * The buffer may have been set dirty during - * attachment to a dirty page. + * attachment to a dirty folio. */ clear_buffer_dirty(bh); } } while ((bh = bh->b_this_page) != head); - SetPageError(page); - BUG_ON(PageWriteback(page)); - mapping_set_error(page->mapping, err); - set_page_writeback(page); + folio_set_error(folio); + BUG_ON(folio_test_writeback(folio)); + mapping_set_error(folio->mapping, err); + folio_start_writeback(folio); do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { @@ -1921,39 +1984,40 @@ recover: } bh = next; } while (bh != head); - unlock_page(page); + folio_unlock(folio); goto done; } -EXPORT_SYMBOL(__block_write_full_page); +EXPORT_SYMBOL(__block_write_full_folio); /* - * If a page has any new buffers, zero them out here, and mark them uptodate + * If a folio has any new buffers, zero them out here, and mark them uptodate * and dirty so they'll be written out (in order to prevent uninitialised * block data from leaking). And clear the new bit. */ -void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) +void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to) { - unsigned int block_start, block_end; + size_t block_start, block_end; struct buffer_head *head, *bh; - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) + BUG_ON(!folio_test_locked(folio)); + head = folio_buffers(folio); + if (!head) return; - bh = head = page_buffers(page); + bh = head; block_start = 0; do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { - if (!PageUptodate(page)) { - unsigned start, size; + if (!folio_test_uptodate(folio)) { + size_t start, xend; start = max(from, block_start); - size = min(to, block_end) - start; + xend = min(to, block_end); - zero_user(page, start, size); + folio_zero_segment(folio, start, xend); set_buffer_uptodate(bh); } @@ -1966,7 +2030,7 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) bh = bh->b_this_page; } while (bh != head); } -EXPORT_SYMBOL(page_zero_new_buffers); +EXPORT_SYMBOL(folio_zero_new_buffers); static void iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, @@ -2104,7 +2168,7 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) - page_zero_new_buffers(&folio->page, from, to); + folio_zero_new_buffers(folio, from, to); return err; } @@ -2116,15 +2180,15 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, } EXPORT_SYMBOL(__block_write_begin); -static int __block_commit_write(struct inode *inode, struct page *page, - unsigned from, unsigned to) +static int __block_commit_write(struct inode *inode, struct folio *folio, + size_t from, size_t to) { - unsigned block_start, block_end; - int partial = 0; + size_t block_start, block_end; + bool partial = false; unsigned blocksize; struct buffer_head *bh, *head; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); blocksize = bh->b_size; block_start = 0; @@ -2132,7 +2196,7 @@ static int __block_commit_write(struct inode *inode, struct page *page, block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) - partial = 1; + partial = true; } else { set_buffer_uptodate(bh); mark_buffer_dirty(bh); @@ -2147,11 +2211,11 @@ static int __block_commit_write(struct inode *inode, struct page *page, /* * If this is a partial write which happened to make all buffers * uptodate then we can optimize away a bogus read_folio() for - * the next read(). Here we 'discover' whether the page went + * the next read(). Here we 'discover' whether the folio went * uptodate as a result of this (potentially partial) write. */ if (!partial) - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } @@ -2188,10 +2252,9 @@ int block_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; - unsigned start; - - start = pos & (PAGE_SIZE - 1); + size_t start = pos - folio_pos(folio); if (unlikely(copied < len)) { /* @@ -2203,18 +2266,18 @@ int block_write_end(struct file *file, struct address_space *mapping, * read_folio might come in and destroy our partial write. * * Do the simplest thing, and just treat any short write to a - * non uptodate page as a zero-length write, and force the + * non uptodate folio as a zero-length write, and force the * caller to redo the whole thing. */ - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) copied = 0; - page_zero_new_buffers(page, start+copied, start+len); + folio_zero_new_buffers(folio, start+copied, start+len); } - flush_dcache_page(page); + flush_dcache_folio(folio); /* This could be a short (even 0-length) commit */ - __block_commit_write(inode, page, start, start+copied); + __block_commit_write(inode, folio, start, start + copied); return copied; } @@ -2537,8 +2600,9 @@ EXPORT_SYMBOL(cont_write_begin); int block_commit_write(struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; - __block_commit_write(inode,page,from,to); + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; + __block_commit_write(inode, folio, from, to); return 0; } EXPORT_SYMBOL(block_commit_write); @@ -2564,38 +2628,37 @@ EXPORT_SYMBOL(block_commit_write); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vma->vm_file); unsigned long end; loff_t size; int ret; - lock_page(page); + folio_lock(folio); size = i_size_read(inode); - if ((page->mapping != inode->i_mapping) || - (page_offset(page) > size)) { + if ((folio->mapping != inode->i_mapping) || + (folio_pos(folio) >= size)) { /* We overload EFAULT to mean page got truncated */ ret = -EFAULT; goto out_unlock; } - /* page is wholly or partially inside EOF */ - if (((page->index + 1) << PAGE_SHIFT) > size) - end = size & ~PAGE_MASK; - else - end = PAGE_SIZE; + end = folio_size(folio); + /* folio is wholly or partially inside EOF */ + if (folio_pos(folio) + end > size) + end = size - folio_pos(folio); - ret = __block_write_begin(page, 0, end, get_block); + ret = __block_write_begin_int(folio, 0, end, get_block, NULL); if (!ret) - ret = block_commit_write(page, 0, end); + ret = __block_commit_write(inode, folio, 0, end); if (unlikely(ret < 0)) goto out_unlock; - set_page_dirty(page); - wait_for_stable_page(page); + folio_mark_dirty(folio); + folio_wait_stable(folio); return 0; out_unlock: - unlock_page(page); + folio_unlock(folio); return ret; } EXPORT_SYMBOL(block_page_mkwrite); @@ -2604,17 +2667,16 @@ int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) { pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize; sector_t iblock; - unsigned length, pos; + size_t offset, length, pos; struct inode *inode = mapping->host; - struct page *page; + struct folio *folio; struct buffer_head *bh; int err = 0; blocksize = i_blocksize(inode); - length = offset & (blocksize - 1); + length = from & (blocksize - 1); /* Block boundary? Nothing to do */ if (!length) @@ -2623,15 +2685,18 @@ int block_truncate_page(struct address_space *mapping, length = blocksize - length; iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits); - page = grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); + bh = folio_buffers(folio); + if (!bh) { + folio_create_empty_buffers(folio, blocksize, 0); + bh = folio_buffers(folio); + } /* Find the buffer that contains "offset" */ - bh = page_buffers(page); + offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -2650,7 +2715,7 @@ int block_truncate_page(struct address_space *mapping, } /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { @@ -2660,12 +2725,12 @@ int block_truncate_page(struct address_space *mapping, goto unlock; } - zero_user(page, offset, length); + folio_zero_range(folio, offset, length); mark_buffer_dirty(bh); unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } @@ -2677,33 +2742,32 @@ EXPORT_SYMBOL(block_truncate_page); int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc) { - struct inode * const inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode * const inode = folio->mapping->host; loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned offset; - /* Is the page fully inside i_size? */ - if (page->index < end_index) - return __block_write_full_page(inode, page, get_block, wbc, + /* Is the folio fully inside i_size? */ + if (folio_pos(folio) + folio_size(folio) <= i_size) + return __block_write_full_folio(inode, folio, get_block, wbc, end_buffer_async_write); - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE-1); - if (page->index >= end_index+1 || !offset) { - unlock_page(page); + /* Is the folio fully outside i_size? (truncate in progress) */ + if (folio_pos(folio) >= i_size) { + folio_unlock(folio); return 0; /* don't care */ } /* - * The page straddles i_size. It must be zeroed out on each and every + * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and + * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - zero_user_segment(page, offset, PAGE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc, - end_buffer_async_write); + folio_zero_segment(folio, offset_in_folio(folio, i_size), + folio_size(folio)); + return __block_write_full_folio(inode, folio, get_block, wbc, + end_buffer_async_write); } EXPORT_SYMBOL(block_write_full_page); @@ -2760,8 +2824,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); - BUG_ON(bio->bi_iter.bi_size != bh->b_size); + __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 82219a8f6084..d9d22d0ec38a 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -451,9 +451,10 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) ret = cachefiles_inject_write_error(); if (ret == 0) { - file = vfs_tmpfile_open(&nop_mnt_idmap, &parentpath, S_IFREG, - O_RDWR | O_LARGEFILE | O_DIRECT, - cache->cache_cred); + file = kernel_tmpfile_open(&nop_mnt_idmap, &parentpath, + S_IFREG | 0600, + O_RDWR | O_LARGEFILE | O_DIRECT, + cache->cache_cred); ret = PTR_ERR_OR_ZERO(file); } if (ret) { @@ -560,8 +561,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object, */ path.mnt = cache->mnt; path.dentry = dentry; - file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - d_backing_inode(dentry), cache->cache_cred); + file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, + d_backing_inode(dentry), cache->cache_cred); if (IS_ERR(file)) { trace_cachefiles_vfs_error(object, d_backing_inode(dentry), PTR_ERR(file), diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f4d8bf7dec88..b1925232dc08 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1746,6 +1746,69 @@ again: } /* + * Wrap filemap_splice_read with checks for cap bits on the inode. + * Atomically grab references, so that those bits are not released + * back to the MDS mid-read. + */ +static ssize_t ceph_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct ceph_file_info *fi = in->private_data; + struct inode *inode = file_inode(in); + struct ceph_inode_info *ci = ceph_inode(inode); + ssize_t ret; + int want = 0, got = 0; + CEPH_DEFINE_RW_CONTEXT(rw_ctx, 0); + + dout("splice_read %p %llx.%llx %llu~%zu trying to get caps on %p\n", + inode, ceph_vinop(inode), *ppos, len, inode); + + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + + if (ceph_has_inline_data(ci) || + (fi->flags & CEPH_F_SYNC)) + return copy_splice_read(in, ppos, pipe, len, flags); + + ceph_start_io_read(inode); + + want = CEPH_CAP_FILE_CACHE; + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want |= CEPH_CAP_FILE_LAZYIO; + + ret = ceph_get_caps(in, CEPH_CAP_FILE_RD, want, -1, &got); + if (ret < 0) + goto out_end; + + if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) == 0) { + dout("splice_read/sync %p %llx.%llx %llu~%zu got cap refs on %s\n", + inode, ceph_vinop(inode), *ppos, len, + ceph_cap_string(got)); + + ceph_put_cap_refs(ci, got); + ceph_end_io_read(inode); + return copy_splice_read(in, ppos, pipe, len, flags); + } + + dout("splice_read %p %llx.%llx %llu~%zu got cap refs on %s\n", + inode, ceph_vinop(inode), *ppos, len, ceph_cap_string(got)); + + rw_ctx.caps = got; + ceph_add_rw_context(fi, &rw_ctx); + ret = filemap_splice_read(in, ppos, pipe, len, flags); + ceph_del_rw_context(fi, &rw_ctx); + + dout("splice_read %p %llx.%llx dropping cap refs on %s = %zd\n", + inode, ceph_vinop(inode), ceph_cap_string(got), ret); + + ceph_put_cap_refs(ci, got); +out_end: + ceph_end_io_read(inode); + return ret; +} + +/* * Take cap references to avoid releasing caps to MDS mid-write. * * If we are synchronous, and write with an old snap context, the OSD @@ -1791,9 +1854,6 @@ retry_snap: else ceph_start_io_write(inode); - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - if (iocb->ki_flags & IOCB_APPEND) { err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); if (err < 0) @@ -1894,8 +1954,6 @@ retry_snap: * can not run at the same time */ written = generic_perform_write(iocb, from); - if (likely(written >= 0)) - iocb->ki_pos = pos + written; ceph_end_io_write(inode); } @@ -1940,7 +1998,6 @@ out: ceph_end_io_write(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); - current->backing_dev_info = NULL; return written ? written : err; } @@ -2593,7 +2650,7 @@ const struct file_operations ceph_file_fops = { .lock = ceph_lock, .setlease = simple_nosetlease, .flock = ceph_flock, - .splice_read = generic_file_splice_read, + .splice_read = ceph_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = ceph_ioctl, .compat_ioctl = compat_ptr_ioctl, diff --git a/fs/char_dev.c b/fs/char_dev.c index 13deb45f1ec6..950b6919fb87 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -150,7 +150,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; - strlcpy(cd->name, name, sizeof(cd->name)); + strscpy(cd->name, name, sizeof(cd->name)); if (!prev) { cd->next = curr; diff --git a/fs/coda/file.c b/fs/coda/file.c index 3f3c81e6b1ab..12b26bd13564 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -23,6 +23,7 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/uio.h> +#include <linux/splice.h> #include <linux/coda.h> #include "coda_psdev.h" @@ -94,6 +95,32 @@ finish_write: return ret; } +static ssize_t +coda_file_splice_read(struct file *coda_file, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *coda_inode = file_inode(coda_file); + struct coda_file_info *cfi = coda_ftoc(coda_file); + struct file *in = cfi->cfi_container; + loff_t ki_pos = *ppos; + ssize_t ret; + + ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + len, ki_pos, CODA_ACCESS_TYPE_READ); + if (ret) + goto finish_read; + + ret = vfs_splice_read(in, ppos, pipe, len, flags); + +finish_read: + venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + len, ki_pos, CODA_ACCESS_TYPE_READ_FINISH); + return ret; +} + static void coda_vm_open(struct vm_area_struct *vma) { @@ -302,5 +329,5 @@ const struct file_operations coda_file_operations = { .open = coda_open, .release = coda_release, .fsync = coda_fsync, - .splice_read = generic_file_splice_read, + .splice_read = coda_file_splice_read, }; diff --git a/fs/coredump.c b/fs/coredump.c index 88740c51b942..9d235fa14ab9 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -648,7 +648,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) } else { struct mnt_idmap *idmap; struct inode *inode; - int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | + int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | O_LARGEFILE | O_EXCL; if (cprm.limit < binfmt->min_coredump) diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 006ef68d7ff6..27c6597aa1be 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -473,7 +473,7 @@ static unsigned int cramfs_physmem_mmap_capabilities(struct file *file) static const struct file_operations cramfs_physmem_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .mmap = cramfs_physmem_mmap, #ifndef CONFIG_MMU .get_unmapped_area = cramfs_physmem_get_unmapped_area, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 7ab5a7b7eef8..2d63da48635a 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -171,7 +171,7 @@ fscrypt_policy_flags(const union fscrypt_policy *policy) */ struct fscrypt_symlink_data { __le16 len; - char encrypted_path[1]; + char encrypted_path[]; } __packed; /** diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 9e786ae66a13..6238dbcadcad 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -255,10 +255,10 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target, * for now since filesystems will assume it is there and subtract it. */ if (!__fscrypt_fname_encrypted_size(policy, len, - max_len - sizeof(struct fscrypt_symlink_data), + max_len - sizeof(struct fscrypt_symlink_data) - 1, &disk_link->len)) return -ENAMETOOLONG; - disk_link->len += sizeof(struct fscrypt_symlink_data); + disk_link->len += sizeof(struct fscrypt_symlink_data) + 1; disk_link->name = NULL; return 0; @@ -289,7 +289,7 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, if (!sd) return -ENOMEM; } - ciphertext_len = disk_link->len - sizeof(*sd); + ciphertext_len = disk_link->len - sizeof(*sd) - 1; sd->len = cpu_to_le16(ciphertext_len); err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path, @@ -367,7 +367,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, * the ciphertext length, even though this is redundant with i_size. */ - if (max_size < sizeof(*sd)) + if (max_size < sizeof(*sd) + 1) return ERR_PTR(-EUCLEAN); sd = caddr; cstr.name = (unsigned char *)sd->encrypted_path; @@ -376,7 +376,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, if (cstr.len == 0) return ERR_PTR(-EUCLEAN); - if (cstr.len + sizeof(*sd) - 1 > max_size) + if (cstr.len + sizeof(*sd) > max_size) return ERR_PTR(-EUCLEAN); err = fscrypt_fname_alloc_buffer(cstr.len, &pstr); diff --git a/fs/d_path.c b/fs/d_path.c index 56a6ee4c6331..5f4da5c8d5db 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/prefetch.h> #include "mount.h" +#include "internal.h" struct prepend_buffer { char *buf; @@ -1148,7 +1148,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, if (!zero_edge) { ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); if (ret) - return ret; + return dax_mem2blk_err(ret); } if (copy_all) { @@ -1310,7 +1310,7 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) out_unlock: dax_read_unlock(id); - return ret; + return dax_mem2blk_err(ret); } int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, @@ -1342,7 +1342,8 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); if (ret < 0) - return ret; + return dax_mem2blk_err(ret); + memset(kaddr + offset, 0, size); if (iomap->flags & IOMAP_F_SHARED) ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, @@ -1498,7 +1499,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_ACCESS, &kaddr, NULL); - if (map_len == -EIO && iov_iter_rw(iter) == WRITE) { + if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) { map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_RECOVERY_WRITE, &kaddr, NULL); @@ -1506,7 +1507,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, recovery = true; } if (map_len < 0) { - ret = map_len; + ret = dax_mem2blk_err(map_len); break; } @@ -1830,7 +1831,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, vm_fault_t ret = VM_FAULT_FALLBACK; pgoff_t max_pgoff; void *entry; - int error; if (vmf->flags & FAULT_FLAG_WRITE) iter.flags |= IOMAP_WRITE; @@ -1877,7 +1877,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, } iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT; - while ((error = iomap_iter(&iter, ops)) > 0) { + while (iomap_iter(&iter, ops) > 0) { if (iomap_length(&iter) < PMD_SIZE) continue; /* actually breaks out of the loop */ diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 1f971c880dde..b7711888dd17 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -940,15 +940,6 @@ static const struct file_operations fops_str_wo = { * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will - * be returned. */ void debugfs_create_str(const char *name, umode_t mode, struct dentry *parent, char **value) diff --git a/fs/direct-io.c b/fs/direct-io.c index 0b380bb8a81e..7bc494ee56b9 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -42,8 +42,8 @@ #include "internal.h" /* - * How many user pages to map in one call to get_user_pages(). This determines - * the size of a structure in the slab cache + * How many user pages to map in one call to iov_iter_extract_pages(). This + * determines the size of a structure in the slab cache */ #define DIO_PAGES 64 @@ -121,12 +121,13 @@ struct dio { struct inode *inode; loff_t i_size; /* i_size when submitted */ dio_iodone_t *end_io; /* IO completion function */ + bool is_pinned; /* T if we have pins on the pages */ void *private; /* copy from map_bh.b_private */ /* BIO completion state */ spinlock_t bio_lock; /* protects BIO fields below */ - int page_errors; /* errno from get_user_pages() */ + int page_errors; /* err from iov_iter_extract_pages() */ int is_async; /* is IO async ? */ bool defer_completion; /* defer AIO completion to workqueue? */ bool should_dirty; /* if pages should be dirtied */ @@ -165,14 +166,14 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio) */ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { + struct page **pages = dio->pages; const enum req_op dio_op = dio->opf & REQ_OP_MASK; ssize_t ret; - ret = iov_iter_get_pages2(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES, - &sdio->from); + ret = iov_iter_extract_pages(sdio->iter, &pages, LONG_MAX, + DIO_PAGES, 0, &sdio->from); if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) { - struct page *page = ZERO_PAGE(0); /* * A memory fault, but the filesystem has some outstanding * mapped blocks. We need to use those blocks up to avoid @@ -180,8 +181,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) */ if (dio->page_errors == 0) dio->page_errors = ret; - get_page(page); - dio->pages[0] = page; + dio->pages[0] = ZERO_PAGE(0); sdio->head = 0; sdio->tail = 1; sdio->from = 0; @@ -201,9 +201,9 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) /* * Get another userspace page. Returns an ERR_PTR on error. Pages are - * buffered inside the dio so that we can call get_user_pages() against a - * decent number of pages, less frequently. To provide nicer use of the - * L1 cache. + * buffered inside the dio so that we can call iov_iter_extract_pages() + * against a decent number of pages, less frequently. To provide nicer use of + * the L1 cache. */ static inline struct page *dio_get_page(struct dio *dio, struct dio_submit *sdio) @@ -219,6 +219,18 @@ static inline struct page *dio_get_page(struct dio *dio, return dio->pages[sdio->head]; } +static void dio_pin_page(struct dio *dio, struct page *page) +{ + if (dio->is_pinned) + folio_add_pin(page_folio(page)); +} + +static void dio_unpin_page(struct dio *dio, struct page *page) +{ + if (dio->is_pinned) + unpin_user_page(page); +} + /* * dio_complete() - called when all DIO BIO I/O has been completed * @@ -285,14 +297,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) * zeros from unwritten extents. */ if (flags & DIO_COMPLETE_INVALIDATE && - ret > 0 && dio_op == REQ_OP_WRITE && - dio->inode->i_mapping->nrpages) { - err = invalidate_inode_pages2_range(dio->inode->i_mapping, - offset >> PAGE_SHIFT, - (offset + ret - 1) >> PAGE_SHIFT); - if (err) - dio_warn_stale_pagecache(dio->iocb->ki_filp); - } + ret > 0 && dio_op == REQ_OP_WRITE) + kiocb_invalidate_post_direct_write(dio->iocb, ret); inode_dio_end(dio->inode); @@ -402,6 +408,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_end_io = dio_bio_end_aio; else bio->bi_end_io = dio_bio_end_io; + if (dio->is_pinned) + bio_set_flag(bio, BIO_PAGE_PINNED); sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } @@ -442,8 +450,10 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) */ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { - while (sdio->head < sdio->tail) - put_page(dio->pages[sdio->head++]); + if (dio->is_pinned) + unpin_user_pages(dio->pages + sdio->head, + sdio->tail - sdio->head); + sdio->head = sdio->tail; } /* @@ -674,7 +684,7 @@ out: * * Return zero on success. Non-zero means the caller needs to start a new BIO. */ -static inline int dio_bio_add_page(struct dio_submit *sdio) +static inline int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio) { int ret; @@ -686,7 +696,7 @@ static inline int dio_bio_add_page(struct dio_submit *sdio) */ if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) sdio->pages_in_io--; - get_page(sdio->cur_page); + dio_pin_page(dio, sdio->cur_page); sdio->final_block_in_bio = sdio->cur_page_block + (sdio->cur_page_len >> sdio->blkbits); ret = 0; @@ -741,11 +751,11 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, goto out; } - if (dio_bio_add_page(sdio) != 0) { + if (dio_bio_add_page(dio, sdio) != 0) { dio_bio_submit(dio, sdio); ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret == 0) { - ret = dio_bio_add_page(sdio); + ret = dio_bio_add_page(dio, sdio); BUG_ON(ret != 0); } } @@ -802,13 +812,13 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, */ if (sdio->cur_page) { ret = dio_send_cur_page(dio, sdio, map_bh); - put_page(sdio->cur_page); + dio_unpin_page(dio, sdio->cur_page); sdio->cur_page = NULL; if (ret) return ret; } - get_page(page); /* It is in dio */ + dio_pin_page(dio, page); /* It is in dio */ sdio->cur_page = page; sdio->cur_page_offset = offset; sdio->cur_page_len = len; @@ -823,7 +833,7 @@ out: ret = dio_send_cur_page(dio, sdio, map_bh); if (sdio->bio) dio_bio_submit(dio, sdio); - put_page(sdio->cur_page); + dio_unpin_page(dio, sdio->cur_page); sdio->cur_page = NULL; } return ret; @@ -924,7 +934,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, ret = get_more_blocks(dio, sdio, map_bh); if (ret) { - put_page(page); + dio_unpin_page(dio, page); goto out; } if (!buffer_mapped(map_bh)) @@ -969,7 +979,7 @@ do_holes: /* AKPM: eargh, -ENOTBLK is a hack */ if (dio_op == REQ_OP_WRITE) { - put_page(page); + dio_unpin_page(dio, page); return -ENOTBLK; } @@ -982,7 +992,7 @@ do_holes: if (sdio->block_in_file >= i_size_aligned >> blkbits) { /* We hit eof */ - put_page(page); + dio_unpin_page(dio, page); goto out; } zero_user(page, from, 1 << blkbits); @@ -1022,7 +1032,7 @@ do_holes: sdio->next_block_for_io, map_bh); if (ret) { - put_page(page); + dio_unpin_page(dio, page); goto out; } sdio->next_block_for_io += this_chunk_blocks; @@ -1037,8 +1047,8 @@ next_block: break; } - /* Drop the ref which was taken in get_user_pages() */ - put_page(page); + /* Drop the pin which was taken in get_user_pages() */ + dio_unpin_page(dio, page); } out: return ret; @@ -1133,6 +1143,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, /* will be released by direct_io_worker */ inode_lock(inode); } + dio->is_pinned = iov_iter_extract_will_pin(iter); /* Once we sampled i_size check for reads beyond EOF */ dio->i_size = i_size_read(inode); @@ -1257,7 +1268,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, ret2 = dio_send_cur_page(dio, &sdio, &map_bh); if (retval == 0) retval = ret2; - put_page(sdio.cur_page); + dio_unpin_page(dio, sdio.cur_page); sdio.cur_page = NULL; } if (sdio.bio) diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 700ff2e0515a..1f2f70a1b824 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -36,23 +36,6 @@ void dlm_callback_set_last_ptr(struct dlm_callback **from, *from = to; } -void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb) -{ - struct dlm_callback *cb, *safe; - - list_for_each_entry_safe(cb, safe, &lkb->lkb_callbacks, list) { - list_del(&cb->list); - kref_put(&cb->ref, dlm_release_callback); - } - - clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags); - - /* invalidate */ - dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL); - dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL); - lkb->lkb_last_bast_mode = -1; -} - int dlm_enqueue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags) { @@ -181,10 +164,12 @@ void dlm_callback_work(struct work_struct *work) spin_lock(&lkb->lkb_cb_lock); rv = dlm_dequeue_lkb_callback(lkb, &cb); - spin_unlock(&lkb->lkb_cb_lock); - - if (WARN_ON_ONCE(rv == DLM_DEQUEUE_CALLBACK_EMPTY)) + if (WARN_ON_ONCE(rv == DLM_DEQUEUE_CALLBACK_EMPTY)) { + clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags); + spin_unlock(&lkb->lkb_cb_lock); goto out; + } + spin_unlock(&lkb->lkb_cb_lock); for (;;) { castfn = lkb->lkb_astfn; diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h index 880b11882495..ce007892dc2d 100644 --- a/fs/dlm/ast.h +++ b/fs/dlm/ast.h @@ -26,7 +26,6 @@ void dlm_callback_set_last_ptr(struct dlm_callback **from, struct dlm_callback *to); void dlm_release_callback(struct kref *ref); -void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb); void dlm_callback_work(struct work_struct *work); int dlm_callback_start(struct dlm_ls *ls); void dlm_callback_stop(struct dlm_ls *ls); diff --git a/fs/dlm/config.c b/fs/dlm/config.c index d31319d08581..2beceff024e3 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -116,9 +116,9 @@ static ssize_t cluster_cluster_name_store(struct config_item *item, { struct dlm_cluster *cl = config_item_to_cluster(item); - strlcpy(dlm_config.ci_cluster_name, buf, + strscpy(dlm_config.ci_cluster_name, buf, sizeof(dlm_config.ci_cluster_name)); - strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name)); + strscpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name)); return len; } diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 986a9d7b1f33..c8156770205e 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -246,7 +246,7 @@ struct dlm_lkb { int8_t lkb_highbast; /* highest mode bast sent for */ int8_t lkb_wait_type; /* type of reply waiting for */ - int8_t lkb_wait_count; + atomic_t lkb_wait_count; int lkb_wait_nodeid; /* for debugging */ struct list_head lkb_statequeue; /* rsb g/c/w list */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index debf8a55ad7d..f511a9d7d416 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1407,6 +1407,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; int error = 0; + int wc; mutex_lock(&ls->ls_waiters_mutex); @@ -1428,20 +1429,17 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) error = -EBUSY; goto out; } - lkb->lkb_wait_count++; + wc = atomic_inc_return(&lkb->lkb_wait_count); hold_lkb(lkb); log_debug(ls, "addwait %x cur %d overlap %d count %d f %x", - lkb->lkb_id, lkb->lkb_wait_type, mstype, - lkb->lkb_wait_count, dlm_iflags_val(lkb)); + lkb->lkb_id, lkb->lkb_wait_type, mstype, wc, + dlm_iflags_val(lkb)); goto out; } - DLM_ASSERT(!lkb->lkb_wait_count, - dlm_print_lkb(lkb); - printk("wait_count %d\n", lkb->lkb_wait_count);); - - lkb->lkb_wait_count++; + wc = atomic_fetch_inc(&lkb->lkb_wait_count); + DLM_ASSERT(!wc, dlm_print_lkb(lkb); printk("wait_count %d\n", wc);); lkb->lkb_wait_type = mstype; lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */ hold_lkb(lkb); @@ -1504,7 +1502,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, log_debug(ls, "remwait %x convert_reply zap overlap_cancel", lkb->lkb_id); lkb->lkb_wait_type = 0; - lkb->lkb_wait_count--; + atomic_dec(&lkb->lkb_wait_count); unhold_lkb(lkb); goto out_del; } @@ -1531,16 +1529,15 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype, if (overlap_done && lkb->lkb_wait_type) { log_error(ls, "remwait error %x reply %d wait_type %d overlap", lkb->lkb_id, mstype, lkb->lkb_wait_type); - lkb->lkb_wait_count--; + atomic_dec(&lkb->lkb_wait_count); unhold_lkb(lkb); lkb->lkb_wait_type = 0; } - DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb);); + DLM_ASSERT(atomic_read(&lkb->lkb_wait_count), dlm_print_lkb(lkb);); clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags); - lkb->lkb_wait_count--; - if (!lkb->lkb_wait_count) + if (atomic_dec_and_test(&lkb->lkb_wait_count)) list_del_init(&lkb->lkb_wait_reply); unhold_lkb(lkb); return 0; @@ -2669,7 +2666,7 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, goto out; /* lock not allowed if there's any op in progress */ - if (lkb->lkb_wait_type || lkb->lkb_wait_count) + if (lkb->lkb_wait_type || atomic_read(&lkb->lkb_wait_count)) goto out; if (is_overlap(lkb)) @@ -2731,7 +2728,7 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) /* normal unlock not allowed if there's any op in progress */ if (!(args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) && - (lkb->lkb_wait_type || lkb->lkb_wait_count)) + (lkb->lkb_wait_type || atomic_read(&lkb->lkb_wait_count))) goto out; /* an lkb may be waiting for an rsb lookup to complete where the @@ -4616,7 +4613,7 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms, { int error = 0, noent = 0; - if (!dlm_is_member(ls, le32_to_cpu(ms->m_header.h_nodeid))) { + if (WARN_ON_ONCE(!dlm_is_member(ls, le32_to_cpu(ms->m_header.h_nodeid)))) { log_limit(ls, "receive %d from non-member %d %x %x %d", le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_header.h_nodeid), @@ -4754,7 +4751,7 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms, /* If we were a member of this lockspace, left, and rejoined, other nodes may still be sending us messages from the lockspace generation before we left. */ - if (!ls->ls_generation) { + if (WARN_ON_ONCE(!ls->ls_generation)) { log_limit(ls, "receive %d from %d ignore old gen", le32_to_cpu(ms->m_type), nodeid); return; @@ -5066,10 +5063,9 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) /* drop all wait_count references we still * hold a reference for this iteration. */ - while (lkb->lkb_wait_count) { - lkb->lkb_wait_count--; + while (!atomic_dec_and_test(&lkb->lkb_wait_count)) unhold_lkb(lkb); - } + mutex_lock(&ls->ls_waiters_mutex); list_del_init(&lkb->lkb_wait_reply); mutex_unlock(&ls->ls_waiters_mutex); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 67261b7b1f0e..0455dddb0797 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -935,15 +935,3 @@ void dlm_stop_lockspaces(void) log_print("dlm user daemon left %d lockspaces", count); } -void dlm_stop_lockspaces_check(void) -{ - struct dlm_ls *ls; - - spin_lock(&lslist_lock); - list_for_each_entry(ls, &lslist, ls_list) { - if (WARN_ON(!rwsem_is_locked(&ls->ls_in_recovery) || - !dlm_locking_stopped(ls))) - break; - } - spin_unlock(&lslist_lock); -} diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h index 03f4a4a3a871..47ebd4411926 100644 --- a/fs/dlm/lockspace.h +++ b/fs/dlm/lockspace.h @@ -27,7 +27,6 @@ struct dlm_ls *dlm_find_lockspace_local(void *id); struct dlm_ls *dlm_find_lockspace_device(int minor); void dlm_put_lockspace(struct dlm_ls *ls); void dlm_stop_lockspaces(void); -void dlm_stop_lockspaces_check(void); int dlm_new_user_lockspace(const char *name, const char *cluster, uint32_t flags, int lvblen, const struct dlm_lockspace_ops *ops, diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3d3802c47b8b..9f14ea9f6322 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -546,9 +546,6 @@ int dlm_lowcomms_connect_node(int nodeid) struct connection *con; int idx; - if (nodeid == dlm_our_nodeid()) - return 0; - idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (WARN_ON_ONCE(!con)) { @@ -735,19 +732,15 @@ static void stop_connection_io(struct connection *con) if (con->othercon) stop_connection_io(con->othercon); + spin_lock_bh(&con->writequeue_lock); + set_bit(CF_IO_STOP, &con->flags); + spin_unlock_bh(&con->writequeue_lock); + down_write(&con->sock_lock); if (con->sock) { lock_sock(con->sock->sk); restore_callbacks(con->sock->sk); - - spin_lock_bh(&con->writequeue_lock); - set_bit(CF_IO_STOP, &con->flags); - spin_unlock_bh(&con->writequeue_lock); release_sock(con->sock->sk); - } else { - spin_lock_bh(&con->writequeue_lock); - set_bit(CF_IO_STOP, &con->flags); - spin_unlock_bh(&con->writequeue_lock); } up_write(&con->sock_lock); @@ -867,30 +860,8 @@ struct dlm_processed_nodes { struct list_head list; }; -static void add_processed_node(int nodeid, struct list_head *processed_nodes) -{ - struct dlm_processed_nodes *n; - - list_for_each_entry(n, processed_nodes, list) { - /* we already remembered this node */ - if (n->nodeid == nodeid) - return; - } - - /* if it's fails in worst case we simple don't send an ack back. - * We try it next time. - */ - n = kmalloc(sizeof(*n), GFP_NOFS); - if (!n) - return; - - n->nodeid = nodeid; - list_add(&n->list, processed_nodes); -} - static void process_dlm_messages(struct work_struct *work) { - struct dlm_processed_nodes *n, *n_tmp; struct processqueue_entry *pentry; LIST_HEAD(processed_nodes); @@ -898,6 +869,7 @@ static void process_dlm_messages(struct work_struct *work) pentry = list_first_entry_or_null(&processqueue, struct processqueue_entry, list); if (WARN_ON_ONCE(!pentry)) { + process_dlm_messages_pending = false; spin_unlock(&processqueue_lock); return; } @@ -908,7 +880,6 @@ static void process_dlm_messages(struct work_struct *work) for (;;) { dlm_process_incoming_buffer(pentry->nodeid, pentry->buf, pentry->buflen); - add_processed_node(pentry->nodeid, &processed_nodes); free_processqueue_entry(pentry); spin_lock(&processqueue_lock); @@ -923,13 +894,6 @@ static void process_dlm_messages(struct work_struct *work) list_del(&pentry->list); spin_unlock(&processqueue_lock); } - - /* send ack back after we processed couple of messages */ - list_for_each_entry_safe(n, n_tmp, &processed_nodes, list) { - list_del(&n->list); - dlm_midcomms_receive_done(n->nodeid); - kfree(n); - } } /* Data received from remote end */ @@ -1395,8 +1359,11 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg) /* Send a message */ static int send_to_sock(struct connection *con) { - const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; struct writequeue_entry *e; + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | MSG_NOSIGNAL, + }; int len, offset, ret; spin_lock_bh(&con->writequeue_lock); @@ -1412,8 +1379,9 @@ static int send_to_sock(struct connection *con) WARN_ON_ONCE(len == 0 && e->users == 0); spin_unlock_bh(&con->writequeue_lock); - ret = kernel_sendpage(con->sock, e->page, offset, len, - msg_flags); + bvec_set_page(&bvec, e->page, len, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(con->sock, &msg); trace_dlm_send(con->nodeid, ret); if (ret == -EAGAIN || ret == 0) { lock_sock(con->sock->sk); @@ -1496,8 +1464,7 @@ int dlm_lowcomms_close(int nodeid) call_srcu(&connections_srcu, &con->rcu, connection_release); if (con->othercon) { clean_one_writequeue(con->othercon); - if (con->othercon) - call_srcu(&connections_srcu, &con->othercon->rcu, connection_release); + call_srcu(&connections_srcu, &con->othercon->rcu, connection_release); } srcu_read_unlock(&connections_srcu, idx); diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 93755f83a30d..6ca28299c9db 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -73,10 +73,10 @@ static void __exit exit_dlm(void) dlm_plock_exit(); dlm_user_exit(); dlm_config_exit(); - dlm_memory_exit(); dlm_lockspace_exit(); dlm_midcomms_exit(); dlm_unregister_debugfs(); + dlm_memory_exit(); } module_init(init_dlm); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 923c01a8a0aa..77d202e4a02a 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -307,6 +307,21 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) } } +static int add_remote_member(int nodeid) +{ + int error; + + if (nodeid == dlm_our_nodeid()) + return 0; + + error = dlm_lowcomms_connect_node(nodeid); + if (error < 0) + return error; + + dlm_midcomms_add_member(nodeid); + return 0; +} + static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) { struct dlm_member *memb; @@ -316,16 +331,16 @@ static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) if (!memb) return -ENOMEM; - error = dlm_lowcomms_connect_node(node->nodeid); + memb->nodeid = node->nodeid; + memb->weight = node->weight; + memb->comm_seq = node->comm_seq; + + error = add_remote_member(node->nodeid); if (error < 0) { kfree(memb); return error; } - memb->nodeid = node->nodeid; - memb->weight = node->weight; - memb->comm_seq = node->comm_seq; - dlm_midcomms_add_member(node->nodeid); add_ordered_member(ls, memb); ls->ls_num_nodes++; return 0; @@ -370,11 +385,19 @@ static void clear_memb_list(struct list_head *head, } } -static void clear_members_cb(int nodeid) +static void remove_remote_member(int nodeid) { + if (nodeid == dlm_our_nodeid()) + return; + dlm_midcomms_remove_member(nodeid); } +static void clear_members_cb(int nodeid) +{ + remove_remote_member(nodeid); +} + void dlm_clear_members(struct dlm_ls *ls) { clear_memb_list(&ls->ls_nodes, clear_members_cb); @@ -562,7 +585,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) neg++; list_move(&memb->list, &ls->ls_nodes_gone); - dlm_midcomms_remove_member(memb->nodeid); + remove_remote_member(memb->nodeid); ls->ls_num_nodes--; dlm_lsop_recover_slot(ls, memb); } diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index c02c43e4980a..e1a0df67b566 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -136,7 +136,6 @@ #include <net/tcp.h> #include "dlm_internal.h" -#include "lockspace.h" #include "lowcomms.h" #include "config.h" #include "memory.h" @@ -149,12 +148,14 @@ /* 5 seconds wait to sync ending of dlm */ #define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(5000) #define DLM_VERSION_NOT_SET 0 +#define DLM_SEND_ACK_BACK_MSG_THRESHOLD 32 +#define DLM_RECV_ACK_BACK_MSG_THRESHOLD (DLM_SEND_ACK_BACK_MSG_THRESHOLD * 8) struct midcomms_node { int nodeid; uint32_t version; - uint32_t seq_send; - uint32_t seq_next; + atomic_t seq_send; + atomic_t seq_next; /* These queues are unbound because we cannot drop any message in dlm. * We could send a fence signal for a specific node to the cluster * manager if queues hits some maximum value, however this handling @@ -166,7 +167,7 @@ struct midcomms_node { #define DLM_NODE_FLAG_CLOSE 1 #define DLM_NODE_FLAG_STOP_TX 2 #define DLM_NODE_FLAG_STOP_RX 3 -#define DLM_NODE_ULP_DELIVERED 4 + atomic_t ulp_delivered; unsigned long flags; wait_queue_head_t shutdown_wait; @@ -318,8 +319,9 @@ static void midcomms_node_reset(struct midcomms_node *node) { pr_debug("reset node %d\n", node->nodeid); - node->seq_next = DLM_SEQ_INIT; - node->seq_send = DLM_SEQ_INIT; + atomic_set(&node->seq_next, DLM_SEQ_INIT); + atomic_set(&node->seq_send, DLM_SEQ_INIT); + atomic_set(&node->ulp_delivered, 0); node->version = DLM_VERSION_NOT_SET; node->flags = 0; @@ -394,6 +396,28 @@ static int dlm_send_ack(int nodeid, uint32_t seq) return 0; } +static void dlm_send_ack_threshold(struct midcomms_node *node, + uint32_t threshold) +{ + uint32_t oval, nval; + bool send_ack; + + /* let only send one user trigger threshold to send ack back */ + do { + oval = atomic_read(&node->ulp_delivered); + send_ack = (oval > threshold); + /* abort if threshold is not reached */ + if (!send_ack) + break; + + nval = 0; + /* try to reset ulp_delivered counter */ + } while (atomic_cmpxchg(&node->ulp_delivered, oval, nval) != oval); + + if (send_ack) + dlm_send_ack(node->nodeid, atomic_read(&node->seq_next)); +} + static int dlm_send_fin(struct midcomms_node *node, void (*ack_rcv)(struct midcomms_node *node)) { @@ -493,9 +517,19 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, struct midcomms_node *node, uint32_t seq) { - if (seq == node->seq_next) { - node->seq_next++; + bool is_expected_seq; + uint32_t oval, nval; + do { + oval = atomic_read(&node->seq_next); + is_expected_seq = (oval == seq); + if (!is_expected_seq) + break; + + nval = oval + 1; + } while (atomic_cmpxchg(&node->seq_next, oval, nval) != oval); + + if (is_expected_seq) { switch (p->header.h_cmd) { case DLM_FIN: spin_lock(&node->state_lock); @@ -504,7 +538,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, switch (node->state) { case DLM_ESTABLISHED: - dlm_send_ack(node->nodeid, node->seq_next); + dlm_send_ack(node->nodeid, nval); /* passive shutdown DLM_LAST_ACK case 1 * additional we check if the node is used by @@ -523,14 +557,14 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, } break; case DLM_FIN_WAIT1: - dlm_send_ack(node->nodeid, node->seq_next); + dlm_send_ack(node->nodeid, nval); node->state = DLM_CLOSING; set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); break; case DLM_FIN_WAIT2: - dlm_send_ack(node->nodeid, node->seq_next); + dlm_send_ack(node->nodeid, nval); midcomms_node_reset(node); pr_debug("switch node %d to state %s\n", node->nodeid, dlm_state_str(node->state)); @@ -551,18 +585,20 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); dlm_receive_buffer_3_2_trace(seq, p); dlm_receive_buffer(p, node->nodeid); - set_bit(DLM_NODE_ULP_DELIVERED, &node->flags); + atomic_inc(&node->ulp_delivered); + /* unlikely case to send ack back when we don't transmit */ + dlm_send_ack_threshold(node, DLM_RECV_ACK_BACK_MSG_THRESHOLD); break; } } else { /* retry to ack message which we already have by sending back * current node->seq_next number as ack. */ - if (seq < node->seq_next) - dlm_send_ack(node->nodeid, node->seq_next); + if (seq < oval) + dlm_send_ack(node->nodeid, oval); log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d", - seq, node->seq_next, node->nodeid); + seq, oval, node->nodeid); } } @@ -960,49 +996,6 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) return ret; } -void dlm_midcomms_receive_done(int nodeid) -{ - struct midcomms_node *node; - int idx; - - idx = srcu_read_lock(&nodes_srcu); - node = nodeid2node(nodeid, 0); - if (!node) { - srcu_read_unlock(&nodes_srcu, idx); - return; - } - - /* old protocol, we do nothing */ - switch (node->version) { - case DLM_VERSION_3_2: - break; - default: - srcu_read_unlock(&nodes_srcu, idx); - return; - } - - /* do nothing if we didn't delivered stateful to ulp */ - if (!test_and_clear_bit(DLM_NODE_ULP_DELIVERED, - &node->flags)) { - srcu_read_unlock(&nodes_srcu, idx); - return; - } - - spin_lock(&node->state_lock); - /* we only ack if state is ESTABLISHED */ - switch (node->state) { - case DLM_ESTABLISHED: - spin_unlock(&node->state_lock); - dlm_send_ack(node->nodeid, node->seq_next); - break; - default: - spin_unlock(&node->state_lock); - /* do nothing FIN has it's own ack send */ - break; - } - srcu_read_unlock(&nodes_srcu, idx); -} - void dlm_midcomms_unack_msg_resend(int nodeid) { struct midcomms_node *node; @@ -1059,7 +1052,7 @@ static void midcomms_new_msg_cb(void *data) list_add_tail_rcu(&mh->list, &mh->node->send_queue); spin_unlock_bh(&mh->node->send_queue_lock); - mh->seq = mh->node->seq_send++; + mh->seq = atomic_fetch_inc(&mh->node->seq_send); } static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int nodeid, @@ -1133,6 +1126,8 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, goto err; } + /* send ack back if necessary */ + dlm_send_ack_threshold(node, DLM_SEND_ACK_BACK_MSG_THRESHOLD); break; default: dlm_free_mhandle(mh); @@ -1281,9 +1276,6 @@ void dlm_midcomms_add_member(int nodeid) struct midcomms_node *node; int idx; - if (nodeid == dlm_our_nodeid()) - return; - idx = srcu_read_lock(&nodes_srcu); node = nodeid2node(nodeid, GFP_NOFS); if (!node) { @@ -1329,9 +1321,6 @@ void dlm_midcomms_remove_member(int nodeid) struct midcomms_node *node; int idx; - if (nodeid == dlm_our_nodeid()) - return; - idx = srcu_read_lock(&nodes_srcu); node = nodeid2node(nodeid, 0); if (!node) { @@ -1488,11 +1477,6 @@ int dlm_midcomms_close(int nodeid) struct midcomms_node *node; int idx, ret; - if (nodeid == dlm_our_nodeid()) - return 0; - - dlm_stop_lockspaces_check(); - idx = srcu_read_lock(&nodes_srcu); /* Abort pending close/remove operation */ node = nodeid2node(nodeid, 0); @@ -1542,7 +1526,7 @@ static void midcomms_new_rawmsg_cb(void *data) switch (h->h_cmd) { case DLM_OPTS: if (!h->u.h_seq) - h->u.h_seq = cpu_to_le32(rd->node->seq_send++); + h->u.h_seq = cpu_to_le32(atomic_fetch_inc(&rd->node->seq_send)); break; default: break; diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index ed4357e62f35..70a4752ed913 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -30,8 +30,6 @@ struct plock_async_data { struct plock_op { struct list_head list; int done; - /* if lock op got interrupted while waiting dlm_controld reply */ - bool sigint; struct dlm_plock_info info; /* if set indicates async handling */ struct plock_async_data *data; @@ -157,23 +155,29 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); - rv = wait_event_interruptible(recv_wq, (op->done != 0)); - if (rv == -ERESTARTSYS) { - spin_lock(&ops_lock); - /* recheck under ops_lock if we got a done != 0, - * if so this interrupt case should be ignored - */ - if (op->done != 0) { + if (op->info.wait) { + rv = wait_event_killable(recv_wq, (op->done != 0)); + if (rv == -ERESTARTSYS) { + spin_lock(&ops_lock); + /* recheck under ops_lock if we got a done != 0, + * if so this interrupt case should be ignored + */ + if (op->done != 0) { + spin_unlock(&ops_lock); + goto do_lock_wait; + } + list_del(&op->list); spin_unlock(&ops_lock); - goto do_lock_wait; - } - op->sigint = true; - spin_unlock(&ops_lock); - log_debug(ls, "%s: wait interrupted %x %llx pid %d", - __func__, ls->ls_global_id, - (unsigned long long)number, op->info.pid); - goto out; + log_debug(ls, "%s: wait interrupted %x %llx pid %d", + __func__, ls->ls_global_id, + (unsigned long long)number, op->info.pid); + do_unlock_close(&op->info); + dlm_release_plock_op(op); + goto out; + } + } else { + wait_event(recv_wq, (op->done != 0)); } do_lock_wait: @@ -360,7 +364,9 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, locks_init_lock(fl); fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; fl->fl_flags = FL_POSIX; - fl->fl_pid = -op->info.pid; + fl->fl_pid = op->info.pid; + if (op->info.nodeid != dlm_our_nodeid()) + fl->fl_pid = -fl->fl_pid; fl->fl_start = op->info.start; fl->fl_end = op->info.end; rv = 0; @@ -389,7 +395,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, if (op->info.flags & DLM_PLOCK_FL_CLOSE) list_del(&op->list); else - list_move(&op->list, &recv_list); + list_move_tail(&op->list, &recv_list); memcpy(&info, &op->info, sizeof(info)); } spin_unlock(&ops_lock); @@ -427,34 +433,53 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, if (check_version(&info)) return -EINVAL; + /* + * The results for waiting ops (SETLKW) can be returned in any + * order, so match all fields to find the op. The results for + * non-waiting ops are returned in the order that they were sent + * to userspace, so match the result with the first non-waiting op. + */ spin_lock(&ops_lock); - list_for_each_entry(iter, &recv_list, list) { - if (iter->info.fsid == info.fsid && - iter->info.number == info.number && - iter->info.owner == info.owner) { - if (iter->sigint) { - list_del(&iter->list); - spin_unlock(&ops_lock); - - pr_debug("%s: sigint cleanup %x %llx pid %d", - __func__, iter->info.fsid, - (unsigned long long)iter->info.number, - iter->info.pid); - do_unlock_close(&iter->info); - memcpy(&iter->info, &info, sizeof(info)); - dlm_release_plock_op(iter); - return count; + if (info.wait) { + list_for_each_entry(iter, &recv_list, list) { + if (iter->info.fsid == info.fsid && + iter->info.number == info.number && + iter->info.owner == info.owner && + iter->info.pid == info.pid && + iter->info.start == info.start && + iter->info.end == info.end && + iter->info.ex == info.ex && + iter->info.wait) { + op = iter; + break; + } + } + } else { + list_for_each_entry(iter, &recv_list, list) { + if (!iter->info.wait) { + op = iter; + break; } - list_del_init(&iter->list); - memcpy(&iter->info, &info, sizeof(info)); - if (iter->data) - do_callback = 1; - else - iter->done = 1; - op = iter; - break; } } + + if (op) { + /* Sanity check that op and info match. */ + if (info.wait) + WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK); + else + WARN_ON(op->info.fsid != info.fsid || + op->info.number != info.number || + op->info.owner != info.owner || + op->info.optype != info.optype); + + list_del_init(&op->list); + memcpy(&op->info, &info, sizeof(info)); + if (op->data) + do_callback = 1; + else + op->done = 1; + } spin_unlock(&ops_lock); if (op) { @@ -463,8 +488,8 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count, else wake_up(&recv_wq); } else - log_print("%s: no op %x %llx", __func__, - info.fsid, (unsigned long long)info.number); + pr_debug("%s: no op %x %llx", __func__, + info.fsid, (unsigned long long)info.number); return count; } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index d9c09fc0aba1..695e691b38b3 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -145,6 +145,24 @@ static void compat_output(struct dlm_lock_result *res, } #endif +/* should held proc->asts_spin lock */ +void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb) +{ + struct dlm_callback *cb, *safe; + + list_for_each_entry_safe(cb, safe, &lkb->lkb_callbacks, list) { + list_del(&cb->list); + kref_put(&cb->ref, dlm_release_callback); + } + + clear_bit(DLM_IFL_CB_PENDING_BIT, &lkb->lkb_iflags); + + /* invalidate */ + dlm_callback_set_last_ptr(&lkb->lkb_last_cast, NULL); + dlm_callback_set_last_ptr(&lkb->lkb_last_cb, NULL); + lkb->lkb_last_bast_mode = -1; +} + /* Figure out if this lock is at the end of its life and no longer available for the application to use. The lkb still exists until the final ast is read. A lock becomes EOL in three situations: diff --git a/fs/dlm/user.h b/fs/dlm/user.h index 33059452d79e..2caf8e6e24d5 100644 --- a/fs/dlm/user.h +++ b/fs/dlm/user.h @@ -6,6 +6,7 @@ #ifndef __USER_DOT_H__ #define __USER_DOT_H__ +void dlm_purge_lkb_callbacks(struct dlm_lkb *lkb); void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags); int dlm_user_init(void); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 268b74499c28..ce0a3c5ed0ca 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -44,6 +44,31 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, return rc; } +/* + * ecryptfs_splice_read_update_atime + * + * filemap_splice_read updates the atime of upper layer inode. But, it + * doesn't give us a chance to update the atime of the lower layer inode. This + * function is a wrapper to generic_file_read. It updates the atime of the + * lower level inode if generic_file_read returns without any errors. This is + * to be used only for file reads. The function to be used for directory reads + * is ecryptfs_read. + */ +static ssize_t ecryptfs_splice_read_update_atime(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + ssize_t rc; + const struct path *path; + + rc = filemap_splice_read(in, ppos, pipe, len, flags); + if (rc >= 0) { + path = ecryptfs_dentry_to_lower_path(in->f_path.dentry); + touch_atime(path); + } + return rc; +} + struct ecryptfs_getdents_callback { struct dir_context ctx; struct dir_context *caller; @@ -414,5 +439,5 @@ const struct file_operations ecryptfs_main_fops = { .release = ecryptfs_release, .fsync = ecryptfs_fsync, .fasync = ecryptfs_fasync, - .splice_read = generic_file_splice_read, + .splice_read = ecryptfs_splice_read_update_atime, }; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 482d612b716b..e028fafa04f3 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -13,6 +13,7 @@ #include <linux/ucs2_string.h> #include <linux/slab.h> #include <linux/magic.h> +#include <linux/statfs.h> #include "internal.h" @@ -23,8 +24,44 @@ static void efivarfs_evict_inode(struct inode *inode) clear_inode(inode); } +static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + const u32 attr = EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS; + u64 storage_space, remaining_space, max_variable_size; + efi_status_t status; + + status = efivar_query_variable_info(attr, &storage_space, &remaining_space, + &max_variable_size); + if (status != EFI_SUCCESS) + return efi_status_to_err(status); + + /* + * This is not a normal filesystem, so no point in pretending it has a block + * size; we declare f_bsize to 1, so that we can then report the exact value + * sent by EFI QueryVariableInfo in f_blocks and f_bfree + */ + buf->f_bsize = 1; + buf->f_namelen = NAME_MAX; + buf->f_blocks = storage_space; + buf->f_bfree = remaining_space; + buf->f_type = dentry->d_sb->s_magic; + + /* + * In f_bavail we declare the free space that the kernel will allow writing + * when the storage_paranoia x86 quirk is active. To use more, users + * should boot the kernel with efi_no_storage_paranoia. + */ + if (remaining_space > efivar_reserved_space()) + buf->f_bavail = remaining_space - efivar_reserved_space(); + else + buf->f_bavail = 0; + + return 0; +} static const struct super_operations efivarfs_ops = { - .statfs = simple_statfs, + .statfs = efivarfs_statfs, .drop_inode = generic_delete_inode, .evict_inode = efivarfs_evict_inode, }; diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 26fa170090b8..b1b846504027 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -89,8 +89,7 @@ static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, unsigned int padbufsize); -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool); +extern const struct z_erofs_decompressor erofs_decompressors[]; /* prototypes for specific algorithms */ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 6fe9a779fa91..db5e4b7636ec 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -448,5 +448,5 @@ const struct file_operations erofs_file_fops = { .llseek = generic_file_llseek, .read_iter = erofs_file_read_iter, .mmap = erofs_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, }; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 7021e2cf6146..2a29943fa5cc 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -363,7 +363,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, return 0; } -static struct z_erofs_decompressor decompressors[] = { +const struct z_erofs_decompressor erofs_decompressors[] = { [Z_EROFS_COMPRESSION_SHIFTED] = { .decompress = z_erofs_transform_plain, .name = "shifted" @@ -383,9 +383,3 @@ static struct z_erofs_decompressor decompressors[] = { }, #endif }; - -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) -{ - return decompressors[rq->alg].decompress(rq, pagepool); -} diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1e39c03357d1..36e32fa542f0 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -208,46 +208,12 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) - /* basic unit of the workstation of a super_block */ struct erofs_workgroup { - /* the workgroup index in the workstation */ pgoff_t index; - - /* overall workgroup reference count */ - atomic_t refcount; + struct lockref lockref; }; -static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, - int val) -{ - preempt_disable(); - if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) { - preempt_enable(); - return false; - } - return true; -} - -static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, - int orig_val) -{ - /* - * other observers should notice all modifications - * in the freezing period. - */ - smp_mb(); - atomic_set(&grp->refcount, orig_val); - preempt_enable(); -} - -static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) -{ - return atomic_cond_read_relaxed(&grp->refcount, - VAL != EROFS_LOCKED_MAGIC); -} - enum erofs_kmap_type { EROFS_NO_KMAP, /* don't map the buffer */ EROFS_KMAP, /* use kmap_local_page() to map the buffer */ @@ -486,7 +452,7 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) void erofs_release_pages(struct page **pagepool); #ifdef CONFIG_EROFS_FS_ZIP -int erofs_workgroup_put(struct erofs_workgroup *grp); +void erofs_workgroup_put(struct erofs_workgroup *grp); struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, pgoff_t index); struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, @@ -500,7 +466,6 @@ int __init z_erofs_init_zip_subsystem(void); void z_erofs_exit_zip_subsystem(void); int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); -int erofs_try_to_free_cached_page(struct page *page); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); @@ -511,6 +476,7 @@ void erofs_put_pcpubuf(void *ptr); int erofs_pcpubuf_growsize(unsigned int nrpages); void __init erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); +int erofs_init_managed_cache(struct super_block *sb); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} @@ -530,6 +496,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } static inline void erofs_pcpubuf_init(void) {} static inline void erofs_pcpubuf_exit(void) {} +static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_ZIP_LZMA diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 811ab66d805e..9d6a3c6158bd 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -19,6 +19,7 @@ #include <trace/events/erofs.h> static struct kmem_cache *erofs_inode_cachep __read_mostly; +struct file_system_type erofs_fs_type; void _erofs_err(struct super_block *sb, const char *function, const char *fmt, ...) @@ -253,8 +254,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { - bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, - sb->s_type); + bdev = blkdev_get_by_path(dif->path, BLK_OPEN_READ, sb->s_type, + NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); dif->bdev = bdev; @@ -599,68 +600,6 @@ static int erofs_fc_parse_param(struct fs_context *fc, return 0; } -#ifdef CONFIG_EROFS_FS_ZIP -static const struct address_space_operations managed_cache_aops; - -static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp) -{ - bool ret = true; - struct address_space *const mapping = folio->mapping; - - DBG_BUGON(!folio_test_locked(folio)); - DBG_BUGON(mapping->a_ops != &managed_cache_aops); - - if (folio_test_private(folio)) - ret = erofs_try_to_free_cached_page(&folio->page); - - return ret; -} - -/* - * It will be called only on inode eviction. In case that there are still some - * decompression requests in progress, wait with rescheduling for a bit here. - * We could introduce an extra locking instead but it seems unnecessary. - */ -static void erofs_managed_cache_invalidate_folio(struct folio *folio, - size_t offset, size_t length) -{ - const size_t stop = length + offset; - - DBG_BUGON(!folio_test_locked(folio)); - - /* Check for potential overflow in debug mode */ - DBG_BUGON(stop > folio_size(folio) || stop < length); - - if (offset == 0 && stop == folio_size(folio)) - while (!erofs_managed_cache_release_folio(folio, GFP_NOFS)) - cond_resched(); -} - -static const struct address_space_operations managed_cache_aops = { - .release_folio = erofs_managed_cache_release_folio, - .invalidate_folio = erofs_managed_cache_invalidate_folio, -}; - -static int erofs_init_managed_cache(struct super_block *sb) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - struct inode *const inode = new_inode(sb); - - if (!inode) - return -ENOMEM; - - set_nlink(inode, 1); - inode->i_size = OFFSET_MAX; - - inode->i_mapping->a_ops = &managed_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - sbi->managed_cache = inode; - return 0; -} -#else -static int erofs_init_managed_cache(struct super_block *sb) { return 0; } -#endif - static struct inode *erofs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -877,7 +816,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data) fs_put_dax(dif->dax_dev, NULL); if (dif->bdev) - blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); + blkdev_put(dif->bdev, &erofs_fs_type); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); @@ -1016,10 +955,8 @@ static int __init erofs_module_init(void) sizeof(struct erofs_inode), 0, SLAB_RECLAIM_ACCOUNT, erofs_inode_init_once); - if (!erofs_inode_cachep) { - err = -ENOMEM; - goto icache_err; - } + if (!erofs_inode_cachep) + return -ENOMEM; err = erofs_init_shrinker(); if (err) @@ -1054,7 +991,6 @@ lzma_err: erofs_exit_shrinker(); shrinker_err: kmem_cache_destroy(erofs_inode_cachep); -icache_err: return err; } diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 46627cb69abe..cc6fb9e98899 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -4,7 +4,6 @@ * https://www.huawei.com/ */ #include "internal.h" -#include <linux/pagevec.h> struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) { @@ -33,22 +32,21 @@ void erofs_release_pages(struct page **pagepool) /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; -static int erofs_workgroup_get(struct erofs_workgroup *grp) +static bool erofs_workgroup_get(struct erofs_workgroup *grp) { - int o; + if (lockref_get_not_zero(&grp->lockref)) + return true; -repeat: - o = erofs_wait_on_workgroup_freezed(grp); - if (o <= 0) - return -1; - - if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o) - goto repeat; + spin_lock(&grp->lockref.lock); + if (__lockref_is_dead(&grp->lockref)) { + spin_unlock(&grp->lockref.lock); + return false; + } - /* decrease refcount paired by erofs_workgroup_put */ - if (o == 1) + if (!grp->lockref.count++) atomic_long_dec(&erofs_global_shrink_cnt); - return 0; + spin_unlock(&grp->lockref.lock); + return true; } struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, @@ -61,7 +59,7 @@ repeat: rcu_read_lock(); grp = xa_load(&sbi->managed_pslots, index); if (grp) { - if (erofs_workgroup_get(grp)) { + if (!erofs_workgroup_get(grp)) { /* prefer to relax rcu read side */ rcu_read_unlock(); goto repeat; @@ -80,11 +78,10 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, struct erofs_workgroup *pre; /* - * Bump up a reference count before making this visible - * to others for the XArray in order to avoid potential - * UAF without serialized by xa_lock. + * Bump up before making this visible to others for the XArray in order + * to avoid potential UAF without serialized by xa_lock. */ - atomic_inc(&grp->refcount); + lockref_get(&grp->lockref); repeat: xa_lock(&sbi->managed_pslots); @@ -93,13 +90,13 @@ repeat: if (pre) { if (xa_is_err(pre)) { pre = ERR_PTR(xa_err(pre)); - } else if (erofs_workgroup_get(pre)) { + } else if (!erofs_workgroup_get(pre)) { /* try to legitimize the current in-tree one */ xa_unlock(&sbi->managed_pslots); cond_resched(); goto repeat; } - atomic_dec(&grp->refcount); + lockref_put_return(&grp->lockref); grp = pre; } xa_unlock(&sbi->managed_pslots); @@ -112,38 +109,34 @@ static void __erofs_workgroup_free(struct erofs_workgroup *grp) erofs_workgroup_free_rcu(grp); } -int erofs_workgroup_put(struct erofs_workgroup *grp) +void erofs_workgroup_put(struct erofs_workgroup *grp) { - int count = atomic_dec_return(&grp->refcount); + if (lockref_put_or_lock(&grp->lockref)) + return; - if (count == 1) + DBG_BUGON(__lockref_is_dead(&grp->lockref)); + if (grp->lockref.count == 1) atomic_long_inc(&erofs_global_shrink_cnt); - else if (!count) - __erofs_workgroup_free(grp); - return count; + --grp->lockref.count; + spin_unlock(&grp->lockref.lock); } static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, struct erofs_workgroup *grp) { - /* - * If managed cache is on, refcount of workgroups - * themselves could be < 0 (freezed). In other words, - * there is no guarantee that all refcounts > 0. - */ - if (!erofs_workgroup_try_to_freeze(grp, 1)) - return false; + int free = false; + + spin_lock(&grp->lockref.lock); + if (grp->lockref.count) + goto out; /* - * Note that all cached pages should be unattached - * before deleted from the XArray. Otherwise some - * cached pages could be still attached to the orphan - * old workgroup when the new one is available in the tree. + * Note that all cached pages should be detached before deleted from + * the XArray. Otherwise some cached pages could be still attached to + * the orphan old workgroup when the new one is available in the tree. */ - if (erofs_try_to_free_all_cached_pages(sbi, grp)) { - erofs_workgroup_unfreeze(grp, 1); - return false; - } + if (erofs_try_to_free_all_cached_pages(sbi, grp)) + goto out; /* * It's impossible to fail after the workgroup is freezed, @@ -152,10 +145,13 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, */ DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); - /* last refcount should be connected with its managed pslot. */ - erofs_workgroup_unfreeze(grp, 0); - __erofs_workgroup_free(grp); - return true; + lockref_mark_dead(&grp->lockref); + free = true; +out: + spin_unlock(&grp->lockref.lock); + if (free) + __erofs_workgroup_free(grp); + return free; } static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index bbfe7ce170d2..40178b6e0688 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -7,32 +7,27 @@ #include <linux/security.h> #include "xattr.h" -static inline erofs_blk_t erofs_xattr_blkaddr(struct super_block *sb, - unsigned int xattr_id) -{ - return EROFS_SB(sb)->xattr_blkaddr + - erofs_blknr(sb, xattr_id * sizeof(__u32)); -} - -static inline unsigned int erofs_xattr_blkoff(struct super_block *sb, - unsigned int xattr_id) -{ - return erofs_blkoff(sb, xattr_id * sizeof(__u32)); -} - -struct xattr_iter { +struct erofs_xattr_iter { struct super_block *sb; struct erofs_buf buf; + erofs_off_t pos; void *kaddr; - erofs_blk_t blkaddr; - unsigned int ofs; + char *buffer; + int buffer_size, buffer_ofs; + + /* getxattr */ + int index, infix_len; + struct qstr name; + + /* listxattr */ + struct dentry *dentry; }; static int erofs_init_inode_xattrs(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); - struct xattr_iter it; + struct erofs_xattr_iter it; unsigned int i; struct erofs_xattr_ibody_header *ih; struct super_block *sb = inode->i_sb; @@ -81,17 +76,17 @@ static int erofs_init_inode_xattrs(struct inode *inode) } it.buf = __EROFS_BUF_INITIALIZER; - it.blkaddr = erofs_blknr(sb, erofs_iloc(inode) + vi->inode_isize); - it.ofs = erofs_blkoff(sb, erofs_iloc(inode) + vi->inode_isize); + erofs_init_metabuf(&it.buf, sb); + it.pos = erofs_iloc(inode) + vi->inode_isize; /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP); if (IS_ERR(it.kaddr)) { ret = PTR_ERR(it.kaddr); goto out_unlock; } - ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); + ih = it.kaddr + erofs_blkoff(sb, it.pos); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, sizeof(uint), GFP_KERNEL); @@ -102,26 +97,20 @@ static int erofs_init_inode_xattrs(struct inode *inode) } /* let's skip ibody header */ - it.ofs += sizeof(struct erofs_xattr_ibody_header); + it.pos += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - if (it.ofs >= sb->s_blocksize) { - /* cannot be unaligned */ - DBG_BUGON(it.ofs != sb->s_blocksize); - - it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr, - EROFS_KMAP); - if (IS_ERR(it.kaddr)) { - kfree(vi->xattr_shared_xattrs); - vi->xattr_shared_xattrs = NULL; - ret = PTR_ERR(it.kaddr); - goto out_unlock; - } - it.ofs = 0; + it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), + EROFS_KMAP); + if (IS_ERR(it.kaddr)) { + kfree(vi->xattr_shared_xattrs); + vi->xattr_shared_xattrs = NULL; + ret = PTR_ERR(it.kaddr); + goto out_unlock; } - vi->xattr_shared_xattrs[i] = - le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); - it.ofs += sizeof(__le32); + vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *) + (it.kaddr + erofs_blkoff(sb, it.pos))); + it.pos += sizeof(__le32); } erofs_put_metabuf(&it.buf); @@ -134,287 +123,6 @@ out_unlock: return ret; } -/* - * the general idea for these return values is - * if 0 is returned, go on processing the current xattr; - * 1 (> 0) is returned, skip this round to process the next xattr; - * -err (< 0) is returned, an error (maybe ENOXATTR) occurred - * and need to be handled - */ -struct xattr_iter_handlers { - int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry); - int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf, - unsigned int len); - int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz); - void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf, - unsigned int len); -}; - -static inline int xattr_iter_fixup(struct xattr_iter *it) -{ - if (it->ofs < it->sb->s_blocksize) - return 0; - - it->blkaddr += erofs_blknr(it->sb, it->ofs); - it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr, - EROFS_KMAP); - if (IS_ERR(it->kaddr)) - return PTR_ERR(it->kaddr); - it->ofs = erofs_blkoff(it->sb, it->ofs); - return 0; -} - -static int inline_xattr_iter_begin(struct xattr_iter *it, - struct inode *inode) -{ - struct erofs_inode *const vi = EROFS_I(inode); - unsigned int xattr_header_sz, inline_xattr_ofs; - - xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) + - sizeof(u32) * vi->xattr_shared_count; - if (xattr_header_sz >= vi->xattr_isize) { - DBG_BUGON(xattr_header_sz > vi->xattr_isize); - return -ENOATTR; - } - - inline_xattr_ofs = vi->inode_isize + xattr_header_sz; - - it->blkaddr = erofs_blknr(it->sb, erofs_iloc(inode) + inline_xattr_ofs); - it->ofs = erofs_blkoff(it->sb, erofs_iloc(inode) + inline_xattr_ofs); - it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, - EROFS_KMAP); - if (IS_ERR(it->kaddr)) - return PTR_ERR(it->kaddr); - return vi->xattr_isize - xattr_header_sz; -} - -/* - * Regardless of success or failure, `xattr_foreach' will end up with - * `ofs' pointing to the next xattr item rather than an arbitrary position. - */ -static int xattr_foreach(struct xattr_iter *it, - const struct xattr_iter_handlers *op, - unsigned int *tlimit) -{ - struct erofs_xattr_entry entry; - unsigned int value_sz, processed, slice; - int err; - - /* 0. fixup blkaddr, ofs, ipage */ - err = xattr_iter_fixup(it); - if (err) - return err; - - /* - * 1. read xattr entry to the memory, - * since we do EROFS_XATTR_ALIGN - * therefore entry should be in the page - */ - entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs); - if (tlimit) { - unsigned int entry_sz = erofs_xattr_entry_size(&entry); - - /* xattr on-disk corruption: xattr entry beyond xattr_isize */ - if (*tlimit < entry_sz) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - *tlimit -= entry_sz; - } - - it->ofs += sizeof(struct erofs_xattr_entry); - value_sz = le16_to_cpu(entry.e_value_size); - - /* handle entry */ - err = op->entry(it, &entry); - if (err) { - it->ofs += entry.e_name_len + value_sz; - goto out; - } - - /* 2. handle xattr name (ofs will finally be at the end of name) */ - processed = 0; - - while (processed < entry.e_name_len) { - if (it->ofs >= it->sb->s_blocksize) { - DBG_BUGON(it->ofs > it->sb->s_blocksize); - - err = xattr_iter_fixup(it); - if (err) - goto out; - it->ofs = 0; - } - - slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs, - entry.e_name_len - processed); - - /* handle name */ - err = op->name(it, processed, it->kaddr + it->ofs, slice); - if (err) { - it->ofs += entry.e_name_len - processed + value_sz; - goto out; - } - - it->ofs += slice; - processed += slice; - } - - /* 3. handle xattr value */ - processed = 0; - - if (op->alloc_buffer) { - err = op->alloc_buffer(it, value_sz); - if (err) { - it->ofs += value_sz; - goto out; - } - } - - while (processed < value_sz) { - if (it->ofs >= it->sb->s_blocksize) { - DBG_BUGON(it->ofs > it->sb->s_blocksize); - - err = xattr_iter_fixup(it); - if (err) - goto out; - it->ofs = 0; - } - - slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs, - value_sz - processed); - op->value(it, processed, it->kaddr + it->ofs, slice); - it->ofs += slice; - processed += slice; - } - -out: - /* xattrs should be 4-byte aligned (on-disk constraint) */ - it->ofs = EROFS_XATTR_ALIGN(it->ofs); - return err < 0 ? err : 0; -} - -struct getxattr_iter { - struct xattr_iter it; - - char *buffer; - int buffer_size, index, infix_len; - struct qstr name; -}; - -static int erofs_xattr_long_entrymatch(struct getxattr_iter *it, - struct erofs_xattr_entry *entry) -{ - struct erofs_sb_info *sbi = EROFS_SB(it->it.sb); - struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + - (entry->e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); - - if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) - return -ENOATTR; - - if (it->index != pf->prefix->base_index || - it->name.len != entry->e_name_len + pf->infix_len) - return -ENOATTR; - - if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len)) - return -ENOATTR; - - it->infix_len = pf->infix_len; - return 0; -} - -static int xattr_entrymatch(struct xattr_iter *_it, - struct erofs_xattr_entry *entry) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - - /* should also match the infix for long name prefixes */ - if (entry->e_name_index & EROFS_XATTR_LONG_PREFIX) - return erofs_xattr_long_entrymatch(it, entry); - - if (it->index != entry->e_name_index || - it->name.len != entry->e_name_len) - return -ENOATTR; - it->infix_len = 0; - return 0; -} - -static int xattr_namematch(struct xattr_iter *_it, - unsigned int processed, char *buf, unsigned int len) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - - if (memcmp(buf, it->name.name + it->infix_len + processed, len)) - return -ENOATTR; - return 0; -} - -static int xattr_checkbuffer(struct xattr_iter *_it, - unsigned int value_sz) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - int err = it->buffer_size < value_sz ? -ERANGE : 0; - - it->buffer_size = value_sz; - return !it->buffer ? 1 : err; -} - -static void xattr_copyvalue(struct xattr_iter *_it, - unsigned int processed, - char *buf, unsigned int len) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - - memcpy(it->buffer + processed, buf, len); -} - -static const struct xattr_iter_handlers find_xattr_handlers = { - .entry = xattr_entrymatch, - .name = xattr_namematch, - .alloc_buffer = xattr_checkbuffer, - .value = xattr_copyvalue -}; - -static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) -{ - int ret; - unsigned int remaining; - - ret = inline_xattr_iter_begin(&it->it, inode); - if (ret < 0) - return ret; - - remaining = ret; - while (remaining) { - ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining); - if (ret != -ENOATTR) - break; - } - return ret ? ret : it->buffer_size; -} - -static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = it->it.sb; - unsigned int i, xsid; - int ret = -ENOATTR; - - for (i = 0; i < vi->xattr_shared_count; ++i) { - xsid = vi->xattr_shared_xattrs[i]; - it->it.blkaddr = erofs_xattr_blkaddr(sb, xsid); - it->it.ofs = erofs_xattr_blkoff(sb, xsid); - it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, - it->it.blkaddr, EROFS_KMAP); - if (IS_ERR(it->it.kaddr)) - return PTR_ERR(it->it.kaddr); - - ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); - if (ret != -ENOATTR) - break; - } - return ret ? ret : it->buffer_size; -} - static bool erofs_xattr_user_list(struct dentry *dentry) { return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); @@ -425,39 +133,6 @@ static bool erofs_xattr_trusted_list(struct dentry *dentry) return capable(CAP_SYS_ADMIN); } -int erofs_getxattr(struct inode *inode, int index, - const char *name, - void *buffer, size_t buffer_size) -{ - int ret; - struct getxattr_iter it; - - if (!name) - return -EINVAL; - - ret = erofs_init_inode_xattrs(inode); - if (ret) - return ret; - - it.index = index; - it.name.len = strlen(name); - if (it.name.len > EROFS_NAME_LEN) - return -ERANGE; - - it.it.buf = __EROFS_BUF_INITIALIZER; - it.name.name = name; - - it.buffer = buffer; - it.buffer_size = buffer_size; - - it.it.sb = inode->i_sb; - ret = inline_getxattr(inode, &it); - if (ret == -ENOATTR) - ret = shared_getxattr(inode, &it); - erofs_put_metabuf(&it.it.buf); - return ret; -} - static int erofs_xattr_generic_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) @@ -500,30 +175,49 @@ const struct xattr_handler *erofs_xattr_handlers[] = { NULL, }; -struct listxattr_iter { - struct xattr_iter it; - - struct dentry *dentry; - char *buffer; - int buffer_size, buffer_ofs; -}; +static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, + unsigned int len) +{ + unsigned int slice, processed; + struct super_block *sb = it->sb; + void *src; + + for (processed = 0; processed < len; processed += slice) { + it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + src = it->kaddr + erofs_blkoff(sb, it->pos); + slice = min_t(unsigned int, sb->s_blocksize - + erofs_blkoff(sb, it->pos), len - processed); + memcpy(it->buffer + it->buffer_ofs, src, slice); + it->buffer_ofs += slice; + it->pos += slice; + } + return 0; +} -static int xattr_entrylist(struct xattr_iter *_it, - struct erofs_xattr_entry *entry) +static int erofs_listxattr_foreach(struct erofs_xattr_iter *it) { - struct listxattr_iter *it = - container_of(_it, struct listxattr_iter, it); - unsigned int base_index = entry->e_name_index; - unsigned int prefix_len, infix_len = 0; + struct erofs_xattr_entry entry; + unsigned int base_index, name_total, prefix_len, infix_len = 0; const char *prefix, *infix = NULL; + int err; + + /* 1. handle xattr entry */ + entry = *(struct erofs_xattr_entry *) + (it->kaddr + erofs_blkoff(it->sb, it->pos)); + it->pos += sizeof(struct erofs_xattr_entry); - if (entry->e_name_index & EROFS_XATTR_LONG_PREFIX) { - struct erofs_sb_info *sbi = EROFS_SB(_it->sb); + base_index = entry.e_name_index; + if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) { + struct erofs_sb_info *sbi = EROFS_SB(it->sb); struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + - (entry->e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) - return 1; + return 0; infix = pf->prefix->infix; infix_len = pf->infix_len; base_index = pf->prefix->base_index; @@ -531,120 +225,228 @@ static int xattr_entrylist(struct xattr_iter *_it, prefix = erofs_xattr_prefix(base_index, it->dentry); if (!prefix) - return 1; + return 0; prefix_len = strlen(prefix); + name_total = prefix_len + infix_len + entry.e_name_len + 1; if (!it->buffer) { - it->buffer_ofs += prefix_len + infix_len + - entry->e_name_len + 1; - return 1; + it->buffer_ofs += name_total; + return 0; } - if (it->buffer_ofs + prefix_len + infix_len + - + entry->e_name_len + 1 > it->buffer_size) + if (it->buffer_ofs + name_total > it->buffer_size) return -ERANGE; memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); memcpy(it->buffer + it->buffer_ofs + prefix_len, infix, infix_len); it->buffer_ofs += prefix_len + infix_len; - return 0; -} -static int xattr_namelist(struct xattr_iter *_it, - unsigned int processed, char *buf, unsigned int len) -{ - struct listxattr_iter *it = - container_of(_it, struct listxattr_iter, it); + /* 2. handle xattr name */ + err = erofs_xattr_copy_to_buffer(it, entry.e_name_len); + if (err) + return err; - memcpy(it->buffer + it->buffer_ofs, buf, len); - it->buffer_ofs += len; + it->buffer[it->buffer_ofs++] = '\0'; return 0; } -static int xattr_skipvalue(struct xattr_iter *_it, - unsigned int value_sz) +static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) { - struct listxattr_iter *it = - container_of(_it, struct listxattr_iter, it); + struct super_block *sb = it->sb; + struct erofs_xattr_entry entry; + unsigned int slice, processed, value_sz; - it->buffer[it->buffer_ofs++] = '\0'; - return 1; -} + /* 1. handle xattr entry */ + entry = *(struct erofs_xattr_entry *) + (it->kaddr + erofs_blkoff(sb, it->pos)); + it->pos += sizeof(struct erofs_xattr_entry); + value_sz = le16_to_cpu(entry.e_value_size); -static const struct xattr_iter_handlers list_xattr_handlers = { - .entry = xattr_entrylist, - .name = xattr_namelist, - .alloc_buffer = xattr_skipvalue, - .value = NULL -}; + /* should also match the infix for long name prefixes */ + if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) { + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + + (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + + if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) + return -ENOATTR; + + if (it->index != pf->prefix->base_index || + it->name.len != entry.e_name_len + pf->infix_len) + return -ENOATTR; + + if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len)) + return -ENOATTR; + + it->infix_len = pf->infix_len; + } else { + if (it->index != entry.e_name_index || + it->name.len != entry.e_name_len) + return -ENOATTR; -static int inline_listxattr(struct listxattr_iter *it) + it->infix_len = 0; + } + + /* 2. handle xattr name */ + for (processed = 0; processed < entry.e_name_len; processed += slice) { + it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + slice = min_t(unsigned int, + sb->s_blocksize - erofs_blkoff(sb, it->pos), + entry.e_name_len - processed); + if (memcmp(it->name.name + it->infix_len + processed, + it->kaddr + erofs_blkoff(sb, it->pos), slice)) + return -ENOATTR; + it->pos += slice; + } + + /* 3. handle xattr value */ + if (!it->buffer) { + it->buffer_ofs = value_sz; + return 0; + } + + if (it->buffer_size < value_sz) + return -ERANGE; + + return erofs_xattr_copy_to_buffer(it, value_sz); +} + +static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, + struct inode *inode, bool getxattr) { + struct erofs_inode *const vi = EROFS_I(inode); + unsigned int xattr_header_sz, remaining, entry_sz; + erofs_off_t next_pos; int ret; - unsigned int remaining; - ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry)); - if (ret < 0) - return ret; + xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * vi->xattr_shared_count; + if (xattr_header_sz >= vi->xattr_isize) { + DBG_BUGON(xattr_header_sz > vi->xattr_isize); + return -ENOATTR; + } + + remaining = vi->xattr_isize - xattr_header_sz; + it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; - remaining = ret; while (remaining) { - ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining); - if (ret) + it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + entry_sz = erofs_xattr_entry_size(it->kaddr + + erofs_blkoff(it->sb, it->pos)); + /* xattr on-disk corruption: xattr entry beyond xattr_isize */ + if (remaining < entry_sz) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + remaining -= entry_sz; + next_pos = it->pos + entry_sz; + + if (getxattr) + ret = erofs_getxattr_foreach(it); + else + ret = erofs_listxattr_foreach(it); + if ((getxattr && ret != -ENOATTR) || (!getxattr && ret)) break; + + it->pos = next_pos; } - return ret ? ret : it->buffer_ofs; + return ret; } -static int shared_listxattr(struct listxattr_iter *it) +static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, + struct inode *inode, bool getxattr) { - struct inode *const inode = d_inode(it->dentry); struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = it->it.sb; - unsigned int i, xsid; - int ret = 0; + struct super_block *const sb = it->sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int i; + int ret = -ENOATTR; for (i = 0; i < vi->xattr_shared_count; ++i) { - xsid = vi->xattr_shared_xattrs[i]; - it->it.blkaddr = erofs_xattr_blkaddr(sb, xsid); - it->it.ofs = erofs_xattr_blkoff(sb, xsid); - it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, - it->it.blkaddr, EROFS_KMAP); - if (IS_ERR(it->it.kaddr)) - return PTR_ERR(it->it.kaddr); - - ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); - if (ret) + it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + + vi->xattr_shared_xattrs[i] * sizeof(__le32); + it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + if (getxattr) + ret = erofs_getxattr_foreach(it); + else + ret = erofs_listxattr_foreach(it); + if ((getxattr && ret != -ENOATTR) || (!getxattr && ret)) break; } - return ret ? ret : it->buffer_ofs; + return ret; +} + +int erofs_getxattr(struct inode *inode, int index, const char *name, + void *buffer, size_t buffer_size) +{ + int ret; + struct erofs_xattr_iter it; + + if (!name) + return -EINVAL; + + ret = erofs_init_inode_xattrs(inode); + if (ret) + return ret; + + it.index = index; + it.name = (struct qstr)QSTR_INIT(name, strlen(name)); + if (it.name.len > EROFS_NAME_LEN) + return -ERANGE; + + it.sb = inode->i_sb; + it.buf = __EROFS_BUF_INITIALIZER; + erofs_init_metabuf(&it.buf, it.sb); + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + ret = erofs_xattr_iter_inline(&it, inode, true); + if (ret == -ENOATTR) + ret = erofs_xattr_iter_shared(&it, inode, true); + erofs_put_metabuf(&it.buf); + return ret ? ret : it.buffer_ofs; } -ssize_t erofs_listxattr(struct dentry *dentry, - char *buffer, size_t buffer_size) +ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret; - struct listxattr_iter it; + struct erofs_xattr_iter it; + struct inode *inode = d_inode(dentry); - ret = erofs_init_inode_xattrs(d_inode(dentry)); + ret = erofs_init_inode_xattrs(inode); if (ret == -ENOATTR) return 0; if (ret) return ret; - it.it.buf = __EROFS_BUF_INITIALIZER; + it.sb = dentry->d_sb; + it.buf = __EROFS_BUF_INITIALIZER; + erofs_init_metabuf(&it.buf, it.sb); it.dentry = dentry; it.buffer = buffer; it.buffer_size = buffer_size; it.buffer_ofs = 0; - it.it.sb = dentry->d_sb; - - ret = inline_listxattr(&it); - if (ret >= 0 || ret == -ENOATTR) - ret = shared_listxattr(&it); - erofs_put_metabuf(&it.it.buf); - return ret; + ret = erofs_xattr_iter_inline(&it, inode, false); + if (!ret || ret == -ENOATTR) + ret = erofs_xattr_iter_shared(&it, inode, false); + if (ret == -ENOATTR) + ret = 0; + erofs_put_metabuf(&it.buf); + return ret ? ret : it.buffer_ofs; } void erofs_xattr_prefixes_cleanup(struct super_block *sb) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 160b3da43aec..5f1890e309c6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -5,7 +5,6 @@ * Copyright (C) 2022 Alibaba Cloud */ #include "compress.h" -#include <linux/prefetch.h> #include <linux/psi.h> #include <linux/cpuhotplug.h> #include <trace/events/erofs.h> @@ -92,13 +91,8 @@ struct z_erofs_pcluster { struct z_erofs_bvec compressed_bvecs[]; }; -/* let's avoid the valid 32-bit kernel addresses */ - -/* the chained workgroup has't submitted io (still open) */ -#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) -/* the chained workgroup has already submitted io */ -#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) - +/* the end of a chain of pclusters */ +#define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA) #define Z_EROFS_PCLUSTER_NIL (NULL) struct z_erofs_decompressqueue { @@ -241,14 +235,20 @@ static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, - struct page **candidate_bvpage) + struct page **candidate_bvpage, + struct page **pagepool) { - if (iter->cur == iter->nr) { - if (!*candidate_bvpage) - return -EAGAIN; - + if (iter->cur >= iter->nr) { + struct page *nextpage = *candidate_bvpage; + + if (!nextpage) { + nextpage = erofs_allocpage(pagepool, GFP_NOFS); + if (!nextpage) + return -ENOMEM; + set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); + } DBG_BUGON(iter->bvset->nextpage); - iter->bvset->nextpage = *candidate_bvpage; + iter->bvset->nextpage = nextpage; z_erofs_bvset_flip(iter); iter->bvset->nextpage = NULL; @@ -500,20 +500,6 @@ out_error_pcluster_pool: enum z_erofs_pclustermode { Z_EROFS_PCLUSTER_INFLIGHT, /* - * The current pclusters was the tail of an exist chain, in addition - * that the previous processed chained pclusters are all decided to - * be hooked up to it. - * A new chain will be created for the remaining pclusters which are - * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, - * the next pcluster cannot reuse the whole page safely for inplace I/O - * in the following scenario: - * ________________________________________________________________ - * | tail (partial) page | head (partial) page | - * | (belongs to the next pcl) | (belongs to the current pcl) | - * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| - */ - Z_EROFS_PCLUSTER_HOOKED, - /* * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it * could be dispatched into bypass queue later due to uptodated managed * pages. All related online pages cannot be reused for inplace I/O (or @@ -530,8 +516,8 @@ enum z_erofs_pclustermode { * ________________________________________________________________ * | tail (partial) page | head (partial) page | * | (of the current cl) | (of the previous collection) | - * | PCLUSTER_FOLLOWED or | | - * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| + * | | | + * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________| * * [ (*) the above page can be used as inplace I/O. ] */ @@ -543,12 +529,12 @@ struct z_erofs_decompress_frontend { struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; + struct page *pagepool; struct page *candidate_bvpage; - struct z_erofs_pcluster *pcl, *tailpcl; + struct z_erofs_pcluster *pcl; z_erofs_next_pcluster_t owned_head; enum z_erofs_pclustermode mode; - bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; @@ -578,8 +564,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) return false; } -static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, - struct page **pagepool) +static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; @@ -620,7 +605,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, * succeeds or fallback to in-place I/O instead * to avoid any direct reclaim. */ - newpage = erofs_allocpage(pagepool, gfp); + newpage = erofs_allocpage(&fe->pagepool, gfp); if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); @@ -633,7 +618,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, if (page) put_page(page); else if (newpage) - erofs_pagepool_add(pagepool, newpage); + erofs_pagepool_add(&fe->pagepool, newpage); } /* @@ -654,7 +639,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); /* - * refcount of workgroup is now freezed as 1, + * refcount of workgroup is now freezed as 0, * therefore no need to worry about available decompression users. */ for (i = 0; i < pcl->pclusterpages; ++i) { @@ -678,29 +663,73 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, return 0; } -int erofs_try_to_free_cached_page(struct page *page) +static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { - struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret, i; + struct z_erofs_pcluster *pcl = folio_get_private(folio); + bool ret; + int i; - if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) - return 0; + if (!folio_test_private(folio)) + return true; + + ret = false; + spin_lock(&pcl->obj.lockref.lock); + if (pcl->obj.lockref.count > 0) + goto out; - ret = 0; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_bvecs[i].page == page) { + if (pcl->compressed_bvecs[i].page == &folio->page) { WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); - ret = 1; + ret = true; break; } } - erofs_workgroup_unfreeze(&pcl->obj, 1); if (ret) - detach_page_private(page); + folio_detach_private(folio); +out: + spin_unlock(&pcl->obj.lockref.lock); return ret; } +/* + * It will be called only on inode eviction. In case that there are still some + * decompression requests in progress, wait with rescheduling for a bit here. + * An extra lock could be introduced instead but it seems unnecessary. + */ +static void z_erofs_cache_invalidate_folio(struct folio *folio, + size_t offset, size_t length) +{ + const size_t stop = length + offset; + + /* Check for potential overflow in debug mode */ + DBG_BUGON(stop > folio_size(folio) || stop < length); + + if (offset == 0 && stop == folio_size(folio)) + while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations z_erofs_cache_aops = { + .release_folio = z_erofs_cache_release_folio, + .invalidate_folio = z_erofs_cache_invalidate_folio, +}; + +int erofs_init_managed_cache(struct super_block *sb) +{ + struct inode *const inode = new_inode(sb); + + if (!inode) + return -ENOMEM; + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &z_erofs_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + EROFS_SB(sb)->managed_cache = inode; + return 0; +} + static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec) { @@ -731,7 +760,8 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, !fe->candidate_bvpage) fe->candidate_bvpage = bvec->page; } - ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage, + &fe->pagepool); fe->pcl->vcnt += (ret >= 0); return ret; } @@ -750,19 +780,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) return; } - /* - * type 2, link to the end of an existing open chain, be careful - * that its submission is controlled by the original attached chain. - */ - if (*owned_head != &pcl->next && pcl != f->tailpcl && - cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, - *owned_head) == Z_EROFS_PCLUSTER_TAIL) { - *owned_head = Z_EROFS_PCLUSTER_TAIL; - f->mode = Z_EROFS_PCLUSTER_HOOKED; - f->tailpcl = NULL; - return; - } - /* type 3, it belongs to a chain, but it isn't the end of the chain */ + /* type 2, it belongs to an ongoing chain */ f->mode = Z_EROFS_PCLUSTER_INFLIGHT; } @@ -786,7 +804,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (IS_ERR(pcl)) return PTR_ERR(pcl); - atomic_set(&pcl->obj.refcount, 1); + spin_lock_init(&pcl->obj.lockref.lock); pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; @@ -823,9 +841,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) goto err_out; } } - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = pcl; fe->owned_head = &pcl->next; fe->pcl = pcl; return 0; @@ -846,7 +861,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); - DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); if (!(map->m_flags & EROFS_MAP_META)) { grp = erofs_find_workgroup(fe->inode->i_sb, @@ -865,10 +879,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) if (ret == -EEXIST) { mutex_lock(&fe->pcl->lock); - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = fe->pcl; - z_erofs_try_to_claim_pcluster(fe); } else if (ret) { return ret; @@ -908,10 +918,8 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); - if (fe->candidate_bvpage) { - DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + if (fe->candidate_bvpage) fe->candidate_bvpage = NULL; - } /* * if all pending pages are added, don't hold its reference @@ -958,7 +966,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, struct page **pagepool) + struct page *page) { struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; @@ -1016,7 +1024,7 @@ repeat: fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe, pagepool); + z_erofs_bind_cache(fe); } hitted: /* @@ -1025,8 +1033,7 @@ hitted: * those chains are handled asynchronously thus the page cannot be used * for inplace I/O or bvpage (should be processed in a strict order.) */ - tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && - fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { @@ -1056,24 +1063,13 @@ hitted: if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); -retry: err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { .page = page, .offset = offset - map->m_la, .end = end, }), exclusive); - /* should allocate an additional short-lived page for bvset */ - if (err == -EAGAIN && !fe->candidate_bvpage) { - fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); - set_page_private(fe->candidate_bvpage, - Z_EROFS_SHORTLIVED_PAGE); - goto retry; - } - - if (err) { - DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); + if (err) goto out; - } z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ @@ -1104,7 +1100,7 @@ out: return err; } -static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, +static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, unsigned int readahead_pages) { /* auto: enable for read_folio, disable for readahead */ @@ -1283,6 +1279,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + const struct z_erofs_decompressor *decompressor = + &erofs_decompressors[pcl->algorithmformat]; unsigned int i, inputsize; int err2; struct page *page; @@ -1326,7 +1324,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, else inputsize = pclusterpages * PAGE_SIZE; - err = z_erofs_decompress(&(struct z_erofs_decompress_req) { + err = decompressor->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, @@ -1404,10 +1402,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, }; z_erofs_next_pcluster_t owned = io->head; - while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { - /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ - DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); - /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ + while (owned != Z_EROFS_PCLUSTER_TAIL) { DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); be.pcl = container_of(owned, struct z_erofs_pcluster, next); @@ -1424,7 +1419,7 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) container_of(work, struct z_erofs_decompressqueue, u.work); struct page *pagepool = NULL; - DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL); z_erofs_decompress_queue(bgq, &pagepool); erofs_release_pages(&pagepool); kvfree(bgq); @@ -1452,7 +1447,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (atomic_add_return(bios, &io->pending_bios)) return; /* Use (kthread_)work and sync decompression for atomic contexts only */ - if (in_atomic() || irqs_disabled()) { + if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) { #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD struct kthread_worker *worker; @@ -1612,7 +1607,7 @@ fg_out: q->sync = true; } q->sb = sb; - q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + q->head = Z_EROFS_PCLUSTER_TAIL; return q; } @@ -1630,11 +1625,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; - DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (owned_head == Z_EROFS_PCLUSTER_TAIL) - owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED; - - WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED); + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL); WRITE_ONCE(*submit_qtail, owned_head); WRITE_ONCE(*bypass_qtail, &pcl->next); @@ -1668,9 +1659,8 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) } static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, struct z_erofs_decompressqueue *fgq, - bool *force_fg) + bool *force_fg, bool readahead) { struct super_block *sb = f->inode->i_sb; struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); @@ -1705,15 +1695,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, unsigned int i = 0; bool bypass = true; - /* no possible 'owned_head' equals the following */ - DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned_head, struct z_erofs_pcluster, next); + owned_head = READ_ONCE(pcl->next); - /* close the main owned chain at first */ - owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, - Z_EROFS_PCLUSTER_TAIL_CLOSED); if (z_erofs_is_inline_pcluster(pcl)) { move_to_bypass_jobqueue(pcl, qtail, owned_head); continue; @@ -1731,8 +1716,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, do { struct page *page; - page = pickup_page_for_submission(pcl, i++, pagepool, - mc); + page = pickup_page_for_submission(pcl, i++, + &f->pagepool, mc); if (!page) continue; @@ -1761,7 +1746,7 @@ submit_bio_retry: bio->bi_iter.bi_sector = (sector_t)cur << (sb->s_blocksize_bits - 9); bio->bi_private = q[JQ_SUBMIT]; - if (f->readahead) + if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } @@ -1797,16 +1782,16 @@ submit_bio_retry: } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, bool force_fg) + bool force_fg, bool ra) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, io, &force_fg, ra); /* handle bypass queue (no i/o pclusters) immediately */ - z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); + z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) return; @@ -1815,7 +1800,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ - z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); + z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); } /* @@ -1823,29 +1808,28 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, * approximate readmore strategies as a start. */ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, - struct readahead_control *rac, - erofs_off_t end, - struct page **pagepool, - bool backmost) + struct readahead_control *rac, bool backmost) { struct inode *inode = f->inode; struct erofs_map_blocks *map = &f->map; - erofs_off_t cur; + erofs_off_t cur, end, headoffset = f->headoffset; int err; if (backmost) { + if (rac) + end = headoffset + readahead_length(rac) - 1; + else + end = headoffset + PAGE_SIZE - 1; map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); if (err) return; - /* expend ra for the trailing edge if readahead */ + /* expand ra for the trailing edge if readahead */ if (rac) { - loff_t newstart = readahead_pos(rac); - cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); - readahead_expand(rac, newstart, cur - newstart); + readahead_expand(rac, headoffset, cur - headoffset); return; } end = round_up(end, PAGE_SIZE); @@ -1866,7 +1850,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, if (PageUptodate(page)) { unlock_page(page); } else { - err = z_erofs_do_read_page(f, page, pagepool); + err = z_erofs_do_read_page(f, page); if (err) erofs_err(inode->i_sb, "readmore error at page %lu @ nid %llu", @@ -1887,28 +1871,24 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) struct inode *const inode = page->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL; int err; trace_erofs_readpage(page, false); f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, - &pagepool, true); - err = z_erofs_do_read_page(&f, page, &pagepool); - z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); - + z_erofs_pcluster_readmore(&f, NULL, true); + err = z_erofs_do_read_page(&f, page); + z_erofs_pcluster_readmore(&f, NULL, false); (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, 0)); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); return err; } @@ -1917,14 +1897,12 @@ static void z_erofs_readahead(struct readahead_control *rac) struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL, *head = NULL, *page; + struct page *head = NULL, *page; unsigned int nr_pages; - f.readahead = true; f.headoffset = readahead_pos(rac); - z_erofs_pcluster_readmore(&f, rac, f.headoffset + - readahead_length(rac) - 1, &pagepool, true); + z_erofs_pcluster_readmore(&f, rac, true); nr_pages = readahead_count(rac); trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); @@ -1940,20 +1918,19 @@ static void z_erofs_readahead(struct readahead_control *rac) /* traversal in reverse order */ head = (void *)page_private(page); - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); if (err) erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", page->index, EROFS_I(inode)->nid); put_page(page); } - z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); + z_erofs_pcluster_readmore(&f, rac, false); (void)z_erofs_collector_end(&f); - z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, nr_pages)); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); } const struct address_space_operations z_erofs_aops = { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index d37c5c89c728..1909ddafd9c7 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -22,8 +22,8 @@ struct z_erofs_maprecorder { bool partialref; }; -static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn) +static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, + unsigned long lcn) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); @@ -129,7 +129,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, u8 *in, type; bool big_pcluster; - if (1 << amortizedshift == 4) + if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; else if (1 << amortizedshift == 2 && lclusterbits == 12) vcnt = 16; @@ -226,12 +226,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, return 0; } -static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn, bool lookahead) +static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, + unsigned long lcn, bool lookahead) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const unsigned int lclusterbits = vi->z_logical_clusterbits; const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); unsigned int totalidx = erofs_iblks(inode); @@ -239,9 +238,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned int amortizedshift; erofs_off_t pos; - if (lclusterbits != 12) - return -EOPNOTSUPP; - if (lcn >= totalidx) return -EINVAL; @@ -281,23 +277,23 @@ out: return unpack_compacted_index(m, amortizedshift, pos, lookahead); } -static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned int lcn, bool lookahead) +static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m, + unsigned int lcn, bool lookahead) { - const unsigned int datamode = EROFS_I(m->inode)->datalayout; - - if (datamode == EROFS_INODE_COMPRESSED_FULL) - return legacy_load_cluster_from_disk(m, lcn); - - if (datamode == EROFS_INODE_COMPRESSED_COMPACT) - return compacted_load_cluster_from_disk(m, lcn, lookahead); - - return -EINVAL; + switch (EROFS_I(m->inode)->datalayout) { + case EROFS_INODE_COMPRESSED_FULL: + return z_erofs_load_full_lcluster(m, lcn); + case EROFS_INODE_COMPRESSED_COMPACT: + return z_erofs_load_compact_lcluster(m, lcn, lookahead); + default: + return -EINVAL; + } } static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, unsigned int lookback_distance) { + struct super_block *sb = m->inode->i_sb; struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; @@ -305,21 +301,15 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, unsigned long lcn = m->lcn - lookback_distance; int err; - /* load extent head logical cluster if needed */ - err = z_erofs_load_cluster_from_disk(m, lcn, false); + err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; switch (m->type) { case Z_EROFS_LCLUSTER_TYPE_NONHEAD: - if (!m->delta[0]) { - erofs_err(m->inode->i_sb, - "invalid lookback distance 0 @ nid %llu", - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } lookback_distance = m->delta[0]; + if (!lookback_distance) + goto err_bogus; continue; case Z_EROFS_LCLUSTER_TYPE_PLAIN: case Z_EROFS_LCLUSTER_TYPE_HEAD1: @@ -328,16 +318,15 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, m->map->m_la = (lcn << lclusterbits) | m->clusterofs; return 0; default: - erofs_err(m->inode->i_sb, - "unknown type %u @ lcn %lu of nid %llu", + erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", m->type, lcn, vi->nid); DBG_BUGON(1); return -EOPNOTSUPP; } } - - erofs_err(m->inode->i_sb, "bogus lookback distance @ nid %llu", - vi->nid); +err_bogus: + erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu", + lookback_distance, m->lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } @@ -369,7 +358,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, if (m->compressedblks) goto out; - err = z_erofs_load_cluster_from_disk(m, lcn, false); + err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; @@ -401,9 +390,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, break; fallthrough; default: - erofs_err(m->inode->i_sb, - "cannot found CBLKCNT @ lcn %lu of nid %llu", - lcn, vi->nid); + erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, + vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } @@ -411,9 +399,7 @@ out: map->m_plen = erofs_pos(sb, m->compressedblks); return 0; err_bonus_cblkcnt: - erofs_err(m->inode->i_sb, - "bogus CBLKCNT @ lcn %lu of nid %llu", - lcn, vi->nid); + erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } @@ -434,7 +420,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) return 0; } - err = z_erofs_load_cluster_from_disk(m, lcn, true); + err = z_erofs_load_lcluster_from_disk(m, lcn, true); if (err) return err; @@ -481,7 +467,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); - err = z_erofs_load_cluster_from_disk(&m, initial_lcn, false); + err = z_erofs_load_lcluster_from_disk(&m, initial_lcn, false); if (err) goto unmap_out; @@ -539,8 +525,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (flags & EROFS_GET_BLOCKS_FINDTAIL) { vi->z_tailextent_headlcn = m.lcn; /* for non-compact indexes, fragmentoff is 64 bits */ - if (fragment && - vi->datalayout == EROFS_INODE_COMPRESSED_FULL) + if (fragment && vi->datalayout == EROFS_INODE_COMPRESSED_FULL) vi->z_fragmentoff |= (u64)m.pblk << 32; } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { diff --git a/fs/eventfd.c b/fs/eventfd.c index 95850a13ce8d..8aa36cd37351 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -33,17 +33,17 @@ struct eventfd_ctx { /* * Every time that a write(2) is performed on an eventfd, the * value of the __u64 being written is added to "count" and a - * wakeup is performed on "wqh". A read(2) will return the "count" - * value to userspace, and will reset "count" to zero. The kernel - * side eventfd_signal() also, adds to the "count" counter and - * issue a wakeup. + * wakeup is performed on "wqh". If EFD_SEMAPHORE flag was not + * specified, a read(2) will return the "count" value to userspace, + * and will reset "count" to zero. The kernel side eventfd_signal() + * also, adds to the "count" counter and issue a wakeup. */ __u64 count; unsigned int flags; int id; }; -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask) +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) { unsigned long flags; @@ -301,6 +301,8 @@ static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) (unsigned long long)ctx->count); spin_unlock_irq(&ctx->wqh.lock); seq_printf(m, "eventfd-id: %d\n", ctx->id); + seq_printf(m, "eventfd-semaphore: %d\n", + !!(ctx->flags & EFD_SEMAPHORE)); } #endif diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 266d45c7685b..4b1b3362f697 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -536,7 +536,7 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, #else static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, - unsigned pollflags) + __poll_t pollflags) { wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } diff --git a/fs/exec.c b/fs/exec.c index a466e797c8e2..1a827d55ba94 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -152,8 +152,6 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) path_noexec(&file->f_path))) goto exit; - fsnotify_open(file); - error = -ENOEXEC; read_lock(&binfmt_lock); @@ -200,33 +198,39 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; + struct vm_area_struct *vma = bprm->vma; + struct mm_struct *mm = bprm->mm; int ret; - unsigned int gup_flags = 0; -#ifdef CONFIG_STACK_GROWSUP - if (write) { - ret = expand_downwards(bprm->vma, pos); - if (ret < 0) + /* + * Avoid relying on expanding the stack down in GUP (which + * does not work for STACK_GROWSUP anyway), and just do it + * by hand ahead of time. + */ + if (write && pos < vma->vm_start) { + mmap_write_lock(mm); + ret = expand_downwards(vma, pos); + if (unlikely(ret < 0)) { + mmap_write_unlock(mm); return NULL; - } -#endif - - if (write) - gup_flags |= FOLL_WRITE; + } + mmap_write_downgrade(mm); + } else + mmap_read_lock(mm); /* * We are doing an exec(). 'current' is the process - * doing the exec and bprm->mm is the new process's mm. + * doing the exec and 'mm' is the new process's mm. */ - mmap_read_lock(bprm->mm); - ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, - &page, NULL, NULL); - mmap_read_unlock(bprm->mm); + ret = get_user_pages_remote(mm, pos, 1, + write ? FOLL_WRITE : 0, + &page, NULL); + mmap_read_unlock(mm); if (ret <= 0) return NULL; if (write) - acct_arg_size(bprm, vma_pages(bprm->vma)); + acct_arg_size(bprm, vma_pages(vma)); return page; } @@ -853,7 +857,7 @@ int setup_arg_pages(struct linux_binprm *bprm, stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; - ret = expand_stack(vma, stack_base); + ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; @@ -934,9 +938,6 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) if (err) goto exit; - if (name->name[0] != '\0') - fsnotify_open(file); - out: return file; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index e99183a74611..3cbd270e0cba 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -389,7 +389,7 @@ const struct file_operations exfat_file_operations = { #endif .mmap = generic_file_mmap, .fsync = exfat_file_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index ab88d33d106c..40e624cf7e92 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -381,11 +381,27 @@ static int export_encode_fh(struct inode *inode, struct fid *fid, return type; } +/** + * exportfs_encode_inode_fh - encode a file handle from inode + * @inode: the object to encode + * @fid: where to store the file handle fragment + * @max_len: maximum length to store there + * @flags: properties of the requested file handle + * + * Returns an enum fid_type or a negative errno. + */ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, - int *max_len, struct inode *parent) + int *max_len, struct inode *parent, int flags) { const struct export_operations *nop = inode->i_sb->s_export_op; + /* + * If a decodeable file handle was requested, we need to make sure that + * filesystem can decode file handles. + */ + if (nop && !(flags & EXPORT_FH_FID) && !nop->fh_to_dentry) + return -EOPNOTSUPP; + if (nop && nop->encode_fh) return nop->encode_fh(inode, fid->raw, max_len, parent); @@ -393,14 +409,23 @@ int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, } EXPORT_SYMBOL_GPL(exportfs_encode_inode_fh); +/** + * exportfs_encode_fh - encode a file handle from dentry + * @dentry: the object to encode + * @fid: where to store the file handle fragment + * @max_len: maximum length to store there + * @flags: properties of the requested file handle + * + * Returns an enum fid_type or a negative errno. + */ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, - int connectable) + int flags) { int error; struct dentry *p = NULL; struct inode *inode = dentry->d_inode, *parent = NULL; - if (connectable && !S_ISDIR(inode->i_mode)) { + if ((flags & EXPORT_FH_CONNECTABLE) && !S_ISDIR(inode->i_mode)) { p = dget_parent(dentry); /* * note that while p might've ceased to be our parent already, @@ -409,7 +434,7 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len, parent = p->d_inode; } - error = exportfs_encode_inode_fh(inode, fid, max_len, parent); + error = exportfs_encode_inode_fh(inode, fid, max_len, parent, flags); dput(p); return error; diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile index 311479d864a7..8860948ef9ca 100644 --- a/fs/ext2/Makefile +++ b/fs/ext2/Makefile @@ -6,7 +6,10 @@ obj-$(CONFIG_EXT2_FS) += ext2.o ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o + ioctl.o namei.o super.o symlink.o trace.o + +# For tracepoints to include our trace.h from tracepoint infrastructure +CFLAGS_trace.o := -I$(src) ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 4a6955a0a116..42db804794bd 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -186,23 +186,25 @@ fail: * NOTE: ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page() * and should be treated as a call to ext2_get_page() for nesting purposes. */ -static struct page * ext2_get_page(struct inode *dir, unsigned long n, - int quiet, void **page_addr) +static void *ext2_get_page(struct inode *dir, unsigned long n, + int quiet, struct page **page) { struct address_space *mapping = dir->i_mapping; struct folio *folio = read_mapping_folio(mapping, n, NULL); + void *page_addr; if (IS_ERR(folio)) - return &folio->page; - *page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1)); + return ERR_CAST(folio); + page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1)); if (unlikely(!folio_test_checked(folio))) { - if (!ext2_check_page(&folio->page, quiet, *page_addr)) + if (!ext2_check_page(&folio->page, quiet, page_addr)) goto fail; } - return &folio->page; + *page = &folio->page; + return page_addr; fail: - ext2_put_page(&folio->page, *page_addr); + ext2_put_page(&folio->page, page_addr); return ERR_PTR(-EIO); } @@ -240,7 +242,7 @@ ext2_validate_entry(char *base, unsigned offset, unsigned mask) break; p = ext2_next_entry(p); } - return (char *)p - base; + return offset_in_page(p); } static inline void ext2_set_de_type(ext2_dirent *de, struct inode *inode) @@ -271,16 +273,17 @@ ext2_readdir(struct file *file, struct dir_context *ctx) EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE); for ( ; n < npages; n++, offset = 0) { - char *kaddr, *limit; ext2_dirent *de; - struct page *page = ext2_get_page(inode, n, 0, (void **)&kaddr); + struct page *page; + char *kaddr = ext2_get_page(inode, n, 0, &page); + char *limit; - if (IS_ERR(page)) { + if (IS_ERR(kaddr)) { ext2_error(sb, __func__, "bad page in #%lu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; - return PTR_ERR(page); + return PTR_ERR(kaddr); } if (unlikely(need_revalidate)) { if (offset) { @@ -296,7 +299,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) if (de->rec_len == 0) { ext2_error(sb, __func__, "zero-length directory entry"); - ext2_put_page(page, kaddr); + ext2_put_page(page, de); return -EIO; } if (de->inode) { @@ -308,7 +311,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), d_type)) { - ext2_put_page(page, kaddr); + ext2_put_page(page, de); return 0; } } @@ -336,8 +339,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) * should be treated as a call to ext2_get_page() for nesting purposes. */ struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir, - const struct qstr *child, struct page **res_page, - void **res_page_addr) + const struct qstr *child, struct page **res_page) { const char *name = child->name; int namelen = child->len; @@ -347,40 +349,36 @@ struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir, struct page *page = NULL; struct ext2_inode_info *ei = EXT2_I(dir); ext2_dirent * de; - void *page_addr; if (npages == 0) goto out; /* OFFSET_CACHE */ *res_page = NULL; - *res_page_addr = NULL; start = ei->i_dir_start_lookup; if (start >= npages) start = 0; n = start; do { - char *kaddr; - page = ext2_get_page(dir, n, 0, &page_addr); - if (IS_ERR(page)) - return ERR_CAST(page); + char *kaddr = ext2_get_page(dir, n, 0, &page); + if (IS_ERR(kaddr)) + return ERR_CAST(kaddr); - kaddr = page_addr; de = (ext2_dirent *) kaddr; kaddr += ext2_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { if (de->rec_len == 0) { ext2_error(dir->i_sb, __func__, "zero-length directory entry"); - ext2_put_page(page, page_addr); + ext2_put_page(page, de); goto out; } if (ext2_match(namelen, name, de)) goto found; de = ext2_next_entry(de); } - ext2_put_page(page, page_addr); + ext2_put_page(page, kaddr); if (++n >= npages) n = 0; @@ -398,7 +396,6 @@ out: found: *res_page = page; - *res_page_addr = page_addr; ei->i_dir_start_lookup = n; return de; } @@ -415,33 +412,26 @@ found: * ext2_find_entry() and ext2_dotdot() act as a call to ext2_get_page() and * should be treated as a call to ext2_get_page() for nesting purposes. */ -struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p, - void **pa) +struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p) { - void *page_addr; - struct page *page = ext2_get_page(dir, 0, 0, &page_addr); - ext2_dirent *de = NULL; + ext2_dirent *de = ext2_get_page(dir, 0, 0, p); - if (!IS_ERR(page)) { - de = ext2_next_entry((ext2_dirent *) page_addr); - *p = page; - *pa = page_addr; - } - return de; + if (!IS_ERR(de)) + return ext2_next_entry(de); + return NULL; } int ext2_inode_by_name(struct inode *dir, const struct qstr *child, ino_t *ino) { struct ext2_dir_entry_2 *de; struct page *page; - void *page_addr; - de = ext2_find_entry(dir, child, &page, &page_addr); + de = ext2_find_entry(dir, child, &page); if (IS_ERR(de)) return PTR_ERR(de); *ino = le32_to_cpu(de->inode); - ext2_put_page(page, page_addr); + ext2_put_page(page, de); return 0; } @@ -462,11 +452,9 @@ static int ext2_handle_dirsync(struct inode *dir) } int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, - struct page *page, void *page_addr, struct inode *inode, - bool update_times) + struct page *page, struct inode *inode, bool update_times) { - loff_t pos = page_offset(page) + - (char *) de - (char *) page_addr; + loff_t pos = page_offset(page) + offset_in_page(de); unsigned len = ext2_rec_len_from_disk(de->rec_len); int err; @@ -498,7 +486,6 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) unsigned reclen = EXT2_DIR_REC_LEN(namelen); unsigned short rec_len, name_len; struct page *page = NULL; - void *page_addr = NULL; ext2_dirent * de; unsigned long npages = dir_pages(dir); unsigned long n; @@ -511,15 +498,12 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) * to protect that region. */ for (n = 0; n <= npages; n++) { - char *kaddr; + char *kaddr = ext2_get_page(dir, n, 0, &page); char *dir_end; - page = ext2_get_page(dir, n, 0, &page_addr); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; + if (IS_ERR(kaddr)) + return PTR_ERR(kaddr); lock_page(page); - kaddr = page_addr; dir_end = kaddr + ext2_last_byte(dir, n); de = (ext2_dirent *)kaddr; kaddr += PAGE_SIZE - reclen; @@ -550,14 +534,13 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) de = (ext2_dirent *) ((char *) de + rec_len); } unlock_page(page); - ext2_put_page(page, page_addr); + ext2_put_page(page, kaddr); } BUG(); return -EINVAL; got_it: - pos = page_offset(page) + - (char *)de - (char *)page_addr; + pos = page_offset(page) + offset_in_page(de); err = ext2_prepare_chunk(page, pos, rec_len); if (err) goto out_unlock; @@ -578,8 +561,7 @@ got_it: err = ext2_handle_dirsync(dir); /* OFFSET_CACHE */ out_put: - ext2_put_page(page, page_addr); -out: + ext2_put_page(page, de); return err; out_unlock: unlock_page(page); @@ -590,34 +572,36 @@ out_unlock: * ext2_delete_entry deletes a directory entry by merging it with the * previous entry. Page is up-to-date. */ -int ext2_delete_entry (struct ext2_dir_entry_2 *dir, struct page *page, - char *kaddr) +int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page) { struct inode *inode = page->mapping->host; - unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1); - unsigned to = ((char *)dir - kaddr) + + char *kaddr = (char *)((unsigned long)dir & PAGE_MASK); + unsigned from = offset_in_page(dir) & ~(ext2_chunk_size(inode)-1); + unsigned to = offset_in_page(dir) + ext2_rec_len_from_disk(dir->rec_len); loff_t pos; - ext2_dirent * pde = NULL; - ext2_dirent * de = (ext2_dirent *) (kaddr + from); + ext2_dirent *pde = NULL; + ext2_dirent *de = (ext2_dirent *)(kaddr + from); int err; while ((char*)de < (char*)dir) { if (de->rec_len == 0) { ext2_error(inode->i_sb, __func__, "zero-length directory entry"); - err = -EIO; - goto out; + return -EIO; } pde = de; de = ext2_next_entry(de); } if (pde) - from = (char *)pde - kaddr; + from = offset_in_page(pde); pos = page_offset(page) + from; lock_page(page); err = ext2_prepare_chunk(page, pos, to - from); - BUG_ON(err); + if (err) { + unlock_page(page); + return err; + } if (pde) pde->rec_len = ext2_rec_len_to_disk(to - from); dir->inode = 0; @@ -625,9 +609,7 @@ int ext2_delete_entry (struct ext2_dir_entry_2 *dir, struct page *page, inode->i_ctime = inode->i_mtime = current_time(inode); EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(inode); - err = ext2_handle_dirsync(inode); -out: - return err; + return ext2_handle_dirsync(inode); } /* @@ -677,19 +659,17 @@ fail: */ int ext2_empty_dir (struct inode * inode) { - void *page_addr = NULL; - struct page *page = NULL; + struct page *page; + char *kaddr; unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { - char *kaddr; - ext2_dirent * de; - page = ext2_get_page(inode, i, 0, &page_addr); + ext2_dirent *de; - if (IS_ERR(page)) + kaddr = ext2_get_page(inode, i, 0, &page); + if (IS_ERR(kaddr)) return 0; - kaddr = page_addr; de = (ext2_dirent *)kaddr; kaddr += ext2_last_byte(inode, i) - EXT2_DIR_REC_LEN(1); @@ -715,12 +695,12 @@ int ext2_empty_dir (struct inode * inode) } de = ext2_next_entry(de); } - ext2_put_page(page, page_addr); + ext2_put_page(page, kaddr); } return 1; not_empty: - ext2_put_page(page, page_addr); + ext2_put_page(page, kaddr); return 0; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 8244366862e4..35a041c47c38 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -70,10 +70,7 @@ struct mb_cache; * second extended-fs super-block data in memory */ struct ext2_sb_info { - unsigned long s_frag_size; /* Size of a fragment in bytes */ - unsigned long s_frags_per_block;/* Number of fragments per block */ unsigned long s_inodes_per_block;/* Number of inodes per block */ - unsigned long s_frags_per_group;/* Number of fragments in a group */ unsigned long s_blocks_per_group;/* Number of blocks in a group */ unsigned long s_inodes_per_group;/* Number of inodes in a group */ unsigned long s_itb_per_group; /* Number of inode table blocks per group */ @@ -189,15 +186,6 @@ static inline struct ext2_sb_info *EXT2_SB(struct super_block *sb) #define EXT2_FIRST_INO(s) (EXT2_SB(s)->s_first_ino) /* - * Macro-instructions used to manage fragments - */ -#define EXT2_MIN_FRAG_SIZE 1024 -#define EXT2_MAX_FRAG_SIZE 4096 -#define EXT2_MIN_FRAG_LOG_SIZE 10 -#define EXT2_FRAG_SIZE(s) (EXT2_SB(s)->s_frag_size) -#define EXT2_FRAGS_PER_BLOCK(s) (EXT2_SB(s)->s_frags_per_block) - -/* * Structure of a blocks group descriptor */ struct ext2_group_desc @@ -730,14 +718,12 @@ extern int ext2_inode_by_name(struct inode *dir, const struct qstr *child, ino_t *ino); extern int ext2_make_empty(struct inode *, struct inode *); extern struct ext2_dir_entry_2 *ext2_find_entry(struct inode *, const struct qstr *, - struct page **, void **res_page_addr); -extern int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page, - char *kaddr); + struct page **); +extern int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page); extern int ext2_empty_dir (struct inode *); -extern struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p, void **pa); +extern struct ext2_dir_entry_2 *ext2_dotdot(struct inode *dir, struct page **p); int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, - struct page *page, void *page_addr, struct inode *inode, - bool update_times); + struct page *page, struct inode *inode, bool update_times); static inline void ext2_put_page(struct page *page, void *page_addr) { kunmap_local(page_addr); @@ -754,6 +740,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); extern struct inode *ext2_iget (struct super_block *, unsigned long); extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_evict_inode(struct inode *); +void ext2_write_failed(struct address_space *mapping, loff_t to); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); extern int ext2_getattr (struct mnt_idmap *, const struct path *, diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 6b4bebe982ca..0b4c91c62e1f 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -25,9 +25,11 @@ #include <linux/quotaops.h> #include <linux/iomap.h> #include <linux/uio.h> +#include <linux/buffer_head.h> #include "ext2.h" #include "xattr.h" #include "acl.h" +#include "trace.h" #ifdef CONFIG_FS_DAX static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -153,7 +155,7 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ret; struct super_block *sb = file->f_mapping->host->i_sb; - ret = generic_file_fsync(file, start, end, datasync); + ret = generic_buffers_fsync(file, start, end, datasync); if (ret == -EIO) /* We don't really know where the IO error happened... */ ext2_error(sb, __func__, @@ -161,12 +163,131 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; } +static ssize_t ext2_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + trace_ext2_dio_read_begin(iocb, to, 0); + inode_lock_shared(inode); + ret = iomap_dio_rw(iocb, to, &ext2_iomap_ops, NULL, 0, NULL, 0); + inode_unlock_shared(inode); + trace_ext2_dio_read_end(iocb, to, ret); + + return ret; +} + +static int ext2_dio_write_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + loff_t pos = iocb->ki_pos; + struct inode *inode = file_inode(iocb->ki_filp); + + if (error) + goto out; + + /* + * If we are extending the file, we have to update i_size here before + * page cache gets invalidated in iomap_dio_rw(). This prevents racing + * buffered reads from zeroing out too much from page cache pages. + * Note that all extending writes always happens synchronously with + * inode lock held by ext2_dio_write_iter(). So it is safe to update + * inode size here for extending file writes. + */ + pos += size; + if (pos > i_size_read(inode)) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } +out: + trace_ext2_dio_write_endio(iocb, size, error); + return error; +} + +static const struct iomap_dio_ops ext2_dio_write_ops = { + .end_io = ext2_dio_write_end_io, +}; + +static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + unsigned int flags = 0; + unsigned long blocksize = inode->i_sb->s_blocksize; + loff_t offset = iocb->ki_pos; + loff_t count = iov_iter_count(from); + ssize_t status = 0; + + trace_ext2_dio_write_begin(iocb, from, 0); + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; + + ret = kiocb_modified(iocb); + if (ret) + goto out_unlock; + + /* use IOMAP_DIO_FORCE_WAIT for unaligned or extending writes */ + if (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode) || + (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(from), blocksize))) + flags |= IOMAP_DIO_FORCE_WAIT; + + ret = iomap_dio_rw(iocb, from, &ext2_iomap_ops, &ext2_dio_write_ops, + flags, NULL, 0); + + /* ENOTBLK is magic return value for fallback to buffered-io */ + if (ret == -ENOTBLK) + ret = 0; + + if (ret < 0 && ret != -EIOCBQUEUED) + ext2_write_failed(inode->i_mapping, offset + count); + + /* handle case for partial write and for fallback to buffered write */ + if (ret >= 0 && iov_iter_count(from)) { + loff_t pos, endbyte; + int ret2; + + iocb->ki_flags &= ~IOCB_DIRECT; + pos = iocb->ki_pos; + status = generic_perform_write(iocb, from); + if (unlikely(status < 0)) { + ret = status; + goto out_unlock; + } + + iocb->ki_pos += status; + ret += status; + endbyte = pos + status - 1; + ret2 = filemap_write_and_wait_range(inode->i_mapping, pos, + endbyte); + if (!ret2) + invalidate_mapping_pages(inode->i_mapping, + pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + if (ret > 0) + generic_write_sync(iocb, ret); + } + +out_unlock: + inode_unlock(inode); + if (status) + trace_ext2_dio_write_buff_end(iocb, from, status); + trace_ext2_dio_write_end(iocb, from, ret); + return ret; +} + static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { #ifdef CONFIG_FS_DAX if (IS_DAX(iocb->ki_filp->f_mapping->host)) return ext2_dax_read_iter(iocb, to); #endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext2_dio_read_iter(iocb, to); + return generic_file_read_iter(iocb, to); } @@ -176,6 +297,9 @@ static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(iocb->ki_filp->f_mapping->host)) return ext2_dax_write_iter(iocb, from); #endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext2_dio_write_iter(iocb, from); + return generic_file_write_iter(iocb, from); } @@ -192,7 +316,7 @@ const struct file_operations ext2_file_operations = { .release = ext2_release_file, .fsync = ext2_fsync, .get_unmapped_area = thp_get_unmapped_area, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 26f135e7ffce..75983215c7a1 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -56,7 +56,7 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode) static void ext2_truncate_blocks(struct inode *inode, loff_t offset); -static void ext2_write_failed(struct address_space *mapping, loff_t to) +void ext2_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; @@ -809,9 +809,27 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, bool new = false, boundary = false; u32 bno; int ret; + bool create = flags & IOMAP_WRITE; + + /* + * For writes that could fill holes inside i_size on a + * DIO_SKIP_HOLES filesystem we forbid block creations: only + * overwrites are permitted. + */ + if ((flags & IOMAP_DIRECT) && + (first_block << blkbits) < i_size_read(inode)) + create = 0; + + /* + * Writes that span EOF might trigger an IO size update on completion, + * so consider them to be dirty for the purposes of O_DSYNC even if + * there is no other metadata changes pending or have been made here. + */ + if ((flags & IOMAP_WRITE) && offset + length > i_size_read(inode)) + iomap->flags |= IOMAP_F_DIRTY; ret = ext2_get_blocks(inode, first_block, max_blocks, - &bno, &new, &boundary, flags & IOMAP_WRITE); + &bno, &new, &boundary, create); if (ret < 0) return ret; @@ -823,6 +841,12 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->bdev = inode->i_sb->s_bdev; if (ret == 0) { + /* + * Switch to buffered-io for writing to holes in a non-extent + * based filesystem to avoid stale data exposure problem. + */ + if (!create && (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) + return -ENOTBLK; iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; iomap->length = 1 << blkbits; @@ -844,6 +868,13 @@ static int ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { + /* + * Switch to buffered-io in case of any error. + * Blocks allocated can be used by the buffered-io path. + */ + if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE) && written == 0) + return -ENOTBLK; + if (iomap->type == IOMAP_MAPPED && written < length && (flags & IOMAP_WRITE)) @@ -908,22 +939,6 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping,block,ext2_get_block); } -static ssize_t -ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - ssize_t ret; - - ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); - if (ret < 0 && iov_iter_rw(iter) == WRITE) - ext2_write_failed(mapping, offset + count); - return ret; -} - static int ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -946,7 +961,7 @@ const struct address_space_operations ext2_aops = { .write_begin = ext2_write_begin, .write_end = ext2_write_end, .bmap = ext2_bmap, - .direct_IO = ext2_direct_IO, + .direct_IO = noop_direct_IO, .writepages = ext2_writepages, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, @@ -1259,9 +1274,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); if (IS_DAX(inode)) - error = dax_zero_range(inode, newsize, - PAGE_ALIGN(newsize) - newsize, NULL, - &ext2_iomap_ops); + error = dax_truncate_page(inode, newsize, NULL, + &ext2_iomap_ops); else error = block_truncate_page(inode->i_mapping, newsize, ext2_get_block); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 7f5dfa87cc95..937dd8f60f96 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -269,26 +269,25 @@ out_dir: goto out; } -static int ext2_unlink(struct inode * dir, struct dentry *dentry) +static int ext2_unlink(struct inode *dir, struct dentry *dentry) { - struct inode * inode = d_inode(dentry); - struct ext2_dir_entry_2 * de; - struct page * page; - void *page_addr; + struct inode *inode = d_inode(dentry); + struct ext2_dir_entry_2 *de; + struct page *page; int err; err = dquot_initialize(dir); if (err) goto out; - de = ext2_find_entry(dir, &dentry->d_name, &page, &page_addr); + de = ext2_find_entry(dir, &dentry->d_name, &page); if (IS_ERR(de)) { err = PTR_ERR(de); goto out; } - err = ext2_delete_entry (de, page, page_addr); - ext2_put_page(page, page_addr); + err = ext2_delete_entry(de, page); + ext2_put_page(page, de); if (err) goto out; @@ -323,10 +322,8 @@ static int ext2_rename (struct mnt_idmap * idmap, struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; - void *dir_page_addr; struct ext2_dir_entry_2 * dir_de = NULL; struct page * old_page; - void *old_page_addr; struct ext2_dir_entry_2 * old_de; int err; @@ -335,28 +332,24 @@ static int ext2_rename (struct mnt_idmap * idmap, err = dquot_initialize(old_dir); if (err) - goto out; + return err; err = dquot_initialize(new_dir); if (err) - goto out; + return err; - old_de = ext2_find_entry(old_dir, &old_dentry->d_name, &old_page, - &old_page_addr); - if (IS_ERR(old_de)) { - err = PTR_ERR(old_de); - goto out; - } + old_de = ext2_find_entry(old_dir, &old_dentry->d_name, &old_page); + if (IS_ERR(old_de)) + return PTR_ERR(old_de); if (S_ISDIR(old_inode->i_mode)) { err = -EIO; - dir_de = ext2_dotdot(old_inode, &dir_page, &dir_page_addr); + dir_de = ext2_dotdot(old_inode, &dir_page); if (!dir_de) goto out_old; } if (new_inode) { - void *page_addr; struct page *new_page; struct ext2_dir_entry_2 *new_de; @@ -365,14 +358,13 @@ static int ext2_rename (struct mnt_idmap * idmap, goto out_dir; new_de = ext2_find_entry(new_dir, &new_dentry->d_name, - &new_page, &page_addr); + &new_page); if (IS_ERR(new_de)) { err = PTR_ERR(new_de); goto out_dir; } - err = ext2_set_link(new_dir, new_de, new_page, page_addr, - old_inode, true); - ext2_put_page(new_page, page_addr); + err = ext2_set_link(new_dir, new_de, new_page, old_inode, true); + ext2_put_page(new_page, new_de); if (err) goto out_dir; new_inode->i_ctime = current_time(new_inode); @@ -394,27 +386,20 @@ static int ext2_rename (struct mnt_idmap * idmap, old_inode->i_ctime = current_time(old_inode); mark_inode_dirty(old_inode); - ext2_delete_entry(old_de, old_page, old_page_addr); - - if (dir_de) { - if (old_dir != new_dir) { + err = ext2_delete_entry(old_de, old_page); + if (!err && dir_de) { + if (old_dir != new_dir) err = ext2_set_link(old_inode, dir_de, dir_page, - dir_page_addr, new_dir, false); + new_dir, false); - } - ext2_put_page(dir_page, dir_page_addr); inode_dec_link_count(old_dir); } - -out_old: - ext2_put_page(old_page, old_page_addr); -out: - return err; - out_dir: if (dir_de) - ext2_put_page(dir_page, dir_page_addr); - goto out_old; + ext2_put_page(dir_page, dir_de); +out_old: + ext2_put_page(old_page, old_de); + return err; } const struct inode_operations ext2_dir_inode_operations = { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f342f347a695..2959afc7541c 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -668,10 +668,9 @@ static int ext2_setup_super (struct super_block * sb, es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); if (test_opt (sb, DEBUG)) - ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, " + ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, gc=%lu, " "bpg=%lu, ipg=%lu, mo=%04lx]", EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize, - sbi->s_frag_size, sbi->s_groups_count, EXT2_BLOCKS_PER_GROUP(sb), EXT2_INODES_PER_GROUP(sb), @@ -1012,14 +1011,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } } - sbi->s_frag_size = EXT2_MIN_FRAG_SIZE << - le32_to_cpu(es->s_log_frag_size); - if (sbi->s_frag_size == 0) - goto cantfind_ext2; - sbi->s_frags_per_block = sb->s_blocksize / sbi->s_frag_size; - sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); - sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb); @@ -1045,11 +1037,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (sb->s_blocksize != sbi->s_frag_size) { + if (es->s_log_frag_size != es->s_log_block_size) { ext2_msg(sb, KERN_ERR, - "error: fragsize %lu != blocksize %lu" - "(not supported yet)", - sbi->s_frag_size, sb->s_blocksize); + "error: fragsize log %u != blocksize log %u", + le32_to_cpu(es->s_log_frag_size), sb->s_blocksize_bits); goto failed_mount; } @@ -1066,12 +1057,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_blocks_per_group, sbi->s_inodes_per_group + 3); goto failed_mount; } - if (sbi->s_frags_per_group > sb->s_blocksize * 8) { - ext2_msg(sb, KERN_ERR, - "error: #fragments per group too big: %lu", - sbi->s_frags_per_group); - goto failed_mount; - } if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext2_msg(sb, KERN_ERR, diff --git a/fs/ext2/trace.c b/fs/ext2/trace.c new file mode 100644 index 000000000000..b01cdf6526fd --- /dev/null +++ b/fs/ext2/trace.c @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "ext2.h" +#include <linux/uio.h> + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/ext2/trace.h b/fs/ext2/trace.h new file mode 100644 index 000000000000..7d230e13576e --- /dev/null +++ b/fs/ext2/trace.h @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ext2 + +#if !defined(_EXT2_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _EXT2_TRACE_H + +#include <linux/tracepoint.h> + +DECLARE_EVENT_CLASS(ext2_dio_class, + TP_PROTO(struct kiocb *iocb, struct iov_iter *iter, ssize_t ret), + TP_ARGS(iocb, iter, ret), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, isize) + __field(loff_t, pos) + __field(size_t, count) + __field(int, ki_flags) + __field(bool, aio) + __field(ssize_t, ret) + ), + TP_fast_assign( + __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; + __entry->ino = file_inode(iocb->ki_filp)->i_ino; + __entry->isize = file_inode(iocb->ki_filp)->i_size; + __entry->pos = iocb->ki_pos; + __entry->count = iov_iter_count(iter); + __entry->ki_flags = iocb->ki_flags; + __entry->aio = !is_sync_kiocb(iocb); + __entry->ret = ret; + ), + TP_printk("dev %d:%d ino 0x%lx isize 0x%llx pos 0x%llx len %zu flags %s aio %d ret %zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->isize, + __entry->pos, + __entry->count, + __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS), + __entry->aio, + __entry->ret) +); + +#define DEFINE_DIO_RW_EVENT(name) \ +DEFINE_EVENT(ext2_dio_class, name, \ + TP_PROTO(struct kiocb *iocb, struct iov_iter *iter, ssize_t ret), \ + TP_ARGS(iocb, iter, ret)) +DEFINE_DIO_RW_EVENT(ext2_dio_write_begin); +DEFINE_DIO_RW_EVENT(ext2_dio_write_end); +DEFINE_DIO_RW_EVENT(ext2_dio_write_buff_end); +DEFINE_DIO_RW_EVENT(ext2_dio_read_begin); +DEFINE_DIO_RW_EVENT(ext2_dio_read_end); + +TRACE_EVENT(ext2_dio_write_endio, + TP_PROTO(struct kiocb *iocb, ssize_t size, int ret), + TP_ARGS(iocb, size, ret), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, isize) + __field(loff_t, pos) + __field(ssize_t, size) + __field(int, ki_flags) + __field(bool, aio) + __field(int, ret) + ), + TP_fast_assign( + __entry->dev = file_inode(iocb->ki_filp)->i_sb->s_dev; + __entry->ino = file_inode(iocb->ki_filp)->i_ino; + __entry->isize = file_inode(iocb->ki_filp)->i_size; + __entry->pos = iocb->ki_pos; + __entry->size = size; + __entry->ki_flags = iocb->ki_flags; + __entry->aio = !is_sync_kiocb(iocb); + __entry->ret = ret; + ), + TP_printk("dev %d:%d ino 0x%lx isize 0x%llx pos 0x%llx len %zd flags %s aio %d ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->isize, + __entry->pos, + __entry->size, + __print_flags(__entry->ki_flags, "|", TRACE_IOCB_STRINGS), + __entry->aio, + __entry->ret) +); + +#endif /* _EXT2_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8104a21b001a..0a2d55faa095 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -128,6 +128,58 @@ enum SHIFT_DIRECTION { }; /* + * For each criteria, mballoc has slightly different way of finding + * the required blocks nad usually, higher the criteria the slower the + * allocation. We start at lower criterias and keep falling back to + * higher ones if we are not able to find any blocks. Lower (earlier) + * criteria are faster. + */ +enum criteria { + /* + * Used when number of blocks needed is a power of 2. This + * doesn't trigger any disk IO except prefetch and is the + * fastest criteria. + */ + CR_POWER2_ALIGNED, + + /* + * Tries to lookup in-memory data structures to find the most + * suitable group that satisfies goal request. No disk IO + * except block prefetch. + */ + CR_GOAL_LEN_FAST, + + /* + * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal + * length to the best available length for faster allocation. + */ + CR_BEST_AVAIL_LEN, + + /* + * Reads each block group sequentially, performing disk IO if + * necessary, to find find_suitable block group. Tries to + * allocate goal length but might trim the request if nothing + * is found after enough tries. + */ + CR_GOAL_LEN_SLOW, + + /* + * Finds the first free set of blocks and allocates + * those. This is only used in rare cases when + * CR_GOAL_LEN_SLOW also fails to allocate anything. + */ + CR_ANY_FREE, + + /* + * Number of criterias defined. + */ + EXT4_MB_NUM_CRS +}; + +/* criteria below which we use fast block scanning and avoid unnecessary IO */ +#define CR_FAST CR_GOAL_LEN_SLOW + +/* * Flags used in mballoc's allocation_context flags field. * * Also used to show what's going on for debugging purposes when the @@ -165,9 +217,12 @@ enum SHIFT_DIRECTION { /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 /* Large fragment size list lookup succeeded at least once for cr = 0 */ -#define EXT4_MB_CR0_OPTIMIZED 0x8000 +#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ -#define EXT4_MB_CR1_OPTIMIZED 0x00010000 +#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 +/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ +#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 + struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; @@ -1532,21 +1587,25 @@ struct ext4_sb_info { unsigned long s_mb_last_start; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; + unsigned int s_mb_best_avail_max_trim_order; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_success; /* we found long enough chunks */ atomic_t s_bal_allocated; /* in blocks */ atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ - atomic_t s_bal_cr0_bad_suggestions; - atomic_t s_bal_cr1_bad_suggestions; - atomic64_t s_bal_cX_groups_considered[4]; - atomic64_t s_bal_cX_hits[4]; - atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ + atomic_t s_bal_p2_aligned_bad_suggestions; + atomic_t s_bal_goal_fast_bad_suggestions; + atomic_t s_bal_best_avail_bad_suggestions; + atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ atomic_t s_mb_buddies_generated; /* number of buddies generated */ atomic64_t s_mb_generation_time; atomic_t s_mb_lost_chunks; @@ -2632,10 +2691,6 @@ extern void ext4_get_group_no_and_offset(struct super_block *sb, extern ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block); -extern unsigned int ext4_block_group(struct super_block *sb, - ext4_fsblk_t blocknr); -extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, - ext4_fsblk_t blocknr); extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); @@ -2841,8 +2896,6 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; extern const struct seq_operations ext4_mb_seq_structs_summary_ops; -extern long ext4_mb_stats; -extern long ext4_mb_max_to_scan; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); @@ -2968,6 +3021,7 @@ int ext4_fileattr_set(struct mnt_idmap *idmap, int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); int ext4_update_overhead(struct super_block *sb, bool force); +int ext4_force_shutdown(struct super_block *sb, u32 flags); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); @@ -3480,14 +3534,8 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct page **pagep); -extern int ext4_write_inline_data_end(struct inode *inode, - loff_t pos, unsigned len, - unsigned copied, - struct page *page); -extern struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, - unsigned len, - struct page *page); +int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct folio *folio); extern int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 35703dce23a3..e4115d338f10 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3123,7 +3123,7 @@ void ext4_ext_release(struct super_block *sb) #endif } -static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) +static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) { ext4_lblk_t ee_block; ext4_fsblk_t ee_pblock; @@ -3134,10 +3134,10 @@ static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) ee_pblock = ext4_ext_pblock(ex); if (ee_len == 0) - return 0; + return; - return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, - EXTENT_STATUS_WRITTEN); + ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); } /* FIXME!! we need to try to merge to left or right after zero-out */ @@ -3287,7 +3287,7 @@ static int ext4_split_extent_at(handle_t *handle, err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (!err) /* update extent status tree */ - err = ext4_zeroout_es(inode, &zero_ex); + ext4_zeroout_es(inode, &zero_ex); /* If we failed at this point, we don't know in which * state the extent tree exactly is so don't try to fix * length of the original extent as it may do even more @@ -3640,9 +3640,8 @@ fallback: out: /* If we have gotten a failure, don't zero out status tree */ if (!err) { - err = ext4_zeroout_es(inode, &zero_ex1); - if (!err) - err = ext4_zeroout_es(inode, &zero_ex2); + ext4_zeroout_es(inode, &zero_ex1); + ext4_zeroout_es(inode, &zero_ex2); } return err ? err : allocated; } @@ -4403,15 +4402,8 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); -retry: - err = ext4_es_remove_extent(inode, last_block, - EXT_MAX_BLOCKS - last_block); - if (err == -ENOMEM) { - memalloc_retry_wait(GFP_ATOMIC); - goto retry; - } - if (err) - return err; + ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); + retry_remove_space: err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); if (err == -ENOMEM) { @@ -5363,13 +5355,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); - - ret = ext4_es_remove_extent(inode, punch_start, - EXT_MAX_BLOCKS - punch_start); - if (ret) { - up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; - } + ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); if (ret) { @@ -5547,12 +5533,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) ext4_free_ext_path(path); } - ret = ext4_es_remove_extent(inode, offset_lblk, - EXT_MAX_BLOCKS - offset_lblk); - if (ret) { - up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; - } + ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); /* * if offset_lblk lies in a hole which is at start of file, use @@ -5610,12 +5591,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(!inode_is_locked(inode1)); BUG_ON(!inode_is_locked(inode2)); - *erp = ext4_es_remove_extent(inode1, lblk1, count); - if (unlikely(*erp)) - return 0; - *erp = ext4_es_remove_extent(inode2, lblk2, count); - if (unlikely(*erp)) - return 0; + ext4_es_remove_extent(inode1, lblk1, count); + ext4_es_remove_extent(inode2, lblk2, count); while (count) { struct ext4_extent *ex1, *ex2, tmp_ex; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 595abb9e7d74..9b5b8951afb4 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -144,9 +144,11 @@ static struct kmem_cache *ext4_es_cachep; static struct kmem_cache *ext4_pending_cachep; -static int __es_insert_extent(struct inode *inode, struct extent_status *newes); +static int __es_insert_extent(struct inode *inode, struct extent_status *newes, + struct extent_status *prealloc); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end, int *reserved); + ext4_lblk_t end, int *reserved, + struct extent_status *prealloc); static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); @@ -446,22 +448,36 @@ static void ext4_es_list_del(struct inode *inode) spin_unlock(&sbi->s_es_lock); } -static struct extent_status * -ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, - ext4_fsblk_t pblk) +/* + * Returns true if we cannot fail to allocate memory for this extent_status + * entry and cannot reclaim it until its status changes. + */ +static inline bool ext4_es_must_keep(struct extent_status *es) +{ + /* fiemap, bigalloc, and seek_data/hole need to use it. */ + if (ext4_es_is_delayed(es)) + return true; + + return false; +} + +static inline struct extent_status *__es_alloc_extent(bool nofail) +{ + if (!nofail) + return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); + + return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL); +} + +static void ext4_es_init_extent(struct inode *inode, struct extent_status *es, + ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk) { - struct extent_status *es; - es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); - if (es == NULL) - return NULL; es->es_lblk = lblk; es->es_len = len; es->es_pblk = pblk; - /* - * We don't count delayed extent because we never try to reclaim them - */ - if (!ext4_es_is_delayed(es)) { + /* We never try to reclaim a must kept extent, so we don't count it. */ + if (!ext4_es_must_keep(es)) { if (!EXT4_I(inode)->i_es_shk_nr++) ext4_es_list_add(inode); percpu_counter_inc(&EXT4_SB(inode->i_sb)-> @@ -470,8 +486,11 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, EXT4_I(inode)->i_es_all_nr++; percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); +} - return es; +static inline void __es_free_extent(struct extent_status *es) +{ + kmem_cache_free(ext4_es_cachep, es); } static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) @@ -479,8 +498,8 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) EXT4_I(inode)->i_es_all_nr--; percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); - /* Decrease the shrink counter when this es is not delayed */ - if (!ext4_es_is_delayed(es)) { + /* Decrease the shrink counter when we can reclaim the extent. */ + if (!ext4_es_must_keep(es)) { BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); if (!--EXT4_I(inode)->i_es_shk_nr) ext4_es_list_del(inode); @@ -488,7 +507,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) s_es_stats.es_stats_shk_cnt); } - kmem_cache_free(ext4_es_cachep, es); + __es_free_extent(es); } /* @@ -749,7 +768,8 @@ static inline void ext4_es_insert_extent_check(struct inode *inode, } #endif -static int __es_insert_extent(struct inode *inode, struct extent_status *newes) +static int __es_insert_extent(struct inode *inode, struct extent_status *newes, + struct extent_status *prealloc) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node **p = &tree->root.rb_node; @@ -789,10 +809,15 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes) } } - es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len, - newes->es_pblk); + if (prealloc) + es = prealloc; + else + es = __es_alloc_extent(false); if (!es) return -ENOMEM; + ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len, + newes->es_pblk); + rb_link_node(&es->rb_node, parent, p); rb_insert_color(&es->rb_node, &tree->root); @@ -804,26 +829,27 @@ out: /* * ext4_es_insert_extent() adds information to an inode's extent * status tree. - * - * Return 0 on success, error code on failure. */ -int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status) +void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; - int err = 0; + int err1 = 0; + int err2 = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct extent_status *es1 = NULL; + struct extent_status *es2 = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) - return 0; + return; es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", lblk, len, pblk, status, inode->i_ino); if (!len) - return 0; + return; BUG_ON(end < lblk); @@ -842,29 +868,40 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_insert_extent_check(inode, &newes); +retry: + if (err1 && !es1) + es1 = __es_alloc_extent(true); + if ((err1 || err2) && !es2) + es2 = __es_alloc_extent(true); write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end, NULL); - if (err != 0) + + err1 = __es_remove_extent(inode, lblk, end, NULL, es1); + if (err1 != 0) + goto error; + + err2 = __es_insert_extent(inode, &newes, es2); + if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) + err2 = 0; + if (err2 != 0) goto error; -retry: - err = __es_insert_extent(inode, &newes); - if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), - 128, EXT4_I(inode))) - goto retry; - if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) - err = 0; if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && (status & EXTENT_STATUS_WRITTEN || status & EXTENT_STATUS_UNWRITTEN)) __revise_pending(inode, lblk, len); + /* es is pre-allocated but not used, free it. */ + if (es1 && !es1->es_len) + __es_free_extent(es1); + if (es2 && !es2->es_len) + __es_free_extent(es2); error: write_unlock(&EXT4_I(inode)->i_es_lock); + if (err1 || err2) + goto retry; ext4_es_print_tree(inode); - - return err; + return; } /* @@ -897,7 +934,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); if (!es || es->es_lblk > end) - __es_insert_extent(inode, &newes); + __es_insert_extent(inode, &newes, NULL); write_unlock(&EXT4_I(inode)->i_es_lock); } @@ -1287,6 +1324,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, * @lblk - first block in range * @end - last block in range * @reserved - number of cluster reservations released + * @prealloc - pre-allocated es to avoid memory allocation failures * * If @reserved is not NULL and delayed allocation is enabled, counts * block/cluster reservations freed by removing range and if bigalloc @@ -1294,7 +1332,8 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, * error code on failure. */ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end, int *reserved) + ext4_lblk_t end, int *reserved, + struct extent_status *prealloc) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node *node; @@ -1302,14 +1341,12 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status orig_es; ext4_lblk_t len1, len2; ext4_fsblk_t block; - int err; + int err = 0; bool count_reserved = true; struct rsvd_count rc; if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC)) count_reserved = false; -retry: - err = 0; es = __es_tree_search(&tree->root, lblk); if (!es) @@ -1343,14 +1380,13 @@ retry: orig_es.es_len - len2; ext4_es_store_pblock_status(&newes, block, ext4_es_status(&orig_es)); - err = __es_insert_extent(inode, &newes); + err = __es_insert_extent(inode, &newes, prealloc); if (err) { + if (!ext4_es_must_keep(&newes)) + return 0; + es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; - if ((err == -ENOMEM) && - __es_shrink(EXT4_SB(inode->i_sb), - 128, EXT4_I(inode))) - goto retry; goto out; } } else { @@ -1422,39 +1458,48 @@ out: * @len - number of blocks to remove * * Reduces block/cluster reservation count and for bigalloc cancels pending - * reservations as needed. Returns 0 on success, error code on failure. + * reservations as needed. */ -int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) +void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) { ext4_lblk_t end; int err = 0; int reserved = 0; + struct extent_status *es = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) - return 0; + return; trace_ext4_es_remove_extent(inode, lblk, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", lblk, len, inode->i_ino); if (!len) - return err; + return; end = lblk + len - 1; BUG_ON(end < lblk); +retry: + if (err && !es) + es = __es_alloc_extent(true); /* * ext4_clear_inode() depends on us taking i_es_lock unconditionally * so that we are sure __es_shrink() is done with the inode before it * is reclaimed. */ write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end, &reserved); + err = __es_remove_extent(inode, lblk, end, &reserved, es); + if (es && !es->es_len) + __es_free_extent(es); write_unlock(&EXT4_I(inode)->i_es_lock); + if (err) + goto retry; + ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); - return err; + return; } static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, @@ -1702,11 +1747,8 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, (*nr_to_scan)--; node = rb_next(&es->rb_node); - /* - * We can't reclaim delayed extent from status tree because - * fiemap, bigallic, and seek_data/hole need to use it. - */ - if (ext4_es_is_delayed(es)) + + if (ext4_es_must_keep(es)) goto next; if (ext4_es_is_referenced(es)) { ext4_es_clear_referenced(es); @@ -1770,7 +1812,7 @@ void ext4_clear_inode_es(struct inode *inode) while (node) { es = rb_entry(node, struct extent_status, rb_node); node = rb_next(node); - if (!ext4_es_is_delayed(es)) { + if (!ext4_es_must_keep(es)) { rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); } @@ -1972,17 +2014,18 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) * @lblk - logical block to be added * @allocated - indicates whether a physical cluster has been allocated for * the logical cluster that contains the block - * - * Returns 0 on success, negative error code on failure. */ -int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, - bool allocated) +void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated) { struct extent_status newes; - int err = 0; + int err1 = 0; + int err2 = 0; + struct extent_status *es1 = NULL; + struct extent_status *es2 = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) - return 0; + return; es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", lblk, inode->i_ino); @@ -1994,29 +2037,37 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, ext4_es_insert_extent_check(inode, &newes); +retry: + if (err1 && !es1) + es1 = __es_alloc_extent(true); + if ((err1 || err2) && !es2) + es2 = __es_alloc_extent(true); write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, lblk, NULL); - if (err != 0) + err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); + if (err1 != 0) goto error; -retry: - err = __es_insert_extent(inode, &newes); - if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), - 128, EXT4_I(inode))) - goto retry; - if (err != 0) + + err2 = __es_insert_extent(inode, &newes, es2); + if (err2 != 0) goto error; if (allocated) __insert_pending(inode, lblk); + /* es is pre-allocated but not used, free it. */ + if (es1 && !es1->es_len) + __es_free_extent(es1); + if (es2 && !es2->es_len) + __es_free_extent(es2); error: write_unlock(&EXT4_I(inode)->i_es_lock); + if (err1 || err2) + goto retry; ext4_es_print_tree(inode); ext4_print_pending_tree(inode); - - return err; + return; } /* diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 4ec30a798260..d9847a4a25db 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -127,14 +127,14 @@ extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); -extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status); +extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); -extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len); +extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); extern void ext4_es_find_extent_range(struct inode *inode, int (*match_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, @@ -249,8 +249,8 @@ extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); -extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, - bool allocated); +extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_clear_inode_es(struct inode *inode); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d101b3b0c7da..c457c8517f0f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -147,6 +147,17 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return generic_file_read_iter(iocb, to); } +static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + return filemap_splice_read(in, ppos, pipe, len, flags); +} + /* * Called when an inode is released. Note that this is different * from ext4_file_open: open gets called at every open, but release @@ -285,18 +296,13 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, if (ret <= 0) goto out; - current->backing_dev_info = inode_to_bdi(inode); ret = generic_perform_write(iocb, from); - current->backing_dev_info = NULL; out: inode_unlock(inode); - if (likely(ret > 0)) { - iocb->ki_pos += ret; - ret = generic_write_sync(iocb, ret); - } - - return ret; + if (unlikely(ret <= 0)) + return ret; + return generic_write_sync(iocb, ret); } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, @@ -444,13 +450,14 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, bool *ilock_shared, bool *extend, - bool *unwritten) + bool *unwritten, int *dio_flags) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); loff_t offset; size_t count; ssize_t ret; + bool overwrite, unaligned_io; restart: ret = ext4_generic_write_checks(iocb, from); @@ -459,16 +466,20 @@ restart: offset = iocb->ki_pos; count = ret; - if (ext4_extending_io(inode, offset, count)) - *extend = true; + + unaligned_io = ext4_unaligned_io(inode, from, offset); + *extend = ext4_extending_io(inode, offset, count); + overwrite = ext4_overwrite_io(inode, offset, count, unwritten); + /* - * Determine whether the IO operation will overwrite allocated - * and initialized blocks. - * We need exclusive i_rwsem for changing security info - * in file_modified(). + * Determine whether we need to upgrade to an exclusive lock. This is + * required to change security info in file_modified(), for extending + * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten + * extents (as partial block zeroing may be required). */ - if (*ilock_shared && (!IS_NOSEC(inode) || *extend || - !ext4_overwrite_io(inode, offset, count, unwritten))) { + if (*ilock_shared && + ((!IS_NOSEC(inode) || *extend || !overwrite || + (unaligned_io && *unwritten)))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -479,6 +490,32 @@ restart: goto restart; } + /* + * Now that locking is settled, determine dio flags and exclusivity + * requirements. Unaligned writes are allowed under shared lock so long + * as they are pure overwrites. Set the iomap overwrite only flag as an + * added precaution in this case. Even though this is unnecessary, we + * can detect and warn on unexpected -EAGAIN if an unsafe unaligned + * write is ever submitted. + * + * Otherwise, concurrent unaligned writes risk data corruption due to + * partial block zeroing in the dio layer, and so the I/O must occur + * exclusively. The inode lock is already held exclusive if the write is + * non-overwrite or extending, so drain all outstanding dio and set the + * force wait dio flag. + */ + if (*ilock_shared && unaligned_io) { + *dio_flags = IOMAP_DIO_OVERWRITE_ONLY; + } else if (!*ilock_shared && (unaligned_io || *extend)) { + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + if (unaligned_io && (!overwrite || *unwritten)) + inode_dio_wait(inode); + *dio_flags = IOMAP_DIO_FORCE_WAIT; + } + ret = file_modified(file); if (ret < 0) goto out; @@ -500,18 +537,11 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; - bool extend = false, unaligned_io = false, unwritten = false; + bool extend = false, unwritten = false; bool ilock_shared = true; + int dio_flags = 0; /* - * We initially start with shared inode lock unless it is - * unaligned IO which needs exclusive lock anyways. - */ - if (ext4_unaligned_io(inode, from, offset)) { - unaligned_io = true; - ilock_shared = false; - } - /* * Quick check here without any i_rwsem lock to see if it is extending * IO. A more reliable check is done in ext4_dio_write_checks() with * proper locking in place. @@ -543,16 +573,11 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_buffered_write_iter(iocb, from); } - ret = ext4_dio_write_checks(iocb, from, - &ilock_shared, &extend, &unwritten); + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, + &unwritten, &dio_flags); if (ret <= 0) return ret; - /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */ - if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) { - ret = -EAGAIN; - goto out; - } /* * Make sure inline data cannot be created anymore since we are going * to allocate blocks for DIO. We know the inode does not have any @@ -563,19 +588,6 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) offset = iocb->ki_pos; count = ret; - /* - * Unaligned direct IO must be serialized among each other as zeroing - * of partial blocks of two competing unaligned IOs can result in data - * corruption. - * - * So we make sure we don't allow any unaligned IO in flight. - * For IOs where we need not wait (like unaligned non-AIO DIO), - * below inode_dio_wait() may anyway become a no-op, since we start - * with exclusive lock. - */ - if (unaligned_io) - inode_dio_wait(inode); - if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { @@ -595,8 +607,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ilock_shared && !unwritten) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, - (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, - NULL, 0); + dio_flags, NULL, 0); + WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)); if (ret == -ENOTBLK) ret = 0; @@ -957,7 +969,7 @@ const struct file_operations ext4_file_operations = { .release = ext4_release_file, .fsync = ext4_sync_file, .get_unmapped_area = thp_get_unmapped_area, - .splice_read = generic_file_splice_read, + .splice_read = ext4_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, }; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 2a143209aa0c..0c56f3a011a1 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -28,6 +28,7 @@ #include <linux/sched.h> #include <linux/writeback.h> #include <linux/blkdev.h> +#include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" @@ -78,21 +79,13 @@ static int ext4_sync_parent(struct inode *inode) return ret; } -static int ext4_fsync_nojournal(struct inode *inode, bool datasync, - bool *needs_barrier) +static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end, + int datasync, bool *needs_barrier) { - int ret, err; - - ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY_ALL)) - return ret; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return ret; - - err = sync_inode_metadata(inode, 1); - if (!ret) - ret = err; + struct inode *inode = file->f_inode; + int ret; + ret = generic_buffers_fsync_noflush(file, start, end, datasync); if (!ret) ret = ext4_sync_parent(inode); if (test_opt(inode->i_sb, BARRIER)) @@ -155,6 +148,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } + if (!sbi->s_journal) { + ret = ext4_fsync_nojournal(file, start, end, datasync, + &needs_barrier); + if (needs_barrier) + goto issue_flush; + goto out; + } + ret = file_write_and_wait_range(file, start, end); if (ret) goto out; @@ -164,11 +165,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * Metadata is in the journal, we wait for proper transaction to * commit here. */ - if (!sbi->s_journal) - ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier); - else - ret = ext4_fsync_journal(inode, datasync, &needs_barrier); + ret = ext4_fsync_journal(inode, datasync, &needs_barrier); +issue_flush: if (needs_barrier) { err = blkdev_issue_flush(inode->i_sb->s_bdev); if (!ret) diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index c68bebe7ff4b..a9f3716119d3 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -651,6 +651,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, ext4_update_inode_fsync_trans(handle, inode, 1); count = ar.len; + + /* + * Update reserved blocks/metadata blocks after successful block + * allocation which had been deferred till now. + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_da_update_reserve_space(inode, count, 1); + got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 5854bd5a3352..a4b7e4bc32d4 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -741,9 +741,8 @@ convert: } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page) + unsigned copied, struct folio *folio) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); int no_expand; void *kaddr; @@ -823,30 +822,6 @@ out: return ret ? ret : copied; } -struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, - unsigned len, - struct page *page) -{ - int ret, no_expand; - void *kaddr; - struct ext4_iloc iloc; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) { - ext4_std_error(inode->i_sb, ret); - return NULL; - } - - ext4_write_lock_xattr(inode, &no_expand); - kaddr = kmap_atomic(page); - ext4_write_inline_data(inode, &iloc, kaddr, 0, len); - kunmap_atomic(kaddr); - ext4_write_unlock_xattr(inode, &no_expand); - - return iloc.bh; -} - /* * Try to make the page cache and handle ready for the inline data case. * We can call this function in 2 cases: @@ -1964,16 +1939,8 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) * the extent status cache must be cleared to avoid leaving * behind stale delayed allocated extent entries */ - if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { -retry: - err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); - if (err == -ENOMEM) { - memalloc_retry_wait(GFP_ATOMIC); - goto retry; - } - if (err) - goto out_error; - } + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); /* Clear the content in the xattr space. */ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 02de439bf1f0..43775a6ca505 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -567,10 +567,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; - ret = ext4_es_insert_extent(inode, map->m_lblk, - map->m_len, map->m_pblk, status); - if (ret < 0) - retval = ret; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); } up_read((&EXT4_I(inode)->i_data_sem)); @@ -632,16 +630,6 @@ found: */ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } - - /* - * Update reserved blocks/metadata blocks after successful - * block allocation which had been deferred till now. We don't - * support fallocate for non extent files. So we can update - * reserve space here. - */ - if ((retval > 0) && - (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) - ext4_da_update_reserve_space(inode, retval, 1); } if (retval > 0) { @@ -689,12 +677,8 @@ found: ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; - ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - if (ret < 0) { - retval = ret; - goto out_sem; - } + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); } out_sem: @@ -1093,7 +1077,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - page_zero_new_buffers(&folio->page, from, to); + folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; @@ -1287,7 +1271,8 @@ static int ext4_write_end(struct file *file, if (ext4_has_inline_data(inode) && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) - return ext4_write_inline_data_end(inode, pos, len, copied, page); + return ext4_write_inline_data_end(inode, pos, len, copied, + folio); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* @@ -1339,7 +1324,7 @@ static int ext4_write_end(struct file *file, } /* - * This is a private version of page_zero_new_buffers() which doesn't + * This is a private version of folio_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_dirty_journalled_data() instead. */ @@ -1395,7 +1380,8 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); if (ext4_has_inline_data(inode)) - return ext4_write_inline_data_end(inode, pos, len, copied, page); + return ext4_write_inline_data_end(inode, pos, len, copied, + folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; @@ -1638,7 +1624,6 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool allocated = false; - bool reserved = false; /* * If the cluster containing lblk is shared with a delayed, @@ -1654,8 +1639,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) if (sbi->s_cluster_ratio == 1) { ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ - goto errout; - reserved = true; + return ret; } else { /* bigalloc */ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { if (!ext4_es_scan_clu(inode, @@ -1663,12 +1647,11 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); if (ret < 0) - goto errout; + return ret; if (ret == 0) { ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ - goto errout; - reserved = true; + return ret; } else { allocated = true; } @@ -1678,12 +1661,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) } } - ret = ext4_es_insert_delayed_block(inode, lblk, allocated); - if (ret && reserved) - ext4_da_release_space(inode, 1); - -errout: - return ret; + ext4_es_insert_delayed_block(inode, lblk, allocated); + return 0; } /* @@ -1780,7 +1759,6 @@ add_delayed: set_buffer_new(bh); set_buffer_delay(bh); } else if (retval > 0) { - int ret; unsigned int status; if (unlikely(retval != map->m_len)) { @@ -1793,10 +1771,8 @@ add_delayed: status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - if (ret != 0) - retval = ret; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); } out_unlock: @@ -2321,11 +2297,11 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } -static int ext4_journal_page_buffers(handle_t *handle, struct page *page, - int len) +static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, + size_t len) { - struct buffer_head *page_bufs = page_buffers(page); - struct inode *inode = page->mapping->host; + struct buffer_head *page_bufs = folio_buffers(folio); + struct inode *inode = folio->mapping->host; int ret, err; ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, @@ -2334,7 +2310,7 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page, NULL, write_end_fn); if (ret == 0) ret = err; - err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); + err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -2344,22 +2320,20 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page, static int mpage_journal_page_buffers(handle_t *handle, struct mpage_da_data *mpd, - struct page *page) + struct folio *folio) { struct inode *inode = mpd->inode; loff_t size = i_size_read(inode); - int len; + size_t len = folio_size(folio); - ClearPageChecked(page); + folio_clear_checked(folio); mpd->wbc->nr_to_write--; - if (page->index == size >> PAGE_SHIFT && + if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; + len = size - folio_pos(folio); - return ext4_journal_page_buffers(handle, page, len); + return ext4_journal_folio_buffers(handle, folio, len); } /* @@ -2499,7 +2473,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { err = mpage_journal_page_buffers(handle, - mpd, &folio->page); + mpd, folio); if (err < 0) goto out; mpd->journalled_more_data = 1; @@ -2944,15 +2918,15 @@ retry: * Check if we should update i_disksize * when write to the end of file but not require block allocation */ -static int ext4_da_should_update_i_disksize(struct page *page, +static int ext4_da_should_update_i_disksize(struct folio *folio, unsigned long offset) { struct buffer_head *bh; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned int idx; int i; - bh = page_buffers(page); + bh = folio_buffers(folio); idx = offset >> inode->i_blkbits; for (i = 0; i < idx; i++) @@ -2972,17 +2946,19 @@ static int ext4_da_write_end(struct file *file, loff_t new_i_size; unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; + struct folio *folio = page_folio(page); if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, - len, copied, page, fsdata); + len, copied, &folio->page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) - return ext4_write_inline_data_end(inode, pos, len, copied, page); + return ext4_write_inline_data_end(inode, pos, len, copied, + folio); if (unlikely(copied < len) && !PageUptodate(page)) copied = 0; @@ -3006,10 +2982,11 @@ static int ext4_da_write_end(struct file *file, */ new_i_size = pos + copied; if (copied && new_i_size > inode->i_size && - ext4_da_should_update_i_disksize(page, end)) + ext4_da_should_update_i_disksize(folio, end)) ext4_update_i_disksize(inode, new_i_size); - return generic_write_end(file, mapping, pos, len, copied, page, fsdata); + return generic_write_end(file, mapping, pos, len, copied, &folio->page, + fsdata); } /* @@ -3105,7 +3082,7 @@ static int ext4_read_folio(struct file *file, struct folio *folio) int ret = -EAGAIN; struct inode *inode = folio->mapping->host; - trace_ext4_readpage(&folio->page); + trace_ext4_read_folio(inode, folio); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); @@ -3164,9 +3141,10 @@ static void ext4_journalled_invalidate_folio(struct folio *folio, static bool ext4_release_folio(struct folio *folio, gfp_t wait) { - journal_t *journal = EXT4_JOURNAL(folio->mapping->host); + struct inode *inode = folio->mapping->host; + journal_t *journal = EXT4_JOURNAL(inode); - trace_ext4_releasepage(&folio->page); + trace_ext4_release_folio(inode, folio); /* Page has dirty journalled data -> cannot release */ if (folio_test_checked(folio)) @@ -3992,12 +3970,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); - ret = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - if (ret) { - up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; - } + ext4_es_remove_extent(inode, first_block, + stop_block - first_block); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, first_block, @@ -6156,7 +6130,7 @@ retry_alloc: err = __block_write_begin(&folio->page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_journal_page_buffers(handle, &folio->page, len)) + if (ext4_journal_folio_buffers(handle, folio, len)) goto out_error; } else { folio_unlock(folio); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index f9a430152063..331859511f80 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -793,16 +793,10 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) } #endif -static int ext4_shutdown(struct super_block *sb, unsigned long arg) +int ext4_force_shutdown(struct super_block *sb, u32 flags) { struct ext4_sb_info *sbi = EXT4_SB(sb); - __u32 flags; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (get_user(flags, (__u32 __user *)arg)) - return -EFAULT; + int ret; if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) return -EINVAL; @@ -815,7 +809,9 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) switch (flags) { case EXT4_GOING_FLAGS_DEFAULT: - freeze_bdev(sb->s_bdev); + ret = freeze_bdev(sb->s_bdev); + if (ret) + return ret; set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); thaw_bdev(sb->s_bdev); break; @@ -838,6 +834,19 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) return 0; } +static int ext4_ioctl_shutdown(struct super_block *sb, unsigned long arg) +{ + u32 flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, (__u32 __user *)arg)) + return -EFAULT; + + return ext4_force_shutdown(sb, flags); +} + struct getfsmap_info { struct super_block *gi_sb; struct fsmap_head __user *gi_data; @@ -1566,7 +1575,7 @@ resizefs_out: return ext4_ioctl_get_es_cache(filp, arg); case EXT4_IOC_SHUTDOWN: - return ext4_shutdown(sb, arg); + return ext4_ioctl_shutdown(sb, arg); case FS_IOC_ENABLE_VERITY: if (!ext4_has_feature_verity(sb)) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 20f67a260df5..a2475b8c9fb5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -154,19 +154,31 @@ * structures to decide the order in which groups are to be traversed for * fulfilling an allocation request. * - * At CR = 0, we look for groups which have the largest_free_order >= the order - * of the request. We directly look at the largest free order list in the data - * structure (1) above where largest_free_order = order of the request. If that - * list is empty, we look at remaining list in the increasing order of - * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. + * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order + * >= the order of the request. We directly look at the largest free order list + * in the data structure (1) above where largest_free_order = order of the + * request. If that list is empty, we look at remaining list in the increasing + * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED + * lookup in O(1) time. * - * At CR = 1, we only consider groups where average fragment size > request - * size. So, we lookup a group which has average fragment size just above or - * equal to request size using our average fragment size group lists (data - * structure 2) in O(1) time. + * At CR_GOAL_LEN_FAST, we only consider groups where + * average fragment size > request size. So, we lookup a group which has average + * fragment size just above or equal to request size using our average fragment + * size group lists (data structure 2) in O(1) time. + * + * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied + * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in + * CR_GOAL_LEN_FAST suggests that there is no BG that has avg + * fragment size > goal length. So before falling to the slower + * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and + * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big + * enough average fragment size. This increases the chances of finding a + * suitable block group in O(1) time and results in faster allocation at the + * cost of reduced size of allocation. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in - * linear order which requires O(N) search time for each CR 0 and CR 1 phase. + * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and + * CR_GOAL_LEN_FAST phase. * * The regular allocator (using the buddy cache) supports a few tunables. * @@ -351,8 +363,8 @@ * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) - * - cr0 lists lock (cr0) - * - cr1 tree lock (cr1) + * - cr_power2_aligned lists lock (cr_power2_aligned) + * - cr_goal_len_fast lists lock (cr_goal_len_fast) * * Paths: * - new pa @@ -384,7 +396,7 @@ * * - allocation path (ext4_mb_regular_allocator) * group - * cr0/cr1 + * cr_power2_aligned/cr_goal_len_fast */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; @@ -409,7 +421,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static bool ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, int cr); + ext4_group_t group, enum criteria cr); static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, @@ -858,8 +870,8 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) * Choose next group by traversing largest_free_order lists. Updates *new_cr if * cr level needs an update. */ -static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, - int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *iter, *grp; @@ -868,8 +880,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, if (ac->ac_status == AC_STATUS_FOUND) return; - if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) - atomic_inc(&sbi->s_bal_cr0_bad_suggestions); + if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) + atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); grp = NULL; for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { @@ -884,8 +896,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { + atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { grp = iter; break; } @@ -897,57 +909,155 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, if (!grp) { /* Increment cr and search again */ - *new_cr = 1; + *new_cr = CR_GOAL_LEN_FAST; } else { *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; + ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; } } /* + * Find a suitable group of given order from the average fragments list. + */ +static struct ext4_group_info * +ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order]; + rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order]; + struct ext4_group_info *grp = NULL, *iter; + enum criteria cr = ac->ac_criteria; + + if (list_empty(frag_list)) + return NULL; + read_lock(frag_list_lock); + if (list_empty(frag_list)) { + read_unlock(frag_list_lock); + return NULL; + } + list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) { + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) { + grp = iter; + break; + } + } + read_unlock(frag_list_lock); + return grp; +} + +/* * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update. */ -static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, - int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *grp = NULL, *iter; + struct ext4_group_info *grp = NULL; int i; - if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { + if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) { if (sbi->s_mb_stats) - atomic_inc(&sbi->s_bal_cr1_bad_suggestions); + atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions); } for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); i < MB_NUM_ORDERS(ac->ac_sb); i++) { - if (list_empty(&sbi->s_mb_avg_fragment_size[i])) - continue; - read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); - if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { - read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); - continue; - } - list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], - bb_avg_fragment_size_node) { - if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { - grp = iter; - break; - } + grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); + if (grp) + break; + } + + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; + } else { + *new_cr = CR_BEST_AVAIL_LEN; + } +} + +/* + * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment + * order we have and proactively trim the goal request length to that order to + * find a suitable group faster. + * + * This optimizes allocation speed at the cost of slightly reduced + * preallocations. However, we make sure that we don't trim the request too + * much and fall to CR_GOAL_LEN_SLOW in that case. + */ +static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *grp = NULL; + int i, order, min_order; + unsigned long num_stripe_clusters = 0; + + if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) { + if (sbi->s_mb_stats) + atomic_inc(&sbi->s_bal_best_avail_bad_suggestions); + } + + /* + * mb_avg_fragment_size_order() returns order in a way that makes + * retrieving back the length using (1 << order) inaccurate. Hence, use + * fls() instead since we need to know the actual length while modifying + * goal length. + */ + order = fls(ac->ac_g_ex.fe_len); + min_order = order - sbi->s_mb_best_avail_max_trim_order; + if (min_order < 0) + min_order = 0; + + if (1 << min_order < ac->ac_o_ex.fe_len) + min_order = fls(ac->ac_o_ex.fe_len) + 1; + + if (sbi->s_stripe > 0) { + /* + * We are assuming that stripe size is always a multiple of + * cluster ratio otherwise __ext4_fill_super exists early. + */ + num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); + if (1 << min_order < num_stripe_clusters) + min_order = fls(num_stripe_clusters); + } + + for (i = order; i >= min_order; i--) { + int frag_order; + /* + * Scale down goal len to make sure we find something + * in the free fragments list. Basically, reduce + * preallocations. + */ + ac->ac_g_ex.fe_len = 1 << i; + + if (num_stripe_clusters > 0) { + /* + * Try to round up the adjusted goal length to + * stripe size (in cluster units) multiple for + * efficiency. + */ + ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len, + num_stripe_clusters); } - read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); + + frag_order = mb_avg_fragment_size_order(ac->ac_sb, + ac->ac_g_ex.fe_len); + + grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); if (grp) break; } if (grp) { *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; + ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; } else { - *new_cr = 2; + /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; + *new_cr = CR_GOAL_LEN_SLOW; } } @@ -955,7 +1065,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) { if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) return 0; - if (ac->ac_criteria >= 2) + if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) return 0; if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; @@ -1000,7 +1110,7 @@ inc_and_return: * @ngroups Total number of groups */ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, - int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { *new_cr = ac->ac_criteria; @@ -1009,10 +1119,12 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, return; } - if (*new_cr == 0) { - ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); - } else if (*new_cr == 1) { - ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); + if (*new_cr == CR_POWER2_ALIGNED) { + ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups); + } else if (*new_cr == CR_GOAL_LEN_FAST) { + ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups); + } else if (*new_cr == CR_BEST_AVAIL_LEN) { + ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups); } else { /* * TODO: For CR=2, we can arrange groups in an rb tree sorted by @@ -2103,6 +2215,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; + ac->ac_cX_found[ac->ac_criteria]++; /* * The special case - take what you catch first @@ -2207,11 +2320,11 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ - if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { + if (max >= ac->ac_g_ex.fe_len && + ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) { ext4_fsblk_t start; - start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + - ex.fe_start; + start = ext4_grp_offs_to_block(ac->ac_sb, &ex); /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; @@ -2277,6 +2390,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, break; } ac->ac_found++; + ac->ac_cX_found[ac->ac_criteria]++; ac->ac_b_ex.fe_len = 1 << i; ac->ac_b_ex.fe_start = k << i; @@ -2305,7 +2419,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct super_block *sb = ac->ac_sb; void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; - int i; + int i, j, freelen; int free; free = e4b->bd_info->bb_free; @@ -2332,6 +2446,24 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, break; } + if (ac->ac_criteria < CR_FAST) { + /* + * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are + * sure that this group will have a large enough + * continuous free extent, so skip over the smaller free + * extents + */ + j = mb_find_next_bit(bitmap, + EXT4_CLUSTERS_PER_GROUP(sb), i); + freelen = j - i; + + if (freelen < ac->ac_g_ex.fe_len) { + i = j; + free -= freelen; + continue; + } + } + mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); if (WARN_ON(ex.fe_len <= 0)) break; @@ -2373,7 +2505,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; - ext4_grpblk_t i; + ext4_grpblk_t i, stripe; int max; BUG_ON(sbi->s_stripe == 0); @@ -2385,18 +2517,21 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; + stripe = EXT4_B2C(sbi, sbi->s_stripe); + i = EXT4_B2C(sbi, i); while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { - max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); - if (max >= sbi->s_stripe) { + max = mb_find_extent(e4b, i, stripe, &ex); + if (max >= stripe) { ac->ac_found++; + ac->ac_cX_found[ac->ac_criteria]++; ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; } } - i += sbi->s_stripe; + i += stripe; } } @@ -2406,13 +2541,13 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, * for the allocation or not. */ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, int cr) + ext4_group_t group, enum criteria cr) { ext4_grpblk_t free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); - BUG_ON(cr < 0 || cr >= 4); + BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) return false; @@ -2426,7 +2561,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return false; switch (cr) { - case 0: + case CR_POWER2_ALIGNED: BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ @@ -2445,15 +2580,16 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return false; return true; - case 1: + case CR_GOAL_LEN_FAST: + case CR_BEST_AVAIL_LEN: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; - case 2: + case CR_GOAL_LEN_SLOW: if (free >= ac->ac_g_ex.fe_len) return true; break; - case 3: + case CR_ANY_FREE: return true; default: BUG(); @@ -2474,7 +2610,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, * out"! */ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, - ext4_group_t group, int cr) + ext4_group_t group, enum criteria cr) { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct super_block *sb = ac->ac_sb; @@ -2494,7 +2630,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) goto out; - if (cr <= 2 && free < ac->ac_g_ex.fe_len) + if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; @@ -2509,15 +2645,16 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_get_group_desc(sb, group, NULL); int ret; - /* cr=0/1 is a very optimistic search to find large - * good chunks almost for free. If buddy data is not - * ready, then this optimization makes no sense. But - * we never skip the first block group in a flex_bg, - * since this gets used for metadata block allocation, - * and we want to make sure we locate metadata blocks - * in the first block group in the flex_bg if possible. + /* + * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic + * search to find large good chunks almost for free. If buddy + * data is not ready, then this optimization makes no sense. But + * we never skip the first block group in a flex_bg, since this + * gets used for metadata block allocation, and we want to make + * sure we locate metadata blocks in the first block group in + * the flex_bg if possible. */ - if (cr < 2 && + if (cr < CR_FAST && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && @@ -2567,9 +2704,7 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, */ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && EXT4_MB_GRP_NEED_INIT(grp) && - ext4_free_group_clusters(sb, gdp) > 0 && - !(ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { + ext4_free_group_clusters(sb, gdp) > 0 ) { bh = ext4_read_block_bitmap_nowait(sb, group, true); if (bh && !IS_ERR(bh)) { if (!buffer_uptodate(bh) && cnt) @@ -2610,9 +2745,7 @@ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, grp = ext4_get_group_info(sb, group); if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && - ext4_free_group_clusters(sb, gdp) > 0 && - !(ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { + ext4_free_group_clusters(sb, gdp) > 0) { if (ext4_mb_init_group(sb, group, GFP_NOFS)) break; } @@ -2623,7 +2756,7 @@ static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t prefetch_grp = 0, ngroups, group, i; - int cr = -1, new_cr; + enum criteria new_cr, cr = CR_GOAL_LEN_FAST; int err = 0, first_err = 0; unsigned int nr = 0, prefetch_ios = 0; struct ext4_sb_info *sbi; @@ -2680,14 +2813,15 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) spin_unlock(&sbi->s_md_lock); } - /* Let's just scan groups to find more-less suitable blocks */ - cr = ac->ac_2order ? 0 : 1; /* - * cr == 0 try to get exact allocation, - * cr == 3 try to get anything + * Let's just scan groups to find more-less suitable blocks We + * start with CR_GOAL_LEN_FAST, unless it is power of 2 + * aligned, in which case let's do that faster approach first. */ + if (ac->ac_2order) + cr = CR_POWER2_ALIGNED; repeat: - for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { + for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; /* * searching for the right group start @@ -2714,10 +2848,8 @@ repeat: * spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr > 1 || + (cr >= CR_FAST || prefetch_ios < sbi->s_mb_prefetch_limit)) { - unsigned int curr_ios = prefetch_ios; - nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { nr = 1 << sbi->s_log_groups_per_flex; @@ -2726,8 +2858,6 @@ repeat: } prefetch_grp = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); - if (prefetch_ios == curr_ios) - nr = 0; } /* This now checks without needing the buddy page */ @@ -2756,10 +2886,13 @@ repeat: } ac->ac_groups_scanned++; - if (cr == 0) + if (cr == CR_POWER2_ALIGNED) ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == 1 && sbi->s_stripe && - !(ac->ac_g_ex.fe_len % sbi->s_stripe)) + else if ((cr == CR_GOAL_LEN_FAST || + cr == CR_BEST_AVAIL_LEN) && + sbi->s_stripe && + !(ac->ac_g_ex.fe_len % + EXT4_B2C(sbi, sbi->s_stripe))) ext4_mb_scan_aligned(ac, &e4b); else ext4_mb_complex_scan_group(ac, &e4b); @@ -2773,6 +2906,11 @@ repeat: /* Processed all groups and haven't found blocks */ if (sbi->s_mb_stats && i == ngroups) atomic64_inc(&sbi->s_bal_cX_failed[cr]); + + if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN) + /* Reset goal length to original goal length before + * falling into CR_GOAL_LEN_SLOW */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && @@ -2798,7 +2936,7 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; - cr = 3; + cr = CR_ANY_FREE; goto repeat; } } @@ -2914,51 +3052,94 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_puts(seq, "mballoc:\n"); if (!sbi->s_mb_stats) { seq_puts(seq, "\tmb stats collection turned off.\n"); - seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); + seq_puts( + seq, + "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); return 0; } seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); - seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); - - seq_puts(seq, "\tcr0_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); - seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[0])); + seq_printf(seq, "\tgroups_scanned: %u\n", + atomic_read(&sbi->s_bal_groups_scanned)); + + /* CR_POWER2_ALIGNED stats */ + seq_puts(seq, "\tcr_p2_aligned_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED])); + seq_printf( + seq, "\t\tgroups_considered: %llu\n", + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[0])); + atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_cr0_bad_suggestions)); + atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions)); - seq_puts(seq, "\tcr1_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); + /* CR_GOAL_LEN_FAST stats */ + seq_puts(seq, "\tcr_goal_fast_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[1])); + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[1])); + atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_cr1_bad_suggestions)); - - seq_puts(seq, "\tcr2_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); - seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[2])); + atomic_read(&sbi->s_bal_goal_fast_bad_suggestions)); + + /* CR_BEST_AVAIL_LEN stats */ + seq_puts(seq, "\tcr_best_avail_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN])); + seq_printf( + seq, "\t\tgroups_considered: %llu\n", + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[2])); + atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); + seq_printf(seq, "\t\tbad_suggestions: %u\n", + atomic_read(&sbi->s_bal_best_avail_bad_suggestions)); - seq_puts(seq, "\tcr3_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); + /* CR_GOAL_LEN_SLOW stats */ + seq_puts(seq, "\tcr_goal_slow_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[3])); + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW])); + + /* CR_ANY_FREE stats */ + seq_puts(seq, "\tcr_any_free_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE])); + seq_printf( + seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[3])); - seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); + atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE])); + + /* Aggregates */ + seq_printf(seq, "\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); + seq_printf(seq, "\t\tlen_goal_hits: %u\n", + atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); - seq_printf(seq, "\tbuddies_generated: %u/%u\n", atomic_read(&sbi->s_mb_buddies_generated), ext4_get_groups_count(sb)); @@ -2966,8 +3147,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic64_read(&sbi->s_mb_generation_time)); seq_printf(seq, "\tpreallocated: %u\n", atomic_read(&sbi->s_mb_preallocated)); - seq_printf(seq, "\tdiscarded: %u\n", - atomic_read(&sbi->s_mb_discarded)); + seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded)); return 0; } @@ -3454,6 +3634,8 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER; + /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file @@ -3478,7 +3660,7 @@ int ext4_mb_init(struct super_block *sb) */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( - sbi->s_mb_group_prealloc, sbi->s_stripe); + sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe)); } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); @@ -4283,7 +4465,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (22 - bsbits)) << 22; size = 4 * 1024 * 1024; - } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, + } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), (8<<20)>>bsbits, max, 8 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (23 - bsbits)) << 23; @@ -4357,6 +4539,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); + ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size)) && @@ -4390,11 +4573,20 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); + atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); + for (int i=0; i<EXT4_MB_NUM_CRS; i++) { + atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]); + } + atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); + /* did we allocate as much as normalizer originally wanted? */ + if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len) + atomic_inc(&sbi->s_bal_len_goals); + if (ac->ac_found > sbi->s_mb_max_to_scan) atomic_inc(&sbi->s_bal_breaks); } @@ -4529,6 +4721,37 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, } /* + * check if found pa meets EXT4_MB_HINT_GOAL_ONLY + */ +static bool +ext4_mb_pa_goal_check(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + ext4_fsblk_t start; + + if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))) + return true; + + /* + * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted + * in ext4_mb_normalize_request and will keep same with ac_o_ex + * from ext4_mb_initialize_context. Choose ac_g_ex here to keep + * consistent with ext4_mb_find_by_goal. + */ + start = pa->pa_pstart + + (ac->ac_g_ex.fe_logical - pa->pa_lstart); + if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start) + return false; + + if (ac->ac_g_ex.fe_len > pa->pa_len - + EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart)) + return false; + + return true; +} + +/* * search goal blocks in preallocated space */ static noinline_for_stack bool @@ -4578,11 +4801,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* found preallocated blocks, use them */ spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) { + if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free && + likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { atomic_inc(&tmp_pa->pa_count); ext4_mb_use_inode_pa(ac, tmp_pa); spin_unlock(&tmp_pa->pa_lock); - ac->ac_criteria = 10; read_unlock(&ei->i_prealloc_lock); return true; } @@ -4625,7 +4848,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) } if (cpa) { ext4_mb_use_group_pa(ac, cpa); - ac->ac_criteria = 20; return true; } return false; @@ -4849,7 +5071,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; - if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { + if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { int new_bex_start; int new_bex_end; @@ -4864,14 +5086,14 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) * fragmentation in check while ensuring logical range of best * extent doesn't overflow out of goal extent: * - * 1. Check if best ex can be kept at end of goal and still - * cover original start + * 1. Check if best ex can be kept at end of goal (before + * cr_best_avail trimmed it) and still cover original start * 2. Else, check if best ex can be kept at start of goal and * still cover original start * 3. Else, keep the best ex at start of original request. */ new_bex_end = ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_g_ex.fe_len); + EXT4_C2B(sbi, ac->ac_orig_goal_len); new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (ac->ac_o_ex.fe_logical >= new_bex_start) goto adjust_bex; @@ -4892,7 +5114,7 @@ adjust_bex: BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_g_ex.fe_len))); + EXT4_C2B(sbi, ac->ac_orig_goal_len))); } pa->pa_lstart = ac->ac_b_ex.fe_logical; @@ -5399,6 +5621,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); mb_debug(sb, "%u found", ac->ac_found); + mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no"); + if (ac->ac_pa) + mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? + "group pa" : "inode pa"); ext4_mb_show_pa(sb); } #else @@ -5508,6 +5734,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; ac->ac_g_ex = ac->ac_o_ex; + ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; ac->ac_flags = ar->flags; /* we have to define context: we'll work with a file or @@ -5751,8 +5978,72 @@ out_dbg: return ret; } -static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, - struct ext4_allocation_request *ar, int *errp); +/* + * Simple allocator for Ext4 fast commit replay path. It searches for blocks + * linearly starting at the goal block and also excludes the blocks which + * are going to be in use after fast commit replay. + */ +static ext4_fsblk_t +ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) +{ + struct buffer_head *bitmap_bh; + struct super_block *sb = ar->inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t group, nr; + ext4_grpblk_t blkoff; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; + ext4_fsblk_t goal, block; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + goal = ar->goal; + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= ext4_blocks_count(es)) + goal = le32_to_cpu(es->s_first_data_block); + + ar->len = 0; + ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); + for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + *errp = PTR_ERR(bitmap_bh); + pr_warn("Failed to read block bitmap\n"); + return 0; + } + + while (1) { + i = mb_find_next_zero_bit(bitmap_bh->b_data, max, + blkoff); + if (i >= max) + break; + if (ext4_fc_replay_check_excluded(sb, + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, i))) { + blkoff = i + 1; + } else + break; + } + brelse(bitmap_bh); + if (i < max) + break; + + if (++group >= ext4_get_groups_count(sb)) + group = 0; + + blkoff = 0; + } + + if (i >= max) { + *errp = -ENOSPC; + return 0; + } + + block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); + ext4_mb_mark_bb(sb, block, 1, 1); + ar->len = 1; + + return block; +} /* * Main entry point into mballoc to allocate blocks @@ -5777,7 +6068,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, trace_ext4_request_blocks(ar); if (sbi->s_mount_state & EXT4_FC_REPLAY) - return ext4_mb_new_blocks_simple(handle, ar, errp); + return ext4_mb_new_blocks_simple(ar, errp); /* Allow to use superuser reservation for quota file */ if (ext4_is_quota_file(ar->inode)) @@ -6001,68 +6292,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, spin_unlock(&sbi->s_md_lock); } -/* - * Simple allocator for Ext4 fast commit replay path. It searches for blocks - * linearly starting at the goal block and also excludes the blocks which - * are going to be in use after fast commit replay. - */ -static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, - struct ext4_allocation_request *ar, int *errp) -{ - struct buffer_head *bitmap_bh; - struct super_block *sb = ar->inode->i_sb; - ext4_group_t group; - ext4_grpblk_t blkoff; - ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); - ext4_grpblk_t i = 0; - ext4_fsblk_t goal, block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - - goal = ar->goal; - if (goal < le32_to_cpu(es->s_first_data_block) || - goal >= ext4_blocks_count(es)) - goal = le32_to_cpu(es->s_first_data_block); - - ar->len = 0; - ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); - for (; group < ext4_get_groups_count(sb); group++) { - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (IS_ERR(bitmap_bh)) { - *errp = PTR_ERR(bitmap_bh); - pr_warn("Failed to read block bitmap\n"); - return 0; - } - - while (1) { - i = mb_find_next_zero_bit(bitmap_bh->b_data, max, - blkoff); - if (i >= max) - break; - if (ext4_fc_replay_check_excluded(sb, - ext4_group_first_block_no(sb, group) + i)) { - blkoff = i + 1; - } else - break; - } - brelse(bitmap_bh); - if (i < max) - break; - - blkoff = 0; - } - - if (group >= ext4_get_groups_count(sb) || i >= max) { - *errp = -ENOSPC; - return 0; - } - - block = ext4_group_first_block_no(sb, group) + i; - ext4_mb_mark_bb(sb, block, 1, 1); - ar->len = 1; - - return block; -} - static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, unsigned long count) { @@ -6243,8 +6472,8 @@ do_more: * them with group lock_held */ if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, block_group, bit, count, - NULL); + err = ext4_issue_discard(sb, block_group, bit, + count_clusters, NULL); if (err && err != -EOPNOTSUPP) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%u block:%d count:%lu failed" @@ -6328,12 +6557,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, sbi = EXT4_SB(sb); - if (sbi->s_mount_state & EXT4_FC_REPLAY) { - ext4_free_blocks_simple(inode, block, count); - return; - } - - might_sleep(); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); @@ -6341,6 +6564,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, block = bh->b_blocknr; } + if (sbi->s_mount_state & EXT4_FC_REPLAY) { + ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count)); + return; + } + + might_sleep(); + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 6d85ee8674a6..df6b5e7c2274 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -49,7 +49,7 @@ #define MB_DEFAULT_MIN_TO_SCAN 10 /* - * with 'ext4_mb_stats' allocator will collect stats that will be + * with 's_mb_stats' allocator will collect stats that will be * shown at umount. The collecting costs though! */ #define MB_DEFAULT_STATS 0 @@ -86,6 +86,13 @@ #define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 /* + * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular + * allocation request. Example, if we have an order 7 request and max trim order + * of 3, we can trim this request upto order 4. + */ +#define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER 3 + +/* * Number of valid buddy orders */ #define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) @@ -179,11 +186,18 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; + /* + * goal len can change in CR1.5, so save the original len. This is + * used while adjusting the PA window and for accounting. + */ + ext4_grpblk_t ac_orig_goal_len; + __u32 ac_groups_considered; __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; __u16 ac_groups_linear_remaining; __u16 ac_found; + __u16 ac_cX_found[EXT4_MB_NUM_CRS]; __u16 ac_tail; __u16 ac_buddy; __u8 ac_status; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 45b579805c95..0caf6c730ce3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3834,19 +3834,10 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, return retval; } - /* - * We need to protect against old.inode directory getting converted - * from inline directory format into a normal one. - */ - if (S_ISDIR(old.inode->i_mode)) - inode_lock_nested(old.inode, I_MUTEX_NONDIR2); - old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); - if (IS_ERR(old.bh)) { - retval = PTR_ERR(old.bh); - goto unlock_moved_dir; - } + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. @@ -4043,10 +4034,6 @@ release_bh: brelse(old.bh); brelse(new.bh); -unlock_moved_dir: - if (S_ISDIR(old.inode->i_mode)) - inode_unlock(old.inode); - return retval; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 6f46823fba61..3e7d160f543f 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -334,7 +334,7 @@ int ext4_mpage_readpages(struct inode *inode, folio_size(folio)); if (first_hole == 0) { if (ext4_need_verity(inode, folio->index) && - !fsverity_verify_page(&folio->page)) + !fsverity_verify_folio(folio)) goto set_error_page; folio_mark_uptodate(folio); folio_unlock(folio); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 05fcecc36244..c94ebf704616 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1096,6 +1096,15 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } +static void ext4_bdev_mark_dead(struct block_device *bdev) +{ + ext4_force_shutdown(bdev->bd_holder, EXT4_GOING_FLAGS_NOLOGFLUSH); +} + +static const struct blk_holder_ops ext4_holder_ops = { + .mark_dead = ext4_bdev_mark_dead, +}; + /* * Open the external journal device */ @@ -1103,7 +1112,8 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; - bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); + bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, + &ext4_holder_ops); if (IS_ERR(bdev)) goto fail; return bdev; @@ -1118,17 +1128,18 @@ fail: /* * Release the journal device */ -static void ext4_blkdev_put(struct block_device *bdev) -{ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -} - static void ext4_blkdev_remove(struct ext4_sb_info *sbi) { struct block_device *bdev; bdev = sbi->s_journal_bdev; if (bdev) { - ext4_blkdev_put(bdev); + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + invalidate_bdev(bdev); + blkdev_put(bdev, sbi->s_sb); sbi->s_journal_bdev = NULL; } } @@ -1159,12 +1170,12 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) #ifdef CONFIG_QUOTA static int ext4_quota_off(struct super_block *sb, int type); -static inline void ext4_quota_off_umount(struct super_block *sb) +static inline void ext4_quotas_off(struct super_block *sb, int type) { - int type; + BUG_ON(type > EXT4_MAXQUOTAS); /* Use our quota_off function to clear inode flags etc. */ - for (type = 0; type < EXT4_MAXQUOTAS; type++) + for (type--; type >= 0; type--) ext4_quota_off(sb, type); } @@ -1180,7 +1191,7 @@ static inline char *get_qf_name(struct super_block *sb, lockdep_is_held(&sb->s_umount)); } #else -static inline void ext4_quota_off_umount(struct super_block *sb) +static inline void ext4_quotas_off(struct super_block *sb, int type) { } #endif @@ -1280,7 +1291,7 @@ static void ext4_put_super(struct super_block *sb) &sb->s_uuid); ext4_unregister_li_request(sb); - ext4_quota_off_umount(sb); + ext4_quotas_off(sb, EXT4_MAXQUOTAS); flush_work(&sbi->s_error_work); destroy_workqueue(sbi->rsv_conversion_wq); @@ -1327,14 +1338,8 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); - if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) { - /* - * Invalidate the journal device's buffers. We don't want them - * floating about in memory - the physical journal device may - * hotswapped, and it breaks the `ro-after' testing code. - */ + if (sbi->s_journal_bdev) { sync_blockdev(sbi->s_journal_bdev); - invalidate_bdev(sbi->s_journal_bdev); ext4_blkdev_remove(sbi); } @@ -1449,6 +1454,11 @@ static void ext4_destroy_inode(struct inode *inode) EXT4_I(inode)->i_reserved_data_blocks); } +static void ext4_shutdown(struct super_block *sb) +{ + ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH); +} + static void init_once(void *foo) { struct ext4_inode_info *ei = foo; @@ -1609,6 +1619,7 @@ static const struct super_operations ext4_sops = { .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, .show_options = ext4_show_options, + .shutdown = ext4_shutdown, #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, @@ -3692,16 +3703,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ext4_group_t group = elr->lr_next_group; unsigned int prefetch_ios = 0; int ret = 0; + int nr = EXT4_SB(sb)->s_mb_prefetch; u64 start_time; if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { - elr->lr_next_group = ext4_mb_prefetch(sb, group, - EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios); - if (prefetch_ios) - ext4_mb_prefetch_fini(sb, elr->lr_next_group, - prefetch_ios); - trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, - prefetch_ios); + elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); + ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr); + trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr); if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && @@ -5297,6 +5305,19 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); + /* + * It's hard to get stripe aligned blocks if stripe is not aligned with + * cluster, just disable stripe and alert user to simpfy code and avoid + * stripe aligned allocation which will rarely successes. + */ + if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 && + sbi->s_stripe % sbi->s_cluster_ratio != 0) { + ext4_msg(sb, KERN_WARNING, + "stripe (%lu) is not aligned with cluster size (%u), " + "stripe is disabled", + sbi->s_stripe, sbi->s_cluster_ratio); + sbi->s_stripe = 0; + } sbi->s_extent_max_zeroout_kb = 32; /* @@ -5567,7 +5588,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) - goto failed_mount9; + goto failed_mount10; } if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) @@ -5586,7 +5607,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) return 0; -failed_mount9: +failed_mount10: + ext4_quotas_off(sb, EXT4_MAXQUOTAS); +failed_mount9: __maybe_unused ext4_release_orphan_info(sb); failed_mount8: ext4_unregister_sysfs(sb); @@ -5645,6 +5668,7 @@ failed_mount: brelse(sbi->s_sbh); ext4_blkdev_remove(sbi); out_fail: + invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; return err; } @@ -5727,6 +5751,11 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; else journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; + /* + * Always enable journal cycle record option, letting the journal + * records log transactions continuously between each mount. + */ + journal->j_flags |= JBD2_CYCLE_RECORD; write_unlock(&journal->j_state_lock); } @@ -5899,7 +5928,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, out_journal: jbd2_journal_destroy(journal); out_bdev: - ext4_blkdev_put(bdev); + blkdev_put(bdev, sb); return NULL; } @@ -5979,19 +6008,27 @@ static int ext4_load_journal(struct super_block *sb, err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); + __le16 orig_state; + bool changed = false; if (save) memcpy(save, ((char *) es) + EXT4_S_ERR_START, EXT4_S_ERR_LEN); err = jbd2_journal_load(journal); - if (save) + if (save && memcmp(((char *) es) + EXT4_S_ERR_START, + save, EXT4_S_ERR_LEN)) { memcpy(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN); + changed = true; + } kfree(save); + orig_state = es->s_state; es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS); + if (orig_state != es->s_state) + changed = true; /* Write out restored error information to the superblock */ - if (!bdev_read_only(sb->s_bdev)) { + if (changed && !really_read_only) { int err2; err2 = ext4_commit_super(sb); err = err ? : err2; @@ -7026,20 +7063,8 @@ int ext4_enable_quotas(struct super_block *sb) "(type=%d, err=%d, ino=%lu). " "Please run e2fsck to fix.", type, err, qf_inums[type]); - for (type--; type >= 0; type--) { - struct inode *inode; - - inode = sb_dqopt(sb)->files[type]; - if (inode) - inode = igrab(inode); - dquot_quota_off(sb, type); - if (inode) { - lockdep_set_quota_inode(inode, - I_DATA_SEM_NORMAL); - iput(inode); - } - } + ext4_quotas_off(sb, type); return err; } } diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 3042bc605bbf..6d332dff79dd 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -223,6 +223,7 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif @@ -273,6 +274,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(warning_ratelimit_burst), ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), + ATTR_LIST(mb_best_avail_max_trim_order), ATTR_LIST(errors_count), ATTR_LIST(warning_count), ATTR_LIST(msg_count), diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 64b3860f50ee..8fd3b7f9fb88 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -30,12 +30,9 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, unsigned char reason) { f2fs_build_fault_attr(sbi, 0, 0); - set_ckpt_flags(sbi, CP_ERROR_FLAG); - if (!end_io) { + if (!end_io) f2fs_flush_merged_writes(sbi); - - f2fs_handle_stop(sbi, reason); - } + f2fs_handle_critical_error(sbi, reason, end_io); } /* diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 11653fa79289..236d890f560b 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -55,6 +55,7 @@ struct f2fs_compress_ops { int (*init_decompress_ctx)(struct decompress_io_ctx *dic); void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic); int (*decompress_pages)(struct decompress_io_ctx *dic); + bool (*is_level_valid)(int level); }; static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -308,17 +309,25 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic) return 0; } +static bool lz4_is_level_valid(int lvl) +{ +#ifdef CONFIG_F2FS_FS_LZ4HC + return !lvl || (lvl >= LZ4HC_MIN_CLEVEL && lvl <= LZ4HC_MAX_CLEVEL); +#else + return lvl == 0; +#endif +} + static const struct f2fs_compress_ops f2fs_lz4_ops = { .init_compress_ctx = lz4_init_compress_ctx, .destroy_compress_ctx = lz4_destroy_compress_ctx, .compress_pages = lz4_compress_pages, .decompress_pages = lz4_decompress_pages, + .is_level_valid = lz4_is_level_valid, }; #endif #ifdef CONFIG_F2FS_FS_ZSTD -#define F2FS_ZSTD_DEFAULT_CLEVEL 1 - static int zstd_init_compress_ctx(struct compress_ctx *cc) { zstd_parameters params; @@ -327,6 +336,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) unsigned int workspace_size; unsigned char level = F2FS_I(cc->inode)->i_compress_level; + /* Need to remain this for backward compatibility */ if (!level) level = F2FS_ZSTD_DEFAULT_CLEVEL; @@ -477,6 +487,11 @@ static int zstd_decompress_pages(struct decompress_io_ctx *dic) return 0; } +static bool zstd_is_level_valid(int lvl) +{ + return lvl >= zstd_min_clevel() && lvl <= zstd_max_clevel(); +} + static const struct f2fs_compress_ops f2fs_zstd_ops = { .init_compress_ctx = zstd_init_compress_ctx, .destroy_compress_ctx = zstd_destroy_compress_ctx, @@ -484,6 +499,7 @@ static const struct f2fs_compress_ops f2fs_zstd_ops = { .init_decompress_ctx = zstd_init_decompress_ctx, .destroy_decompress_ctx = zstd_destroy_decompress_ctx, .decompress_pages = zstd_decompress_pages, + .is_level_valid = zstd_is_level_valid, }; #endif @@ -542,6 +558,16 @@ bool f2fs_is_compress_backend_ready(struct inode *inode) return f2fs_cops[F2FS_I(inode)->i_compress_algorithm]; } +bool f2fs_is_compress_level_valid(int alg, int lvl) +{ + const struct f2fs_compress_ops *cops = f2fs_cops[alg]; + + if (cops->is_level_valid) + return cops->is_level_valid(lvl); + + return lvl == 0; +} + static mempool_t *compress_page_pool; static int num_compress_pages = 512; module_param(num_compress_pages, uint, 0444); @@ -743,8 +769,8 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) ret = -EFSCORRUPTED; /* Avoid f2fs_commit_super in irq context */ - if (in_task) - f2fs_save_errors(sbi, ERROR_FAIL_DECOMPRESSION); + if (!in_task) + f2fs_handle_error_async(sbi, ERROR_FAIL_DECOMPRESSION); else f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); goto out_release; @@ -1215,6 +1241,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, unsigned int last_index = cc->cluster_size - 1; loff_t psize; int i, err; + bool quota_inode = IS_NOQUOTA(inode); /* we should bypass data pages to proceed the kworker jobs */ if (unlikely(f2fs_cp_error(sbi))) { @@ -1222,7 +1249,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, goto out_free; } - if (IS_NOQUOTA(inode)) { + if (quota_inode) { /* * We need to wait for node_write to avoid block allocation during * checkpoint. This can only happen to quota writes which can cause @@ -1344,7 +1371,7 @@ unlock_continue: set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); f2fs_put_dnode(&dn); - if (IS_NOQUOTA(inode)) + if (quota_inode) f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); @@ -1370,7 +1397,7 @@ out_put_cic: out_put_dnode: f2fs_put_dnode(&dn); out_unlock_op: - if (IS_NOQUOTA(inode)) + if (quota_inode) f2fs_up_read(&sbi->node_write); else f2fs_unlock_op(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7165b1202f53..5882afe71d82 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -383,6 +383,17 @@ static void f2fs_write_end_io(struct bio *bio) bio_put(bio); } +#ifdef CONFIG_BLK_DEV_ZONED +static void f2fs_zone_write_end_io(struct bio *bio) +{ + struct f2fs_bio_info *io = (struct f2fs_bio_info *)bio->bi_private; + + bio->bi_private = io->bi_private; + complete(&io->zone_wait); + f2fs_write_end_io(bio); +} +#endif + struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, sector_t *sector) { @@ -639,6 +650,11 @@ int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list); init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock); +#ifdef CONFIG_BLK_DEV_ZONED + init_completion(&sbi->write_io[i][j].zone_wait); + sbi->write_io[i][j].zone_pending_bio = NULL; + sbi->write_io[i][j].bi_private = NULL; +#endif } } @@ -965,6 +981,26 @@ alloc_new: return 0; } +#ifdef CONFIG_BLK_DEV_ZONED +static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int devi = 0; + + if (f2fs_is_multi_device(sbi)) { + devi = f2fs_target_device_index(sbi, blkaddr); + if (blkaddr < FDEV(devi).start_blk || + blkaddr > FDEV(devi).end_blk) { + f2fs_err(sbi, "Invalid block %x", blkaddr); + return false; + } + blkaddr -= FDEV(devi).start_blk; + } + return bdev_zoned_model(FDEV(devi).bdev) == BLK_ZONED_HM && + f2fs_blkz_is_seq(sbi, devi, blkaddr) && + (blkaddr % sbi->blocks_per_blkz == sbi->blocks_per_blkz - 1); +} +#endif + void f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; @@ -975,6 +1011,16 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) f2fs_bug_on(sbi, is_read_io(fio->op)); f2fs_down_write(&io->io_rwsem); + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) { + wait_for_completion_io(&io->zone_wait); + bio_put(io->zone_pending_bio); + io->zone_pending_bio = NULL; + io->bi_private = NULL; + } +#endif + next: if (fio->in_list) { spin_lock(&io->io_lock); @@ -1038,6 +1084,18 @@ skip: if (fio->in_list) goto next; out: +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && btype < META && + is_end_zone_blkaddr(sbi, fio->new_blkaddr)) { + bio_get(io->bio); + reinit_completion(&io->zone_wait); + io->bi_private = io->bio->bi_private; + io->bio->bi_private = io; + io->bio->bi_end_io = f2fs_zone_write_end_io; + io->zone_pending_bio = io->bio; + __submit_merged_bio(io); + } +#endif if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); @@ -2173,7 +2231,6 @@ submit_and_realloc: f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); *last_block_in_bio = block_nr; - goto out; out: *bio_ret = bio; return ret; @@ -2775,6 +2832,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; + bool quota_inode = IS_NOQUOTA(inode); int err = 0; struct f2fs_io_info fio = { .sbi = sbi, @@ -2807,6 +2865,10 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, if (S_ISDIR(inode->i_mode) && !is_sbi_flag_set(sbi, SBI_IS_CLOSE)) goto redirty_out; + + /* keep data pages in remount-ro mode */ + if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) + goto redirty_out; goto out; } @@ -2832,19 +2894,19 @@ write: goto out; /* Dentry/quota blocks are controlled by checkpoint */ - if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) { + if (S_ISDIR(inode->i_mode) || quota_inode) { /* * We need to wait for node_write to avoid block allocation during * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ - if (IS_NOQUOTA(inode)) + if (quota_inode) f2fs_down_read(&sbi->node_write); fio.need_lock = LOCK_DONE; err = f2fs_do_write_data_page(&fio); - if (IS_NOQUOTA(inode)) + if (quota_inode) f2fs_up_read(&sbi->node_write); goto done; @@ -4067,7 +4129,6 @@ const struct address_space_operations f2fs_dblock_aops = { .migrate_folio = filemap_migrate_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, - .direct_IO = noop_direct_IO, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 887e55988450..d635c58cf5a3 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -775,8 +775,15 @@ int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, { int err = -EAGAIN; - if (f2fs_has_inline_dentry(dir)) + if (f2fs_has_inline_dentry(dir)) { + /* + * Should get i_xattr_sem to keep the lock order: + * i_xattr_sem -> inode_page lock used by f2fs_setxattr. + */ + f2fs_down_read(&F2FS_I(dir)->i_xattr_sem); err = f2fs_add_inline_entry(dir, fname, inode, ino, mode); + f2fs_up_read(&F2FS_I(dir)->i_xattr_sem); + } if (err == -EAGAIN) err = f2fs_add_regular_entry(dir, fname, inode, ino, mode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d211ee89c158..c7cb2177b252 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -80,34 +80,34 @@ extern const char *f2fs_fault_name[FAULT_MAX]; /* * For mount options */ -#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002 -#define F2FS_MOUNT_DISCARD 0x00000004 -#define F2FS_MOUNT_NOHEAP 0x00000008 -#define F2FS_MOUNT_XATTR_USER 0x00000010 -#define F2FS_MOUNT_POSIX_ACL 0x00000020 -#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 -#define F2FS_MOUNT_INLINE_XATTR 0x00000080 -#define F2FS_MOUNT_INLINE_DATA 0x00000100 -#define F2FS_MOUNT_INLINE_DENTRY 0x00000200 -#define F2FS_MOUNT_FLUSH_MERGE 0x00000400 -#define F2FS_MOUNT_NOBARRIER 0x00000800 -#define F2FS_MOUNT_FASTBOOT 0x00001000 -#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00002000 -#define F2FS_MOUNT_DATA_FLUSH 0x00008000 -#define F2FS_MOUNT_FAULT_INJECTION 0x00010000 -#define F2FS_MOUNT_USRQUOTA 0x00080000 -#define F2FS_MOUNT_GRPQUOTA 0x00100000 -#define F2FS_MOUNT_PRJQUOTA 0x00200000 -#define F2FS_MOUNT_QUOTA 0x00400000 -#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 -#define F2FS_MOUNT_RESERVE_ROOT 0x01000000 -#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 -#define F2FS_MOUNT_NORECOVERY 0x04000000 -#define F2FS_MOUNT_ATGC 0x08000000 -#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 -#define F2FS_MOUNT_GC_MERGE 0x20000000 -#define F2FS_MOUNT_COMPRESS_CACHE 0x40000000 -#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x80000000 +#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000001 +#define F2FS_MOUNT_DISCARD 0x00000002 +#define F2FS_MOUNT_NOHEAP 0x00000004 +#define F2FS_MOUNT_XATTR_USER 0x00000008 +#define F2FS_MOUNT_POSIX_ACL 0x00000010 +#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000020 +#define F2FS_MOUNT_INLINE_XATTR 0x00000040 +#define F2FS_MOUNT_INLINE_DATA 0x00000080 +#define F2FS_MOUNT_INLINE_DENTRY 0x00000100 +#define F2FS_MOUNT_FLUSH_MERGE 0x00000200 +#define F2FS_MOUNT_NOBARRIER 0x00000400 +#define F2FS_MOUNT_FASTBOOT 0x00000800 +#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00001000 +#define F2FS_MOUNT_DATA_FLUSH 0x00002000 +#define F2FS_MOUNT_FAULT_INJECTION 0x00004000 +#define F2FS_MOUNT_USRQUOTA 0x00008000 +#define F2FS_MOUNT_GRPQUOTA 0x00010000 +#define F2FS_MOUNT_PRJQUOTA 0x00020000 +#define F2FS_MOUNT_QUOTA 0x00040000 +#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00080000 +#define F2FS_MOUNT_RESERVE_ROOT 0x00100000 +#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x00200000 +#define F2FS_MOUNT_NORECOVERY 0x00400000 +#define F2FS_MOUNT_ATGC 0x00800000 +#define F2FS_MOUNT_MERGE_CHECKPOINT 0x01000000 +#define F2FS_MOUNT_GC_MERGE 0x02000000 +#define F2FS_MOUNT_COMPRESS_CACHE 0x04000000 +#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x08000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -162,6 +162,7 @@ struct f2fs_mount_info { int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ int memory_mode; /* memory mode */ + int errors; /* errors parameter */ int discard_unit; /* * discard command's offset/size should * be aligned to this unit: block, @@ -185,21 +186,21 @@ struct f2fs_mount_info { unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ }; -#define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_BLKZONED 0x0002 -#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 -#define F2FS_FEATURE_EXTRA_ATTR 0x0008 -#define F2FS_FEATURE_PRJQUOTA 0x0010 -#define F2FS_FEATURE_INODE_CHKSUM 0x0020 -#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 -#define F2FS_FEATURE_QUOTA_INO 0x0080 -#define F2FS_FEATURE_INODE_CRTIME 0x0100 -#define F2FS_FEATURE_LOST_FOUND 0x0200 -#define F2FS_FEATURE_VERITY 0x0400 -#define F2FS_FEATURE_SB_CHKSUM 0x0800 -#define F2FS_FEATURE_CASEFOLD 0x1000 -#define F2FS_FEATURE_COMPRESSION 0x2000 -#define F2FS_FEATURE_RO 0x4000 +#define F2FS_FEATURE_ENCRYPT 0x00000001 +#define F2FS_FEATURE_BLKZONED 0x00000002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x00000004 +#define F2FS_FEATURE_EXTRA_ATTR 0x00000008 +#define F2FS_FEATURE_PRJQUOTA 0x00000010 +#define F2FS_FEATURE_INODE_CHKSUM 0x00000020 +#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x00000040 +#define F2FS_FEATURE_QUOTA_INO 0x00000080 +#define F2FS_FEATURE_INODE_CRTIME 0x00000100 +#define F2FS_FEATURE_LOST_FOUND 0x00000200 +#define F2FS_FEATURE_VERITY 0x00000400 +#define F2FS_FEATURE_SB_CHKSUM 0x00000800 +#define F2FS_FEATURE_CASEFOLD 0x00001000 +#define F2FS_FEATURE_COMPRESSION 0x00002000 +#define F2FS_FEATURE_RO 0x00004000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -1175,6 +1176,7 @@ enum iostat_type { /* other */ FS_DISCARD_IO, /* discard */ FS_FLUSH_IO, /* flush */ + FS_ZONE_RESET_IO, /* zone reset */ NR_IO_TYPE, }; @@ -1217,6 +1219,11 @@ struct f2fs_bio_info { struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ +#ifdef CONFIG_BLK_DEV_ZONED + struct completion zone_wait; /* condition value for the previous open zone to close */ + struct bio *zone_pending_bio; /* pending bio for the previous zone */ + void *bi_private; /* previous bi_private for pending bio */ +#endif struct f2fs_rwsem io_rwsem; /* blocking op for bio */ spinlock_t io_lock; /* serialize DATA/NODE IOs */ struct list_head io_list; /* track fios */ @@ -1370,6 +1377,12 @@ enum { MEMORY_MODE_LOW, /* memory mode for low memry devices */ }; +enum errors_option { + MOUNT_ERRORS_READONLY, /* remount fs ro on errors */ + MOUNT_ERRORS_CONTINUE, /* continue on errors */ + MOUNT_ERRORS_PANIC, /* panic on errors */ +}; + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1427,6 +1440,8 @@ struct compress_data { #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 +#define F2FS_ZSTD_DEFAULT_CLEVEL 1 + #define COMPRESS_LEVEL_OFFSET 8 /* compress context */ @@ -1721,8 +1736,14 @@ struct f2fs_sb_info { struct workqueue_struct *post_read_wq; /* post read workqueue */ - unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ - spinlock_t error_lock; /* protect errors array */ + /* + * If we are in irq context, let's update error information into + * on-disk superblock in the work. + */ + struct work_struct s_error_work; + unsigned char errors[MAX_F2FS_ERRORS]; /* error flags */ + unsigned char stop_reason[MAX_STOP_REASON]; /* stop reason */ + spinlock_t error_lock; /* protect errors/stop_reason array */ bool error_dirty; /* errors of sb is dirty */ struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ @@ -2941,6 +2962,8 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define F2FS_QUOTA_DEFAULT_FL (F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL) + /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ @@ -3394,6 +3417,8 @@ static inline int get_inline_xattr_addrs(struct inode *inode) ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) +#define F2FS_MIN_EXTRA_ATTR_SIZE (sizeof(__le32)) + #define F2FS_TOTAL_EXTRA_ATTR_SIZE \ (offsetof(struct f2fs_inode, i_extra_end) - \ offsetof(struct f2fs_inode, i_extra_isize)) \ @@ -3432,7 +3457,6 @@ static inline bool __is_valid_data_blkaddr(block_t blkaddr) * file.c */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -void f2fs_truncate_data_blocks(struct dnode_of_data *dn); int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); @@ -3541,9 +3565,11 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); -void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason); void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); +void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, + bool irq_context); void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); +void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); @@ -3815,7 +3841,7 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); -int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); +int f2fs_resize_fs(struct file *filp, __u64 block_count); int __init f2fs_create_garbage_collection_cache(void); void f2fs_destroy_garbage_collection_cache(void); /* victim selection function for cleaning and SSR */ @@ -4213,6 +4239,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock); void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); +bool f2fs_is_compress_level_valid(int alg, int lvl); int __init f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); @@ -4277,6 +4304,7 @@ static inline bool f2fs_is_compress_backend_ready(struct inode *inode) /* not support compression */ return false; } +static inline bool f2fs_is_compress_level_valid(int alg, int lvl) { return false; } static inline struct page *f2fs_compress_control_page(struct page *page) { WARN_ON_ONCE(1); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5ac53d2627d2..093039dee992 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -149,8 +149,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) zero_user_segment(page, offset, PAGE_SIZE); } set_page_dirty(page); - if (!PageUptodate(page)) - SetPageUptodate(page); f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE); f2fs_update_time(sbi, REQ_TIME); @@ -546,7 +544,8 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; - filp->f_mode |= FMODE_NOWAIT; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } @@ -627,11 +626,6 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->ofs_in_node, nr_free); } -void f2fs_truncate_data_blocks(struct dnode_of_data *dn) -{ - f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK(dn->inode)); -} - static int truncate_partial_data_page(struct inode *inode, u64 from, bool cache_only) { @@ -2225,7 +2219,6 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) ret = 0; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); trace_f2fs_shutdown(sbi, in, ret); } return ret; @@ -2238,7 +2231,6 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) goto out; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); thaw_bdev(sb->s_bdev); break; case F2FS_GOING_DOWN_METASYNC: @@ -2247,16 +2239,13 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) goto out; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NEED_FSCK: set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -2593,6 +2582,11 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, inode_lock(inode); + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { + err = -EINVAL; + goto unlock_out; + } + /* if in-place-update policy is enabled, don't waste time here */ set_inode_flag(inode, FI_OPU_WRITE); if (f2fs_should_update_inplace(inode, NULL)) { @@ -2717,6 +2711,7 @@ clear_out: clear_inode_flag(inode, FI_SKIP_WRITES); out: clear_inode_flag(inode, FI_OPU_WRITE); +unlock_out: inode_unlock(inode); if (!err) range->len = (u64)total << PAGE_SHIFT; @@ -2876,6 +2871,17 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); out_src: f2fs_up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (ret) + goto out_unlock; + + src->i_mtime = src->i_ctime = current_time(src); + f2fs_mark_inode_dirty_sync(src, false); + if (src != dst) { + dst->i_mtime = dst->i_ctime = current_time(dst); + f2fs_mark_inode_dirty_sync(dst, false); + } + f2fs_update_time(sbi, REQ_TIME); + out_unlock: if (src != dst) inode_unlock(dst); @@ -3278,7 +3284,7 @@ static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg) sizeof(block_count))) return -EFAULT; - return f2fs_resize_fs(sbi, block_count); + return f2fs_resize_fs(filp, block_count); } static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) @@ -3375,18 +3381,29 @@ out: return err; } -static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg) +static int f2fs_get_compress_blocks(struct inode *inode, __u64 *blocks) { - struct inode *inode = file_inode(filp); - __u64 blocks; - if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) return -EOPNOTSUPP; if (!f2fs_compressed_file(inode)) return -EINVAL; - blocks = atomic_read(&F2FS_I(inode)->i_compr_blocks); + *blocks = atomic_read(&F2FS_I(inode)->i_compr_blocks); + + return 0; +} + +static int f2fs_ioc_get_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u64 blocks; + int ret; + + ret = f2fs_get_compress_blocks(inode, &blocks); + if (ret < 0) + return ret; + return put_user(blocks, (u64 __user *)arg); } @@ -3455,7 +3472,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) int ret; int writecount; - if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + if (!f2fs_sb_has_compression(sbi)) return -EOPNOTSUPP; if (!f2fs_compressed_file(inode)) @@ -3468,7 +3485,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (ret) return ret; - f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_balance_fs(sbi, true); inode_lock(inode); @@ -3488,13 +3505,15 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) if (ret) goto out; + if (!atomic_read(&F2FS_I(inode)->i_compr_blocks)) { + ret = -EPERM; + goto out; + } + set_inode_flag(inode, FI_COMPRESS_RELEASED); inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, true); - if (!atomic_read(&F2FS_I(inode)->i_compr_blocks)) - goto out; - f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); @@ -3625,7 +3644,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) unsigned int reserved_blocks = 0; int ret; - if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + if (!f2fs_sb_has_compression(sbi)) return -EOPNOTSUPP; if (!f2fs_compressed_file(inode)) @@ -3641,7 +3660,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) if (atomic_read(&F2FS_I(inode)->i_compr_blocks)) goto out; - f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_balance_fs(sbi, true); inode_lock(inode); @@ -4035,7 +4054,7 @@ static int f2fs_ioc_decompress_file(struct file *filp) if (!f2fs_compressed_file(inode)) return -EINVAL; - f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_balance_fs(sbi, true); file_start_write(filp); inode_lock(inode); @@ -4110,7 +4129,7 @@ static int f2fs_ioc_compress_file(struct file *filp) if (!f2fs_compressed_file(inode)) return -EINVAL; - f2fs_balance_fs(F2FS_I_SB(inode), true); + f2fs_balance_fs(sbi, true); file_start_write(filp); inode_lock(inode); @@ -4238,7 +4257,7 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC_SETFSLABEL: return f2fs_ioc_setfslabel(filp, arg); case F2FS_IOC_GET_COMPRESS_BLOCKS: - return f2fs_get_compress_blocks(filp, arg); + return f2fs_ioc_get_compress_blocks(filp, arg); case F2FS_IOC_RELEASE_COMPRESS_BLOCKS: return f2fs_release_compress_blocks(filp, arg); case F2FS_IOC_RESERVE_COMPRESS_BLOCKS: @@ -4367,22 +4386,23 @@ out: return ret; } -static void f2fs_trace_rw_file_path(struct kiocb *iocb, size_t count, int rw) +static void f2fs_trace_rw_file_path(struct file *file, loff_t pos, size_t count, + int rw) { - struct inode *inode = file_inode(iocb->ki_filp); + struct inode *inode = file_inode(file); char *buf, *path; buf = f2fs_getname(F2FS_I_SB(inode)); if (!buf) return; - path = dentry_path_raw(file_dentry(iocb->ki_filp), buf, PATH_MAX); + path = dentry_path_raw(file_dentry(file), buf, PATH_MAX); if (IS_ERR(path)) goto free_buf; if (rw == WRITE) - trace_f2fs_datawrite_start(inode, iocb->ki_pos, count, + trace_f2fs_datawrite_start(inode, pos, count, current->pid, path, current->comm); else - trace_f2fs_dataread_start(inode, iocb->ki_pos, count, + trace_f2fs_dataread_start(inode, pos, count, current->pid, path, current->comm); free_buf: f2fs_putname(buf); @@ -4398,7 +4418,8 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return -EOPNOTSUPP; if (trace_f2fs_dataread_start_enabled()) - f2fs_trace_rw_file_path(iocb, iov_iter_count(to), READ); + f2fs_trace_rw_file_path(iocb->ki_filp, iocb->ki_pos, + iov_iter_count(to), READ); if (f2fs_should_use_dio(inode, iocb, to)) { ret = f2fs_dio_read_iter(iocb, to); @@ -4413,6 +4434,30 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; } +static ssize_t f2fs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + const loff_t pos = *ppos; + ssize_t ret; + + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + + if (trace_f2fs_dataread_start_enabled()) + f2fs_trace_rw_file_path(in, pos, len, READ); + + ret = filemap_splice_read(in, ppos, pipe, len, flags); + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), inode, + APP_BUFFERED_READ_IO, ret); + + if (trace_f2fs_dataread_end_enabled()) + trace_f2fs_dataread_end(inode, pos, ret); + return ret; +} + static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -4517,12 +4562,9 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; - current->backing_dev_info = inode_to_bdi(inode); ret = generic_perform_write(iocb, from); - current->backing_dev_info = NULL; if (ret > 0) { - iocb->ki_pos += ret; f2fs_update_iostat(F2FS_I_SB(inode), inode, APP_BUFFERED_IO, ret); } @@ -4714,7 +4756,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = preallocated; } else { if (trace_f2fs_datawrite_start_enabled()) - f2fs_trace_rw_file_path(iocb, orig_count, WRITE); + f2fs_trace_rw_file_path(iocb->ki_filp, iocb->ki_pos, + orig_count, WRITE); /* Do the actual write. */ ret = dio ? @@ -4919,7 +4962,7 @@ const struct file_operations f2fs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = f2fs_compat_ioctl, #endif - .splice_read = generic_file_splice_read, + .splice_read = f2fs_file_splice_read, .splice_write = iter_file_splice_write, .fadvise = f2fs_file_fadvise, }; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 61c5f9d26018..01effd3fcb6c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -59,7 +59,7 @@ static int gc_thread_func(void *data) if (gc_th->gc_wake) gc_th->gc_wake = false; - if (try_to_freeze()) { + if (try_to_freeze() || f2fs_readonly(sbi->sb)) { stat_other_skip_bggc_count(sbi); continue; } @@ -1797,7 +1797,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control) { int gc_type = gc_control->init_gc_type; unsigned int segno = gc_control->victim_segno; - int sec_freed = 0, seg_freed = 0, total_freed = 0; + int sec_freed = 0, seg_freed = 0, total_freed = 0, total_sec_freed = 0; int ret = 0; struct cp_control cpc; struct gc_inode_list gc_list = { @@ -1842,6 +1842,8 @@ gc_more: ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; + /* Reset due to checkpoint */ + sec_freed = 0; } } @@ -1866,15 +1868,17 @@ retry: gc_control->should_migrate_blocks); total_freed += seg_freed; - if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) + if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) { sec_freed++; + total_sec_freed++; + } if (gc_type == FG_GC) { sbi->cur_victim_sec = NULL_SEGNO; if (has_enough_free_secs(sbi, sec_freed, 0)) { if (!gc_control->no_bg_gc && - sec_freed < gc_control->nr_free_secs) + total_sec_freed < gc_control->nr_free_secs) goto go_gc_more; goto stop; } @@ -1901,6 +1905,8 @@ retry: ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; + /* Reset due to checkpoint */ + sec_freed = 0; } go_gc_more: segno = NULL_SEGNO; @@ -1913,7 +1919,7 @@ stop: if (gc_type == FG_GC) f2fs_unpin_all_sections(sbi, true); - trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + trace_f2fs_gc_end(sbi->sb, ret, total_freed, total_sec_freed, get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DENTS), get_pages(sbi, F2FS_DIRTY_IMETA), @@ -1927,7 +1933,7 @@ stop: put_gc_inode(&gc_list); if (gc_control->err_gc_skipped && !ret) - ret = sec_freed ? 0 : -EAGAIN; + ret = total_sec_freed ? 0 : -EAGAIN; return ret; } @@ -2099,8 +2105,9 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) } } -int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) +int f2fs_resize_fs(struct file *filp, __u64 block_count) { + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp)); __u64 old_block_count, shrunk_blocks; struct cp_control cpc = { CP_RESIZE, 0, 0, 0 }; unsigned int secs; @@ -2138,12 +2145,18 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) return -EINVAL; } + err = mnt_want_write_file(filp); + if (err) + return err; + shrunk_blocks = old_block_count - block_count; secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi)); /* stop other GC */ - if (!f2fs_down_write_trylock(&sbi->gc_lock)) - return -EAGAIN; + if (!f2fs_down_write_trylock(&sbi->gc_lock)) { + err = -EAGAIN; + goto out_drop_write; + } /* stop CP to protect MAIN_SEC in free_segment_range */ f2fs_lock_op(sbi); @@ -2163,10 +2176,20 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) out_unlock: f2fs_unlock_op(sbi); f2fs_up_write(&sbi->gc_lock); +out_drop_write: + mnt_drop_write_file(filp); if (err) return err; - freeze_super(sbi->sb); + err = freeze_super(sbi->sb); + if (err) + return err; + + if (f2fs_readonly(sbi->sb)) { + thaw_super(sbi->sb); + return -EROFS; + } + f2fs_down_write(&sbi->gc_lock); f2fs_down_write(&sbi->cp_global_sem); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index cf4327ad106c..09e986b050c6 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -10,6 +10,8 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/sched/mm.h> +#include <linux/lz4.h> +#include <linux/zstd.h> #include "f2fs.h" #include "node.h" @@ -202,6 +204,80 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); } +static bool sanity_check_compress_inode(struct inode *inode, + struct f2fs_inode *ri) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned char clevel; + + if (ri->i_compress_algorithm >= COMPRESS_MAX) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) has unsupported compress algorithm: %u, run fsck to fix", + __func__, inode->i_ino, ri->i_compress_algorithm); + goto err; + } + if (le64_to_cpu(ri->i_compr_blocks) > + SECTOR_TO_BLOCK(inode->i_blocks)) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) has inconsistent i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", + __func__, inode->i_ino, le64_to_cpu(ri->i_compr_blocks), + SECTOR_TO_BLOCK(inode->i_blocks)); + goto err; + } + if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || + ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) has unsupported log cluster size: %u, run fsck to fix", + __func__, inode->i_ino, ri->i_log_cluster_size); + goto err; + } + + clevel = le16_to_cpu(ri->i_compress_flag) >> + COMPRESS_LEVEL_OFFSET; + switch (ri->i_compress_algorithm) { + case COMPRESS_LZO: +#ifdef CONFIG_F2FS_FS_LZO + if (clevel) + goto err_level; +#endif + break; + case COMPRESS_LZORLE: +#ifdef CONFIG_F2FS_FS_LZORLE + if (clevel) + goto err_level; +#endif + break; + case COMPRESS_LZ4: +#ifdef CONFIG_F2FS_FS_LZ4 +#ifdef CONFIG_F2FS_FS_LZ4HC + if (clevel && + (clevel < LZ4HC_MIN_CLEVEL || clevel > LZ4HC_MAX_CLEVEL)) + goto err_level; +#else + if (clevel) + goto err_level; +#endif +#endif + break; + case COMPRESS_ZSTD: +#ifdef CONFIG_F2FS_FS_ZSTD + if (clevel < zstd_min_clevel() || clevel > zstd_max_clevel()) + goto err_level; +#endif + break; + default: + goto err_level; + } + + return true; +err_level: + f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported compress level: %u, run fsck to fix", + __func__, inode->i_ino, clevel); +err: + set_sbi_flag(sbi, SBI_NEED_FSCK); + return false; +} + static bool sanity_check_inode(struct inode *inode, struct page *node_page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -225,41 +301,77 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (f2fs_sb_has_flexible_inline_xattr(sbi) - && !f2fs_has_extra_attr(inode)) { + if (f2fs_has_extra_attr(inode)) { + if (!f2fs_sb_has_extra_attr(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off", + __func__, inode->i_ino); + return false; + } + if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || + fi->i_extra_isize < F2FS_MIN_EXTRA_ATTR_SIZE || + fi->i_extra_isize % sizeof(__le32)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, max: %zu", + __func__, inode->i_ino, fi->i_extra_isize, + F2FS_TOTAL_EXTRA_ATTR_SIZE); + return false; + } + if (f2fs_sb_has_flexible_inline_xattr(sbi) && + f2fs_has_inline_xattr(inode) && + (!fi->i_inline_xattr_size || + fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu", + __func__, inode->i_ino, fi->i_inline_xattr_size, + MAX_INLINE_XATTR_SIZE); + return false; + } + if (f2fs_sb_has_compression(sbi) && + fi->i_flags & F2FS_COMPR_FL && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_compress_flag)) { + if (!sanity_check_compress_inode(inode, ri)) + return false; + } + } else if (f2fs_sb_has_flexible_inline_xattr(sbi)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: corrupted inode ino=%lx, run fsck to fix.", __func__, inode->i_ino); return false; } - if (f2fs_has_extra_attr(inode) && - !f2fs_sb_has_extra_attr(sbi)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off", - __func__, inode->i_ino); - return false; - } - - if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || - fi->i_extra_isize % sizeof(__le32)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, max: %zu", - __func__, inode->i_ino, fi->i_extra_isize, - F2FS_TOTAL_EXTRA_ATTR_SIZE); - return false; - } - - if (f2fs_has_extra_attr(inode) && - f2fs_sb_has_flexible_inline_xattr(sbi) && - f2fs_has_inline_xattr(inode) && - (!fi->i_inline_xattr_size || - fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu", - __func__, inode->i_ino, fi->i_inline_xattr_size, - MAX_INLINE_XATTR_SIZE); - return false; + if (!f2fs_sb_has_extra_attr(sbi)) { + if (f2fs_sb_has_project_quota(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_PRJQUOTA); + return false; + } + if (f2fs_sb_has_inode_chksum(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_INODE_CHKSUM); + return false; + } + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); + return false; + } + if (f2fs_sb_has_inode_crtime(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_INODE_CRTIME); + return false; + } + if (f2fs_sb_has_compression(sbi)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + __func__, inode->i_ino, F2FS_FEATURE_COMPRESSION); + return false; + } } if (f2fs_sanity_check_inline_data(inode)) { @@ -283,39 +395,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && - fi->i_flags & F2FS_COMPR_FL && - F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, - i_log_cluster_size)) { - if (ri->i_compress_algorithm >= COMPRESS_MAX) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " - "compress algorithm: %u, run fsck to fix", - __func__, inode->i_ino, - ri->i_compress_algorithm); - return false; - } - if (le64_to_cpu(ri->i_compr_blocks) > - SECTOR_TO_BLOCK(inode->i_blocks)) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent " - "i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", - __func__, inode->i_ino, - le64_to_cpu(ri->i_compr_blocks), - SECTOR_TO_BLOCK(inode->i_blocks)); - return false; - } - if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || - ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " - "log cluster size: %u, run fsck to fix", - __func__, inode->i_ino, - ri->i_log_cluster_size); - return false; - } - } - return true; } @@ -442,7 +521,7 @@ static int do_read_inode(struct inode *inode) if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && (fi->i_flags & F2FS_COMPR_FL)) { if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, - i_log_cluster_size)) { + i_compress_flag)) { unsigned short compress_flag; atomic_set(&fi->i_compr_blocks, @@ -680,7 +759,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (f2fs_sb_has_compression(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, - i_log_cluster_size)) { + i_compress_flag)) { unsigned short compress_flag; ri->i_compr_blocks = diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index 3d5bfb1ad585..f8703038e1d8 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -80,6 +80,7 @@ int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) seq_puts(seq, "[OTHER]\n"); IOSTAT_INFO_SHOW("fs discard", FS_DISCARD_IO); IOSTAT_INFO_SHOW("fs flush", FS_FLUSH_IO); + IOSTAT_INFO_SHOW("fs zone reset", FS_ZONE_RESET_IO); return 0; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 77a71276ecb1..bee0568888da 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -23,7 +23,7 @@ #include <trace/events/f2fs.h> static inline bool is_extension_exist(const unsigned char *s, const char *sub, - bool tmp_ext) + bool tmp_ext, bool tmp_dot) { size_t slen = strlen(s); size_t sublen = strlen(sub); @@ -49,13 +49,27 @@ static inline bool is_extension_exist(const unsigned char *s, const char *sub, for (i = 1; i < slen - sublen; i++) { if (s[i] != '.') continue; - if (!strncasecmp(s + i + 1, sub, sublen)) - return true; + if (!strncasecmp(s + i + 1, sub, sublen)) { + if (!tmp_dot) + return true; + if (i == slen - sublen - 1 || s[i + 1 + sublen] == '.') + return true; + } } return false; } +static inline bool is_temperature_extension(const unsigned char *s, const char *sub) +{ + return is_extension_exist(s, sub, true, false); +} + +static inline bool is_compress_extension(const unsigned char *s, const char *sub) +{ + return is_extension_exist(s, sub, true, true); +} + int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, bool hot, bool set) { @@ -148,7 +162,7 @@ static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir, cold_count = le32_to_cpu(sbi->raw_super->extension_count); hot_count = sbi->raw_super->hot_ext_count; for (i = cold_count; i < cold_count + hot_count; i++) - if (is_extension_exist(name, extlist[i], false)) + if (is_temperature_extension(name, extlist[i])) break; f2fs_up_read(&sbi->sb_lock); if (i < (cold_count + hot_count)) @@ -156,12 +170,12 @@ static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir, /* Don't compress unallowed extension. */ for (i = 0; i < noext_cnt; i++) - if (is_extension_exist(name, noext[i], false)) + if (is_compress_extension(name, noext[i])) return; /* Compress wanting extension. */ for (i = 0; i < ext_cnt; i++) { - if (is_extension_exist(name, ext[i], false)) { + if (is_compress_extension(name, ext[i])) { set_compress_context(inode); return; } @@ -189,7 +203,7 @@ static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, cold_count = le32_to_cpu(sbi->raw_super->extension_count); hot_count = sbi->raw_super->hot_ext_count; for (i = 0; i < cold_count + hot_count; i++) - if (is_extension_exist(name, extlist[i], true)) + if (is_temperature_extension(name, extlist[i])) break; f2fs_up_read(&sbi->sb_lock); @@ -576,8 +590,8 @@ out_splice: } #endif new = d_splice_alias(inode, dentry); - err = PTR_ERR_OR_ZERO(new); - trace_f2fs_lookup_end(dir, dentry, ino, !new ? -ENOENT : err); + trace_f2fs_lookup_end(dir, !IS_ERR_OR_NULL(new) ? new : dentry, + ino, IS_ERR(new) ? PTR_ERR(new) : err); return new; out_iput: iput(inode); @@ -995,20 +1009,12 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out; } - /* - * Copied from ext4_rename: we need to protect against old.inode - * directory getting converted from inline directory format into - * a normal one. - */ - if (S_ISDIR(old_inode->i_mode)) - inode_lock_nested(old_inode, I_MUTEX_NONDIR2); - err = -ENOENT; old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) err = PTR_ERR(old_page); - goto out_unlock_old; + goto out; } if (S_ISDIR(old_inode->i_mode)) { @@ -1116,9 +1122,6 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, f2fs_unlock_op(sbi); - if (S_ISDIR(old_inode->i_mode)) - inode_unlock(old_inode); - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); @@ -1133,9 +1136,6 @@ out_dir: f2fs_put_page(old_dir_page, 0); out_old: f2fs_put_page(old_page, 0); -out_unlock_old: - if (S_ISDIR(old_inode->i_mode)) - inode_unlock(old_inode); out: iput(whiteout); return err; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bd1dad523796..ee2e1dd64f25 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -925,6 +925,7 @@ static int truncate_node(struct dnode_of_data *dn) static int truncate_dnode(struct dnode_of_data *dn) { + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct page *page; int err; @@ -932,19 +933,30 @@ static int truncate_dnode(struct dnode_of_data *dn) return 1; /* get direct node */ - page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); + page = f2fs_get_node_page(sbi, dn->nid); if (PTR_ERR(page) == -ENOENT) return 1; else if (IS_ERR(page)) return PTR_ERR(page); + if (IS_INODE(page) || ino_of_node(page) != dn->inode->i_ino) { + f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u", + dn->inode->i_ino, dn->nid, ino_of_node(page)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INVALID_NODE_REFERENCE); + f2fs_put_page(page, 1); + return -EFSCORRUPTED; + } + /* Make dnode_of_data for parameter */ dn->node_page = page; dn->ofs_in_node = 0; - f2fs_truncate_data_blocks(dn); + f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK(dn->inode)); err = truncate_node(dn); - if (err) + if (err) { + f2fs_put_page(page, 1); return err; + } return 1; } @@ -1596,6 +1608,9 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, trace_f2fs_writepage(page, NODE); if (unlikely(f2fs_cp_error(sbi))) { + /* keep node pages in remount-ro mode */ + if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) + goto redirty_out; ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_NODES); unlock_page(page); @@ -2063,7 +2078,6 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, struct list_head *head = &sbi->fsync_node_list; unsigned long flags; unsigned int cur_seq_id = 0; - int ret2, ret = 0; while (seq_id && cur_seq_id < seq_id) { spin_lock_irqsave(&sbi->fsync_node_lock, flags); @@ -2084,16 +2098,9 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, f2fs_wait_on_page_writeback(page, NODE, true, false); put_page(page); - - if (ret) - break; } - ret2 = filemap_check_errors(NODE_MAPPING(sbi)); - if (!ret) - ret = ret2; - - return ret; + return filemap_check_errors(NODE_MAPPING(sbi)); } static int f2fs_write_node_pages(struct address_space *mapping, @@ -3065,7 +3072,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; - struct nat_entry_set *setvec[SETVEC_SIZE]; + struct nat_entry_set *setvec[NAT_VEC_SIZE]; struct nat_entry_set *set, *tmp; unsigned int found; nid_t set_idx = 0; @@ -3098,7 +3105,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, - set_idx, SETVEC_SIZE, setvec))) { + set_idx, NAT_VEC_SIZE, setvec))) { unsigned idx; set_idx = setvec[found - 1]->set + 1; @@ -3319,8 +3326,9 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *next_i; - struct nat_entry *natvec[NATVEC_SIZE]; - struct nat_entry_set *setvec[SETVEC_SIZE]; + void *vec[NAT_VEC_SIZE]; + struct nat_entry **natvec = (struct nat_entry **)vec; + struct nat_entry_set **setvec = (struct nat_entry_set **)vec; nid_t nid = 0; unsigned int found; @@ -3343,7 +3351,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy nat cache */ f2fs_down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, - nid, NATVEC_SIZE, natvec))) { + nid, NAT_VEC_SIZE, natvec))) { unsigned idx; nid = nat_get_nid(natvec[found - 1]) + 1; @@ -3359,8 +3367,9 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy nat set cache */ nid = 0; + memset(vec, 0, sizeof(void *) * NAT_VEC_SIZE); while ((found = __gang_lookup_nat_set(nm_i, - nid, SETVEC_SIZE, setvec))) { + nid, NAT_VEC_SIZE, setvec))) { unsigned idx; nid = setvec[found - 1]->set + 1; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 906fb67a99da..5bd16a95eef8 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -35,8 +35,7 @@ #define DEF_RF_NODE_BLOCKS 0 /* vector size for gang look-up from nat cache that consists of radix tree */ -#define NATVEC_SIZE 64 -#define SETVEC_SIZE 32 +#define NAT_VEC_SIZE 32 /* return value for read_node_page */ #define LOCKED_PAGE 1 diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 58c1a0096f7d..4e7d4ceeb084 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -360,21 +360,63 @@ static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi, return ra_blocks; } +/* Detect looped node chain with Floyd's cycle detection algorithm. */ +static int sanity_check_node_chain(struct f2fs_sb_info *sbi, block_t blkaddr, + block_t *blkaddr_fast, bool *is_detecting) +{ + unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; + struct page *page = NULL; + int i; + + if (!*is_detecting) + return 0; + + for (i = 0; i < 2; i++) { + if (!f2fs_is_valid_blkaddr(sbi, *blkaddr_fast, META_POR)) { + *is_detecting = false; + return 0; + } + + page = f2fs_get_tmp_page(sbi, *blkaddr_fast); + if (IS_ERR(page)) + return PTR_ERR(page); + + if (!is_recoverable_dnode(page)) { + f2fs_put_page(page, 1); + *is_detecting = false; + return 0; + } + + ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, *blkaddr_fast, + next_blkaddr_of_node(page)); + + *blkaddr_fast = next_blkaddr_of_node(page); + f2fs_put_page(page, 1); + + f2fs_ra_meta_pages_cond(sbi, *blkaddr_fast, ra_blocks); + } + + if (*blkaddr_fast == blkaddr) { + f2fs_notice(sbi, "%s: Detect looped node chain on blkaddr:%u." + " Run fsck to fix it.", __func__, blkaddr); + return -EINVAL; + } + return 0; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, bool check_only) { struct curseg_info *curseg; struct page *page = NULL; - block_t blkaddr; - unsigned int loop_cnt = 0; - unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; - unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg - - valid_user_blocks(sbi); + block_t blkaddr, blkaddr_fast; + bool is_detecting = true; int err = 0; /* get node pages in the current segment */ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + blkaddr_fast = blkaddr; while (1) { struct fsync_inode_entry *entry; @@ -418,10 +460,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); - if (err == -ENOENT) { - err = 0; + if (err == -ENOENT) goto next; - } f2fs_put_page(page, 1); break; } @@ -431,25 +471,14 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, if (IS_INODE(page) && is_dent_dnode(page)) entry->last_dentry = blkaddr; next: - /* sanity check in order to detect looped node chain */ - if (++loop_cnt >= free_blocks || - blkaddr == next_blkaddr_of_node(page)) { - f2fs_notice(sbi, "%s: detect looped node chain, blkaddr:%u, next:%u", - __func__, blkaddr, - next_blkaddr_of_node(page)); - f2fs_put_page(page, 1); - err = -EINVAL; - break; - } - - ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, - next_blkaddr_of_node(page)); - /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); - f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); + err = sanity_check_node_chain(sbi, blkaddr, &blkaddr_fast, + &is_detecting); + if (err) + break; } return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6db410f1bb8c..0457d620011f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1196,6 +1196,45 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len); + +#ifdef CONFIG_BLK_DEV_ZONED +static void __submit_zone_reset_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, blk_opf_t flag, + struct list_head *wait_list, + unsigned int *issued) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct block_device *bdev = dc->bdev; + struct bio *bio = bio_alloc(bdev, 0, REQ_OP_ZONE_RESET | flag, GFP_NOFS); + unsigned long flags; + + trace_f2fs_issue_reset_zone(bdev, dc->di.start); + + spin_lock_irqsave(&dc->lock, flags); + dc->state = D_SUBMIT; + dc->bio_ref++; + spin_unlock_irqrestore(&dc->lock, flags); + + if (issued) + (*issued)++; + + atomic_inc(&dcc->queued_discard); + dc->queued++; + list_move_tail(&dc->list, wait_list); + + /* sanity check on discard range */ + __check_sit_bitmap(sbi, dc->di.lstart, dc->di.lstart + dc->di.len); + + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(dc->di.start); + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + submit_bio(bio); + + atomic_inc(&dcc->issued_discard); + f2fs_update_iostat(sbi, NULL, FS_ZONE_RESET_IO, dc->di.len * F2FS_BLKSIZE); +} +#endif + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, @@ -1217,6 +1256,13 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return 0; +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) { + __submit_zone_reset_cmd(sbi, dc, flag, wait_list, issued); + return 0; + } +#endif + trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len); lstart = dc->di.lstart; @@ -1461,6 +1507,19 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, } } +#ifdef CONFIG_BLK_DEV_ZONED +static void __queue_zone_reset_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t lblkstart, + block_t blklen) +{ + trace_f2fs_queue_reset_zone(bdev, blkstart); + + mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); + __insert_discard_cmd(sbi, bdev, lblkstart, blkstart, blklen); + mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); +} +#endif + static void __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { @@ -1724,6 +1783,19 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) mutex_lock(&dcc->cmd_lock); dc = __lookup_discard_cmd(sbi, blkaddr); +#ifdef CONFIG_BLK_DEV_ZONED + if (dc && f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(dc->bdev)) { + /* force submit zone reset */ + if (dc->state == D_PREP) + __submit_zone_reset_cmd(sbi, dc, REQ_SYNC, + &dcc->wait_list, NULL); + dc->ref++; + mutex_unlock(&dcc->cmd_lock); + /* wait zone reset */ + __wait_one_discard_bio(sbi, dc); + return; + } +#endif if (dc) { if (dc->state == D_PREP) { __punch_discard_cmd(sbi, dc, blkaddr); @@ -1876,9 +1948,15 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, blkstart, blklen); return -EIO; } - trace_f2fs_issue_reset_zone(bdev, blkstart); - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - sector, nr_sects, GFP_NOFS); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) { + trace_f2fs_issue_reset_zone(bdev, blkstart); + return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + sector, nr_sects, GFP_NOFS); + } + + __queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen); + return 0; } /* For conventional zones, use regular discard if supported */ @@ -2115,7 +2193,7 @@ find_next: len = next_pos - cur_pos; if (f2fs_sb_has_blkzoned(sbi) || - (force && len < cpc->trim_minlen)) + !force || len < cpc->trim_minlen) goto skip; f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, @@ -4768,17 +4846,17 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, { unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno; block_t zone_block, wp_block, last_valid_block; - unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; int i, s, b, ret; struct seg_entry *se; if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; - wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block); + wp_block = fdev->start_blk + (zone->wp >> sbi->log_sectors_per_block); wp_segno = GET_SEGNO(sbi, wp_block); wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); - zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); + zone_block = fdev->start_blk + (zone->start >> + sbi->log_sectors_per_block); zone_segno = GET_SEGNO(sbi, zone_block); zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno); @@ -4811,39 +4889,52 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, } /* - * If last valid block is beyond the write pointer, report the - * inconsistency. This inconsistency does not cause write error - * because the zone will not be selected for write operation until - * it get discarded. Just report it. + * The write pointer matches with the valid blocks or + * already points to the end of the zone. */ - if (last_valid_block >= wp_block) { - f2fs_notice(sbi, "Valid block beyond write pointer: " - "valid block[0x%x,0x%x] wp[0x%x,0x%x]", - GET_SEGNO(sbi, last_valid_block), - GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), - wp_segno, wp_blkoff); + if ((last_valid_block + 1 == wp_block) || + (zone->wp == zone->start + zone->len)) return 0; - } - /* - * If there is no valid block in the zone and if write pointer is - * not at zone start, reset the write pointer. - */ - if (last_valid_block + 1 == zone_block && zone->wp != zone->start) { + if (last_valid_block + 1 == zone_block) { + /* + * If there is no valid block in the zone and if write pointer + * is not at zone start, reset the write pointer. + */ f2fs_notice(sbi, "Zone without valid block has non-zero write " "pointer. Reset the write pointer: wp[0x%x,0x%x]", wp_segno, wp_blkoff); ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, - zone->len >> log_sectors_per_block); - if (ret) { + zone->len >> sbi->log_sectors_per_block); + if (ret) f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", fdev->path, ret); - return ret; - } + + return ret; } - return 0; + /* + * If there are valid blocks and the write pointer doesn't + * match with them, we need to report the inconsistency and + * fill the zone till the end to close the zone. This inconsistency + * does not cause write error because the zone will not be selected + * for write operation until it get discarded. + */ + f2fs_notice(sbi, "Valid blocks are not aligned with write pointer: " + "valid block[0x%x,0x%x] wp[0x%x,0x%x]", + GET_SEGNO(sbi, last_valid_block), + GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), + wp_segno, wp_blkoff); + + ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, + zone->len - (zone->wp - zone->start), + GFP_NOFS, 0); + if (ret) + f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)", + fdev->path, ret); + + return ret; } static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, @@ -4876,7 +4967,6 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) struct blk_zone zone; unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off; block_t cs_zone_block, wp_block; - unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; sector_t zone_sector; int err; @@ -4888,8 +4978,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) return 0; /* report zone for the sector the curseg points to */ - zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) - << log_sectors_per_block; + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << + sbi->log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { @@ -4901,10 +4991,10 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; - wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); + wp_block = zbd->start_blk + (zone.wp >> sbi->log_sectors_per_block); wp_segno = GET_SEGNO(sbi, wp_block); wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); - wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); + wp_sector_off = zone.wp & GENMASK(sbi->log_sectors_per_block - 1, 0); if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && wp_sector_off == 0) @@ -4931,8 +5021,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) if (!zbd) return 0; - zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) - << log_sectors_per_block; + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << + sbi->log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { @@ -4950,7 +5040,7 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) "Reset the zone: curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); err = __f2fs_issue_discard_zone(sbi, zbd->bdev, cs_zone_block, - zone.len >> log_sectors_per_block); + zone.len >> sbi->log_sectors_per_block); if (err) { f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", zbd->path, err); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9f15b03037db..ca31163da00a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -164,6 +164,7 @@ enum { Opt_discard_unit, Opt_memory_mode, Opt_age_extent_cache, + Opt_errors, Opt_err, }; @@ -243,6 +244,7 @@ static match_table_t f2fs_tokens = { {Opt_discard_unit, "discard_unit=%s"}, {Opt_memory_mode, "memory=%s"}, {Opt_age_extent_cache, "age_extent_cache"}, + {Opt_errors, "errors=%s"}, {Opt_err, NULL}, }; @@ -587,14 +589,12 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) { #ifdef CONFIG_F2FS_FS_LZ4HC unsigned int level; -#endif if (strlen(str) == 3) { - F2FS_OPTION(sbi).compress_level = 0; + F2FS_OPTION(sbi).compress_level = LZ4HC_DEFAULT_CLEVEL; return 0; } -#ifdef CONFIG_F2FS_FS_LZ4HC str += 3; if (str[0] != ':') { @@ -604,7 +604,7 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) if (kstrtouint(str + 1, 10, &level)) return -EINVAL; - if (level < LZ4HC_MIN_CLEVEL || level > LZ4HC_MAX_CLEVEL) { + if (!f2fs_is_compress_level_valid(COMPRESS_LZ4, level)) { f2fs_info(sbi, "invalid lz4hc compress level: %d", level); return -EINVAL; } @@ -612,6 +612,10 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) F2FS_OPTION(sbi).compress_level = level; return 0; #else + if (strlen(str) == 3) { + F2FS_OPTION(sbi).compress_level = 0; + return 0; + } f2fs_info(sbi, "kernel doesn't support lz4hc compression"); return -EINVAL; #endif @@ -625,7 +629,7 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) int len = 4; if (strlen(str) == len) { - F2FS_OPTION(sbi).compress_level = 0; + F2FS_OPTION(sbi).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; return 0; } @@ -638,7 +642,7 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) if (kstrtouint(str + 1, 10, &level)) return -EINVAL; - if (!level || level > zstd_max_clevel()) { + if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) { f2fs_info(sbi, "invalid zstd compress level: %d", level); return -EINVAL; } @@ -1268,6 +1272,25 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_age_extent_cache: set_opt(sbi, AGE_EXTENT_CACHE); break; + case Opt_errors: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "remount-ro")) { + F2FS_OPTION(sbi).errors = + MOUNT_ERRORS_READONLY; + } else if (!strcmp(name, "continue")) { + F2FS_OPTION(sbi).errors = + MOUNT_ERRORS_CONTINUE; + } else if (!strcmp(name, "panic")) { + F2FS_OPTION(sbi).errors = + MOUNT_ERRORS_PANIC; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1340,7 +1363,7 @@ default_check: return -EINVAL; } - min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32); + min_size = MIN_INLINE_XATTR_SIZE; max_size = MAX_INLINE_XATTR_SIZE; if (F2FS_OPTION(sbi).inline_xattr_size < min_size || @@ -1538,7 +1561,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) int i; for (i = 0; i < sbi->s_ndevs; i++) { - blkdev_put(FDEV(i).bdev, FMODE_EXCL); + blkdev_put(FDEV(i).bdev, sbi->sb->s_type); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); #endif @@ -1550,6 +1573,7 @@ static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; + int err = 0; bool done; /* unregister procfs/sysfs entries in advance to avoid race case */ @@ -1576,7 +1600,7 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT, }; - f2fs_write_checkpoint(sbi, &cpc); + err = f2fs_write_checkpoint(sbi, &cpc); } /* be sure to wait for any on-going discard commands */ @@ -1585,7 +1609,7 @@ static void f2fs_put_super(struct super_block *sb) struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; - f2fs_write_checkpoint(sbi, &cpc); + err = f2fs_write_checkpoint(sbi, &cpc); } /* @@ -1602,6 +1626,19 @@ static void f2fs_put_super(struct super_block *sb) f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + if (err) { + truncate_inode_pages_final(NODE_MAPPING(sbi)); + truncate_inode_pages_final(META_MAPPING(sbi)); + } + + for (i = 0; i < NR_COUNT_TYPE; i++) { + if (!get_pages(sbi, i)) + continue; + f2fs_err(sbi, "detect filesystem reference count leak during " + "umount, type: %d, count: %lld", i, get_pages(sbi, i)); + f2fs_bug_on(sbi, 1); + } + f2fs_bug_on(sbi, sbi->fsync_node_num); f2fs_destroy_compress_inode(sbi); @@ -1622,6 +1659,9 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_node_manager(sbi); f2fs_destroy_segment_manager(sbi); + /* flush s_error_work before sbi destroy */ + flush_work(&sbi->s_error_work); + f2fs_destroy_post_read_wq(sbi); kvfree(sbi->ckpt); @@ -2052,12 +2092,32 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW) seq_printf(seq, ",memory=%s", "low"); + if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) + seq_printf(seq, ",errors=%s", "remount-ro"); + else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_CONTINUE) + seq_printf(seq, ",errors=%s", "continue"); + else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC) + seq_printf(seq, ",errors=%s", "panic"); + return 0; } -static void default_options(struct f2fs_sb_info *sbi) +static void default_options(struct f2fs_sb_info *sbi, bool remount) { /* init some FS parameters */ + if (!remount) { + set_opt(sbi, READ_EXTENT_CACHE); + clear_opt(sbi, DISABLE_CHECKPOINT); + + if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) + set_opt(sbi, DISCARD); + + if (f2fs_sb_has_blkzoned(sbi)) + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; + else + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK; + } + if (f2fs_sb_has_readonly(sbi)) F2FS_OPTION(sbi).active_logs = NR_CURSEG_RO_TYPE; else @@ -2080,29 +2140,23 @@ static void default_options(struct f2fs_sb_info *sbi) } F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; + F2FS_OPTION(sbi).errors = MOUNT_ERRORS_CONTINUE; sbi->sb->s_flags &= ~SB_INLINECRYPT; set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); - set_opt(sbi, READ_EXTENT_CACHE); set_opt(sbi, NOHEAP); - clear_opt(sbi, DISABLE_CHECKPOINT); set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; if (!f2fs_is_readonly(sbi)) set_opt(sbi, FLUSH_MERGE); - if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) - set_opt(sbi, DISCARD); - if (f2fs_sb_has_blkzoned(sbi)) { + if (f2fs_sb_has_blkzoned(sbi)) F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; - F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; - } else { + else F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; - F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK; - } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -2274,13 +2328,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } - default_options(sbi); + default_options(sbi, true); /* parse mount options */ err = parse_options(sb, data, true); if (err) goto restore_opts; + /* flush outstanding errors before changing fs state */ + flush_work(&sbi->s_error_work); + /* * Previous and new state of filesystem is RO, * so skip checking GC and FLUSH_MERGE conditions. @@ -2713,6 +2770,7 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, { struct inode *qf_inode; unsigned long qf_inum; + unsigned long qf_flag = F2FS_QUOTA_DEFAULT_FL; int err; BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb))); @@ -2728,7 +2786,15 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, } /* Don't account quota for quota files to avoid recursion */ + inode_lock(qf_inode); qf_inode->i_flags |= S_NOQUOTA; + + if ((F2FS_I(qf_inode)->i_flags & qf_flag) != qf_flag) { + F2FS_I(qf_inode)->i_flags |= qf_flag; + f2fs_set_inode_flags(qf_inode); + } + inode_unlock(qf_inode); + err = dquot_load_quota_inode(qf_inode, type, format_id, flags); iput(qf_inode); return err; @@ -2859,18 +2925,29 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, return -EBUSY; } + if (path->dentry->d_sb != sb) + return -EXDEV; + err = f2fs_quota_sync(sb, type); if (err) return err; - err = dquot_quota_on(sb, type, format_id, path); + inode = d_inode(path->dentry); + + err = filemap_fdatawrite(inode->i_mapping); if (err) return err; - inode = d_inode(path->dentry); + err = filemap_fdatawait(inode->i_mapping); + if (err) + return err; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; inode_lock(inode); - F2FS_I(inode)->i_flags |= F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL; + F2FS_I(inode)->i_flags |= F2FS_QUOTA_DEFAULT_FL; f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); @@ -2895,7 +2972,7 @@ static int __f2fs_quota_off(struct super_block *sb, int type) goto out_put; inode_lock(inode); - F2FS_I(inode)->i_flags &= ~(F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL); + F2FS_I(inode)->i_flags &= ~F2FS_QUOTA_DEFAULT_FL; f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); @@ -3926,55 +4003,73 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } -void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) +static void save_stop_reason(struct f2fs_sb_info *sbi, unsigned char reason) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->error_lock, flags); + if (sbi->stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0)) + sbi->stop_reason[reason]++; + spin_unlock_irqrestore(&sbi->error_lock, flags); +} + +static void f2fs_record_stop_reason(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned long flags; int err; f2fs_down_write(&sbi->sb_lock); - if (raw_super->s_stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0)) - raw_super->s_stop_reason[reason]++; + spin_lock_irqsave(&sbi->error_lock, flags); + if (sbi->error_dirty) { + memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, + MAX_F2FS_ERRORS); + sbi->error_dirty = false; + } + memcpy(raw_super->s_stop_reason, sbi->stop_reason, MAX_STOP_REASON); + spin_unlock_irqrestore(&sbi->error_lock, flags); err = f2fs_commit_super(sbi, false); - if (err) - f2fs_err(sbi, "f2fs_commit_super fails to record reason:%u err:%d", - reason, err); + f2fs_up_write(&sbi->sb_lock); + if (err) + f2fs_err(sbi, "f2fs_commit_super fails to record err:%d", err); } void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) { - spin_lock(&sbi->error_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->error_lock, flags); if (!test_bit(flag, (unsigned long *)sbi->errors)) { set_bit(flag, (unsigned long *)sbi->errors); sbi->error_dirty = true; } - spin_unlock(&sbi->error_lock); + spin_unlock_irqrestore(&sbi->error_lock, flags); } static bool f2fs_update_errors(struct f2fs_sb_info *sbi) { + unsigned long flags; bool need_update = false; - spin_lock(&sbi->error_lock); + spin_lock_irqsave(&sbi->error_lock, flags); if (sbi->error_dirty) { memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, MAX_F2FS_ERRORS); sbi->error_dirty = false; need_update = true; } - spin_unlock(&sbi->error_lock); + spin_unlock_irqrestore(&sbi->error_lock, flags); return need_update; } -void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +static void f2fs_record_errors(struct f2fs_sb_info *sbi, unsigned char error) { int err; - f2fs_save_errors(sbi, error); - f2fs_down_write(&sbi->sb_lock); if (!f2fs_update_errors(sbi)) @@ -3988,11 +4083,89 @@ out_unlock: f2fs_up_write(&sbi->sb_lock); } +void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) +{ + f2fs_save_errors(sbi, error); + f2fs_record_errors(sbi, error); +} + +void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error) +{ + f2fs_save_errors(sbi, error); + + if (!sbi->error_dirty) + return; + if (!test_bit(error, (unsigned long *)sbi->errors)) + return; + schedule_work(&sbi->s_error_work); +} + +static bool system_going_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + +void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, + bool irq_context) +{ + struct super_block *sb = sbi->sb; + bool shutdown = reason == STOP_CP_REASON_SHUTDOWN; + bool continue_fs = !shutdown && + F2FS_OPTION(sbi).errors == MOUNT_ERRORS_CONTINUE; + + set_ckpt_flags(sbi, CP_ERROR_FLAG); + + if (!f2fs_hw_is_readonly(sbi)) { + save_stop_reason(sbi, reason); + + if (irq_context && !shutdown) + schedule_work(&sbi->s_error_work); + else + f2fs_record_stop_reason(sbi); + } + + /* + * We force ERRORS_RO behavior when system is rebooting. Otherwise we + * could panic during 'reboot -f' as the underlying device got already + * disabled. + */ + if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC && + !shutdown && !system_going_down() && + !is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) + panic("F2FS-fs (device %s): panic forced after error\n", + sb->s_id); + + if (shutdown) + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + + /* continue filesystem operators if errors=continue */ + if (continue_fs || f2fs_readonly(sb)) + return; + + f2fs_warn(sbi, "Remounting filesystem read-only"); + /* + * Make sure updated value of ->s_mount_flags will be visible before + * ->s_flags update + */ + smp_wmb(); + sb->s_flags |= SB_RDONLY; +} + +static void f2fs_record_error_work(struct work_struct *work) +{ + struct f2fs_sb_info *sbi = container_of(work, + struct f2fs_sb_info, s_error_work); + + f2fs_record_stop_reason(sbi); +} + static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); unsigned int max_devices = MAX_DEVICES; unsigned int logical_blksize; + blk_mode_t mode = sb_open_mode(sbi->sb->s_flags); int i; /* Initialize single device information */ @@ -4024,8 +4197,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) if (max_devices == 1) { /* Single zoned block device mount */ FDEV(0).bdev = - blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, - sbi->sb->s_mode, sbi->sb->s_type); + blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, mode, + sbi->sb->s_type, NULL); } else { /* Multi-device mount */ memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); @@ -4043,8 +4216,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) (FDEV(i).total_segments << sbi->log_blocks_per_seg) - 1; } - FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, - sbi->sb->s_mode, sbi->sb->s_type); + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, mode, + sbi->sb->s_type, + NULL); } if (IS_ERR(FDEV(i).bdev)) return PTR_ERR(FDEV(i).bdev); @@ -4218,14 +4392,16 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + INIT_WORK(&sbi->s_error_work, f2fs_record_error_work); memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS); + memcpy(sbi->stop_reason, raw_super->s_stop_reason, MAX_STOP_REASON); /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sbi)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, sizeof(raw_super->uuid)); - default_options(sbi); + default_options(sbi, false); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); if (data && !options) { @@ -4615,6 +4791,8 @@ free_sm: f2fs_destroy_segment_manager(sbi); stop_ckpt_thread: f2fs_stop_ckpt_thread(sbi); + /* flush s_error_work before sbi destroy */ + flush_work(&sbi->s_error_work); f2fs_destroy_post_read_wq(sbi); free_devices: destroy_device_list(sbi); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8ea05340bad9..48b7e0073884 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -842,68 +842,160 @@ static struct f2fs_attr f2fs_attr_##_name = { \ #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) -#define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = 0444 }, \ - .show = f2fs_sbi_show, \ - .struct_type = _struct_type, \ - .offset = offsetof(struct _struct_name, _elname), \ -} +#ifdef CONFIG_F2FS_STAT_FS +#define STAT_INFO_RO_ATTR(name, elname) \ + F2FS_RO_ATTR(STAT_INFO, f2fs_stat_info, name, elname) +#endif -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, - urgent_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_io_aware_gran, discard_io_aware_gran); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); -F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard); -F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_seq_blocks, min_seq_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, - interval_time[DISCARD_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, - umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); -#ifdef CONFIG_F2FS_IOSTAT -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms); +#define GC_THREAD_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, name, elname) + +#define SM_INFO_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, name, elname) + +#define SM_INFO_GENERAL_RW_ATTR(elname) \ + SM_INFO_RW_ATTR(elname, elname) + +#define DCC_INFO_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, name, elname) + +#define DCC_INFO_GENERAL_RW_ATTR(elname) \ + DCC_INFO_RW_ATTR(elname, elname) + +#define NM_INFO_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, name, elname) + +#define NM_INFO_GENERAL_RW_ATTR(elname) \ + NM_INFO_RW_ATTR(elname, elname) + +#define F2FS_SBI_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, name, elname) + +#define F2FS_SBI_GENERAL_RW_ATTR(elname) \ + F2FS_SBI_RW_ATTR(elname, elname) + +#define F2FS_SBI_GENERAL_RO_ATTR(elname) \ + F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, elname, elname) + +#ifdef CONFIG_F2FS_FAULT_INJECTION +#define FAULT_INFO_GENERAL_RW_ATTR(type, elname) \ + F2FS_RW_ATTR(type, f2fs_fault_info, elname, elname) #endif -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); + +#define RESERVED_BLOCKS_GENERAL_RW_ATTR(elname) \ + F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, elname, elname) + +#define CPRC_INFO_GENERAL_RW_ATTR(elname) \ + F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, elname, elname) + +#define ATGC_INFO_RW_ATTR(name, elname) \ + F2FS_RW_ATTR(ATGC_INFO, atgc_management, name, elname) + +/* GC_THREAD ATTR */ +GC_THREAD_RW_ATTR(gc_urgent_sleep_time, urgent_sleep_time); +GC_THREAD_RW_ATTR(gc_min_sleep_time, min_sleep_time); +GC_THREAD_RW_ATTR(gc_max_sleep_time, max_sleep_time); +GC_THREAD_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time); + +/* SM_INFO ATTR */ +SM_INFO_RW_ATTR(reclaim_segments, rec_prefree_segments); +SM_INFO_GENERAL_RW_ATTR(ipu_policy); +SM_INFO_GENERAL_RW_ATTR(min_ipu_util); +SM_INFO_GENERAL_RW_ATTR(min_fsync_blocks); +SM_INFO_GENERAL_RW_ATTR(min_seq_blocks); +SM_INFO_GENERAL_RW_ATTR(min_hot_blocks); +SM_INFO_GENERAL_RW_ATTR(min_ssr_sections); + +/* DCC_INFO ATTR */ +DCC_INFO_RW_ATTR(max_small_discards, max_discards); +DCC_INFO_GENERAL_RW_ATTR(max_discard_request); +DCC_INFO_GENERAL_RW_ATTR(min_discard_issue_time); +DCC_INFO_GENERAL_RW_ATTR(mid_discard_issue_time); +DCC_INFO_GENERAL_RW_ATTR(max_discard_issue_time); +DCC_INFO_GENERAL_RW_ATTR(discard_io_aware_gran); +DCC_INFO_GENERAL_RW_ATTR(discard_urgent_util); +DCC_INFO_GENERAL_RW_ATTR(discard_granularity); +DCC_INFO_GENERAL_RW_ATTR(max_ordered_discard); + +/* NM_INFO ATTR */ +NM_INFO_RW_ATTR(max_roll_forward_node_blocks, max_rf_node_blocks); +NM_INFO_GENERAL_RW_ATTR(ram_thresh); +NM_INFO_GENERAL_RW_ATTR(ra_nid_pages); +NM_INFO_GENERAL_RW_ATTR(dirty_nats_ratio); + +/* F2FS_SBI ATTR */ F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); +F2FS_SBI_RW_ATTR(gc_idle, gc_mode); +F2FS_SBI_RW_ATTR(gc_urgent, gc_mode); +F2FS_SBI_RW_ATTR(cp_interval, interval_time[CP_TIME]); +F2FS_SBI_RW_ATTR(idle_interval, interval_time[REQ_TIME]); +F2FS_SBI_RW_ATTR(discard_idle_interval, interval_time[DISCARD_TIME]); +F2FS_SBI_RW_ATTR(gc_idle_interval, interval_time[GC_TIME]); +F2FS_SBI_RW_ATTR(umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); +F2FS_SBI_RW_ATTR(gc_pin_file_thresh, gc_pin_file_threshold); +F2FS_SBI_RW_ATTR(gc_reclaimed_segments, gc_reclaimed_segs); +F2FS_SBI_GENERAL_RW_ATTR(max_victim_search); +F2FS_SBI_GENERAL_RW_ATTR(migration_granularity); +F2FS_SBI_GENERAL_RW_ATTR(dir_level); +#ifdef CONFIG_F2FS_IOSTAT +F2FS_SBI_GENERAL_RW_ATTR(iostat_enable); +F2FS_SBI_GENERAL_RW_ATTR(iostat_period_ms); +#endif +F2FS_SBI_GENERAL_RW_ATTR(readdir_ra); +F2FS_SBI_GENERAL_RW_ATTR(max_io_bytes); +F2FS_SBI_GENERAL_RW_ATTR(data_io_flag); +F2FS_SBI_GENERAL_RW_ATTR(node_io_flag); +F2FS_SBI_GENERAL_RW_ATTR(gc_remaining_trials); +F2FS_SBI_GENERAL_RW_ATTR(seq_file_ra_mul); +F2FS_SBI_GENERAL_RW_ATTR(gc_segment_mode); +F2FS_SBI_GENERAL_RW_ATTR(max_fragment_chunk); +F2FS_SBI_GENERAL_RW_ATTR(max_fragment_hole); +#ifdef CONFIG_F2FS_FS_COMPRESSION +F2FS_SBI_GENERAL_RW_ATTR(compr_written_block); +F2FS_SBI_GENERAL_RW_ATTR(compr_saved_block); +F2FS_SBI_GENERAL_RW_ATTR(compr_new_inode); +F2FS_SBI_GENERAL_RW_ATTR(compress_percent); +F2FS_SBI_GENERAL_RW_ATTR(compress_watermark); +#endif +/* atomic write */ +F2FS_SBI_GENERAL_RO_ATTR(current_atomic_write); +F2FS_SBI_GENERAL_RW_ATTR(peak_atomic_write); +F2FS_SBI_GENERAL_RW_ATTR(committed_atomic_block); +F2FS_SBI_GENERAL_RW_ATTR(revoked_atomic_block); +/* block age extent cache */ +F2FS_SBI_GENERAL_RW_ATTR(hot_data_age_threshold); +F2FS_SBI_GENERAL_RW_ATTR(warm_data_age_threshold); +F2FS_SBI_GENERAL_RW_ATTR(last_age_weight); +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); +#endif + +/* STAT_INFO ATTR */ +#ifdef CONFIG_F2FS_STAT_FS +STAT_INFO_RO_ATTR(cp_foreground_calls, cp_count); +STAT_INFO_RO_ATTR(cp_background_calls, bg_cp_count); +STAT_INFO_RO_ATTR(gc_foreground_calls, call_count); +STAT_INFO_RO_ATTR(gc_background_calls, bg_gc); +#endif + +/* FAULT_INFO ATTR */ #ifdef CONFIG_F2FS_FAULT_INJECTION -F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); -F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_RATE, inject_rate); +FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_TYPE, inject_type); #endif -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_remaining_trials, gc_remaining_trials); -F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); + +/* RESERVED_BLOCKS ATTR */ +RESERVED_BLOCKS_GENERAL_RW_ATTR(reserved_blocks); + +/* CPRC_INFO ATTR */ +CPRC_INFO_GENERAL_RW_ATTR(ckpt_thread_ioprio); + +/* ATGC_INFO ATTR */ +ATGC_INFO_RW_ATTR(atgc_candidate_ratio, candidate_ratio); +ATGC_INFO_RW_ATTR(atgc_candidate_count, max_candidate_count); +ATGC_INFO_RW_ATTR(atgc_age_weight, age_weight); +ATGC_INFO_RW_ATTR(atgc_age_threshold, age_threshold); + F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(ovp_segments); @@ -917,10 +1009,6 @@ F2FS_GENERAL_RO_ATTR(main_blkaddr); F2FS_GENERAL_RO_ATTR(pending_discard); F2FS_GENERAL_RO_ATTR(gc_mode); #ifdef CONFIG_F2FS_STAT_FS -F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); -F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); -F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_foreground_calls, call_count); -F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_background_calls, bg_gc); F2FS_GENERAL_RO_ATTR(moved_blocks_background); F2FS_GENERAL_RO_ATTR(moved_blocks_foreground); F2FS_GENERAL_RO_ATTR(avg_vblocks); @@ -935,8 +1023,6 @@ F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED F2FS_FEATURE_RO_ATTR(block_zoned); -F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, unusable_blocks_per_sec, - unusable_blocks_per_sec); #endif F2FS_FEATURE_RO_ATTR(atomic_write); F2FS_FEATURE_RO_ATTR(extra_attr); @@ -956,37 +1042,9 @@ F2FS_FEATURE_RO_ATTR(casefold); F2FS_FEATURE_RO_ATTR(readonly); #ifdef CONFIG_F2FS_FS_COMPRESSION F2FS_FEATURE_RO_ATTR(compression); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compress_percent, compress_percent); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compress_watermark, compress_watermark); #endif F2FS_FEATURE_RO_ATTR(pin_file); -/* For ATGC */ -F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_ratio, candidate_ratio); -F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_count); -F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight); -F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); - -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole); - -/* For atomic write */ -F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, current_atomic_write, current_atomic_write); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block); - -/* For block age extent cache */ -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, last_age_weight, last_age_weight); - #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent_sleep_time), @@ -1386,12 +1444,19 @@ int __init f2fs_init_sysfs(void) ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, NULL, "features"); - if (ret) { - kobject_put(&f2fs_feat); - kset_unregister(&f2fs_kset); - } else { - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + if (ret) + goto put_kobject; + + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + if (!f2fs_proc_root) { + ret = -ENOMEM; + goto put_kobject; } + + return 0; +put_kobject: + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); return ret; } @@ -1430,23 +1495,24 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) if (err) goto put_feature_list_kobj; - if (f2fs_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + if (!sbi->s_proc) { + err = -ENOMEM; + goto put_feature_list_kobj; + } - if (sbi->s_proc) { - proc_create_single_data("segment_info", 0444, sbi->s_proc, + proc_create_single_data("segment_info", 0444, sbi->s_proc, segment_info_seq_show, sb); - proc_create_single_data("segment_bits", 0444, sbi->s_proc, + proc_create_single_data("segment_bits", 0444, sbi->s_proc, segment_bits_seq_show, sb); #ifdef CONFIG_F2FS_IOSTAT - proc_create_single_data("iostat_info", 0444, sbi->s_proc, + proc_create_single_data("iostat_info", 0444, sbi->s_proc, iostat_info_seq_show, sb); #endif - proc_create_single_data("victim_bits", 0444, sbi->s_proc, + proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); - proc_create_single_data("discard_plist_info", 0444, sbi->s_proc, + proc_create_single_data("discard_plist_info", 0444, sbi->s_proc, discard_plist_seq_show, sb); - } return 0; put_feature_list_kobj: kobject_put(&sbi->s_feature_list_kobj); @@ -1462,8 +1528,7 @@ put_sb_kobj: void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { - if (sbi->s_proc) - remove_proc_subtree(sbi->sb->s_id, f2fs_proc_root); + remove_proc_subtree(sbi->sb->s_id, f2fs_proc_root); kobject_put(&sbi->s_stat_kobj); wait_for_completion(&sbi->s_stat_kobj_unregister); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 213805d3592c..476b186b90a6 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -528,10 +528,12 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); + if (!ipage) + f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, &entry, &base_addr, &base_size, &is_inline); - f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); + if (!ipage) + f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 416d652774a3..b1811c392e6f 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -83,6 +83,7 @@ struct f2fs_xattr_entry { sizeof(struct f2fs_xattr_header) - \ sizeof(struct f2fs_xattr_entry)) +#define MIN_INLINE_XATTR_SIZE (sizeof(struct f2fs_xattr_header) / sizeof(__le32)) #define MAX_INLINE_XATTR_SIZE \ (DEF_ADDRS_PER_INODE - \ F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \ diff --git a/fs/fat/file.c b/fs/fat/file.c index 795a4fad5c40..456477946dd9 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -209,7 +209,7 @@ const struct file_operations fat_file_operations = { .unlocked_ioctl = fat_generic_ioctl, .compat_ioctl = compat_ptr_ioctl, .fsync = fat_file_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = fat_fallocate, }; diff --git a/fs/fhandle.c b/fs/fhandle.c index f2bc27d1975e..6ea8d35a9382 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -16,7 +16,7 @@ static long do_sys_name_to_handle(const struct path *path, struct file_handle __user *ufh, - int __user *mnt_id) + int __user *mnt_id, int fh_flags) { long retval; struct file_handle f_handle; @@ -24,11 +24,14 @@ static long do_sys_name_to_handle(const struct path *path, struct file_handle *handle = NULL; /* - * We need to make sure whether the file system - * support decoding of the file handle + * We need to make sure whether the file system support decoding of + * the file handle if decodeable file handle was requested. + * Otherwise, even empty export_operations are sufficient to opt-in + * to encoding FIDs. */ if (!path->dentry->d_sb->s_export_op || - !path->dentry->d_sb->s_export_op->fh_to_dentry) + (!(fh_flags & EXPORT_FH_FID) && + !path->dentry->d_sb->s_export_op->fh_to_dentry)) return -EOPNOTSUPP; if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) @@ -45,27 +48,28 @@ static long do_sys_name_to_handle(const struct path *path, /* convert handle size to multiple of sizeof(u32) */ handle_dwords = f_handle.handle_bytes >> 2; - /* we ask for a non connected handle */ + /* we ask for a non connectable maybe decodeable file handle */ retval = exportfs_encode_fh(path->dentry, (struct fid *)handle->f_handle, - &handle_dwords, 0); + &handle_dwords, fh_flags); handle->handle_type = retval; /* convert handle size to bytes */ handle_bytes = handle_dwords * sizeof(u32); handle->handle_bytes = handle_bytes; if ((handle->handle_bytes > f_handle.handle_bytes) || - (retval == FILEID_INVALID) || (retval == -ENOSPC)) { + (retval == FILEID_INVALID) || (retval < 0)) { /* As per old exportfs_encode_fh documentation * we could return ENOSPC to indicate overflow * But file system returned 255 always. So handle * both the values */ + if (retval == FILEID_INVALID || retval == -ENOSPC) + retval = -EOVERFLOW; /* * set the handle size to zero so we copy only * non variable part of the file_handle */ handle_bytes = 0; - retval = -EOVERFLOW; } else retval = 0; /* copy the mount id */ @@ -84,6 +88,7 @@ static long do_sys_name_to_handle(const struct path *path, * @handle: resulting file handle * @mnt_id: mount id of the file system containing the file * @flag: flag value to indicate whether to follow symlink or not + * and whether a decodable file handle is required. * * @handle->handle_size indicate the space available to store the * variable part of the file handle in bytes. If there is not @@ -96,17 +101,19 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, { struct path path; int lookup_flags; + int fh_flags; int err; - if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) + if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID)) return -EINVAL; lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; + fh_flags = (flag & AT_HANDLE_FID) ? EXPORT_FH_FID : 0; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; err = user_path_at(dfd, name, lookup_flags, &path); if (!err) { - err = do_sys_name_to_handle(&path, handle, mnt_id); + err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags); path_put(&path); } return err; @@ -235,7 +242,6 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, retval = PTR_ERR(file); } else { retval = fd; - fsnotify_open(file); fd_install(fd, file); } path_put(&path); diff --git a/fs/file_table.c b/fs/file_table.c index 372653b92617..fc7d677ff5ad 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -44,18 +44,40 @@ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; +/* Container for backing file with optional real path */ +struct backing_file { + struct file file; + struct path real_path; +}; + +static inline struct backing_file *backing_file(struct file *f) +{ + return container_of(f, struct backing_file, file); +} + +struct path *backing_file_real_path(struct file *f) +{ + return &backing_file(f)->real_path; +} +EXPORT_SYMBOL_GPL(backing_file_real_path); + static void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_rcuhead); put_cred(f->f_cred); - kmem_cache_free(filp_cachep, f); + if (unlikely(f->f_mode & FMODE_BACKING)) + kfree(backing_file(f)); + else + kmem_cache_free(filp_cachep, f); } static inline void file_free(struct file *f) { security_file_free(f); - if (!(f->f_mode & FMODE_NOACCOUNT)) + if (unlikely(f->f_mode & FMODE_BACKING)) + path_put(backing_file_real_path(f)); + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); call_rcu(&f->f_rcuhead, file_free_rcu); } @@ -131,20 +153,15 @@ static int __init init_fs_stat_sysctls(void) fs_initcall(init_fs_stat_sysctls); #endif -static struct file *__alloc_file(int flags, const struct cred *cred) +static int init_file(struct file *f, int flags, const struct cred *cred) { - struct file *f; int error; - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); - if (unlikely(!f)) - return ERR_PTR(-ENOMEM); - f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { - file_free_rcu(&f->f_rcuhead); - return ERR_PTR(error); + put_cred(f->f_cred); + return error; } atomic_long_set(&f->f_count, 1); @@ -155,7 +172,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred) f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ - return f; + return 0; } /* Find an unused file structure and return a pointer to it. @@ -172,6 +189,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; + int error; /* * Privileged users can go above max_files @@ -185,9 +203,17 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) goto over; } - f = __alloc_file(flags, cred); - if (!IS_ERR(f)) - percpu_counter_inc(&nr_files); + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); + + error = init_file(f, flags, cred); + if (unlikely(error)) { + kmem_cache_free(filp_cachep, f); + return ERR_PTR(error); + } + + percpu_counter_inc(&nr_files); return f; @@ -203,18 +229,55 @@ over: /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * - * Should not be used unless there's a very good reason to do so. + * This is only for kernel internal use, and the allocate file must not be + * installed into file tables or such. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { - struct file *f = __alloc_file(flags, cred); + struct file *f; + int error; + + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); - if (!IS_ERR(f)) - f->f_mode |= FMODE_NOACCOUNT; + error = init_file(f, flags, cred); + if (unlikely(error)) { + kmem_cache_free(filp_cachep, f); + return ERR_PTR(error); + } + + f->f_mode |= FMODE_NOACCOUNT; return f; } +/* + * Variant of alloc_empty_file() that allocates a backing_file container + * and doesn't check and modify nr_files. + * + * This is only for kernel internal use, and the allocate file must not be + * installed into file tables or such. + */ +struct file *alloc_empty_backing_file(int flags, const struct cred *cred) +{ + struct backing_file *ff; + int error; + + ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL); + if (unlikely(!ff)) + return ERR_PTR(-ENOMEM); + + error = init_file(&ff->file, flags, cred); + if (unlikely(error)) { + kfree(ff); + return ERR_PTR(error); + } + + ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; + return &ff->file; +} + /** * alloc_file - allocate and initialize a 'struct file' * diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ae4e51e91ee3..aca4b4811394 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2024,7 +2024,6 @@ static long wb_writeback(struct bdi_writeback *wb, struct blk_plug plug; blk_start_plug(&plug); - spin_lock(&wb->list_lock); for (;;) { /* * Stop writeback when nr_pages has been consumed @@ -2049,6 +2048,9 @@ static long wb_writeback(struct bdi_writeback *wb, if (work->for_background && !wb_over_bg_thresh(wb)) break; + + spin_lock(&wb->list_lock); + /* * Kupdate and background works are special and we want to * include all inodes that need writing. Livelock avoidance is @@ -2078,13 +2080,19 @@ static long wb_writeback(struct bdi_writeback *wb, * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */ - if (progress) + if (progress) { + spin_unlock(&wb->list_lock); continue; + } + /* * No more inodes for IO, bail */ - if (list_empty(&wb->b_more_io)) + if (list_empty(&wb->b_more_io)) { + spin_unlock(&wb->list_lock); break; + } + /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise @@ -2096,9 +2104,7 @@ static long wb_writeback(struct bdi_writeback *wb, spin_unlock(&wb->list_lock); /* This function drops i_lock... */ inode_sleep_on_writeback(inode); - spin_lock(&wb->list_lock); } - spin_unlock(&wb->list_lock); blk_finish_plug(&plug); return nr_pages - work->nr_pages; diff --git a/fs/fs_context.c b/fs/fs_context.c index 24ce12f0db32..851214d1d013 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -561,7 +561,8 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) return -ENOMEM; } - ctx->legacy_data[size++] = ','; + if (size) + ctx->legacy_data[size++] = ','; len = strlen(param->key); memcpy(ctx->legacy_data + size, param->key, len); size += len; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 89d97f6188e0..bc4115288eec 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1280,13 +1280,13 @@ static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, max_pages); } -static ssize_t fuse_perform_write(struct kiocb *iocb, - struct address_space *mapping, - struct iov_iter *ii, loff_t pos) +static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); + loff_t pos = iocb->ki_pos; int err = 0; ssize_t res = 0; @@ -1329,7 +1329,10 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, fuse_write_update_attr(inode, pos, res); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - return res > 0 ? res : err; + if (!res) + return err; + iocb->ki_pos += res; + return res; } static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -1337,11 +1340,9 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; ssize_t written = 0; - ssize_t written_buffered = 0; struct inode *inode = mapping->host; ssize_t err; struct fuse_conn *fc = get_fuse_conn(inode); - loff_t endbyte = 0; if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ @@ -1362,9 +1363,6 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) writethrough: inode_lock(inode); - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - err = generic_write_checks(iocb, from); if (err <= 0) goto out; @@ -1378,38 +1376,15 @@ writethrough: goto out; if (iocb->ki_flags & IOCB_DIRECT) { - loff_t pos = iocb->ki_pos; written = generic_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) goto out; - - pos += written; - - written_buffered = fuse_perform_write(iocb, mapping, from, pos); - if (written_buffered < 0) { - err = written_buffered; - goto out; - } - endbyte = pos + written_buffered - 1; - - err = filemap_write_and_wait_range(file->f_mapping, pos, - endbyte); - if (err) - goto out; - - invalidate_mapping_pages(file->f_mapping, - pos >> PAGE_SHIFT, - endbyte >> PAGE_SHIFT); - - written += written_buffered; - iocb->ki_pos = pos + written_buffered; + written = direct_write_fallback(iocb, from, written, + fuse_perform_write(iocb, from)); } else { - written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos); - if (written >= 0) - iocb->ki_pos += written; + written = fuse_perform_write(iocb, from); } out: - current->backing_dev_info = NULL; inode_unlock(inode); if (written > 0) written = generic_write_sync(iocb, written); @@ -3252,7 +3227,7 @@ static const struct file_operations fuse_file_operations = { .lock = fuse_file_lock, .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 4d8d4f16c727..5f1be1da92ce 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -775,7 +775,8 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev, rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, NULL); if (rc < 0) - return rc; + return dax_mem2blk_err(rc); + memset(kaddr, 0, nr_pages << PAGE_SHIFT); dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); return 0; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index a5f4be6b9213..ae49256b7c8c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -38,13 +38,13 @@ void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, - unsigned int from, unsigned int len) + size_t from, size_t len) { struct buffer_head *head = folio_buffers(folio); unsigned int bsize = head->b_size; struct buffer_head *bh; - unsigned int to = from + len; - unsigned int start, end; + size_t to = from + len; + size_t start, end; for (bh = head, start = 0; bh != head || !start; bh = bh->b_this_page, start = end) { @@ -82,61 +82,61 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, } /** - * gfs2_write_jdata_page - gfs2 jdata-specific version of block_write_full_page - * @page: The page to write + * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_page + * @folio: The folio to write * @wbc: The writeback control * * This is the same as calling block_write_full_page, but it also * writes pages outside of i_size */ -static int gfs2_write_jdata_page(struct page *page, +static int gfs2_write_jdata_folio(struct folio *folio, struct writeback_control *wbc) { - struct inode * const inode = page->mapping->host; + struct inode * const inode = folio->mapping->host; loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned offset; /* - * The page straddles i_size. It must be zeroed out on each and every + * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and + * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - offset = i_size & (PAGE_SIZE - 1); - if (page->index == end_index && offset) - zero_user_segment(page, offset, PAGE_SIZE); + if (folio_pos(folio) < i_size && + i_size < folio_pos(folio) + folio_size(folio)) + folio_zero_segment(folio, offset_in_folio(folio, i_size), + folio_size(folio)); - return __block_write_full_page(inode, page, gfs2_get_block_noalloc, wbc, - end_buffer_async_write); + return __block_write_full_folio(inode, folio, gfs2_get_block_noalloc, + wbc, end_buffer_async_write); } /** - * __gfs2_jdata_writepage - The core of jdata writepage - * @page: The page to write + * __gfs2_jdata_write_folio - The core of jdata writepage + * @folio: The folio to write * @wbc: The writeback control * * This is shared between writepage and writepages and implements the * core of the writepage operation. If a transaction is required then - * PageChecked will have been set and the transaction will have + * the checked flag will have been set and the transaction will have * already been started before this is called. */ - -static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) +static int __gfs2_jdata_write_folio(struct folio *folio, + struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - if (PageChecked(page)) { - ClearPageChecked(page); - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - BIT(BH_Dirty)|BIT(BH_Uptodate)); + if (folio_test_checked(folio)) { + folio_clear_checked(folio); + if (!folio_buffers(folio)) { + folio_create_empty_buffers(folio, + inode->i_sb->s_blocksize, + BIT(BH_Dirty)|BIT(BH_Uptodate)); } - gfs2_trans_add_databufs(ip, page_folio(page), 0, PAGE_SIZE); + gfs2_trans_add_databufs(ip, folio, 0, folio_size(folio)); } - return gfs2_write_jdata_page(page, wbc); + return gfs2_write_jdata_folio(folio, wbc); } /** @@ -150,20 +150,21 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) goto out; - if (PageChecked(page) || current->journal_info) + if (folio_test_checked(folio) || current->journal_info) goto out_ignore; - return __gfs2_jdata_writepage(page, wbc); + return __gfs2_jdata_write_folio(folio, wbc); out_ignore: - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); out: - unlock_page(page); + folio_unlock(folio); return 0; } @@ -255,7 +256,7 @@ continue_unlock: trace_wbc_writepage(wbc, inode_to_bdi(inode)); - ret = __gfs2_jdata_writepage(&folio->page, wbc); + ret = __gfs2_jdata_write_folio(folio, wbc); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { folio_unlock(folio); @@ -431,10 +432,10 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) if (error) return error; - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); memset(kaddr + dsize, 0, PAGE_SIZE - dsize); - kunmap_atomic(kaddr); + kunmap_local(kaddr); flush_dcache_page(page); brelse(dibh); SetPageUptodate(page); @@ -488,18 +489,18 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, unsigned copied = 0; unsigned amt; struct page *page; - void *p; do { + page = read_cache_page(mapping, index, gfs2_read_folio, NULL); + if (IS_ERR(page)) { + if (PTR_ERR(page) == -EINTR) + continue; + return PTR_ERR(page); + } amt = size - copied; if (offset + size > PAGE_SIZE) amt = PAGE_SIZE - offset; - page = read_cache_page(mapping, index, gfs2_read_folio, NULL); - if (IS_ERR(page)) - return PTR_ERR(page); - p = kmap_atomic(page); - memcpy(buf + copied, p + offset, amt); - kunmap_atomic(p); + memcpy_from_page(buf + copied, page, offset, amt); put_page(page); copied += amt; index++; @@ -750,7 +751,6 @@ static const struct address_space_operations gfs2_aops = { .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .bmap = gfs2_bmap, - .direct_IO = noop_direct_IO, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h index 09db1914425e..f08322ef41cf 100644 --- a/fs/gfs2/aops.h +++ b/fs/gfs2/aops.h @@ -10,6 +10,6 @@ extern void adjust_fs_space(struct inode *inode); extern void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, - unsigned int from, unsigned int len); + size_t from, size_t len); #endif /* __AOPS_DOT_H__ */ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index c739b258a2d9..8d611fbcf0bd 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1729,8 +1729,8 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) if (offset >= maxsize) { /* - * The starting point lies beyond the allocated meta-data; - * there are no blocks do deallocate. + * The starting point lies beyond the allocated metadata; + * there are no blocks to deallocate. */ return 0; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index cb62c8f07d1e..1bf3c4453516 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -630,6 +630,9 @@ int gfs2_open_common(struct inode *inode, struct file *file) ret = generic_file_open(inode, file); if (ret) return ret; + + if (!gfs2_is_jdata(GFS2_I(inode))) + file->f_mode |= FMODE_CAN_ODIRECT; } fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS); @@ -1030,8 +1033,8 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, } gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); -retry: if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { +retry: window_size -= fault_in_iov_iter_readable(from, window_size); if (!window_size) { ret = -EFAULT; @@ -1052,15 +1055,11 @@ retry: goto out_unlock; } - current->backing_dev_info = inode_to_bdi(inode); pagefault_disable(); ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); pagefault_enable(); - current->backing_dev_info = NULL; - if (ret > 0) { - iocb->ki_pos += ret; + if (ret > 0) written += ret; - } if (inode == sdp->sd_rindex) gfs2_glock_dq_uninit(statfs_gh); @@ -1579,7 +1578,7 @@ const struct file_operations gfs2_file_fops = { .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, @@ -1610,7 +1609,7 @@ const struct file_operations gfs2_file_fops_nolock = { .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = gfs2_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 5adc7d85dbf3..1438e7465e30 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -145,8 +145,8 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu) * * We need to allow some glocks to be enqueued, dequeued, promoted, and demoted * when we're withdrawn. For example, to maintain metadata integrity, we should - * disallow the use of inode and rgrp glocks when withdrawn. Other glocks, like - * iopen or the transaction glocks may be safely used because none of their + * disallow the use of inode and rgrp glocks when withdrawn. Other glocks like + * the iopen or freeze glock may be safely used because none of their * metadata goes through the journal. So in general, we should disallow all * glocks that are journaled, and allow all the others. One exception is: * we need to allow our active journal to be promoted and demoted so others diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 01d433ed6ce7..54319328b16b 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -236,7 +236,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) truncate_inode_pages_range(mapping, start, end); } -static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl, +static void gfs2_rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl, const char *fs_id_buf) { struct gfs2_rgrpd *rgd = gl->gl_object; @@ -536,72 +536,53 @@ static int inode_go_held(struct gfs2_holder *gh) * */ -static void inode_go_dump(struct seq_file *seq, struct gfs2_glock *gl, +static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl, const char *fs_id_buf) { struct gfs2_inode *ip = gl->gl_object; - struct inode *inode; - unsigned long nrpages; + const struct inode *inode = &ip->i_inode; if (ip == NULL) return; - inode = &ip->i_inode; - xa_lock_irq(&inode->i_data.i_pages); - nrpages = inode->i_data.nrpages; - xa_unlock_irq(&inode->i_data.i_pages); - gfs2_print_dbg(seq, "%s I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu " "p:%lu\n", fs_id_buf, (unsigned long long)ip->i_no_formal_ino, (unsigned long long)ip->i_no_addr, - IF2DT(ip->i_inode.i_mode), ip->i_flags, + IF2DT(inode->i_mode), ip->i_flags, (unsigned int)ip->i_diskflags, - (unsigned long long)i_size_read(inode), nrpages); + (unsigned long long)i_size_read(inode), + inode->i_data.nrpages); } /** - * freeze_go_sync - promote/demote the freeze glock + * freeze_go_callback - A cluster node is requesting a freeze * @gl: the glock + * @remote: true if this came from a different cluster node */ -static int freeze_go_sync(struct gfs2_glock *gl) +static void freeze_go_callback(struct gfs2_glock *gl, bool remote) { - int error = 0; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct super_block *sb = sdp->sd_vfs; + + if (!remote || + gl->gl_state != LM_ST_SHARED || + gl->gl_demote_state != LM_ST_UNLOCKED) + return; /* - * We need to check gl_state == LM_ST_SHARED here and not gl_req == - * LM_ST_EXCLUSIVE. That's because when any node does a freeze, - * all the nodes should have the freeze glock in SH mode and they all - * call do_xmote: One for EX and the others for UN. They ALL must - * freeze locally, and they ALL must queue freeze work. The freeze_work - * calls freeze_func, which tries to reacquire the freeze glock in SH, - * effectively waiting for the thaw on the node who holds it in EX. - * Once thawed, the work func acquires the freeze glock in - * SH and everybody goes back to thawed. + * Try to get an active super block reference to prevent racing with + * unmount (see trylock_super()). But note that unmount isn't the only + * place where a write lock on s_umount is taken, and we can fail here + * because of things like remount as well. */ - if (gl->gl_state == LM_ST_SHARED && !gfs2_withdrawn(sdp) && - !test_bit(SDF_NORECOVERY, &sdp->sd_flags)) { - atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE); - error = freeze_super(sdp->sd_vfs); - if (error) { - fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", - error); - if (gfs2_withdrawn(sdp)) { - atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); - return 0; - } - gfs2_assert_withdraw(sdp, 0); - } - queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work); - if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) - gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | - GFS2_LFC_FREEZE_GO_SYNC); - else /* read-only mounts */ - atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); + if (down_read_trylock(&sb->s_umount)) { + atomic_inc(&sb->s_active); + up_read(&sb->s_umount); + if (!queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work)) + deactivate_super(sb); } - return 0; } /** @@ -761,9 +742,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { }; const struct gfs2_glock_operations gfs2_freeze_glops = { - .go_sync = freeze_go_sync, .go_xmote_bh = freeze_go_xmote_bh, .go_demote_ok = freeze_go_demote_ok, + .go_callback = freeze_go_callback, .go_type = LM_TYPE_NONDISK, .go_flags = GLOF_NONDISK, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 79485329118b..04f2d78e8658 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -221,7 +221,7 @@ struct gfs2_glock_operations { int (*go_demote_ok) (const struct gfs2_glock *gl); int (*go_instantiate) (struct gfs2_glock *gl); int (*go_held)(struct gfs2_holder *gh); - void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl, + void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl, const char *fs_id_buf); void (*go_callback)(struct gfs2_glock *gl, bool remote); void (*go_free)(struct gfs2_glock *gl); @@ -600,7 +600,7 @@ enum { SDF_RORECOVERY = 7, /* read only recovery */ SDF_SKIP_DLM_UNLOCK = 8, SDF_FORCE_AIL_FLUSH = 9, - SDF_FS_FROZEN = 10, + SDF_FREEZE_INITIATOR = 10, SDF_WITHDRAWING = 11, /* Will withdraw eventually */ SDF_WITHDRAW_IN_PROG = 12, /* Withdraw is in progress */ SDF_REMOTE_WITHDRAW = 13, /* Performing remote recovery */ @@ -608,12 +608,7 @@ enum { withdrawing */ SDF_DEACTIVATING = 15, SDF_EVICTING = 16, -}; - -enum gfs2_freeze_state { - SFS_UNFROZEN = 0, - SFS_STARTING_FREEZE = 1, - SFS_FROZEN = 2, + SDF_FROZEN = 17, }; #define GFS2_FSNAME_LEN 256 @@ -841,7 +836,6 @@ struct gfs2_sbd { /* For quiescing the filesystem */ struct gfs2_holder sd_freeze_gh; - atomic_t sd_freeze_state; struct mutex sd_freeze_mutex; char sd_fsname[GFS2_FSNAME_LEN + 3 * sizeof(int) + 2]; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 71911bf9ab34..54911294687c 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -296,10 +296,8 @@ static void gdlm_put_lock(struct gfs2_glock *gl) struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; - if (gl->gl_lksb.sb_lkid == 0) { - gfs2_glock_free(gl); - return; - } + if (gl->gl_lksb.sb_lkid == 0) + goto out_free; clear_bit(GLF_BLOCKING, &gl->gl_flags); gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); @@ -307,17 +305,13 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_update_request_times(gl); /* don't want to call dlm if we've unmounted the lock protocol */ - if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { - gfs2_glock_free(gl); - return; - } + if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + goto out_free; /* don't want to skip dlm_unlock writing the lvb when lock has one */ if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && - !gl->gl_lksb.sb_lvbptr) { - gfs2_glock_free(gl); - return; - } + !gl->gl_lksb.sb_lvbptr) + goto out_free; again: error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, @@ -331,8 +325,11 @@ again: fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, error); - return; } + return; + +out_free: + gfs2_glock_free(gl); } static void gdlm_cancel(struct gfs2_glock *gl) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index d750d1128bed..aa568796207c 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -914,9 +914,8 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, static void log_write_header(struct gfs2_sbd *sdp, u32 flags) { blk_opf_t op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; - enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); - gfs2_assert_withdraw(sdp, (state != SFS_FROZEN)); + gfs2_assert_withdraw(sdp, !test_bit(SDF_FROZEN, &sdp->sd_flags)); if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { gfs2_ordered_wait(sdp); @@ -1036,7 +1035,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) { struct gfs2_trans *tr = NULL; unsigned int reserved_blocks = 0, used_blocks = 0; - enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); + bool frozen = test_bit(SDF_FROZEN, &sdp->sd_flags); unsigned int first_log_head; unsigned int reserved_revokes = 0; @@ -1067,7 +1066,7 @@ repeat: if (tr) { sdp->sd_log_tr = NULL; tr->tr_first = first_log_head; - if (unlikely (state == SFS_FROZEN)) { + if (unlikely(frozen)) { if (gfs2_assert_withdraw_delayed(sdp, !tr->tr_num_buf_new && !tr->tr_num_databuf_new)) goto out_withdraw; @@ -1092,7 +1091,7 @@ repeat: if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN) clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - if (unlikely(state == SFS_FROZEN)) + if (unlikely(frozen)) if (gfs2_assert_withdraw_delayed(sdp, !reserved_revokes)) goto out_withdraw; @@ -1136,8 +1135,6 @@ repeat: if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN | GFS2_LOG_HEAD_FLUSH_FREEZE)) gfs2_log_shutdown(sdp); - if (flags & GFS2_LOG_HEAD_FLUSH_FREEZE) - atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); } out_end: diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 1902413d5d12..251322b01631 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -427,10 +427,11 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host lh; - void *kaddr = kmap_atomic(page); + void *kaddr; unsigned int offset; bool ret = false; + kaddr = kmap_local_page(page); for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { if (lh.lh_sequence >= head->lh_sequence) @@ -441,7 +442,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, } } } - kunmap_atomic(kaddr); + kunmap_local(kaddr); return ret; } @@ -626,11 +627,11 @@ static void gfs2_check_magic(struct buffer_head *bh) __be32 *ptr; clear_buffer_escaped(bh); - kaddr = kmap_atomic(bh->b_page); + kaddr = kmap_local_page(bh->b_page); ptr = kaddr + bh_offset(bh); if (*ptr == cpu_to_be32(GFS2_MAGIC)) set_buffer_escaped(bh); - kunmap_atomic(kaddr); + kunmap_local(kaddr); } static int blocknr_cmp(void *priv, const struct list_head *a, @@ -696,14 +697,12 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, lock_buffer(bd2->bd_bh); if (buffer_escaped(bd2->bd_bh)) { - void *kaddr; + void *p; + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); - ptr = page_address(page); - kaddr = kmap_atomic(bd2->bd_bh->b_page); - memcpy(ptr, kaddr + bh_offset(bd2->bd_bh), - bd2->bd_bh->b_size); - kunmap_atomic(kaddr); - *(__be32 *)ptr = 0; + p = page_address(page); + memcpy_from_page(p, page, bh_offset(bd2->bd_bh), bd2->bd_bh->b_size); + *(__be32 *)p = 0; clear_buffer_escaped(bd2->bd_bh); unlock_buffer(bd2->bd_bh); brelse(bd2->bd_bh); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 9af9ddb61ca0..8a27957dbfee 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -135,7 +135,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_rwsem(&sdp->sd_log_flush_lock); atomic_set(&sdp->sd_log_in_flight, 0); init_waitqueue_head(&sdp->sd_log_flush_wait); - atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); mutex_init(&sdp->sd_freeze_mutex); return sdp; @@ -254,7 +253,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio = bio_alloc(sb->s_bdev, 1, REQ_OP_READ | REQ_META, GFP_NOFS); bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); - bio_add_page(bio, page, PAGE_SIZE, 0); + __bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = end_bio_io_page; bio->bi_private = page; @@ -434,7 +433,7 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, error = gfs2_glock_get(sdp, GFS2_FREEZE_LOCK, &gfs2_freeze_glops, CREATE, &sdp->sd_freeze_gl); if (error) { - fs_err(sdp, "can't create transaction glock: %d\n", error); + fs_err(sdp, "can't create freeze glock: %d\n", error); goto fail_rename; } @@ -1140,7 +1139,6 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) int silent = fc->sb_flags & SB_SILENT; struct gfs2_sbd *sdp; struct gfs2_holder mount_gh; - struct gfs2_holder freeze_gh; int error; sdp = init_sbd(sb); @@ -1269,15 +1267,15 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) } } - error = gfs2_freeze_lock(sdp, &freeze_gh, 0); + error = gfs2_freeze_lock_shared(sdp); if (error) goto fail_per_node; if (!sb_rdonly(sb)) error = gfs2_make_fs_rw(sdp); - gfs2_freeze_unlock(&freeze_gh); if (error) { + gfs2_freeze_unlock(&sdp->sd_freeze_gh); if (sdp->sd_quotad_process) kthread_stop(sdp->sd_quotad_process); sdp->sd_quotad_process = NULL; @@ -1590,12 +1588,6 @@ static int gfs2_reconfigure(struct fs_context *fc) fc->sb_flags |= SB_RDONLY; if ((sb->s_flags ^ fc->sb_flags) & SB_RDONLY) { - struct gfs2_holder freeze_gh; - - error = gfs2_freeze_lock(sdp, &freeze_gh, 0); - if (error) - return -EINVAL; - if (fc->sb_flags & SB_RDONLY) { gfs2_make_fs_ro(sdp); } else { @@ -1603,7 +1595,6 @@ static int gfs2_reconfigure(struct fs_context *fc) if (error) errorfc(fc, "unable to remount read-write"); } - gfs2_freeze_unlock(&freeze_gh); } sdp->sd_args = *newargs; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 1ed17226d9ed..704192b73605 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -75,6 +75,9 @@ #define GFS2_QD_HASH_SIZE BIT(GFS2_QD_HASH_SHIFT) #define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) +#define QC_CHANGE 0 +#define QC_SYNC 1 + /* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ /* -> sd_bitmap_lock */ static DEFINE_SPINLOCK(qd_lock); @@ -470,7 +473,6 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) spin_unlock(&qd_lock); if (qd) { - gfs2_assert_warn(sdp, qd->qd_change_sync); error = bh_get(qd); if (error) { clear_bit(QDF_LOCKED, &qd->qd_flags); @@ -591,6 +593,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) || gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) { error = -EIO; + gfs2_qa_put(ip); goto out; } @@ -661,7 +664,7 @@ static int sort_qd(const void *a, const void *b) return 0; } -static void do_qc(struct gfs2_quota_data *qd, s64 change) +static void do_qc(struct gfs2_quota_data *qd, s64 change, int qc_type) { struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); @@ -686,16 +689,18 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) qd->qd_change = x; spin_unlock(&qd_lock); - if (!x) { + if (qc_type == QC_CHANGE) { + if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) { + qd_hold(qd); + slot_hold(qd); + } + } else { gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags)); clear_bit(QDF_CHANGE, &qd->qd_flags); qc->qc_flags = 0; qc->qc_id = 0; slot_put(qd); qd_put(qd); - } else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) { - qd_hold(qd); - slot_hold(qd); } if (change < 0) /* Reset quiet flag if we freed some blocks */ @@ -711,7 +716,6 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, struct address_space *mapping = inode->i_mapping; struct page *page; struct buffer_head *bh; - void *kaddr; u64 blk; unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0; unsigned to_write = bytes, pg_off = off; @@ -763,10 +767,8 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, } /* Write to the page, now that we have setup the buffer(s) */ - kaddr = kmap_atomic(page); - memcpy(kaddr + off, buf, bytes); + memcpy_to_page(page, off, buf, bytes); flush_dcache_page(page); - kunmap_atomic(kaddr); unlock_page(page); put_page(page); @@ -955,7 +957,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) if (error) goto out_end_trans; - do_qc(qd, -qd->qd_change_sync); + do_qc(qd, -qd->qd_change_sync, QC_SYNC); set_bit(QDF_REFRESH, &qd->qd_flags); } @@ -1281,7 +1283,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, if (qid_eq(qd->qd_id, make_kqid_uid(uid)) || qid_eq(qd->qd_id, make_kqid_gid(gid))) { - do_qc(qd, change); + do_qc(qd, change, QC_CHANGE); } } } diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 2bb085a72e8e..9c7a9f640bad 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -404,7 +404,7 @@ void gfs2_recover_func(struct work_struct *work) struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host head; - struct gfs2_holder j_gh, ji_gh, thaw_gh; + struct gfs2_holder j_gh, ji_gh; ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep; int ro = 0; unsigned int pass; @@ -420,10 +420,10 @@ void gfs2_recover_func(struct work_struct *work) if (sdp->sd_args.ar_spectator) goto fail; if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { - fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", + fs_info(sdp, "jid=%u: Trying to acquire journal glock...\n", jd->jd_jid); jlocked = 1; - /* Acquire the journal lock so we can do recovery */ + /* Acquire the journal glock so we can do recovery */ error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, LM_ST_EXCLUSIVE, @@ -465,14 +465,14 @@ void gfs2_recover_func(struct work_struct *work) ktime_ms_delta(t_jhd, t_jlck)); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { - fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", - jd->jd_jid); - - /* Acquire a shared hold on the freeze lock */ + mutex_lock(&sdp->sd_freeze_mutex); - error = gfs2_freeze_lock(sdp, &thaw_gh, LM_FLAG_PRIORITY); - if (error) + if (test_bit(SDF_FROZEN, &sdp->sd_flags)) { + mutex_unlock(&sdp->sd_freeze_mutex); + fs_warn(sdp, "jid=%u: Can't replay: filesystem " + "is frozen\n", jd->jd_jid); goto fail_gunlock_ji; + } if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) { ro = 1; @@ -496,7 +496,7 @@ void gfs2_recover_func(struct work_struct *work) fs_warn(sdp, "jid=%u: Can't replay: read-only block " "device\n", jd->jd_jid); error = -EROFS; - goto fail_gunlock_thaw; + goto fail_gunlock_nofreeze; } t_tlck = ktime_get(); @@ -514,7 +514,7 @@ void gfs2_recover_func(struct work_struct *work) lops_after_scan(jd, error, pass); if (error) { up_read(&sdp->sd_log_flush_lock); - goto fail_gunlock_thaw; + goto fail_gunlock_nofreeze; } } @@ -522,7 +522,7 @@ void gfs2_recover_func(struct work_struct *work) clean_journal(jd, &head); up_read(&sdp->sd_log_flush_lock); - gfs2_freeze_unlock(&thaw_gh); + mutex_unlock(&sdp->sd_freeze_mutex); t_rep = ktime_get(); fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, " "jhead:%lldms, tlck:%lldms, replay:%lldms]\n", @@ -543,8 +543,8 @@ void gfs2_recover_func(struct work_struct *work) fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); goto done; -fail_gunlock_thaw: - gfs2_freeze_unlock(&thaw_gh); +fail_gunlock_nofreeze: + mutex_unlock(&sdp->sd_freeze_mutex); fail_gunlock_ji: if (jlocked) { gfs2_glock_dq_uninit(&ji_gh); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 3b9b76e980ad..9308190895c8 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -2584,8 +2584,8 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); - rgrp_unlock_local(rgd); be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1); + rgrp_unlock_local(rgd); gfs2_statfs_change(sdp, 0, +1, -1); trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index a84bf6444bba..9f4d5d6549ee 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -332,7 +332,12 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) struct lfcc *lfcc; LIST_HEAD(list); struct gfs2_log_header_host lh; - int error; + int error, error2; + + /* + * Grab all the journal glocks in SH mode. We are *probably* doing + * that to prevent recovery. + */ list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); @@ -349,11 +354,13 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) list_add(&lfcc->list, &list); } + gfs2_freeze_unlock(&sdp->sd_freeze_gh); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOPID, &sdp->sd_freeze_gh); if (error) - goto out; + goto relock_shared; list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { error = gfs2_jdesc_check(jd); @@ -368,8 +375,14 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) } } - if (error) - gfs2_freeze_unlock(&sdp->sd_freeze_gh); + if (!error) + goto out; /* success */ + + gfs2_freeze_unlock(&sdp->sd_freeze_gh); + +relock_shared: + error2 = gfs2_freeze_lock_shared(sdp); + gfs2_assert_withdraw(sdp, !error2); out: while (!list_empty(&list)) { @@ -463,7 +476,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) * @flags: The type of dirty * * Unfortunately it can be called under any combination of inode - * glock and transaction lock, so we have to check carefully. + * glock and freeze glock, so we have to check carefully. * * At the moment this deals only with atime - it should be possible * to expand that role in future, once a review of the locking has @@ -615,6 +628,8 @@ restart: /* Release stuff */ + gfs2_freeze_unlock(&sdp->sd_freeze_gh); + iput(sdp->sd_jindex); iput(sdp->sd_statfs_inode); iput(sdp->sd_rindex); @@ -669,59 +684,109 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) return sdp->sd_log_error; } -void gfs2_freeze_func(struct work_struct *work) +static int gfs2_freeze_locally(struct gfs2_sbd *sdp) { - int error; - struct gfs2_holder freeze_gh; - struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_freeze_work); struct super_block *sb = sdp->sd_vfs; + int error; - atomic_inc(&sb->s_active); - error = gfs2_freeze_lock(sdp, &freeze_gh, 0); - if (error) { - gfs2_assert_withdraw(sdp, 0); - } else { - atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); - error = thaw_super(sb); - if (error) { - fs_info(sdp, "GFS2: couldn't thaw filesystem: %d\n", - error); - gfs2_assert_withdraw(sdp, 0); + error = freeze_super(sb); + if (error) + return error; + + if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | + GFS2_LFC_FREEZE_GO_SYNC); + if (gfs2_withdrawn(sdp)) { + thaw_super(sb); + return -EIO; } - gfs2_freeze_unlock(&freeze_gh); } + return 0; +} + +static int gfs2_do_thaw(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + int error; + + error = gfs2_freeze_lock_shared(sdp); + if (error) + goto fail; + error = thaw_super(sb); + if (!error) + return 0; + +fail: + fs_info(sdp, "GFS2: couldn't thaw filesystem: %d\n", error); + gfs2_assert_withdraw(sdp, 0); + return error; +} + +void gfs2_freeze_func(struct work_struct *work) +{ + struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_freeze_work); + struct super_block *sb = sdp->sd_vfs; + int error; + + mutex_lock(&sdp->sd_freeze_mutex); + error = -EBUSY; + if (test_bit(SDF_FROZEN, &sdp->sd_flags)) + goto freeze_failed; + + error = gfs2_freeze_locally(sdp); + if (error) + goto freeze_failed; + + gfs2_freeze_unlock(&sdp->sd_freeze_gh); + set_bit(SDF_FROZEN, &sdp->sd_flags); + + error = gfs2_do_thaw(sdp); + if (error) + goto out; + + clear_bit(SDF_FROZEN, &sdp->sd_flags); + goto out; + +freeze_failed: + fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", error); + +out: + mutex_unlock(&sdp->sd_freeze_mutex); deactivate_super(sb); - clear_bit_unlock(SDF_FS_FROZEN, &sdp->sd_flags); - wake_up_bit(&sdp->sd_flags, SDF_FS_FROZEN); - return; } /** - * gfs2_freeze - prevent further writes to the filesystem + * gfs2_freeze_super - prevent further writes to the filesystem * @sb: the VFS structure for the filesystem * */ -static int gfs2_freeze(struct super_block *sb) +static int gfs2_freeze_super(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; - mutex_lock(&sdp->sd_freeze_mutex); - if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) { - error = -EBUSY; + if (!mutex_trylock(&sdp->sd_freeze_mutex)) + return -EBUSY; + error = -EBUSY; + if (test_bit(SDF_FROZEN, &sdp->sd_flags)) goto out; - } for (;;) { - if (gfs2_withdrawn(sdp)) { - error = -EINVAL; + error = gfs2_freeze_locally(sdp); + if (error) { + fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", + error); goto out; } error = gfs2_lock_fs_check_clean(sdp); if (!error) - break; + break; /* success */ + + error = gfs2_do_thaw(sdp); + if (error) + goto out; if (error == -EBUSY) fs_err(sdp, "waiting for recovery before freeze\n"); @@ -735,32 +800,58 @@ static int gfs2_freeze(struct super_block *sb) fs_err(sdp, "retrying...\n"); msleep(1000); } - set_bit(SDF_FS_FROZEN, &sdp->sd_flags); + out: + if (!error) { + set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags); + set_bit(SDF_FROZEN, &sdp->sd_flags); + } mutex_unlock(&sdp->sd_freeze_mutex); return error; } /** - * gfs2_unfreeze - reallow writes to the filesystem + * gfs2_thaw_super - reallow writes to the filesystem * @sb: the VFS structure for the filesystem * */ -static int gfs2_unfreeze(struct super_block *sb) +static int gfs2_thaw_super(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; + int error; - mutex_lock(&sdp->sd_freeze_mutex); - if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN || - !gfs2_holder_initialized(&sdp->sd_freeze_gh)) { - mutex_unlock(&sdp->sd_freeze_mutex); - return -EINVAL; + if (!mutex_trylock(&sdp->sd_freeze_mutex)) + return -EBUSY; + error = -EINVAL; + if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags)) + goto out; + + gfs2_freeze_unlock(&sdp->sd_freeze_gh); + + error = gfs2_do_thaw(sdp); + + if (!error) { + clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags); + clear_bit(SDF_FROZEN, &sdp->sd_flags); } +out: + mutex_unlock(&sdp->sd_freeze_mutex); + return error; +} + +void gfs2_thaw_freeze_initiator(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + mutex_lock(&sdp->sd_freeze_mutex); + if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags)) + goto out; gfs2_freeze_unlock(&sdp->sd_freeze_gh); + +out: mutex_unlock(&sdp->sd_freeze_mutex); - return wait_on_bit(&sdp->sd_flags, SDF_FS_FROZEN, TASK_INTERRUPTIBLE); } /** @@ -1004,7 +1095,14 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) { struct gfs2_sbd *sdp = root->d_sb->s_fs_info; struct gfs2_args *args = &sdp->sd_args; - int val; + unsigned int logd_secs, statfs_slow, statfs_quantum, quota_quantum; + + spin_lock(&sdp->sd_tune.gt_spin); + logd_secs = sdp->sd_tune.gt_logd_secs; + quota_quantum = sdp->sd_tune.gt_quota_quantum; + statfs_quantum = sdp->sd_tune.gt_statfs_quantum; + statfs_slow = sdp->sd_tune.gt_statfs_slow; + spin_unlock(&sdp->sd_tune.gt_spin); if (is_ancestor(root, sdp->sd_master_dir)) seq_puts(s, ",meta"); @@ -1059,17 +1157,14 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) } if (args->ar_discard) seq_puts(s, ",discard"); - val = sdp->sd_tune.gt_logd_secs; - if (val != 30) - seq_printf(s, ",commit=%d", val); - val = sdp->sd_tune.gt_statfs_quantum; - if (val != 30) - seq_printf(s, ",statfs_quantum=%d", val); - else if (sdp->sd_tune.gt_statfs_slow) + if (logd_secs != 30) + seq_printf(s, ",commit=%d", logd_secs); + if (statfs_quantum != 30) + seq_printf(s, ",statfs_quantum=%d", statfs_quantum); + else if (statfs_slow) seq_puts(s, ",statfs_quantum=0"); - val = sdp->sd_tune.gt_quota_quantum; - if (val != 60) - seq_printf(s, ",quota_quantum=%d", val); + if (quota_quantum != 60) + seq_printf(s, ",quota_quantum=%d", quota_quantum); if (args->ar_statfs_percent) seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent); if (args->ar_errors != GFS2_ERRORS_DEFAULT) { @@ -1131,9 +1226,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) return -EIO; } - error = gfs2_rindex_update(sdp); - if (error) - return error; + gfs2_rindex_update(sdp); error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (error) @@ -1334,9 +1427,6 @@ static int evict_unlinked_inode(struct inode *inode) goto out; } - if (ip->i_gl) - gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); - /* * As soon as we clear the bitmap for the dinode, gfs2_create_inode() * can get called to recreate it, or even gfs2_inode_lookup() if the @@ -1350,6 +1440,9 @@ static int evict_unlinked_inode(struct inode *inode) */ ret = gfs2_dinode_dealloc(ip); + if (!ret && ip->i_gl) + gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); + out: return ret; } @@ -1528,8 +1621,8 @@ const struct super_operations gfs2_super_ops = { .evict_inode = gfs2_evict_inode, .put_super = gfs2_put_super, .sync_fs = gfs2_sync_fs, - .freeze_super = gfs2_freeze, - .thaw_super = gfs2_unfreeze, + .freeze_super = gfs2_freeze_super, + .thaw_super = gfs2_thaw_super, .statfs = gfs2_statfs, .drop_inode = gfs2_drop_inode, .show_options = gfs2_show_options, diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 58d13fd77aed..bba58629bc45 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -46,6 +46,7 @@ extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); extern void gfs2_freeze_func(struct work_struct *work); +extern void gfs2_thaw_freeze_initiator(struct super_block *sb); extern void free_local_statfs_inodes(struct gfs2_sbd *sdp); extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 454dc2ff8b5e..2dfbe2f188dd 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -82,6 +82,7 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) "RO Recovery: %d\n" "Skip DLM Unlock: %d\n" "Force AIL Flush: %d\n" + "FS Freeze Initiator: %d\n" "FS Frozen: %d\n" "Withdrawing: %d\n" "Withdraw In Prog: %d\n" @@ -111,7 +112,8 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) test_bit(SDF_RORECOVERY, &f), test_bit(SDF_SKIP_DLM_UNLOCK, &f), test_bit(SDF_FORCE_AIL_FLUSH, &f), - test_bit(SDF_FS_FROZEN, &f), + test_bit(SDF_FREEZE_INITIATOR, &f), + test_bit(SDF_FROZEN, &f), test_bit(SDF_WITHDRAWING, &f), test_bit(SDF_WITHDRAW_IN_PROG, &f), test_bit(SDF_REMOTE_WITHDRAW, &f), diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 63fec11ef2ce..ec1631257978 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -233,7 +233,6 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) struct gfs2_bufdata *bd; struct gfs2_meta_header *mh; struct gfs2_trans *tr = current->journal_info; - enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); lock_buffer(bh); if (buffer_pinned(bh)) { @@ -267,7 +266,7 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) (unsigned long long)bd->bd_bh->b_blocknr); BUG(); } - if (unlikely(state == SFS_FROZEN)) { + if (unlikely(test_bit(SDF_FROZEN, &sdp->sd_flags))) { fs_info(sdp, "GFS2:adding buf while frozen\n"); gfs2_assert_withdraw(sdp, 0); } diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 7a6aeffcdf5c..dac22b1c1a2e 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -93,21 +93,18 @@ out_unlock: } /** - * gfs2_freeze_lock - hold the freeze glock + * gfs2_freeze_lock_shared - hold the freeze glock * @sdp: the superblock - * @freeze_gh: pointer to the requested holder - * @caller_flags: any additional flags needed by the caller */ -int gfs2_freeze_lock(struct gfs2_sbd *sdp, struct gfs2_holder *freeze_gh, - int caller_flags) +int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp) { - int flags = LM_FLAG_NOEXP | GL_EXACT | caller_flags; int error; - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, flags, - freeze_gh); - if (error && error != GLR_TRYFAILED) - fs_err(sdp, "can't lock the freeze lock: %d\n", error); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT, + &sdp->sd_freeze_gh); + if (error) + fs_err(sdp, "can't lock the freeze glock: %d\n", error); return error; } @@ -124,7 +121,6 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) struct gfs2_inode *ip; struct gfs2_glock *i_gl; u64 no_formal_ino; - int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); int ret = 0; int tries; @@ -152,24 +148,18 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) */ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); if (!sb_rdonly(sdp->sd_vfs)) { - struct gfs2_holder freeze_gh; - - gfs2_holder_mark_uninitialized(&freeze_gh); - if (sdp->sd_freeze_gl && - !gfs2_glock_is_locked_by_me(sdp->sd_freeze_gl)) { - ret = gfs2_freeze_lock(sdp, &freeze_gh, - log_write_allowed ? 0 : LM_FLAG_TRY); - if (ret == GLR_TRYFAILED) - ret = 0; - } - if (!ret) - gfs2_make_fs_ro(sdp); + bool locked = mutex_trylock(&sdp->sd_freeze_mutex); + + gfs2_make_fs_ro(sdp); + + if (locked) + mutex_unlock(&sdp->sd_freeze_mutex); + /* * Dequeue any pending non-system glock holders that can no * longer be granted because the file system is withdrawn. */ gfs2_gl_dq_holders(sdp); - gfs2_freeze_unlock(&freeze_gh); } if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */ @@ -187,15 +177,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) } sdp->sd_jinode_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq(&sdp->sd_jinode_gh); - if (test_bit(SDF_FS_FROZEN, &sdp->sd_flags)) { - /* Make sure gfs2_unfreeze works if partially-frozen */ - flush_work(&sdp->sd_freeze_work); - atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); - thaw_super(sdp->sd_vfs); - } else { - wait_on_bit(&i_gl->gl_flags, GLF_DEMOTE, - TASK_UNINTERRUPTIBLE); - } + gfs2_thaw_freeze_initiator(sdp->sd_vfs); + wait_on_bit(&i_gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); /* * holder_uninit to force glock_put, to force dlm to let go diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 78ec190f4155..cdb839529175 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -149,8 +149,7 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, extern int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, bool verbose); -extern int gfs2_freeze_lock(struct gfs2_sbd *sdp, - struct gfs2_holder *freeze_gh, int caller_flags); +extern int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp); extern void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh); #define gfs2_io_error(sdp) \ diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 1f7bd068acf0..441d7fc952e3 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -694,7 +694,7 @@ static const struct file_operations hfs_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .fsync = hfs_file_fsync, .open = hfs_file_open, .release = hfs_file_release, diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index b21660475ac1..7d1a675e037d 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -372,7 +372,7 @@ static const struct file_operations hfsplus_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .fsync = hfsplus_file_fsync, .open = hfsplus_file_open, .release = hfsplus_file_release, diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 69cb796f6270..0239e3af3945 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -65,6 +65,7 @@ struct hostfs_stat { unsigned long long blocks; unsigned int maj; unsigned int min; + dev_t dev; }; extern int stat_file(const char *path, struct hostfs_stat *p, int fd); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 28b4f15c19eb..46387090eb76 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -26,6 +26,7 @@ struct hostfs_inode_info { fmode_t mode; struct inode vfs_inode; struct mutex open_mutex; + dev_t dev; }; static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) @@ -182,14 +183,6 @@ static char *follow_link(char *link) return ERR_PTR(n); } -static struct inode *hostfs_iget(struct super_block *sb) -{ - struct inode *inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - return inode; -} - static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) { /* @@ -228,6 +221,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) return NULL; hi->fd = -1; hi->mode = 0; + hi->dev = 0; inode_init_once(&hi->vfs_inode); mutex_init(&hi->open_mutex); return &hi->vfs_inode; @@ -240,6 +234,7 @@ static void hostfs_evict_inode(struct inode *inode) if (HOSTFS_I(inode)->fd != -1) { close_file(&HOSTFS_I(inode)->fd); HOSTFS_I(inode)->fd = -1; + HOSTFS_I(inode)->dev = 0; } } @@ -265,6 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations hostfs_sbops = { .alloc_inode = hostfs_alloc_inode, .free_inode = hostfs_free_inode, + .drop_inode = generic_delete_inode, .evict_inode = hostfs_evict_inode, .statfs = hostfs_statfs, .show_options = hostfs_show_options, @@ -381,7 +377,7 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, static const struct file_operations hostfs_file_fops = { .llseek = generic_file_llseek, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, @@ -512,18 +508,31 @@ static const struct address_space_operations hostfs_aops = { .write_end = hostfs_write_end, }; -static int read_name(struct inode *ino, char *name) +static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st) +{ + set_nlink(ino, st->nlink); + i_uid_write(ino, st->uid); + i_gid_write(ino, st->gid); + ino->i_atime = + (struct timespec64){ st->atime.tv_sec, st->atime.tv_nsec }; + ino->i_mtime = + (struct timespec64){ st->mtime.tv_sec, st->mtime.tv_nsec }; + ino->i_ctime = + (struct timespec64){ st->ctime.tv_sec, st->ctime.tv_nsec }; + ino->i_size = st->size; + ino->i_blocks = st->blocks; + return 0; +} + +static int hostfs_inode_set(struct inode *ino, void *data) { + struct hostfs_stat *st = data; dev_t rdev; - struct hostfs_stat st; - int err = stat_file(name, &st, -1); - if (err) - return err; /* Reencode maj and min with the kernel encoding.*/ - rdev = MKDEV(st.maj, st.min); + rdev = MKDEV(st->maj, st->min); - switch (st.mode & S_IFMT) { + switch (st->mode & S_IFMT) { case S_IFLNK: ino->i_op = &hostfs_link_iops; break; @@ -535,7 +544,7 @@ static int read_name(struct inode *ino, char *name) case S_IFBLK: case S_IFIFO: case S_IFSOCK: - init_special_inode(ino, st.mode & S_IFMT, rdev); + init_special_inode(ino, st->mode & S_IFMT, rdev); ino->i_op = &hostfs_iops; break; case S_IFREG: @@ -547,17 +556,42 @@ static int read_name(struct inode *ino, char *name) return -EIO; } - ino->i_ino = st.ino; - ino->i_mode = st.mode; - set_nlink(ino, st.nlink); - i_uid_write(ino, st.uid); - i_gid_write(ino, st.gid); - ino->i_atime = (struct timespec64){ st.atime.tv_sec, st.atime.tv_nsec }; - ino->i_mtime = (struct timespec64){ st.mtime.tv_sec, st.mtime.tv_nsec }; - ino->i_ctime = (struct timespec64){ st.ctime.tv_sec, st.ctime.tv_nsec }; - ino->i_size = st.size; - ino->i_blocks = st.blocks; - return 0; + HOSTFS_I(ino)->dev = st->dev; + ino->i_ino = st->ino; + ino->i_mode = st->mode; + return hostfs_inode_update(ino, st); +} + +static int hostfs_inode_test(struct inode *inode, void *data) +{ + const struct hostfs_stat *st = data; + + return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == st->dev; +} + +static struct inode *hostfs_iget(struct super_block *sb, char *name) +{ + struct inode *inode; + struct hostfs_stat st; + int err = stat_file(name, &st, -1); + + if (err) + return ERR_PTR(err); + + inode = iget5_locked(sb, st.ino, hostfs_inode_test, hostfs_inode_set, + &st); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + unlock_new_inode(inode); + } else { + spin_lock(&inode->i_lock); + hostfs_inode_update(inode, &st); + spin_unlock(&inode->i_lock); + } + + return inode; } static int hostfs_create(struct mnt_idmap *idmap, struct inode *dir, @@ -565,62 +599,48 @@ static int hostfs_create(struct mnt_idmap *idmap, struct inode *dir, { struct inode *inode; char *name; - int error, fd; - - inode = hostfs_iget(dir->i_sb); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - goto out; - } + int fd; - error = -ENOMEM; name = dentry_name(dentry); if (name == NULL) - goto out_put; + return -ENOMEM; fd = file_create(name, mode & 0777); - if (fd < 0) - error = fd; - else - error = read_name(inode, name); + if (fd < 0) { + __putname(name); + return fd; + } + inode = hostfs_iget(dir->i_sb, name); __putname(name); - if (error) - goto out_put; + if (IS_ERR(inode)) + return PTR_ERR(inode); HOSTFS_I(inode)->fd = fd; HOSTFS_I(inode)->mode = FMODE_READ | FMODE_WRITE; d_instantiate(dentry, inode); return 0; - - out_put: - iput(inode); - out: - return error; } static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, unsigned int flags) { - struct inode *inode; + struct inode *inode = NULL; char *name; - int err; - - inode = hostfs_iget(ino->i_sb); - if (IS_ERR(inode)) - goto out; - err = -ENOMEM; name = dentry_name(dentry); - if (name) { - err = read_name(inode, name); - __putname(name); - } - if (err) { - iput(inode); - inode = (err == -ENOENT) ? NULL : ERR_PTR(err); + if (name == NULL) + return ERR_PTR(-ENOMEM); + + inode = hostfs_iget(ino->i_sb, name); + __putname(name); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOENT) + inode = NULL; + else + return ERR_CAST(inode); } - out: + return d_splice_alias(inode, dentry); } @@ -704,35 +724,23 @@ static int hostfs_mknod(struct mnt_idmap *idmap, struct inode *dir, char *name; int err; - inode = hostfs_iget(dir->i_sb); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - goto out; - } - - err = -ENOMEM; name = dentry_name(dentry); if (name == NULL) - goto out_put; + return -ENOMEM; err = do_mknod(name, mode, MAJOR(dev), MINOR(dev)); - if (err) - goto out_free; + if (err) { + __putname(name); + return err; + } - err = read_name(inode, name); + inode = hostfs_iget(dir->i_sb, name); __putname(name); - if (err) - goto out_put; + if (IS_ERR(inode)) + return PTR_ERR(inode); d_instantiate(dentry, inode); return 0; - - out_free: - __putname(name); - out_put: - iput(inode); - out: - return err; } static int hostfs_rename2(struct mnt_idmap *idmap, @@ -929,49 +937,40 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; err = super_setup_bdi(sb); if (err) - goto out; + return err; /* NULL is printed as '(null)' by printf(): avoid that. */ if (req_root == NULL) req_root = ""; - err = -ENOMEM; sb->s_fs_info = host_root_path = kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root); if (host_root_path == NULL) - goto out; - - root_inode = new_inode(sb); - if (!root_inode) - goto out; + return -ENOMEM; - err = read_name(root_inode, host_root_path); - if (err) - goto out_put; + root_inode = hostfs_iget(sb, host_root_path); + if (IS_ERR(root_inode)) + return PTR_ERR(root_inode); if (S_ISLNK(root_inode->i_mode)) { - char *name = follow_link(host_root_path); - if (IS_ERR(name)) { - err = PTR_ERR(name); - goto out_put; - } - err = read_name(root_inode, name); + char *name; + + iput(root_inode); + name = follow_link(host_root_path); + if (IS_ERR(name)) + return PTR_ERR(name); + + root_inode = hostfs_iget(sb, name); kfree(name); - if (err) - goto out_put; + if (IS_ERR(root_inode)) + return PTR_ERR(root_inode); } - err = -ENOMEM; sb->s_root = d_make_root(root_inode); if (sb->s_root == NULL) - goto out; + return -ENOMEM; return 0; - -out_put: - iput(root_inode); -out: - return err; } static struct dentry *hostfs_read_sb(struct file_system_type *type, diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 5ecc4706172b..840619e39a1a 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -36,6 +36,7 @@ static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) p->blocks = buf->st_blocks; p->maj = os_major(buf->st_rdev); p->min = os_minor(buf->st_rdev); + p->dev = buf->st_dev; } int stat_file(const char *path, struct hostfs_stat *p, int fd) diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 88952d4a631e..1bb8d97cd9ae 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -259,7 +259,7 @@ const struct file_operations hpfs_file_ops = .mmap = generic_file_mmap, .release = hpfs_file_release, .fsync = hpfs_file_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .unlocked_ioctl = hpfs_ioctl, .compat_ioctl = compat_ptr_ioctl, }; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ecfdfb2529a3..7b17ccfa039d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -821,7 +821,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, */ struct folio *folio; unsigned long addr; - bool present; cond_resched(); @@ -834,9 +833,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, break; } - /* Set numa allocation policy based on index */ - hugetlb_set_vma_policy(&pseudo_vma, inode, index); - /* addr is the offset within the file (zero based) */ addr = index * hpage_size; @@ -845,12 +841,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - rcu_read_lock(); - present = page_cache_next_miss(mapping, index, 1) != index; - rcu_read_unlock(); - if (present) { + folio = filemap_get_folio(mapping, index); + if (!IS_ERR(folio)) { + folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - hugetlb_drop_vma_policy(&pseudo_vma); continue; } @@ -862,6 +856,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ + hugetlb_set_vma_policy(&pseudo_vma, inode, index); folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(folio)) { diff --git a/fs/inode.c b/fs/inode.c index 577799b7855f..8fefb69e1f84 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1104,9 +1104,51 @@ void discard_new_inode(struct inode *inode) EXPORT_SYMBOL(discard_new_inode); /** + * lock_two_inodes - lock two inodes (may be regular files but also dirs) + * + * Lock any non-NULL argument. The caller must make sure that if he is passing + * in two directories, one is not ancestor of the other. Zero, one or two + * objects may be locked by this function. + * + * @inode1: first inode to lock + * @inode2: second inode to lock + * @subclass1: inode lock subclass for the first lock obtained + * @subclass2: inode lock subclass for the second lock obtained + */ +void lock_two_inodes(struct inode *inode1, struct inode *inode2, + unsigned subclass1, unsigned subclass2) +{ + if (!inode1 || !inode2) { + /* + * Make sure @subclass1 will be used for the acquired lock. + * This is not strictly necessary (no current caller cares) but + * let's keep things consistent. + */ + if (!inode1) + swap(inode1, inode2); + goto lock; + } + + /* + * If one object is directory and the other is not, we must make sure + * to lock directory first as the other object may be its child. + */ + if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) { + if (inode1 > inode2) + swap(inode1, inode2); + } else if (!S_ISDIR(inode1->i_mode)) + swap(inode1, inode2); +lock: + if (inode1) + inode_lock_nested(inode1, subclass1); + if (inode2 && inode2 != inode1) + inode_lock_nested(inode2, subclass2); +} + +/** * lock_two_nondirectories - take two i_mutexes on non-directory objects * - * Lock any non-NULL argument that is not a directory. + * Lock any non-NULL argument. Passed objects must not be directories. * Zero, one or two objects may be locked by this function. * * @inode1: first inode to lock @@ -1114,13 +1156,11 @@ EXPORT_SYMBOL(discard_new_inode); */ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) { - if (inode1 > inode2) - swap(inode1, inode2); - - if (inode1 && !S_ISDIR(inode1->i_mode)) - inode_lock(inode1); - if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - inode_lock_nested(inode2, I_MUTEX_NONDIR2); + if (inode1) + WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); + if (inode2) + WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); + lock_two_inodes(inode1, inode2, I_MUTEX_NORMAL, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); @@ -1131,10 +1171,14 @@ EXPORT_SYMBOL(lock_two_nondirectories); */ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { - if (inode1 && !S_ISDIR(inode1->i_mode)) + if (inode1) { + WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); inode_unlock(inode1); - if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + } + if (inode2 && inode2 != inode1) { + WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); inode_unlock(inode2); + } } EXPORT_SYMBOL(unlock_two_nondirectories); @@ -2264,7 +2308,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; } else if (S_ISBLK(mode)) { - inode->i_fop = &def_blk_fops; + if (IS_ENABLED(CONFIG_BLOCK)) + inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; } else if (S_ISFIFO(mode)) inode->i_fop = &pipefifo_fops; diff --git a/fs/internal.h b/fs/internal.h index bd3b2810a36b..f7a3dc111026 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -97,8 +97,9 @@ extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c */ -extern struct file *alloc_empty_file(int, const struct cred *); -extern struct file *alloc_empty_file_noaccount(int, const struct cred *); +struct file *alloc_empty_file(int flags, const struct cred *cred); +struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); +struct file *alloc_empty_backing_file(int flags, const struct cred *cred); static inline void put_file_access(struct file *file) { @@ -121,6 +122,47 @@ extern bool mount_capable(struct fs_context *); int sb_init_dio_done_wq(struct super_block *sb); /* + * Prepare superblock for changing its read-only state (i.e., either remount + * read-write superblock read-only or vice versa). After this function returns + * mnt_is_readonly() will return true for any mount of the superblock if its + * caller is able to observe any changes done by the remount. This holds until + * sb_end_ro_state_change() is called. + */ +static inline void sb_start_ro_state_change(struct super_block *sb) +{ + WRITE_ONCE(sb->s_readonly_remount, 1); + /* + * For RO->RW transition, the barrier pairs with the barrier in + * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY + * cleared, it will see s_readonly_remount set. + * For RW->RO transition, the barrier pairs with the barrier in + * __mnt_want_write() before the mnt_is_readonly() check. The barrier + * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already + * cleared, it will see s_readonly_remount set. + */ + smp_wmb(); +} + +/* + * Ends section changing read-only state of the superblock. After this function + * returns if mnt_is_readonly() returns false, the caller will be able to + * observe all the changes remount did to the superblock. + */ +static inline void sb_end_ro_state_change(struct super_block *sb) +{ + /* + * This barrier provides release semantics that pairs with + * the smp_rmb() acquire semantics in mnt_is_readonly(). + * This barrier pair ensure that when mnt_is_readonly() sees + * 0 for sb->s_readonly_remount, it will also see all the + * preceding flag changes that were made during the RO state + * change. + */ + smp_wmb(); + WRITE_ONCE(sb->s_readonly_remount, 0); +} + +/* * open.c */ struct open_flags { @@ -152,6 +194,8 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); +void lock_two_inodes(struct inode *inode1, struct inode *inode2, + unsigned subclass1, unsigned subclass2); /* * fs-writeback.c diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 063133ec77f4..adb92cdb24b0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -312,7 +312,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, ctx->bio->bi_opf |= REQ_RAHEAD; ctx->bio->bi_iter.bi_sector = sector; ctx->bio->bi_end_io = iomap_read_end_io; - bio_add_folio(ctx->bio, folio, plen, poff); + bio_add_folio_nofail(ctx->bio, folio, plen, poff); } done: @@ -539,7 +539,7 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); - bio_add_folio(&bio, folio, plen, poff); + bio_add_folio_nofail(&bio, folio, plen, poff); return submit_bio_wait(&bio); } @@ -864,16 +864,19 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, .len = iov_iter_count(i), .flags = IOMAP_WRITE, }; - int ret; + ssize_t ret; if (iocb->ki_flags & IOCB_NOWAIT) iter.flags |= IOMAP_NOWAIT; while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_write_iter(&iter, i); - if (iter.pos == iocb->ki_pos) + + if (unlikely(ret < 0)) return ret; - return iter.pos - iocb->ki_pos; + ret = iter.pos - iocb->ki_pos; + iocb->ki_pos += ret; + return ret; } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); @@ -1073,7 +1076,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, { loff_t start_byte; loff_t end_byte; - int blocksize = i_blocksize(inode); + unsigned int blocksize = i_blocksize(inode); if (iomap->type != IOMAP_DELALLOC) return 0; @@ -1582,7 +1585,7 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); - bio_add_folio(wpc->ioend->io_bio, folio, len, poff); + bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); } if (iop) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 019cc87d0fb3..ea3b868c8355 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -81,7 +81,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) { const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; - struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; ssize_t ret = dio->error; @@ -94,7 +93,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) if (offset + ret > dio->i_size && !(dio->flags & IOMAP_DIO_WRITE)) ret = dio->i_size - offset; - iocb->ki_pos += ret; } /* @@ -109,30 +107,25 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && - (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) { - int err; - err = invalidate_inode_pages2_range(inode->i_mapping, - offset >> PAGE_SHIFT, - (offset + dio->size - 1) >> PAGE_SHIFT); - if (err) - dio_warn_stale_pagecache(iocb->ki_filp); - } + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); - /* - * If this is a DSYNC write, make sure we push it to stable storage now - * that we've written data. - */ - if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) - ret = generic_write_sync(iocb, ret); - if (ret > 0) - ret += dio->done_before; + if (ret > 0) { + iocb->ki_pos += ret; + /* + * If this is a DSYNC write, make sure we push it to stable + * storage now that we've written data. + */ + if (dio->flags & IOMAP_DIO_NEED_SYNC) + ret = generic_write_sync(iocb, ret); + if (ret > 0) + ret += dio->done_before; + } trace_iomap_dio_complete(iocb, dio->error, ret); kfree(dio); - return ret; } EXPORT_SYMBOL_GPL(iomap_dio_complete); @@ -203,7 +196,6 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - get_page(page); __bio_add_page(bio, page, len, 0); iomap_dio_submit_bio(iter, dio, bio, pos); } @@ -479,7 +471,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before) { - struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); struct iomap_iter iomi = { .inode = inode, @@ -488,11 +479,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, .flags = IOMAP_DIRECT, .private = private, }; - loff_t end = iomi.pos + iomi.len - 1, ret = 0; bool wait_for_completion = is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT); struct blk_plug plug; struct iomap_dio *dio; + loff_t ret = 0; trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before); @@ -516,31 +507,29 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->submit.waiter = current; dio->submit.poll_bio = NULL; + if (iocb->ki_flags & IOCB_NOWAIT) + iomi.flags |= IOMAP_NOWAIT; + if (iov_iter_rw(iter) == READ) { if (iomi.pos >= dio->i_size) goto out_free_dio; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, iomi.pos, - end)) { - ret = -EAGAIN; - goto out_free_dio; - } - iomi.flags |= IOMAP_NOWAIT; - } - if (user_backed_iter(iter)) dio->flags |= IOMAP_DIO_DIRTY; + + ret = kiocb_write_and_wait(iocb, iomi.len); + if (ret) + goto out_free_dio; } else { iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, iomi.pos, end)) { - ret = -EAGAIN; + if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { + ret = -EAGAIN; + if (iomi.pos >= dio->i_size || + iomi.pos + iomi.len > dio->i_size) goto out_free_dio; - } - iomi.flags |= IOMAP_NOWAIT; + iomi.flags |= IOMAP_OVERWRITE_ONLY; } /* for data sync or sync, we need sync completion processing */ @@ -556,31 +545,19 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!(iocb->ki_flags & IOCB_SYNC)) dio->flags |= IOMAP_DIO_WRITE_FUA; } - } - if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { - ret = -EAGAIN; - if (iomi.pos >= dio->i_size || - iomi.pos + iomi.len > dio->i_size) - goto out_free_dio; - iomi.flags |= IOMAP_OVERWRITE_ONLY; - } - - ret = filemap_write_and_wait_range(mapping, iomi.pos, end); - if (ret) - goto out_free_dio; - - if (iov_iter_rw(iter) == WRITE) { /* * Try to invalidate cache pages for the range we are writing. * If this invalidation fails, let the caller fall back to * buffered I/O. */ - if (invalidate_inode_pages2_range(mapping, - iomi.pos >> PAGE_SHIFT, end >> PAGE_SHIFT)) { - trace_iomap_dio_invalidate_fail(inode, iomi.pos, - iomi.len); - ret = -ENOTBLK; + ret = kiocb_invalidate_pages(iocb, iomi.len); + if (ret) { + if (ret != -EAGAIN) { + trace_iomap_dio_invalidate_fail(inode, iomi.pos, + iomi.len); + ret = -ENOTBLK; + } goto out_free_dio; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8ae419152ff6..fbce16fedaa4 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1491,7 +1491,6 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) { journal_t *journal; sector_t blocknr; - char *p; int err = 0; blocknr = 0; @@ -1515,9 +1514,8 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) journal->j_inode = inode; snprintf(journal->j_devname, sizeof(journal->j_devname), - "%pg", journal->j_dev); - p = strreplace(journal->j_devname, '/', '!'); - sprintf(p, "-%lu", journal->j_inode->i_ino); + "%pg-%lu", journal->j_dev, journal->j_inode->i_ino); + strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); return journal; @@ -1559,8 +1557,21 @@ static int journal_reset(journal_t *journal) journal->j_first = first; journal->j_last = last; - journal->j_head = journal->j_first; - journal->j_tail = journal->j_first; + if (journal->j_head != 0 && journal->j_flags & JBD2_CYCLE_RECORD) { + /* + * Disable the cycled recording mode if the journal head block + * number is not correct. + */ + if (journal->j_head < first || journal->j_head >= last) { + printk(KERN_WARNING "JBD2: Incorrect Journal head block %lu, " + "disable journal_cycle_record\n", + journal->j_head); + journal->j_head = journal->j_first; + } + } else { + journal->j_head = journal->j_first; + } + journal->j_tail = journal->j_head; journal->j_free = journal->j_last - journal->j_first; journal->j_tail_sequence = journal->j_transaction_sequence; @@ -1732,6 +1743,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags) sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(0); + sb->s_head = cpu_to_be32(journal->j_head); if (jbd2_has_feature_fast_commit(journal)) { /* * When journal is clean, no need to commit fast commit flag and @@ -1905,6 +1917,9 @@ static int journal_get_superblock(journal_t *journal) bh = journal->j_sb_buffer; J_ASSERT(bh != NULL); + if (buffer_verified(bh)) + return 0; + err = bh_read(bh, 0); if (err < 0) { printk(KERN_ERR @@ -1912,9 +1927,6 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (buffer_verified(bh)) - return 0; - sb = journal->j_superblock; err = -EINVAL; @@ -1925,21 +1937,13 @@ static int journal_get_superblock(journal_t *journal) goto out; } - switch(be32_to_cpu(sb->s_header.h_blocktype)) { - case JBD2_SUPERBLOCK_V1: - journal->j_format_version = 1; - break; - case JBD2_SUPERBLOCK_V2: - journal->j_format_version = 2; - break; - default: + if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && + be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); goto out; } - if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) - journal->j_total_len = be32_to_cpu(sb->s_maxlen); - else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { + if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { printk(KERN_WARNING "JBD2: journal file too short\n"); goto out; } @@ -1982,25 +1986,14 @@ static int journal_get_superblock(journal_t *journal) journal->j_chksum_driver = NULL; goto out; } - } - - if (jbd2_journal_has_csum_v2or3(journal)) { /* Check superblock checksum */ if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { printk(KERN_ERR "JBD2: journal checksum error\n"); err = -EFSBADCRC; goto out; } - - /* Precompute checksum seed for all metadata */ - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, - sizeof(sb->s_uuid)); } - - journal->j_revoke_records_per_block = - journal_revoke_records_per_block(journal); set_buffer_verified(bh); - return 0; out: @@ -2031,6 +2024,15 @@ static int load_superblock(journal_t *journal) journal->j_errno = be32_to_cpu(sb->s_errno); journal->j_last = be32_to_cpu(sb->s_maxlen); + if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) + journal->j_total_len = be32_to_cpu(sb->s_maxlen); + /* Precompute checksum seed for all metadata */ + if (jbd2_journal_has_csum_v2or3(journal)) + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); + if (jbd2_has_feature_fast_commit(journal)) { journal->j_fc_last = be32_to_cpu(sb->s_maxlen); num_fc_blocks = jbd2_journal_get_num_fc_blks(sb); @@ -2062,10 +2064,12 @@ int jbd2_journal_load(journal_t *journal) return err; sb = journal->j_superblock; - /* If this is a V2 superblock, then we have to check the - * features flags on it. */ - if (journal->j_format_version >= 2) { + /* + * If this is a V2 superblock, then we have to check the + * features flags on it. + */ + if (jbd2_format_support_feature(journal)) { if ((sb->s_feature_ro_compat & ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || (sb->s_feature_incompat & @@ -2223,11 +2227,9 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, if (!compat && !ro && !incompat) return 1; - /* Load journal superblock if it is not loaded yet. */ - if (journal->j_format_version == 0 && - journal_get_superblock(journal) != 0) + if (journal_get_superblock(journal)) return 0; - if (journal->j_format_version == 1) + if (!jbd2_format_support_feature(journal)) return 0; sb = journal->j_superblock; @@ -2257,11 +2259,7 @@ int jbd2_journal_check_available_features(journal_t *journal, unsigned long comp if (!compat && !ro && !incompat) return 1; - /* We can support any known requested features iff the - * superblock is in version 2. Otherwise we fail to support any - * extended sb features. */ - - if (journal->j_format_version != 2) + if (!jbd2_format_support_feature(journal)) return 0; if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat && diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 8286a9ec122f..0184931d47f7 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -29,6 +29,7 @@ struct recovery_info { tid_t start_transaction; tid_t end_transaction; + unsigned long head_block; int nr_replays; int nr_revokes; @@ -301,11 +302,11 @@ int jbd2_journal_recover(journal_t *journal) * is always zero if, and only if, the journal was cleanly * unmounted. */ - if (!sb->s_start) { - jbd2_debug(1, "No recovery required, last transaction %d\n", - be32_to_cpu(sb->s_sequence)); + jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n", + be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head)); journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; + journal->j_head = be32_to_cpu(sb->s_head); return 0; } @@ -324,6 +325,9 @@ int jbd2_journal_recover(journal_t *journal) /* Restart the log at the next transaction ID, thus invalidating * any existing commit records in the log. */ journal->j_transaction_sequence = ++info.end_transaction; + journal->j_head = info.head_block; + jbd2_debug(1, "JBD2: last transaction %d, head block %lu\n", + journal->j_transaction_sequence, journal->j_head); jbd2_journal_clear_revoke(journal); err2 = sync_blockdev(journal->j_fs_dev); @@ -364,6 +368,7 @@ int jbd2_journal_skip_recovery(journal_t *journal) if (err) { printk(KERN_ERR "JBD2: error %d scanning journal\n", err); ++journal->j_transaction_sequence; + journal->j_head = journal->j_first; } else { #ifdef CONFIG_JBD2_DEBUG int dropped = info.end_transaction - @@ -373,6 +378,7 @@ int jbd2_journal_skip_recovery(journal_t *journal) dropped, (dropped == 1) ? "" : "s"); #endif journal->j_transaction_sequence = ++info.end_transaction; + journal->j_head = info.head_block; } journal->j_tail = 0; @@ -462,7 +468,7 @@ static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { unsigned int first_commit_ID, next_commit_ID; - unsigned long next_log_block; + unsigned long next_log_block, head_block; int err, success = 0; journal_superblock_t * sb; journal_header_t * tmp; @@ -485,6 +491,7 @@ static int do_one_pass(journal_t *journal, sb = journal->j_superblock; next_commit_ID = be32_to_cpu(sb->s_sequence); next_log_block = be32_to_cpu(sb->s_start); + head_block = next_log_block; first_commit_ID = next_commit_ID; if (pass == PASS_SCAN) @@ -809,6 +816,7 @@ static int do_one_pass(journal_t *journal, if (commit_time < last_trans_commit_time) goto ignore_crc_mismatch; info->end_transaction = next_commit_ID; + info->head_block = head_block; if (!jbd2_has_feature_async_commit(journal)) { journal->j_failed_commit = @@ -817,8 +825,10 @@ static int do_one_pass(journal_t *journal, break; } } - if (pass == PASS_SCAN) + if (pass == PASS_SCAN) { last_trans_commit_time = commit_time; + head_block = next_log_block; + } brelse(bh); next_commit_ID++; continue; @@ -868,6 +878,8 @@ static int do_one_pass(journal_t *journal, if (pass == PASS_SCAN) { if (!info->end_transaction) info->end_transaction = next_commit_ID; + if (!info->head_block) + info->head_block = head_block; } else { /* It's really bad news if different passes end up at * different places (but possible due to IO errors). */ diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index 837cd55fd4c5..6ae9d6fefb86 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -211,7 +211,10 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) ic->scan_dents = NULL; cond_resched(); } - jffs2_build_xattr_subsystem(c); + ret = jffs2_build_xattr_subsystem(c); + if (ret) + goto exit; + c->flags &= ~JFFS2_SB_FLAG_BUILDING; dbg_fsbuild("FS build complete\n"); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 96b0275ce957..2345ca3f09ee 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations = .unlocked_ioctl=jffs2_ioctl, .mmap = generic_file_readonly_mmap, .fsync = jffs2_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index aa4048a27f31..3b6bdc9a49e1 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -772,10 +772,10 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c) } #define XREF_TMPHASH_SIZE (128) -void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) +int jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) { struct jffs2_xattr_ref *ref, *_ref; - struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE]; + struct jffs2_xattr_ref **xref_tmphash; struct jffs2_xattr_datum *xd, *_xd; struct jffs2_inode_cache *ic; struct jffs2_raw_node_ref *raw; @@ -784,9 +784,12 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING)); + xref_tmphash = kcalloc(XREF_TMPHASH_SIZE, + sizeof(struct jffs2_xattr_ref *), GFP_KERNEL); + if (!xref_tmphash) + return -ENOMEM; + /* Phase.1 : Merge same xref */ - for (i=0; i < XREF_TMPHASH_SIZE; i++) - xref_tmphash[i] = NULL; for (ref=c->xref_temp; ref; ref=_ref) { struct jffs2_xattr_ref *tmp; @@ -884,6 +887,8 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) "%u of xref (%u dead, %u orphan) found.\n", xdatum_count, xdatum_unchecked_count, xdatum_orphan_count, xref_count, xref_dead_count, xref_orphan_count); + kfree(xref_tmphash); + return 0; } struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h index 720007b2fd65..1b5030a3349d 100644 --- a/fs/jffs2/xattr.h +++ b/fs/jffs2/xattr.h @@ -71,7 +71,7 @@ static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref) #ifdef CONFIG_JFFS2_FS_XATTR extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c); -extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c); +extern int jffs2_build_xattr_subsystem(struct jffs2_sb_info *c); extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c); extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, @@ -103,7 +103,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); #else #define jffs2_init_xattr_subsystem(c) -#define jffs2_build_xattr_subsystem(c) +#define jffs2_build_xattr_subsystem(c) (0) #define jffs2_clear_xattr_subsystem(c) #define jffs2_xattr_do_crccheck_inode(c, ic) diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 2ee35be49de1..01b6912e60f8 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -144,7 +144,7 @@ const struct file_operations jfs_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fsync = jfs_fsync, .release = jfs_release, diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index a3eb1e826947..a14a0f18a4c4 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -178,7 +178,13 @@ int dbMount(struct inode *ipbmap) dbmp_le = (struct dbmap_disk *) mp->data; bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize); bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); + bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); + if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE) { + err = -EINVAL; + goto err_release_metapage; + } + bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); if (!bmp->db_numag) { err = -EINVAL; @@ -1953,6 +1959,9 @@ dbAllocDmapLev(struct bmap * bmp, if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx)) return -ENOSPC; + if (leafidx < 0) + return -EIO; + /* determine the block number within the file system corresponding * to the leaf at which free space was found. */ @@ -3851,7 +3860,7 @@ static int dbInitTree(struct dmaptree * dtp) l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin; /* - * configure the leaf levevl into binary buddy system + * configure the leaf level into binary buddy system * * Try to combine buddies starting with a buddy size of 1 * (i.e. two leaves). At a buddy size of 1 two buddy leaves diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h index b5d702df7111..33ef13a0b110 100644 --- a/fs/jfs/jfs_filsys.h +++ b/fs/jfs/jfs_filsys.h @@ -122,7 +122,9 @@ #define NUM_INODE_PER_IAG INOSPERIAG #define MINBLOCKSIZE 512 +#define L2MINBLOCKSIZE 9 #define MAXBLOCKSIZE 4096 +#define L2MAXBLOCKSIZE 12 #define MAXFILESIZE ((s64)1 << 52) #define JFS_LINK_MAX 0xffffffff diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 695415cbfe98..e855b8fde76c 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1100,8 +1100,8 @@ int lmLogOpen(struct super_block *sb) * file systems to log may have n-to-1 relationship; */ - bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - log); + bdev = blkdev_get_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE, + log, NULL); if (IS_ERR(bdev)) { rc = PTR_ERR(bdev); goto free; @@ -1141,7 +1141,7 @@ journal_found: lbmLogShutdown(log); close: /* close external log device */ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, log); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1485,7 +1485,7 @@ int lmLogClose(struct super_block *sb) bdev = log->bdev; rc = lmLogShutdown(log); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, log); kfree(log); @@ -1974,7 +1974,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); + __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; @@ -2115,7 +2115,7 @@ static void lbmStartIO(struct lbuf * bp) bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); + __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index ffd4feece078..ce4b4760fcb1 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -354,6 +354,11 @@ tid_t txBegin(struct super_block *sb, int flag) jfs_info("txBegin: flag = 0x%x", flag); log = JFS_SBI(sb)->log; + if (!log) { + jfs_error(sb, "read-only filesystem\n"); + return 0; + } + TXN_LOCK(); INCREMENT(TxStat.txBegin); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index b29d68b5eec5..9b030297aa64 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -799,6 +799,11 @@ static int jfs_link(struct dentry *old_dentry, if (rc) goto out; + if (isReadOnly(ip)) { + jfs_error(ip->i_sb, "read-only filesystem\n"); + return -EROFS; + } + tid = txBegin(ip->i_sb, 0); mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); @@ -876,7 +881,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, tid_t tid; ino_t ino = 0; struct component_name dname; - int ssize; /* source pathname size */ + u32 ssize; /* source pathname size */ struct btstack btstack; struct inode *ip = d_inode(dentry); s64 xlen = 0; @@ -957,7 +962,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, if (ssize > sizeof (JFS_IP(ip)->i_inline)) JFS_IP(ip)->mode2 &= ~INLINEEA; - jfs_info("jfs_symlink: fast symlink added ssize:%d name:%s ", + jfs_info("jfs_symlink: fast symlink added ssize:%u name:%s ", ssize, name); } /* @@ -987,7 +992,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, ip->i_size = ssize - 1; while (ssize) { /* This is kind of silly since PATH_MAX == 4K */ - int copy_size = min(ssize, PSIZE); + u32 copy_size = min_t(u32, ssize, PSIZE); mp = get_metapage(ip, xaddr, PSIZE, 1); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 45b6919903e6..5a1a4af9d3d2 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -655,7 +655,9 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, return kn; err_out3: + spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); + spin_unlock(&kernfs_idr_lock); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 40c4661f15b7..180906c36f51 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -1011,7 +1011,7 @@ const struct file_operations kernfs_file_fops = { .release = kernfs_fop_release, .poll = kernfs_fop_poll, .fsync = noop_fsync, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/fs/libfs.c b/fs/libfs.c index 89cf614a3271..5b851315eeed 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1613,3 +1613,44 @@ u64 inode_query_iversion(struct inode *inode) return cur >> I_VERSION_QUERIED_SHIFT; } EXPORT_SYMBOL(inode_query_iversion); + +ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, + ssize_t direct_written, ssize_t buffered_written) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + loff_t pos = iocb->ki_pos - buffered_written; + loff_t end = iocb->ki_pos - 1; + int err; + + /* + * If the buffered write fallback returned an error, we want to return + * the number of bytes which were written by direct I/O, or the error + * code if that was zero. + * + * Note that this differs from normal direct-io semantics, which will + * return -EFOO even if some bytes were written. + */ + if (unlikely(buffered_written < 0)) { + if (direct_written) + return direct_written; + return buffered_written; + } + + /* + * We need to ensure that the page cache pages are written to disk and + * invalidated to preserve the expected O_DIRECT semantics. + */ + err = filemap_write_and_wait_range(mapping, pos, end); + if (err < 0) { + /* + * We don't know how much we wrote, so just return the number of + * bytes which were direct-written + */ + if (direct_written) + return direct_written; + return err; + } + invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + return direct_written + buffered_written; +} +EXPORT_SYMBOL_GPL(direct_write_fallback); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index e3972aa3045a..5d85715be763 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -93,6 +93,12 @@ void nlmclnt_prepare_block(struct nlm_wait *block, struct nlm_host *host, struct block->b_status = nlm_lck_blocked; } +struct rpc_clnt *nlmclnt_rpc_clnt(struct nlm_host *host) +{ + return host->h_rpcclnt; +} +EXPORT_SYMBOL_GPL(nlmclnt_rpc_clnt); + /* * Queue up a lock for blocking so that the GRANTED request can see it */ diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 04ba95b83d16..22d3ff3818f5 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -355,7 +355,6 @@ static int lockd_get(void) int error; if (nlmsvc_serv) { - svc_get(nlmsvc_serv); nlmsvc_users++; return 0; } diff --git a/fs/minix/file.c b/fs/minix/file.c index 0dd05d47724a..906d192ab7f3 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = { .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, }; static int minix_setattr(struct mnt_idmap *idmap, diff --git a/fs/namei.c b/fs/namei.c index e4fe0879ae55..e56ff39a79bc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3028,8 +3028,8 @@ static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) return p; } - inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); - inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); + lock_two_inodes(p1->d_inode, p2->d_inode, + I_MUTEX_PARENT, I_MUTEX_PARENT2); return NULL; } @@ -3703,7 +3703,7 @@ static int vfs_tmpfile(struct mnt_idmap *idmap, } /** - * vfs_tmpfile_open - open a tmpfile for kernel internal use + * kernel_tmpfile_open - open a tmpfile for kernel internal use * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile @@ -3714,24 +3714,26 @@ static int vfs_tmpfile(struct mnt_idmap *idmap, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ -struct file *vfs_tmpfile_open(struct mnt_idmap *idmap, - const struct path *parentpath, - umode_t mode, int open_flag, const struct cred *cred) +struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, + const struct path *parentpath, + umode_t mode, int open_flag, + const struct cred *cred) { struct file *file; int error; file = alloc_empty_file_noaccount(open_flag, cred); - if (!IS_ERR(file)) { - error = vfs_tmpfile(idmap, parentpath, file, mode); - if (error) { - fput(file); - file = ERR_PTR(error); - } + if (IS_ERR(file)) + return file; + + error = vfs_tmpfile(idmap, parentpath, file, mode); + if (error) { + fput(file); + file = ERR_PTR(error); } return file; } -EXPORT_SYMBOL(vfs_tmpfile_open); +EXPORT_SYMBOL(kernel_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, @@ -4731,7 +4733,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. * c) we have to lock _four_ objects - parents and victim (if it exists), - * and source (if it is not a directory). + * and source. * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change @@ -4815,10 +4817,16 @@ int vfs_rename(struct renamedata *rd) take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); - if (!is_dir || (flags & RENAME_EXCHANGE)) - lock_two_nondirectories(source, target); - else if (target) - inode_lock(target); + /* + * Lock all moved children. Moved directories may need to change parent + * pointer so they need the lock to prevent against concurrent + * directory changes moving parent pointer. For regular files we've + * historically always done this. The lockdep locking subclasses are + * somewhat arbitrary but RENAME_EXCHANGE in particular can swap + * regular files and directories so it's difficult to tell which + * subclasses to use. + */ + lock_two_inodes(source, target, I_MUTEX_NORMAL, I_MUTEX_NONDIR2); error = -EPERM; if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) @@ -4866,9 +4874,8 @@ int vfs_rename(struct renamedata *rd) d_exchange(old_dentry, new_dentry); } out: - if (!is_dir || (flags & RENAME_EXCHANGE)) - unlock_two_nondirectories(source, target); - else if (target) + inode_unlock(source); + if (target) inode_unlock(target); dput(new_dentry); if (!error) { diff --git a/fs/namespace.c b/fs/namespace.c index 54847db5b819..e157efc54023 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -309,9 +309,16 @@ static unsigned int mnt_get_writers(struct mount *mnt) static int mnt_is_readonly(struct vfsmount *mnt) { - if (mnt->mnt_sb->s_readonly_remount) + if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; - /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ + /* + * The barrier pairs with the barrier in sb_start_ro_state_change() + * making sure if we don't see s_readonly_remount set yet, we also will + * not see any superblock / mount flag changes done by remount. + * It also pairs with the barrier in sb_end_ro_state_change() + * assuring that if we see s_readonly_remount already cleared, we will + * see the values of superblock / mount flags updated by remount. + */ smp_rmb(); return __mnt_is_readonly(mnt); } @@ -364,9 +371,11 @@ int __mnt_want_write(struct vfsmount *m) } } /* - * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will - * be set to match its requirements. So we must not load that until - * MNT_WRITE_HOLD is cleared. + * The barrier pairs with the barrier sb_start_ro_state_change() making + * sure that if we see MNT_WRITE_HOLD cleared, we will also see + * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in + * mnt_is_readonly() and bail in case we are racing with remount + * read-only. */ smp_rmb(); if (mnt_is_readonly(m)) { @@ -588,10 +597,8 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err && atomic_long_read(&sb->s_remove_count)) err = -EBUSY; - if (!err) { - sb->s_readonly_remount = 1; - smp_wmb(); - } + if (!err) + sb_start_ro_state_change(sb); list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; @@ -658,9 +665,25 @@ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) return false; } -/* - * find the first mount at @dentry on vfsmount @mnt. - * call under rcu_read_lock() +/** + * __lookup_mnt - find first child mount + * @mnt: parent mount + * @dentry: mountpoint + * + * If @mnt has a child mount @c mounted @dentry find and return it. + * + * Note that the child mount @c need not be unique. There are cases + * where shadow mounts are created. For example, during mount + * propagation when a source mount @mnt whose root got overmounted by a + * mount @o after path lookup but before @namespace_sem could be + * acquired gets copied and propagated. So @mnt gets copied including + * @o. When @mnt is propagated to a destination mount @d that already + * has another mount @n mounted at the same mountpoint then the source + * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on + * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt + * on @dentry. + * + * Return: The first child of @mnt mounted @dentry or NULL. */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { @@ -910,6 +933,33 @@ void mnt_set_mountpoint(struct mount *mnt, hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } +/** + * mnt_set_mountpoint_beneath - mount a mount beneath another one + * + * @new_parent: the source mount + * @top_mnt: the mount beneath which @new_parent is mounted + * @new_mp: the new mountpoint of @top_mnt on @new_parent + * + * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and + * parent @top_mnt->mnt_parent and mount it on top of @new_parent at + * @new_mp. And mount @new_parent on the old parent and old + * mountpoint of @top_mnt. + * + * Context: This function expects namespace_lock() and lock_mount_hash() + * to have been acquired in that order. + */ +static void mnt_set_mountpoint_beneath(struct mount *new_parent, + struct mount *top_mnt, + struct mountpoint *new_mp) +{ + struct mount *old_top_parent = top_mnt->mnt_parent; + struct mountpoint *old_top_mp = top_mnt->mnt_mp; + + mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent); + mnt_change_mountpoint(new_parent, new_mp, top_mnt); +} + + static void __attach_mnt(struct mount *mnt, struct mount *parent) { hlist_add_head_rcu(&mnt->mnt_hash, @@ -917,15 +967,42 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent) list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } -/* - * vfsmount lock must be held for write +/** + * attach_mnt - mount a mount, attach to @mount_hashtable and parent's + * list of child mounts + * @parent: the parent + * @mnt: the new mount + * @mp: the new mountpoint + * @beneath: whether to mount @mnt beneath or on top of @parent + * + * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt + * to @parent's child mount list and to @mount_hashtable. + * + * If @beneath is true, remove @mnt from its current parent and + * mountpoint and mount it on @mp on @parent, and mount @parent on the + * old parent and old mountpoint of @mnt. Finally, attach @parent to + * @mnt_hashtable and @parent->mnt_parent->mnt_mounts. + * + * Note, when __attach_mnt() is called @mnt->mnt_parent already points + * to the correct parent. + * + * Context: This function expects namespace_lock() and lock_mount_hash() + * to have been acquired in that order. */ -static void attach_mnt(struct mount *mnt, - struct mount *parent, - struct mountpoint *mp) +static void attach_mnt(struct mount *mnt, struct mount *parent, + struct mountpoint *mp, bool beneath) { - mnt_set_mountpoint(parent, mp, mnt); - __attach_mnt(mnt, parent); + if (beneath) + mnt_set_mountpoint_beneath(mnt, parent, mp); + else + mnt_set_mountpoint(parent, mp, mnt); + /* + * Note, @mnt->mnt_parent has to be used. If @mnt was mounted + * beneath @parent then @mnt will need to be attached to + * @parent's old parent, not @parent. IOW, @mnt->mnt_parent + * isn't the same mount as @parent. + */ + __attach_mnt(mnt, mnt->mnt_parent); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) @@ -937,7 +1014,7 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); - attach_mnt(mnt, parent, mp); + attach_mnt(mnt, parent, mp, false); put_mountpoint(old_mp); mnt_add_count(old_parent, -1); @@ -1767,6 +1844,19 @@ bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } +/** + * path_mounted - check whether path is mounted + * @path: path to check + * + * Determine whether @path refers to the root of a mount. + * + * Return: true if @path is the root of a mount, false if not. + */ +static inline bool path_mounted(const struct path *path) +{ + return path->mnt->mnt_root == path->dentry; +} + static void warn_mandlock(void) { pr_warn_once("=======================================================\n" @@ -1782,7 +1872,7 @@ static int can_umount(const struct path *path, int flags) if (!may_mount()) return -EPERM; - if (path->dentry != path->mnt->mnt_root) + if (!path_mounted(path)) return -EINVAL; if (!check_mnt(mnt)) return -EINVAL; @@ -1925,7 +2015,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, parent, p->mnt_mp); + attach_mnt(q, parent, p->mnt_mp, false); unlock_mount_hash(); } } @@ -2134,12 +2224,17 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) return 0; } -/* - * @source_mnt : mount tree to be attached - * @nd : place the mount tree @source_mnt is attached - * @parent_nd : if non-null, detach the source_mnt from its parent and - * store the parent mount and mountpoint dentry. - * (done when source_mnt is moved) +enum mnt_tree_flags_t { + MNT_TREE_MOVE = BIT(0), + MNT_TREE_BENEATH = BIT(1), +}; + +/** + * attach_recursive_mnt - attach a source mount tree + * @source_mnt: mount tree to be attached + * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath + * @dest_mp: the mountpoint @source_mnt will be mounted at + * @flags: modify how @source_mnt is supposed to be attached * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. @@ -2196,22 +2291,28 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) * applied to each mount in the tree. * Must be called without spinlocks held, since this function can sleep * in allocations. + * + * Context: The function expects namespace_lock() to be held. + * Return: If @source_mnt was successfully attached 0 is returned. + * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct mount *dest_mnt, - struct mountpoint *dest_mp, - bool moving) + struct mount *top_mnt, + struct mountpoint *dest_mp, + enum mnt_tree_flags_t flags) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); - struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct mnt_namespace *ns = top_mnt->mnt_ns; struct mountpoint *smp; - struct mount *child, *p; + struct mount *child, *dest_mnt, *p; struct hlist_node *n; - int err; + int err = 0; + bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH; - /* Preallocate a mountpoint in case the new mounts need - * to be tucked under other mounts. + /* + * Preallocate a mountpoint in case the new mounts need to be + * mounted beneath mounts on the same mountpoint. */ smp = get_mountpoint(source_mnt->mnt.mnt_root); if (IS_ERR(smp)) @@ -2224,29 +2325,41 @@ static int attach_recursive_mnt(struct mount *source_mnt, goto out; } + if (beneath) + dest_mnt = top_mnt->mnt_parent; + else + dest_mnt = top_mnt; + if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); - lock_mount_hash(); - if (err) - goto out_cleanup_ids; + } + lock_mount_hash(); + if (err) + goto out_cleanup_ids; + + if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); - } else { - lock_mount_hash(); } + if (moving) { + if (beneath) + dest_mp = smp; unhash_mnt(source_mnt); - attach_mnt(source_mnt, dest_mnt, dest_mp); + attach_mnt(source_mnt, top_mnt, dest_mp, beneath); touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { /* move from anon - the caller will destroy */ list_del_init(&source_mnt->mnt_ns->list); } - mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); + if (beneath) + mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); + else + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); commit_tree(source_mnt); } @@ -2286,33 +2399,101 @@ static int attach_recursive_mnt(struct mount *source_mnt, return err; } -static struct mountpoint *lock_mount(struct path *path) +/** + * do_lock_mount - lock mount and mountpoint + * @path: target path + * @beneath: whether the intention is to mount beneath @path + * + * Follow the mount stack on @path until the top mount @mnt is found. If + * the initial @path->{mnt,dentry} is a mountpoint lookup the first + * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root} + * until nothing is stacked on top of it anymore. + * + * Acquire the inode_lock() on the top mount's ->mnt_root to protect + * against concurrent removal of the new mountpoint from another mount + * namespace. + * + * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint + * @mp on @mnt->mnt_parent must be acquired. This protects against a + * concurrent unlink of @mp->mnt_dentry from another mount namespace + * where @mnt doesn't have a child mount mounted @mp. A concurrent + * removal of @mnt->mnt_root doesn't matter as nothing will be mounted + * on top of it for @beneath. + * + * In addition, @beneath needs to make sure that @mnt hasn't been + * unmounted or moved from its current mountpoint in between dropping + * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt + * being unmounted would be detected later by e.g., calling + * check_mnt(mnt) in the function it's called from. For the @beneath + * case however, it's useful to detect it directly in do_lock_mount(). + * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points + * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will + * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL. + * + * Return: Either the target mountpoint on the top mount or the top + * mount's mountpoint. + */ +static struct mountpoint *do_lock_mount(struct path *path, bool beneath) { - struct vfsmount *mnt; - struct dentry *dentry = path->dentry; -retry: - inode_lock(dentry->d_inode); - if (unlikely(cant_mount(dentry))) { - inode_unlock(dentry->d_inode); - return ERR_PTR(-ENOENT); - } - namespace_lock(); - mnt = lookup_mnt(path); - if (likely(!mnt)) { - struct mountpoint *mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { + struct vfsmount *mnt = path->mnt; + struct dentry *dentry; + struct mountpoint *mp = ERR_PTR(-ENOENT); + + for (;;) { + struct mount *m; + + if (beneath) { + m = real_mount(mnt); + read_seqlock_excl(&mount_lock); + dentry = dget(m->mnt_mountpoint); + read_sequnlock_excl(&mount_lock); + } else { + dentry = path->dentry; + } + + inode_lock(dentry->d_inode); + if (unlikely(cant_mount(dentry))) { + inode_unlock(dentry->d_inode); + goto out; + } + + namespace_lock(); + + if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) { namespace_unlock(); inode_unlock(dentry->d_inode); - return mp; + goto out; } - return mp; + + mnt = lookup_mnt(path); + if (likely(!mnt)) + break; + + namespace_unlock(); + inode_unlock(dentry->d_inode); + if (beneath) + dput(dentry); + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); } - namespace_unlock(); - inode_unlock(path->dentry->d_inode); - path_put(path); - path->mnt = mnt; - dentry = path->dentry = dget(mnt->mnt_root); - goto retry; + + mp = get_mountpoint(dentry); + if (IS_ERR(mp)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + } + +out: + if (beneath) + dput(dentry); + + return mp; +} + +static inline struct mountpoint *lock_mount(struct path *path) +{ + return do_lock_mount(path, false); } static void unlock_mount(struct mountpoint *where) @@ -2336,7 +2517,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp, false); + return attach_recursive_mnt(mnt, p, mp, 0); } /* @@ -2367,7 +2548,7 @@ static int do_change_type(struct path *path, int ms_flags) int type; int err = 0; - if (path->dentry != path->mnt->mnt_root) + if (!path_mounted(path)) return -EINVAL; type = flags_to_propagation_type(ms_flags); @@ -2643,7 +2824,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) if (!check_mnt(mnt)) return -EINVAL; - if (path->dentry != mnt->mnt.mnt_root) + if (!path_mounted(path)) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) @@ -2682,7 +2863,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, if (!check_mnt(mnt)) return -EINVAL; - if (path->dentry != path->mnt->mnt_root) + if (!path_mounted(path)) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) @@ -2772,9 +2953,9 @@ static int do_set_group(struct path *from_path, struct path *to_path) err = -EINVAL; /* To and From paths should be mount roots */ - if (from_path->dentry != from_path->mnt->mnt_root) + if (!path_mounted(from_path)) goto out; - if (to_path->dentry != to_path->mnt->mnt_root) + if (!path_mounted(to_path)) goto out; /* Setting sharing groups is only allowed across same superblock */ @@ -2818,7 +2999,110 @@ out: return err; } -static int do_move_mount(struct path *old_path, struct path *new_path) +/** + * path_overmounted - check if path is overmounted + * @path: path to check + * + * Check if path is overmounted, i.e., if there's a mount on top of + * @path->mnt with @path->dentry as mountpoint. + * + * Context: This function expects namespace_lock() to be held. + * Return: If path is overmounted true is returned, false if not. + */ +static inline bool path_overmounted(const struct path *path) +{ + rcu_read_lock(); + if (unlikely(__lookup_mnt(path->mnt, path->dentry))) { + rcu_read_unlock(); + return true; + } + rcu_read_unlock(); + return false; +} + +/** + * can_move_mount_beneath - check that we can mount beneath the top mount + * @from: mount to mount beneath + * @to: mount under which to mount + * + * - Make sure that @to->dentry is actually the root of a mount under + * which we can mount another mount. + * - Make sure that nothing can be mounted beneath the caller's current + * root or the rootfs of the namespace. + * - Make sure that the caller can unmount the topmost mount ensuring + * that the caller could reveal the underlying mountpoint. + * - Ensure that nothing has been mounted on top of @from before we + * grabbed @namespace_sem to avoid creating pointless shadow mounts. + * - Prevent mounting beneath a mount if the propagation relationship + * between the source mount, parent mount, and top mount would lead to + * nonsensical mount trees. + * + * Context: This function expects namespace_lock() to be held. + * Return: On success 0, and on error a negative error code is returned. + */ +static int can_move_mount_beneath(const struct path *from, + const struct path *to, + const struct mountpoint *mp) +{ + struct mount *mnt_from = real_mount(from->mnt), + *mnt_to = real_mount(to->mnt), + *parent_mnt_to = mnt_to->mnt_parent; + + if (!mnt_has_parent(mnt_to)) + return -EINVAL; + + if (!path_mounted(to)) + return -EINVAL; + + if (IS_MNT_LOCKED(mnt_to)) + return -EINVAL; + + /* Avoid creating shadow mounts during mount propagation. */ + if (path_overmounted(from)) + return -EINVAL; + + /* + * Mounting beneath the rootfs only makes sense when the + * semantics of pivot_root(".", ".") are used. + */ + if (&mnt_to->mnt == current->fs->root.mnt) + return -EINVAL; + if (parent_mnt_to == current->nsproxy->mnt_ns->root) + return -EINVAL; + + for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent) + if (p == mnt_to) + return -EINVAL; + + /* + * If the parent mount propagates to the child mount this would + * mean mounting @mnt_from on @mnt_to->mnt_parent and then + * propagating a copy @c of @mnt_from on top of @mnt_to. This + * defeats the whole purpose of mounting beneath another mount. + */ + if (propagation_would_overmount(parent_mnt_to, mnt_to, mp)) + return -EINVAL; + + /* + * If @mnt_to->mnt_parent propagates to @mnt_from this would + * mean propagating a copy @c of @mnt_from on top of @mnt_from. + * Afterwards @mnt_from would be mounted on top of + * @mnt_to->mnt_parent and @mnt_to would be unmounted from + * @mnt->mnt_parent and remounted on @mnt_from. But since @c is + * already mounted on @mnt_from, @mnt_to would ultimately be + * remounted on top of @c. Afterwards, @mnt_from would be + * covered by a copy @c of @mnt_from and @c would be covered by + * @mnt_from itself. This defeats the whole purpose of mounting + * @mnt_from beneath @mnt_to. + */ + if (propagation_would_overmount(parent_mnt_to, mnt_from, mp)) + return -EINVAL; + + return 0; +} + +static int do_move_mount(struct path *old_path, struct path *new_path, + bool beneath) { struct mnt_namespace *ns; struct mount *p; @@ -2827,8 +3111,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path) struct mountpoint *mp, *old_mp; int err; bool attached; + enum mnt_tree_flags_t flags = 0; - mp = lock_mount(new_path); + mp = do_lock_mount(new_path, beneath); if (IS_ERR(mp)) return PTR_ERR(mp); @@ -2836,6 +3121,8 @@ static int do_move_mount(struct path *old_path, struct path *new_path) p = real_mount(new_path->mnt); parent = old->mnt_parent; attached = mnt_has_parent(old); + if (attached) + flags |= MNT_TREE_MOVE; old_mp = old->mnt_mp; ns = old->mnt_ns; @@ -2855,7 +3142,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path) if (old->mnt.mnt_flags & MNT_LOCKED) goto out; - if (old_path->dentry != old_path->mnt->mnt_root) + if (!path_mounted(old_path)) goto out; if (d_is_dir(new_path->dentry) != @@ -2866,6 +3153,17 @@ static int do_move_mount(struct path *old_path, struct path *new_path) */ if (attached && IS_MNT_SHARED(parent)) goto out; + + if (beneath) { + err = can_move_mount_beneath(old_path, new_path, mp); + if (err) + goto out; + + err = -EINVAL; + p = p->mnt_parent; + flags |= MNT_TREE_BENEATH; + } + /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. @@ -2879,8 +3177,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path) if (p == old) goto out; - err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, - attached); + err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags); if (err) goto out; @@ -2912,7 +3209,7 @@ static int do_move_mount_old(struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path); + err = do_move_mount(&old_path, path, false); path_put(&old_path); return err; } @@ -2937,8 +3234,7 @@ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, } /* Refuse the same filesystem on the same mount point */ - if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && - path->mnt->mnt_root == path->dentry) + if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path)) return -EBUSY; if (d_is_symlink(newmnt->mnt.mnt_root)) @@ -3079,13 +3375,10 @@ int finish_automount(struct vfsmount *m, const struct path *path) err = -ENOENT; goto discard_locked; } - rcu_read_lock(); - if (unlikely(__lookup_mnt(path->mnt, dentry))) { - rcu_read_unlock(); + if (path_overmounted(path)) { err = 0; goto discard_locked; } - rcu_read_unlock(); mp = get_mountpoint(dentry); if (IS_ERR(mp)) { err = PTR_ERR(mp); @@ -3777,6 +4070,10 @@ SYSCALL_DEFINE5(move_mount, if (flags & ~MOVE_MOUNT__MASK) return -EINVAL; + if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) == + (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) + return -EINVAL; + /* If someone gives a pathname, they aren't permitted to move * from an fd that requires unmount as we can't get at the flag * to clear it afterwards. @@ -3806,7 +4103,8 @@ SYSCALL_DEFINE5(move_mount, if (flags & MOVE_MOUNT_SET_GROUP) ret = do_set_group(&from_path, &to_path); else - ret = do_move_mount(&from_path, &to_path); + ret = do_move_mount(&from_path, &to_path, + (flags & MOVE_MOUNT_BENEATH)); out_to: path_put(&to_path); @@ -3917,11 +4215,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (new_mnt == root_mnt || old_mnt == root_mnt) goto out4; /* loop, on the same file system */ error = -EINVAL; - if (root.mnt->mnt_root != root.dentry) + if (!path_mounted(&root)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) goto out4; /* not attached */ - if (new.mnt->mnt_root != new.dentry) + if (!path_mounted(&new)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) goto out4; /* not attached */ @@ -3939,9 +4237,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } /* mount old root on put_old */ - attach_mnt(root_mnt, old_mnt, old_mp); + attach_mnt(root_mnt, old_mnt, old_mp, false); /* mount new_root on / */ - attach_mnt(new_mnt, root_parent, root_mp); + attach_mnt(new_mnt, root_parent, root_mp, false); mnt_add_count(root_parent, -1); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ @@ -4124,7 +4422,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) struct mount *mnt = real_mount(path->mnt); int err = 0; - if (path->dentry != mnt->mnt.mnt_root) + if (!path_mounted(path)) return -EINVAL; if (kattr->mnt_userns) { diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c index 8a4c86687429..2ff07ba655a0 100644 --- a/fs/netfs/iterator.c +++ b/fs/netfs/iterator.c @@ -101,269 +101,3 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, return npages; } EXPORT_SYMBOL_GPL(netfs_extract_user_iter); - -/* - * Extract and pin a list of up to sg_max pages from UBUF- or IOVEC-class - * iterators, and add them to the scatterlist. - */ -static ssize_t netfs_extract_user_to_sg(struct iov_iter *iter, - ssize_t maxsize, - struct sg_table *sgtable, - unsigned int sg_max, - iov_iter_extraction_t extraction_flags) -{ - struct scatterlist *sg = sgtable->sgl + sgtable->nents; - struct page **pages; - unsigned int npages; - ssize_t ret = 0, res; - size_t len, off; - - /* We decant the page list into the tail of the scatterlist */ - pages = (void *)sgtable->sgl + array_size(sg_max, sizeof(struct scatterlist)); - pages -= sg_max; - - do { - res = iov_iter_extract_pages(iter, &pages, maxsize, sg_max, - extraction_flags, &off); - if (res < 0) - goto failed; - - len = res; - maxsize -= len; - ret += len; - npages = DIV_ROUND_UP(off + len, PAGE_SIZE); - sg_max -= npages; - - for (; npages > 0; npages--) { - struct page *page = *pages; - size_t seg = min_t(size_t, PAGE_SIZE - off, len); - - *pages++ = NULL; - sg_set_page(sg, page, seg, off); - sgtable->nents++; - sg++; - len -= seg; - off = 0; - } - } while (maxsize > 0 && sg_max > 0); - - return ret; - -failed: - while (sgtable->nents > sgtable->orig_nents) - put_page(sg_page(&sgtable->sgl[--sgtable->nents])); - return res; -} - -/* - * Extract up to sg_max pages from a BVEC-type iterator and add them to the - * scatterlist. The pages are not pinned. - */ -static ssize_t netfs_extract_bvec_to_sg(struct iov_iter *iter, - ssize_t maxsize, - struct sg_table *sgtable, - unsigned int sg_max, - iov_iter_extraction_t extraction_flags) -{ - const struct bio_vec *bv = iter->bvec; - struct scatterlist *sg = sgtable->sgl + sgtable->nents; - unsigned long start = iter->iov_offset; - unsigned int i; - ssize_t ret = 0; - - for (i = 0; i < iter->nr_segs; i++) { - size_t off, len; - - len = bv[i].bv_len; - if (start >= len) { - start -= len; - continue; - } - - len = min_t(size_t, maxsize, len - start); - off = bv[i].bv_offset + start; - - sg_set_page(sg, bv[i].bv_page, len, off); - sgtable->nents++; - sg++; - sg_max--; - - ret += len; - maxsize -= len; - if (maxsize <= 0 || sg_max == 0) - break; - start = 0; - } - - if (ret > 0) - iov_iter_advance(iter, ret); - return ret; -} - -/* - * Extract up to sg_max pages from a KVEC-type iterator and add them to the - * scatterlist. This can deal with vmalloc'd buffers as well as kmalloc'd or - * static buffers. The pages are not pinned. - */ -static ssize_t netfs_extract_kvec_to_sg(struct iov_iter *iter, - ssize_t maxsize, - struct sg_table *sgtable, - unsigned int sg_max, - iov_iter_extraction_t extraction_flags) -{ - const struct kvec *kv = iter->kvec; - struct scatterlist *sg = sgtable->sgl + sgtable->nents; - unsigned long start = iter->iov_offset; - unsigned int i; - ssize_t ret = 0; - - for (i = 0; i < iter->nr_segs; i++) { - struct page *page; - unsigned long kaddr; - size_t off, len, seg; - - len = kv[i].iov_len; - if (start >= len) { - start -= len; - continue; - } - - kaddr = (unsigned long)kv[i].iov_base + start; - off = kaddr & ~PAGE_MASK; - len = min_t(size_t, maxsize, len - start); - kaddr &= PAGE_MASK; - - maxsize -= len; - ret += len; - do { - seg = min_t(size_t, len, PAGE_SIZE - off); - if (is_vmalloc_or_module_addr((void *)kaddr)) - page = vmalloc_to_page((void *)kaddr); - else - page = virt_to_page(kaddr); - - sg_set_page(sg, page, len, off); - sgtable->nents++; - sg++; - sg_max--; - - len -= seg; - kaddr += PAGE_SIZE; - off = 0; - } while (len > 0 && sg_max > 0); - - if (maxsize <= 0 || sg_max == 0) - break; - start = 0; - } - - if (ret > 0) - iov_iter_advance(iter, ret); - return ret; -} - -/* - * Extract up to sg_max folios from an XARRAY-type iterator and add them to - * the scatterlist. The pages are not pinned. - */ -static ssize_t netfs_extract_xarray_to_sg(struct iov_iter *iter, - ssize_t maxsize, - struct sg_table *sgtable, - unsigned int sg_max, - iov_iter_extraction_t extraction_flags) -{ - struct scatterlist *sg = sgtable->sgl + sgtable->nents; - struct xarray *xa = iter->xarray; - struct folio *folio; - loff_t start = iter->xarray_start + iter->iov_offset; - pgoff_t index = start / PAGE_SIZE; - ssize_t ret = 0; - size_t offset, len; - XA_STATE(xas, xa, index); - - rcu_read_lock(); - - xas_for_each(&xas, folio, ULONG_MAX) { - if (xas_retry(&xas, folio)) - continue; - if (WARN_ON(xa_is_value(folio))) - break; - if (WARN_ON(folio_test_hugetlb(folio))) - break; - - offset = offset_in_folio(folio, start); - len = min_t(size_t, maxsize, folio_size(folio) - offset); - - sg_set_page(sg, folio_page(folio, 0), len, offset); - sgtable->nents++; - sg++; - sg_max--; - - maxsize -= len; - ret += len; - if (maxsize <= 0 || sg_max == 0) - break; - } - - rcu_read_unlock(); - if (ret > 0) - iov_iter_advance(iter, ret); - return ret; -} - -/** - * netfs_extract_iter_to_sg - Extract pages from an iterator and add ot an sglist - * @iter: The iterator to extract from - * @maxsize: The amount of iterator to copy - * @sgtable: The scatterlist table to fill in - * @sg_max: Maximum number of elements in @sgtable that may be filled - * @extraction_flags: Flags to qualify the request - * - * Extract the page fragments from the given amount of the source iterator and - * add them to a scatterlist that refers to all of those bits, to a maximum - * addition of @sg_max elements. - * - * The pages referred to by UBUF- and IOVEC-type iterators are extracted and - * pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE- - * and DISCARD-type are not supported. - * - * No end mark is placed on the scatterlist; that's left to the caller. - * - * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA - * be allowed on the pages extracted. - * - * If successul, @sgtable->nents is updated to include the number of elements - * added and the number of bytes added is returned. @sgtable->orig_nents is - * left unaltered. - * - * The iov_iter_extract_mode() function should be used to query how cleanup - * should be performed. - */ -ssize_t netfs_extract_iter_to_sg(struct iov_iter *iter, size_t maxsize, - struct sg_table *sgtable, unsigned int sg_max, - iov_iter_extraction_t extraction_flags) -{ - if (maxsize == 0) - return 0; - - switch (iov_iter_type(iter)) { - case ITER_UBUF: - case ITER_IOVEC: - return netfs_extract_user_to_sg(iter, maxsize, sgtable, sg_max, - extraction_flags); - case ITER_BVEC: - return netfs_extract_bvec_to_sg(iter, maxsize, sgtable, sg_max, - extraction_flags); - case ITER_KVEC: - return netfs_extract_kvec_to_sg(iter, maxsize, sgtable, sg_max, - extraction_flags); - case ITER_XARRAY: - return netfs_extract_xarray_to_sg(iter, maxsize, sgtable, sg_max, - extraction_flags); - default: - pr_err("%s(%u) unsupported\n", __func__, iov_iter_type(iter)); - WARN_ON_ONCE(1); - return -EIO; - } -} -EXPORT_SYMBOL_GPL(netfs_extract_iter_to_sg); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index fea5f8821da5..70f5563a8e81 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -35,7 +35,7 @@ bl_free_device(struct pnfs_block_dev *dev) } if (dev->bdev) - blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE); + blkdev_put(dev->bdev, NULL); } } @@ -243,7 +243,8 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, if (!dev) return -EIO; - bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); + bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, + NULL); if (IS_ERR(bdev)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); @@ -312,7 +313,8 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix) if (!devname) return ERR_PTR(-ENOMEM); - bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); + bdev = blkdev_get_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, + NULL); if (IS_ERR(bdev)) { pr_warn("pNFS: failed to open device %s (%ld)\n", devname, PTR_ERR(bdev)); @@ -373,7 +375,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return 0; out_blkdev_put: - blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE); + blkdev_put(d->bdev, NULL); return error; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f50e025ae406..e4c5f193ed5e 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -184,6 +184,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_net = get_net(cl_init->net); clp->cl_principal = "*"; + clp->cl_xprtsec = cl_init->xprtsec; return clp; error_cleanup: @@ -326,6 +327,10 @@ again: sap)) continue; + /* Match the xprt security policy */ + if (clp->cl_xprtsec.policy != data->xprtsec.policy) + continue; + refcount_inc(&clp->cl_count); return clp; } @@ -458,6 +463,7 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, switch (proto) { case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_TCP_TLS: case XPRT_TRANSPORT_RDMA: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; @@ -510,6 +516,7 @@ int nfs_create_rpc_client(struct nfs_client *clp, .version = clp->rpc_ops->version, .authflavor = flavor, .cred = cl_init->cred, + .xprtsec = cl_init->xprtsec, }; if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags)) @@ -592,6 +599,7 @@ static int nfs_start_lockd(struct nfs_server *server) server->nlm_host = host; server->destroy = nfs_destroy_server; + nfs_sysfs_link_rpc_client(server, nlmclnt_rpc_clnt(host), NULL); return 0; } @@ -621,6 +629,7 @@ int nfs_init_server_rpcclient(struct nfs_server *server, if (server->flags & NFS_MOUNT_SOFT) server->client->cl_softrtry = 1; + nfs_sysfs_link_rpc_client(server, server->client, NULL); return 0; } EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); @@ -675,6 +684,7 @@ static int nfs_init_server(struct nfs_server *server, .cred = server->cred, .nconnect = ctx->nfs_server.nconnect, .init_flags = (1UL << NFS_CS_REUSEPORT), + .xprtsec = ctx->xprtsec, }; struct nfs_client *clp; int error; @@ -690,6 +700,8 @@ static int nfs_init_server(struct nfs_server *server, return PTR_ERR(clp); server->nfs_client = clp; + nfs_sysfs_add_server(server); + nfs_sysfs_link_rpc_client(server, clp->cl_rpcclient, "_state"); /* Initialise the client representation from the mount data */ server->flags = ctx->flags; @@ -944,6 +956,8 @@ void nfs_server_remove_lists(struct nfs_server *server) } EXPORT_SYMBOL_GPL(nfs_server_remove_lists); +static DEFINE_IDA(s_sysfs_ids); + /* * Allocate and initialise a server record */ @@ -955,6 +969,12 @@ struct nfs_server *nfs_alloc_server(void) if (!server) return NULL; + server->s_sysfs_id = ida_alloc(&s_sysfs_ids, GFP_KERNEL); + if (server->s_sysfs_id < 0) { + kfree(server); + return NULL; + } + server->client = server->client_acl = ERR_PTR(-EINVAL); /* Zero out the NFS state stuff */ @@ -1001,6 +1021,12 @@ void nfs_free_server(struct nfs_server *server) nfs_put_client(server->nfs_client); + if (server->kobj.state_initialized) { + nfs_sysfs_remove_server(server); + kobject_put(&server->kobj); + } + ida_free(&s_sysfs_ids, server->s_sysfs_id); + ida_destroy(&server->lockowner_id); ida_destroy(&server->openowner_id); nfs_free_iostats(server->io_stats); @@ -1102,6 +1128,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, server->fsid = fattr->fsid; + nfs_sysfs_add_server(server); + + nfs_sysfs_link_rpc_client(server, + server->nfs_client->cl_rpcclient, "_state"); + error = nfs_init_server_rpcclient(server, source->client->cl_timeout, flavor); @@ -1385,6 +1416,7 @@ error_0: void nfs_fs_proc_exit(void) { remove_proc_subtree("fs/nfsfs", NULL); + ida_destroy(&s_sysfs_ids); } #endif /* CONFIG_PROC_FS */ diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f0edf5a36237..79b1b3fcd3fc 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -178,6 +178,27 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) } EXPORT_SYMBOL_GPL(nfs_file_read); +ssize_t +nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + ssize_t result; + + dprintk("NFS: splice_read(%pD2, %zu@%llu)\n", in, len, *ppos); + + nfs_start_io_read(inode); + result = nfs_revalidate_mapping(inode, in->f_mapping); + if (!result) { + result = filemap_splice_read(in, ppos, pipe, len, flags); + if (result > 0) + nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); + } + nfs_end_io_read(inode); + return result; +} +EXPORT_SYMBOL_GPL(nfs_file_splice_read); + int nfs_file_mmap(struct file * file, struct vm_area_struct * vma) { @@ -648,17 +669,13 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) since = filemap_sample_wb_err(file->f_mapping); nfs_start_io_write(inode); result = generic_write_checks(iocb, from); - if (result > 0) { - current->backing_dev_info = inode_to_bdi(inode); + if (result > 0) result = generic_perform_write(iocb, from); - current->backing_dev_info = NULL; - } nfs_end_io_write(inode); if (result <= 0) goto out; written = result; - iocb->ki_pos += written; nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); if (mntflags & NFS_MOUNT_WRITE_EAGER) { @@ -879,7 +896,7 @@ const struct file_operations nfs_file_operations = { .fsync = nfs_file_fsync, .lock = nfs_lock, .flock = nfs_flock, - .splice_read = generic_file_splice_read, + .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = simple_nosetlease, diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 9bcd53d5c7d4..853e8d609bb3 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -18,6 +18,9 @@ #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> + +#include <net/handshake.h> + #include "nfs.h" #include "internal.h" @@ -88,6 +91,7 @@ enum nfs_param { Opt_vers, Opt_wsize, Opt_write, + Opt_xprtsec, }; enum { @@ -194,6 +198,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_string("vers", Opt_vers), fsparam_enum ("write", Opt_write, nfs_param_enums_write), fsparam_u32 ("wsize", Opt_wsize), + fsparam_string("xprtsec", Opt_xprtsec), {} }; @@ -267,6 +272,20 @@ static const struct constant_table nfs_secflavor_tokens[] = { {} }; +enum { + Opt_xprtsec_none, + Opt_xprtsec_tls, + Opt_xprtsec_mtls, + nr__Opt_xprtsec +}; + +static const struct constant_table nfs_xprtsec_policies[] = { + { "none", Opt_xprtsec_none }, + { "tls", Opt_xprtsec_tls }, + { "mtls", Opt_xprtsec_mtls }, + {} +}; + /* * Sanity-check a server address provided by the mount command. * @@ -320,9 +339,21 @@ static int nfs_validate_transport_protocol(struct fs_context *fc, default: ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; } + + if (ctx->xprtsec.policy != RPC_XPRTSEC_NONE) + switch (ctx->nfs_server.protocol) { + case XPRT_TRANSPORT_TCP: + ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP_TLS; + break; + default: + goto out_invalid_xprtsec_policy; + } + return 0; out_invalid_transport_udp: return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); +out_invalid_xprtsec_policy: + return nfs_invalf(fc, "NFS: Transport does not support xprtsec"); } /* @@ -430,6 +461,29 @@ static int nfs_parse_security_flavors(struct fs_context *fc, return 0; } +static int nfs_parse_xprtsec_policy(struct fs_context *fc, + struct fs_parameter *param) +{ + struct nfs_fs_context *ctx = nfs_fc2context(fc); + + trace_nfs_mount_assign(param->key, param->string); + + switch (lookup_constant(nfs_xprtsec_policies, param->string, -1)) { + case Opt_xprtsec_none: + ctx->xprtsec.policy = RPC_XPRTSEC_NONE; + break; + case Opt_xprtsec_tls: + ctx->xprtsec.policy = RPC_XPRTSEC_TLS_ANON; + break; + case Opt_xprtsec_mtls: + ctx->xprtsec.policy = RPC_XPRTSEC_TLS_X509; + break; + default: + return nfs_invalf(fc, "NFS: Unrecognized transport security policy"); + } + return 0; +} + static int nfs_parse_version_string(struct fs_context *fc, const char *string) { @@ -696,6 +750,11 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, if (ret < 0) return ret; break; + case Opt_xprtsec: + ret = nfs_parse_xprtsec_policy(fc, param); + if (ret < 0) + return ret; + break; case Opt_proto: if (!param->string) @@ -791,16 +850,19 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, ctx->mount_server.addrlen = len; break; case Opt_nconnect: + trace_nfs_mount_assign(param->key, param->string); if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_CONNECTIONS) goto out_of_bounds; ctx->nfs_server.nconnect = result.uint_32; break; case Opt_max_connect: + trace_nfs_mount_assign(param->key, param->string); if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_TRANSPORTS) goto out_of_bounds; ctx->nfs_server.max_connect = result.uint_32; break; case Opt_lookupcache: + trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { case Opt_lookupcache_all: ctx->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); @@ -817,6 +879,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, } break; case Opt_local_lock: + trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { case Opt_local_lock_all: ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | @@ -837,6 +900,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, } break; case Opt_write: + trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { case Opt_write_lazy: ctx->flags &= @@ -1569,6 +1633,9 @@ static int nfs_init_fs_context(struct fs_context *fc) ctx->selected_flavor = RPC_AUTH_MAXFLAVOR; ctx->minorversion = 0; ctx->need_mount = true; + ctx->xprtsec.policy = RPC_XPRTSEC_NONE; + ctx->xprtsec.cert_serial = TLS_NO_CERT; + ctx->xprtsec.privkey_serial = TLS_NO_PRIVKEY; fc->s_iflags |= SB_I_STABLE_WRITES; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index a910b9a638c5..8172dd4135a1 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -845,7 +845,7 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path, request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | - STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME | + STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_CHANGE_COOKIE; if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 3cc027d3bd58..913c09806c7f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -81,6 +81,7 @@ struct nfs_client_initdata { struct net *net; const struct rpc_timeout *timeparms; const struct cred *cred; + struct xprtsec_parms xprtsec; }; /* @@ -101,6 +102,7 @@ struct nfs_fs_context { unsigned int bsize; struct nfs_auth_info auth_info; rpc_authflavor_t selected_flavor; + struct xprtsec_parms xprtsec; char *client_address; unsigned int version; unsigned int minorversion; @@ -416,6 +418,8 @@ static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server) int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); loff_t nfs_file_llseek(struct file *, loff_t, int); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); +ssize_t nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, + size_t len, unsigned int flags); int nfs_file_mmap(struct file *, struct vm_area_struct *); ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 669cda757a5c..eff3802c5e03 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -4,6 +4,8 @@ #include <linux/sunrpc/addr.h> #include "internal.h" #include "nfs3_fs.h" +#include "netns.h" +#include "sysfs.h" #ifdef CONFIG_NFS_V3_ACL static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; @@ -31,6 +33,8 @@ static void nfs_init_server_aclclient(struct nfs_server *server) if (IS_ERR(server->client_acl)) goto out_noacl; + nfs_sysfs_link_rpc_client(server, server->client_acl, NULL); + /* No errors! Assume that Sun nfsacls are supported */ server->caps |= NFS_CAP_ACLS; return; @@ -93,6 +97,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, .cred = mds_srv->cred, + .xprtsec = mds_clp->cl_xprtsec, }; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; @@ -102,8 +107,12 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, return ERR_PTR(-EINVAL); cl_init.hostname = buf; - if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP) - cl_init.nconnect = mds_clp->cl_nconnect; + switch (ds_proto) { + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_TCP_TLS: + if (mds_clp->cl_nconnect > 1) + cl_init.nconnect = mds_clp->cl_nconnect; + } if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 93e306bf4430..63802d195556 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1190,15 +1190,19 @@ static int _nfs42_proc_setxattr(struct inode *inode, const char *name, const void *buf, size_t buflen, int flags) { struct nfs_server *server = NFS_SERVER(inode); + __u32 bitmask[NFS_BITMASK_SZ]; struct page *pages[NFS4XATTR_MAXPAGES]; struct nfs42_setxattrargs arg = { .fh = NFS_FH(inode), + .bitmask = bitmask, .xattr_pages = pages, .xattr_len = buflen, .xattr_name = name, .xattr_flags = flags, }; - struct nfs42_setxattrres res; + struct nfs42_setxattrres res = { + .server = server, + }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETXATTR], .rpc_argp = &arg, @@ -1210,13 +1214,22 @@ static int _nfs42_proc_setxattr(struct inode *inode, const char *name, if (buflen > server->sxasize) return -ERANGE; + res.fattr = nfs_alloc_fattr(); + if (!res.fattr) + return -ENOMEM; + if (buflen > 0) { np = nfs4_buf_to_pages_noslab(buf, buflen, arg.xattr_pages); - if (np < 0) - return np; + if (np < 0) { + ret = np; + goto out; + } } else np = 0; + nfs4_bitmask_set(bitmask, server->cache_consistency_bitmask, + inode, NFS_INO_INVALID_CHANGE); + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); trace_nfs4_setxattr(inode, name, ret); @@ -1224,9 +1237,13 @@ static int _nfs42_proc_setxattr(struct inode *inode, const char *name, for (; np > 0; np--) put_page(pages[np - 1]); - if (!ret) + if (!ret) { nfs4_update_changeattr(inode, &res.cinfo, timestamp, 0); + ret = nfs_post_op_update_inode(inode, res.fattr); + } +out: + kfree(res.fattr); return ret; } diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 76ae11834206..911f634ba3da 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -991,6 +991,29 @@ static void nfs4_xattr_cache_init_once(void *p) INIT_LIST_HEAD(&cache->dispose); } +static int nfs4_xattr_shrinker_init(struct shrinker *shrinker, + struct list_lru *lru, const char *name) +{ + int ret = 0; + + ret = register_shrinker(shrinker, name); + if (ret) + return ret; + + ret = list_lru_init_memcg(lru, shrinker); + if (ret) + unregister_shrinker(shrinker); + + return ret; +} + +static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker, + struct list_lru *lru) +{ + unregister_shrinker(shrinker); + list_lru_destroy(lru); +} + int __init nfs4_xattr_cache_init(void) { int ret = 0; @@ -1002,44 +1025,30 @@ int __init nfs4_xattr_cache_init(void) if (nfs4_xattr_cache_cachep == NULL) return -ENOMEM; - ret = list_lru_init_memcg(&nfs4_xattr_large_entry_lru, - &nfs4_xattr_large_entry_shrinker); - if (ret) - goto out4; - - ret = list_lru_init_memcg(&nfs4_xattr_entry_lru, - &nfs4_xattr_entry_shrinker); - if (ret) - goto out3; - - ret = list_lru_init_memcg(&nfs4_xattr_cache_lru, - &nfs4_xattr_cache_shrinker); - if (ret) - goto out2; - - ret = register_shrinker(&nfs4_xattr_cache_shrinker, "nfs-xattr_cache"); + ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker, + &nfs4_xattr_cache_lru, + "nfs-xattr_cache"); if (ret) goto out1; - ret = register_shrinker(&nfs4_xattr_entry_shrinker, "nfs-xattr_entry"); + ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker, + &nfs4_xattr_entry_lru, + "nfs-xattr_entry"); if (ret) - goto out; + goto out2; - ret = register_shrinker(&nfs4_xattr_large_entry_shrinker, - "nfs-xattr_large_entry"); + ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker, + &nfs4_xattr_large_entry_lru, + "nfs-xattr_large_entry"); if (!ret) return 0; - unregister_shrinker(&nfs4_xattr_entry_shrinker); -out: - unregister_shrinker(&nfs4_xattr_cache_shrinker); -out1: - list_lru_destroy(&nfs4_xattr_cache_lru); + nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker, + &nfs4_xattr_entry_lru); out2: - list_lru_destroy(&nfs4_xattr_entry_lru); -out3: - list_lru_destroy(&nfs4_xattr_large_entry_lru); -out4: + nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker, + &nfs4_xattr_cache_lru); +out1: kmem_cache_destroy(nfs4_xattr_cache_cachep); return ret; @@ -1047,11 +1056,11 @@ out4: void nfs4_xattr_cache_exit(void) { - unregister_shrinker(&nfs4_xattr_large_entry_shrinker); - unregister_shrinker(&nfs4_xattr_entry_shrinker); - unregister_shrinker(&nfs4_xattr_cache_shrinker); - list_lru_destroy(&nfs4_xattr_large_entry_lru); - list_lru_destroy(&nfs4_xattr_entry_lru); - list_lru_destroy(&nfs4_xattr_cache_lru); + nfs4_xattr_shrinker_destroy(&nfs4_xattr_large_entry_shrinker, + &nfs4_xattr_large_entry_lru); + nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker, + &nfs4_xattr_entry_lru); + nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker, + &nfs4_xattr_cache_lru); kmem_cache_destroy(nfs4_xattr_cache_cachep); } diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index a6df815a140c..95234208dc9e 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -7,6 +7,9 @@ #include "nfs42.h" +/* Not limited by NFS itself, limited by the generic xattr code */ +#define nfs4_xattr_name_maxsz XDR_QUADLEN(XATTR_NAME_MAX) + #define encode_fallocate_maxsz (encode_stateid_maxsz + \ 2 /* offset */ + \ 2 /* length */) @@ -89,6 +92,18 @@ 2 /* dst offset */ + \ 2 /* count */) #define decode_clone_maxsz (op_decode_hdr_maxsz) +#define encode_getxattr_maxsz (op_encode_hdr_maxsz + 1 + \ + nfs4_xattr_name_maxsz) +#define decode_getxattr_maxsz (op_decode_hdr_maxsz + 1 + pagepad_maxsz) +#define encode_setxattr_maxsz (op_encode_hdr_maxsz + \ + 1 + nfs4_xattr_name_maxsz + 1) +#define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) +#define encode_listxattrs_maxsz (op_encode_hdr_maxsz + 2 + 1) +#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + 1) +#define encode_removexattr_maxsz (op_encode_hdr_maxsz + 1 + \ + nfs4_xattr_name_maxsz) +#define decode_removexattr_maxsz (op_decode_hdr_maxsz + \ + decode_change_info_maxsz) #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ @@ -186,55 +201,40 @@ decode_putfh_maxsz + \ decode_clone_maxsz + \ decode_getattr_maxsz) - -/* Not limited by NFS itself, limited by the generic xattr code */ -#define nfs4_xattr_name_maxsz XDR_QUADLEN(XATTR_NAME_MAX) - -#define encode_getxattr_maxsz (op_encode_hdr_maxsz + 1 + \ - nfs4_xattr_name_maxsz) -#define decode_getxattr_maxsz (op_decode_hdr_maxsz + 1 + pagepad_maxsz) -#define encode_setxattr_maxsz (op_encode_hdr_maxsz + \ - 1 + nfs4_xattr_name_maxsz + 1) -#define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) -#define encode_listxattrs_maxsz (op_encode_hdr_maxsz + 2 + 1) -#define decode_listxattrs_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + 1) -#define encode_removexattr_maxsz (op_encode_hdr_maxsz + 1 + \ - nfs4_xattr_name_maxsz) -#define decode_removexattr_maxsz (op_decode_hdr_maxsz + \ - decode_change_info_maxsz) - -#define NFS4_enc_getxattr_sz (compound_encode_hdr_maxsz + \ - encode_sequence_maxsz + \ - encode_putfh_maxsz + \ - encode_getxattr_maxsz) -#define NFS4_dec_getxattr_sz (compound_decode_hdr_maxsz + \ - decode_sequence_maxsz + \ - decode_putfh_maxsz + \ - decode_getxattr_maxsz) -#define NFS4_enc_setxattr_sz (compound_encode_hdr_maxsz + \ - encode_sequence_maxsz + \ - encode_putfh_maxsz + \ - encode_setxattr_maxsz) -#define NFS4_dec_setxattr_sz (compound_decode_hdr_maxsz + \ - decode_sequence_maxsz + \ - decode_putfh_maxsz + \ - decode_setxattr_maxsz) -#define NFS4_enc_listxattrs_sz (compound_encode_hdr_maxsz + \ - encode_sequence_maxsz + \ - encode_putfh_maxsz + \ - encode_listxattrs_maxsz) -#define NFS4_dec_listxattrs_sz (compound_decode_hdr_maxsz + \ - decode_sequence_maxsz + \ - decode_putfh_maxsz + \ - decode_listxattrs_maxsz) -#define NFS4_enc_removexattr_sz (compound_encode_hdr_maxsz + \ - encode_sequence_maxsz + \ - encode_putfh_maxsz + \ - encode_removexattr_maxsz) -#define NFS4_dec_removexattr_sz (compound_decode_hdr_maxsz + \ - decode_sequence_maxsz + \ - decode_putfh_maxsz + \ - decode_removexattr_maxsz) +#define NFS4_enc_getxattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getxattr_maxsz) +#define NFS4_dec_getxattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getxattr_maxsz) +#define NFS4_enc_setxattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_setxattr_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_setxattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_setxattr_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_listxattrs_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_listxattrs_maxsz) +#define NFS4_dec_listxattrs_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_listxattrs_maxsz) +#define NFS4_enc_removexattr_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_removexattr_maxsz) +#define NFS4_dec_removexattr_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_removexattr_maxsz) /* * These values specify the maximum amount of data that is not @@ -317,6 +317,18 @@ static void encode_copy(struct xdr_stream *xdr, encode_nl4_server(xdr, args->cp_src); } +static void encode_copy_commit(struct xdr_stream *xdr, + const struct nfs42_copy_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->dst_pos); + *p = cpu_to_be32(args->count); +} + static void encode_offload_cancel(struct xdr_stream *xdr, const struct nfs42_offload_status_args *args, struct compound_hdr *hdr) @@ -452,20 +464,6 @@ static void encode_setxattr(struct xdr_stream *xdr, xdr_write_pages(xdr, arg->xattr_pages, 0, arg->xattr_len); } -static int decode_setxattr(struct xdr_stream *xdr, - struct nfs4_change_info *cinfo) -{ - int status; - - status = decode_op_hdr(xdr, OP_SETXATTR); - if (status) - goto out; - status = decode_change_info(xdr, cinfo); -out: - return status; -} - - static void encode_getxattr(struct xdr_stream *xdr, const char *name, struct compound_hdr *hdr) { @@ -473,43 +471,6 @@ static void encode_getxattr(struct xdr_stream *xdr, const char *name, encode_string(xdr, strlen(name), name); } -static int decode_getxattr(struct xdr_stream *xdr, - struct nfs42_getxattrres *res, - struct rpc_rqst *req) -{ - int status; - __be32 *p; - u32 len, rdlen; - - status = decode_op_hdr(xdr, OP_GETXATTR); - if (status) - return status; - - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - return -EIO; - - len = be32_to_cpup(p); - - /* - * Only check against the page length here. The actual - * requested length may be smaller, but that is only - * checked against after possibly caching a valid reply. - */ - if (len > req->rq_rcv_buf.page_len) - return -ERANGE; - - res->xattr_len = len; - - if (len > 0) { - rdlen = xdr_read_pages(xdr, len); - if (rdlen < len) - return -EIO; - } - - return 0; -} - static void encode_removexattr(struct xdr_stream *xdr, const char *name, struct compound_hdr *hdr) { @@ -517,21 +478,6 @@ static void encode_removexattr(struct xdr_stream *xdr, const char *name, encode_string(xdr, strlen(name), name); } - -static int decode_removexattr(struct xdr_stream *xdr, - struct nfs4_change_info *cinfo) -{ - int status; - - status = decode_op_hdr(xdr, OP_REMOVEXATTR); - if (status) - goto out; - - status = decode_change_info(xdr, cinfo); -out: - return status; -} - static void encode_listxattrs(struct xdr_stream *xdr, const struct nfs42_listxattrsargs *arg, struct compound_hdr *hdr) @@ -553,104 +499,6 @@ static void encode_listxattrs(struct xdr_stream *xdr, *p = cpu_to_be32(arg->count + 8 + 4); } -static int decode_listxattrs(struct xdr_stream *xdr, - struct nfs42_listxattrsres *res) -{ - int status; - __be32 *p; - u32 count, len, ulen; - size_t left, copied; - char *buf; - - status = decode_op_hdr(xdr, OP_LISTXATTRS); - if (status) { - /* - * Special case: for LISTXATTRS, NFS4ERR_TOOSMALL - * should be translated to ERANGE. - */ - if (status == -ETOOSMALL) - status = -ERANGE; - /* - * Special case: for LISTXATTRS, NFS4ERR_NOXATTR - * should be translated to success with zero-length reply. - */ - if (status == -ENODATA) { - res->eof = true; - status = 0; - } - goto out; - } - - p = xdr_inline_decode(xdr, 8); - if (unlikely(!p)) - return -EIO; - - xdr_decode_hyper(p, &res->cookie); - - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - return -EIO; - - left = res->xattr_len; - buf = res->xattr_buf; - - count = be32_to_cpup(p); - copied = 0; - - /* - * We have asked for enough room to encode the maximum number - * of possible attribute names, so everything should fit. - * - * But, don't rely on that assumption. Just decode entries - * until they don't fit anymore, just in case the server did - * something odd. - */ - while (count--) { - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - return -EIO; - - len = be32_to_cpup(p); - if (len > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) { - status = -ERANGE; - goto out; - } - - p = xdr_inline_decode(xdr, len); - if (unlikely(!p)) - return -EIO; - - ulen = len + XATTR_USER_PREFIX_LEN + 1; - if (buf) { - if (ulen > left) { - status = -ERANGE; - goto out; - } - - memcpy(buf, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); - memcpy(buf + XATTR_USER_PREFIX_LEN, p, len); - - buf[ulen - 1] = 0; - buf += ulen; - left -= ulen; - } - copied += ulen; - } - - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - return -EIO; - - res->eof = be32_to_cpup(p); - res->copied = copied; - -out: - if (status == -ERANGE && res->xattr_len == XATTR_LIST_MAX) - status = -E2BIG; - - return status; -} - /* * Encode ALLOCATE request */ @@ -671,18 +519,6 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, encode_nops(&hdr); } -static void encode_copy_commit(struct xdr_stream *xdr, - const struct nfs42_copy_args *args, - struct compound_hdr *hdr) -{ - __be32 *p; - - encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); - p = reserve_space(xdr, 12); - p = xdr_encode_hyper(p, args->dst_pos); - *p = cpu_to_be32(args->count); -} - /* * Encode COPY request */ @@ -871,6 +707,90 @@ static void nfs4_xdr_enc_layouterror(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode SETXATTR request + */ +static void nfs4_xdr_enc_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_setxattrargs *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_setxattr(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); +} + +/* + * Encode GETXATTR request + */ +static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_getxattrargs *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + uint32_t replen; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen + op_decode_hdr_maxsz + 1; + encode_getxattr(xdr, args->xattr_name, &hdr); + + rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->xattr_len, + replen); + + encode_nops(&hdr); +} + +/* + * Encode LISTXATTR request + */ +static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfs42_listxattrsargs *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + uint32_t replen; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen + op_decode_hdr_maxsz + 2 + 1; + encode_listxattrs(xdr, args, &hdr); + + rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count, replen); + + encode_nops(&hdr); +} + +/* + * Encode REMOVEXATTR request + */ +static void nfs4_xdr_enc_removexattr(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfs42_removexattrargs *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + encode_removexattr(xdr, args->xattr_name, &hdr); + encode_nops(&hdr); +} + static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_ALLOCATE); @@ -1192,6 +1112,168 @@ static int decode_layouterror(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_LAYOUTERROR); } +static int decode_setxattr(struct xdr_stream *xdr, + struct nfs4_change_info *cinfo) +{ + int status; + + status = decode_op_hdr(xdr, OP_SETXATTR); + if (status) + goto out; + status = decode_change_info(xdr, cinfo); +out: + return status; +} + +static int decode_getxattr(struct xdr_stream *xdr, + struct nfs42_getxattrres *res, + struct rpc_rqst *req) +{ + int status; + __be32 *p; + u32 len, rdlen; + + status = decode_op_hdr(xdr, OP_GETXATTR); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + len = be32_to_cpup(p); + + /* + * Only check against the page length here. The actual + * requested length may be smaller, but that is only + * checked against after possibly caching a valid reply. + */ + if (len > req->rq_rcv_buf.page_len) + return -ERANGE; + + res->xattr_len = len; + + if (len > 0) { + rdlen = xdr_read_pages(xdr, len); + if (rdlen < len) + return -EIO; + } + + return 0; +} + +static int decode_removexattr(struct xdr_stream *xdr, + struct nfs4_change_info *cinfo) +{ + int status; + + status = decode_op_hdr(xdr, OP_REMOVEXATTR); + if (status) + goto out; + + status = decode_change_info(xdr, cinfo); +out: + return status; +} + +static int decode_listxattrs(struct xdr_stream *xdr, + struct nfs42_listxattrsres *res) +{ + int status; + __be32 *p; + u32 count, len, ulen; + size_t left, copied; + char *buf; + + status = decode_op_hdr(xdr, OP_LISTXATTRS); + if (status) { + /* + * Special case: for LISTXATTRS, NFS4ERR_TOOSMALL + * should be translated to ERANGE. + */ + if (status == -ETOOSMALL) + status = -ERANGE; + /* + * Special case: for LISTXATTRS, NFS4ERR_NOXATTR + * should be translated to success with zero-length reply. + */ + if (status == -ENODATA) { + res->eof = true; + status = 0; + } + goto out; + } + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + return -EIO; + + xdr_decode_hyper(p, &res->cookie); + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + left = res->xattr_len; + buf = res->xattr_buf; + + count = be32_to_cpup(p); + copied = 0; + + /* + * We have asked for enough room to encode the maximum number + * of possible attribute names, so everything should fit. + * + * But, don't rely on that assumption. Just decode entries + * until they don't fit anymore, just in case the server did + * something odd. + */ + while (count--) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + len = be32_to_cpup(p); + if (len > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) { + status = -ERANGE; + goto out; + } + + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + return -EIO; + + ulen = len + XATTR_USER_PREFIX_LEN + 1; + if (buf) { + if (ulen > left) { + status = -ERANGE; + goto out; + } + + memcpy(buf, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + memcpy(buf + XATTR_USER_PREFIX_LEN, p, len); + + buf[ulen - 1] = 0; + buf += ulen; + left -= ulen; + } + copied += ulen; + } + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + res->eof = be32_to_cpup(p); + res->copied = copied; + +out: + if (status == -ERANGE && res->xattr_len == XATTR_LIST_MAX) + status = -E2BIG; + + return status; +} + /* * Decode ALLOCATE request */ @@ -1481,22 +1563,9 @@ out: return status; } -#ifdef CONFIG_NFS_V4_2 -static void nfs4_xdr_enc_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr, - const void *data) -{ - const struct nfs42_setxattrargs *args = data; - struct compound_hdr hdr = { - .minorversion = nfs4_xdr_minorversion(&args->seq_args), - }; - - encode_compound_hdr(xdr, req, &hdr); - encode_sequence(xdr, &args->seq_args, &hdr); - encode_putfh(xdr, args->fh, &hdr); - encode_setxattr(xdr, args, &hdr); - encode_nops(&hdr); -} - +/* + * Decode SETXATTR request + */ static int nfs4_xdr_dec_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { @@ -1513,33 +1582,17 @@ static int nfs4_xdr_dec_setxattr(struct rpc_rqst *req, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; - status = decode_setxattr(xdr, &res->cinfo); + if (status) + goto out; + status = decode_getfattr(xdr, res->fattr, res->server); out: return status; } -static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr, - const void *data) -{ - const struct nfs42_getxattrargs *args = data; - struct compound_hdr hdr = { - .minorversion = nfs4_xdr_minorversion(&args->seq_args), - }; - uint32_t replen; - - encode_compound_hdr(xdr, req, &hdr); - encode_sequence(xdr, &args->seq_args, &hdr); - encode_putfh(xdr, args->fh, &hdr); - replen = hdr.replen + op_decode_hdr_maxsz + 1; - encode_getxattr(xdr, args->xattr_name, &hdr); - - rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->xattr_len, - replen); - - encode_nops(&hdr); -} - +/* + * Decode GETXATTR request + */ static int nfs4_xdr_dec_getxattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr, void *data) { @@ -1561,26 +1614,9 @@ out: return status; } -static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req, - struct xdr_stream *xdr, const void *data) -{ - const struct nfs42_listxattrsargs *args = data; - struct compound_hdr hdr = { - .minorversion = nfs4_xdr_minorversion(&args->seq_args), - }; - uint32_t replen; - - encode_compound_hdr(xdr, req, &hdr); - encode_sequence(xdr, &args->seq_args, &hdr); - encode_putfh(xdr, args->fh, &hdr); - replen = hdr.replen + op_decode_hdr_maxsz + 2 + 1; - encode_listxattrs(xdr, args, &hdr); - - rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count, replen); - - encode_nops(&hdr); -} - +/* + * Decode LISTXATTR request + */ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp, struct xdr_stream *xdr, void *data) { @@ -1604,21 +1640,9 @@ out: return status; } -static void nfs4_xdr_enc_removexattr(struct rpc_rqst *req, - struct xdr_stream *xdr, const void *data) -{ - const struct nfs42_removexattrargs *args = data; - struct compound_hdr hdr = { - .minorversion = nfs4_xdr_minorversion(&args->seq_args), - }; - - encode_compound_hdr(xdr, req, &hdr); - encode_sequence(xdr, &args->seq_args, &hdr); - encode_putfh(xdr, args->fh, &hdr); - encode_removexattr(xdr, args->xattr_name, &hdr); - encode_nops(&hdr); -} - +/* + * Decode REMOVEXATTR request + */ static int nfs4_xdr_dec_removexattr(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { @@ -1640,5 +1664,4 @@ static int nfs4_xdr_dec_removexattr(struct rpc_rqst *req, out: return status; } -#endif #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index d3051b051a56..d9114a754db7 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -18,6 +18,7 @@ #include "nfs4idmap.h" #include "pnfs.h" #include "netns.h" +#include "sysfs.h" #define NFSDBG_FACILITY NFSDBG_CLIENT @@ -896,7 +897,8 @@ static int nfs4_set_client(struct nfs_server *server, int proto, const struct rpc_timeout *timeparms, u32 minorversion, unsigned int nconnect, unsigned int max_connect, - struct net *net) + struct net *net, + struct xprtsec_parms *xprtsec) { struct nfs_client_initdata cl_init = { .hostname = hostname, @@ -909,6 +911,7 @@ static int nfs4_set_client(struct nfs_server *server, .net = net, .timeparms = timeparms, .cred = server->cred, + .xprtsec = *xprtsec, }; struct nfs_client *clp; @@ -916,8 +919,11 @@ static int nfs4_set_client(struct nfs_server *server, __set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags); else cl_init.max_connect = max_connect; - if (proto == XPRT_TRANSPORT_TCP) + switch (proto) { + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_TCP_TLS: cl_init.nconnect = nconnect; + } if (server->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); @@ -947,6 +953,9 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); server->nfs_client = clp; + nfs_sysfs_add_server(server); + nfs_sysfs_link_rpc_client(server, clp->cl_rpcclient, "_state"); + return 0; } @@ -978,6 +987,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, .cred = mds_srv->cred, + .xprtsec = mds_srv->nfs_client->cl_xprtsec, }; char buf[INET6_ADDRSTRLEN + 1]; @@ -985,9 +995,13 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, return ERR_PTR(-EINVAL); cl_init.hostname = buf; - if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP) { - cl_init.nconnect = mds_clp->cl_nconnect; - cl_init.max_connect = NFS_MAX_TRANSPORTS; + switch (ds_proto) { + case XPRT_TRANSPORT_TCP: + case XPRT_TRANSPORT_TCP_TLS: + if (mds_clp->cl_nconnect > 1) { + cl_init.nconnect = mds_clp->cl_nconnect; + cl_init.max_connect = NFS_MAX_TRANSPORTS; + } } if (mds_srv->flags & NFS_MOUNT_NORESVPORT) @@ -1157,7 +1171,8 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) ctx->minorversion, ctx->nfs_server.nconnect, ctx->nfs_server.max_connect, - fc->net_ns); + fc->net_ns, + &ctx->xprtsec); if (error < 0) return error; @@ -1219,8 +1234,8 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_client *parent_client; struct nfs_server *server, *parent_server; + int proto, error; bool auth_probe; - int error; server = nfs_alloc_server(); if (!server) @@ -1247,23 +1262,28 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) parent_client->cl_mvops->minor_version, parent_client->cl_nconnect, parent_client->cl_max_connect, - parent_client->cl_net); + parent_client->cl_net, + &parent_client->cl_xprtsec); if (!error) goto init_server; #endif /* IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA) */ + proto = XPRT_TRANSPORT_TCP; + if (parent_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE) + proto = XPRT_TRANSPORT_TCP_TLS; rpc_set_port(&ctx->nfs_server.address, NFS_PORT); error = nfs4_set_client(server, ctx->nfs_server.hostname, &ctx->nfs_server._address, ctx->nfs_server.addrlen, parent_client->cl_ipaddr, - XPRT_TRANSPORT_TCP, + proto, parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, parent_client->cl_nconnect, parent_client->cl_max_connect, - parent_client->cl_net); + parent_client->cl_net, + &parent_client->cl_xprtsec); if (error < 0) goto error; @@ -1314,6 +1334,7 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, .dstaddr = (struct sockaddr *)sap, .addrlen = salen, .servername = hostname, + /* cel: bleh. We might need to pass TLS parameters here */ }; char buf[INET6_ADDRSTRLEN + 1]; struct sockaddr_storage address; @@ -1336,7 +1357,8 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, error = nfs4_set_client(server, hostname, sap, salen, buf, clp->cl_proto, clnt->cl_timeout, clp->cl_minorversion, - clp->cl_nconnect, clp->cl_max_connect, net); + clp->cl_nconnect, clp->cl_max_connect, + net, &clp->cl_xprtsec); clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); if (error != 0) { nfs_server_insert_lists(server); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 2563ed8580f3..4aeadd6e1a6d 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -454,7 +454,7 @@ const struct file_operations nfs4_file_operations = { .fsync = nfs_file_fsync, .lock = nfs_lock, .flock = nfs_flock, - .splice_read = generic_file_splice_read, + .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = nfs4_setlease, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d3665390c4cb..e1a886b58354 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -921,6 +921,7 @@ out: out_noaction: return ret; session_recover: + set_bit(NFS4_SLOT_TBL_DRAINING, &session->fc_slot_table.slot_tbl_state); nfs4_schedule_session_recovery(session, status); dprintk("%s ERROR: %d Reset session\n", __func__, status); nfs41_sequence_free_slot(res); @@ -7159,7 +7160,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) { struct nfs4_lockdata *data = calldata; struct nfs4_lock_state *lsp = data->lsp; - struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry)); if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -7167,7 +7167,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; switch (task->tk_status) { case 0: - renew_lease(server, data->timestamp); + renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), + data->timestamp); if (data->arg.new_lock && !data->cancelled) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) @@ -7188,8 +7189,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) if (!nfs4_stateid_match(&data->arg.open_stateid, &lsp->ls_state->open_stateid)) goto out_restart; - else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN) - goto out_restart; } else if (!nfs4_stateid_match(&data->arg.lock_stateid, &lsp->ls_stateid)) goto out_restart; @@ -9371,7 +9370,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data) return; trace_nfs4_sequence(clp, task->tk_status); - if (task->tk_status < 0) { + if (task->tk_status < 0 && !task->tk_client->cl_shutdown) { dprintk("%s ERROR %d\n", __func__, task->tk_status); if (refcount_read(&clp->cl_count) == 1) return; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index bbe49315d99e..e079987af4a3 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1210,6 +1210,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + if (clp->cl_rpcclient->cl_shutdown) + return; + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { wake_up_var(&clp->cl_state); diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 620329b7e6ae..7600100ba26f 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -164,7 +164,7 @@ __setup("nfsroot=", nfs_root_setup); static int __init root_nfs_copy(char *dest, const char *src, const size_t destlen) { - if (strlcpy(dest, src, destlen) > destlen) + if (strscpy(dest, src, destlen) == -E2BIG) return -1; return 0; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 30e53e93049e..2284f749d892 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -59,6 +59,8 @@ #include <linux/uaccess.h> #include <linux/nfs_ssc.h> +#include <uapi/linux/tls.h> + #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" @@ -68,6 +70,8 @@ #include "nfs4session.h" #include "pnfs.h" #include "nfs.h" +#include "netns.h" +#include "sysfs.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -491,6 +495,16 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ); seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries); seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); + switch (clp->cl_xprtsec.policy) { + case RPC_XPRTSEC_TLS_ANON: + seq_puts(m, ",xprtsec=tls"); + break; + case RPC_XPRTSEC_TLS_X509: + seq_puts(m, ",xprtsec=mtls"); + break; + default: + break; + } if (version != 4) nfs_show_mountd_options(m, nfss, showdefaults); @@ -1077,6 +1091,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) &sb->s_blocksize_bits); nfs_super_set_maxbytes(sb, server->maxfilesize); + nfs_sysfs_move_server_to_sb(sb); server->has_sec_mnt_opts = ctx->has_sec_mnt_opts; } @@ -1319,13 +1334,14 @@ error_splat_super: } /* - * Destroy an NFS2/3 superblock + * Destroy an NFS superblock */ void nfs_kill_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); dev_t dev = s->s_dev; + nfs_sysfs_move_sb_to_server(server); generic_shutdown_super(s); nfs_fscache_release_super_cookie(s); diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 0cbcd2dfa732..acda8f033d30 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -12,17 +12,18 @@ #include <linux/string.h> #include <linux/nfs_fs.h> #include <linux/rcupdate.h> +#include <linux/lockd/lockd.h> #include "nfs4_fs.h" #include "netns.h" #include "sysfs.h" -struct kobject *nfs_client_kobj; -static struct kset *nfs_client_kset; +static struct kset *nfs_kset; -static void nfs_netns_object_release(struct kobject *kobj) +static void nfs_kset_release(struct kobject *kobj) { - kfree(kobj); + struct kset *kset = container_of(kobj, struct kset, kobj); + kfree(kset); } static const struct kobj_ns_type_operations *nfs_netns_object_child_ns_type( @@ -31,46 +32,42 @@ static const struct kobj_ns_type_operations *nfs_netns_object_child_ns_type( return &net_ns_type_operations; } -static struct kobj_type nfs_netns_object_type = { - .release = nfs_netns_object_release, +static struct kobj_type nfs_kset_type = { + .release = nfs_kset_release, .sysfs_ops = &kobj_sysfs_ops, .child_ns_type = nfs_netns_object_child_ns_type, }; -static struct kobject *nfs_netns_object_alloc(const char *name, - struct kset *kset, struct kobject *parent) +int nfs_sysfs_init(void) { - struct kobject *kobj; + int ret; + + nfs_kset = kzalloc(sizeof(*nfs_kset), GFP_KERNEL); + if (!nfs_kset) + return -ENOMEM; - kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); - if (kobj) { - kobj->kset = kset; - if (kobject_init_and_add(kobj, &nfs_netns_object_type, - parent, "%s", name) == 0) - return kobj; - kobject_put(kobj); + ret = kobject_set_name(&nfs_kset->kobj, "nfs"); + if (ret) { + kfree(nfs_kset); + return ret; } - return NULL; -} -int nfs_sysfs_init(void) -{ - nfs_client_kset = kset_create_and_add("nfs", NULL, fs_kobj); - if (!nfs_client_kset) - return -ENOMEM; - nfs_client_kobj = nfs_netns_object_alloc("net", nfs_client_kset, NULL); - if (!nfs_client_kobj) { - kset_unregister(nfs_client_kset); - nfs_client_kset = NULL; - return -ENOMEM; + nfs_kset->kobj.parent = fs_kobj; + nfs_kset->kobj.ktype = &nfs_kset_type; + nfs_kset->kobj.kset = NULL; + + ret = kset_register(nfs_kset); + if (ret) { + kfree(nfs_kset); + return ret; } + return 0; } void nfs_sysfs_exit(void) { - kobject_put(nfs_client_kobj); - kset_unregister(nfs_client_kset); + kset_unregister(nfs_kset); } static ssize_t nfs_netns_identifier_show(struct kobject *kobj, @@ -127,7 +124,6 @@ static void nfs_netns_client_release(struct kobject *kobj) kobject); kfree(rcu_dereference_raw(c->identifier)); - kfree(c); } static const void *nfs_netns_client_namespace(const struct kobject *kobj) @@ -151,6 +147,25 @@ static struct kobj_type nfs_netns_client_type = { .namespace = nfs_netns_client_namespace, }; +static void nfs_netns_object_release(struct kobject *kobj) +{ + struct nfs_netns_client *c = container_of(kobj, + struct nfs_netns_client, + nfs_net_kobj); + kfree(c); +} + +static const void *nfs_netns_namespace(const struct kobject *kobj) +{ + return container_of(kobj, struct nfs_netns_client, nfs_net_kobj)->net; +} + +static struct kobj_type nfs_netns_object_type = { + .release = nfs_netns_object_release, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = nfs_netns_namespace, +}; + static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent, struct net *net) { @@ -159,10 +174,19 @@ static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent, p = kzalloc(sizeof(*p), GFP_KERNEL); if (p) { p->net = net; - p->kobject.kset = nfs_client_kset; + p->kobject.kset = nfs_kset; + p->nfs_net_kobj.kset = nfs_kset; + + if (kobject_init_and_add(&p->nfs_net_kobj, &nfs_netns_object_type, + parent, "net") != 0) { + kobject_put(&p->nfs_net_kobj); + return NULL; + } + if (kobject_init_and_add(&p->kobject, &nfs_netns_client_type, - parent, "nfs_client") == 0) + &p->nfs_net_kobj, "nfs_client") == 0) return p; + kobject_put(&p->kobject); } return NULL; @@ -172,7 +196,7 @@ void nfs_netns_sysfs_setup(struct nfs_net *netns, struct net *net) { struct nfs_netns_client *clp; - clp = nfs_netns_client_alloc(nfs_client_kobj, net); + clp = nfs_netns_client_alloc(&nfs_kset->kobj, net); if (clp) { netns->nfs_client = clp; kobject_uevent(&clp->kobject, KOBJ_ADD); @@ -187,6 +211,149 @@ void nfs_netns_sysfs_destroy(struct nfs_net *netns) kobject_uevent(&clp->kobject, KOBJ_REMOVE); kobject_del(&clp->kobject); kobject_put(&clp->kobject); + kobject_del(&clp->nfs_net_kobj); + kobject_put(&clp->nfs_net_kobj); netns->nfs_client = NULL; } } + +static bool shutdown_match_client(const struct rpc_task *task, const void *data) +{ + return true; +} + +static void shutdown_client(struct rpc_clnt *clnt) +{ + clnt->cl_shutdown = 1; + rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL); +} + +static ssize_t +shutdown_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + bool shutdown = server->flags & NFS_MOUNT_SHUTDOWN; + return sysfs_emit(buf, "%d\n", shutdown); +} + +static ssize_t +shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct nfs_server *server; + int ret, val; + + server = container_of(kobj, struct nfs_server, kobj); + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val != 1) + return -EINVAL; + + /* already shut down? */ + if (server->flags & NFS_MOUNT_SHUTDOWN) + goto out; + + server->flags |= NFS_MOUNT_SHUTDOWN; + shutdown_client(server->client); + shutdown_client(server->nfs_client->cl_rpcclient); + + if (!IS_ERR(server->client_acl)) + shutdown_client(server->client_acl); + + if (server->nlm_host) + shutdown_client(server->nlm_host->h_rpcclnt); +out: + return count; +} + +static struct kobj_attribute nfs_sysfs_attr_shutdown = __ATTR_RW(shutdown); + +#define RPC_CLIENT_NAME_SIZE 64 + +void nfs_sysfs_link_rpc_client(struct nfs_server *server, + struct rpc_clnt *clnt, const char *uniq) +{ + char name[RPC_CLIENT_NAME_SIZE]; + int ret; + + strcpy(name, clnt->cl_program->name); + strcat(name, uniq ? uniq : ""); + strcat(name, "_client"); + + ret = sysfs_create_link_nowarn(&server->kobj, + &clnt->cl_sysfs->kobject, name); + if (ret < 0) + pr_warn("NFS: can't create link to %s in sysfs (%d)\n", + name, ret); +} +EXPORT_SYMBOL_GPL(nfs_sysfs_link_rpc_client); + +static void nfs_sysfs_sb_release(struct kobject *kobj) +{ + /* no-op: why? see lib/kobject.c kobject_cleanup() */ +} + +static const void *nfs_netns_server_namespace(const struct kobject *kobj) +{ + return container_of(kobj, struct nfs_server, kobj)->nfs_client->cl_net; +} + +static struct kobj_type nfs_sb_ktype = { + .release = nfs_sysfs_sb_release, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = nfs_netns_server_namespace, + .child_ns_type = nfs_netns_object_child_ns_type, +}; + +void nfs_sysfs_add_server(struct nfs_server *server) +{ + int ret; + + ret = kobject_init_and_add(&server->kobj, &nfs_sb_ktype, + &nfs_kset->kobj, "server-%d", server->s_sysfs_id); + if (ret < 0) { + pr_warn("NFS: nfs sysfs add server-%d failed (%d)\n", + server->s_sysfs_id, ret); + return; + } + ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_shutdown.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); +} +EXPORT_SYMBOL_GPL(nfs_sysfs_add_server); + +void nfs_sysfs_move_server_to_sb(struct super_block *s) +{ + struct nfs_server *server = s->s_fs_info; + int ret; + + ret = kobject_rename(&server->kobj, s->s_id); + if (ret < 0) + pr_warn("NFS: rename sysfs %s failed (%d)\n", + server->kobj.name, ret); +} + +void nfs_sysfs_move_sb_to_server(struct nfs_server *server) +{ + const char *s; + int ret = -ENOMEM; + + s = kasprintf(GFP_KERNEL, "server-%d", server->s_sysfs_id); + if (s) + ret = kobject_rename(&server->kobj, s); + if (ret < 0) + pr_warn("NFS: rename sysfs %s failed (%d)\n", + server->kobj.name, ret); +} + +/* unlink, not dec-ref */ +void nfs_sysfs_remove_server(struct nfs_server *server) +{ + kobject_del(&server->kobj); +} diff --git a/fs/nfs/sysfs.h b/fs/nfs/sysfs.h index 5501ef573c32..c5d1990cade5 100644 --- a/fs/nfs/sysfs.h +++ b/fs/nfs/sysfs.h @@ -10,11 +10,12 @@ struct nfs_netns_client { struct kobject kobject; + struct kobject nfs_net_kobj; struct net *net; const char __rcu *identifier; }; -extern struct kobject *nfs_client_kobj; +extern struct kobject *nfs_net_kobj; extern int nfs_sysfs_init(void); extern void nfs_sysfs_exit(void); @@ -22,4 +23,11 @@ extern void nfs_sysfs_exit(void); void nfs_netns_sysfs_setup(struct nfs_net *netns, struct net *net); void nfs_netns_sysfs_destroy(struct nfs_net *netns); +void nfs_sysfs_link_rpc_client(struct nfs_server *server, + struct rpc_clnt *clnt, const char *sysfs_prefix); +void nfs_sysfs_add_server(struct nfs_server *s); +void nfs_sysfs_move_server_to_sb(struct super_block *s); +void nfs_sysfs_move_sb_to_server(struct nfs_server *s); +void nfs_sysfs_remove_server(struct nfs_server *s); + #endif diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index f21259ead64b..4c9b87850ab1 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -80,6 +80,8 @@ enum { int nfsd_drc_slab_create(void); void nfsd_drc_slab_free(void); +int nfsd_net_reply_cache_init(struct nfsd_net *nn); +void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); int nfsd_cache_lookup(struct svc_rqst *); diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index ae85257b4238..11a0eaa2f914 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -97,7 +97,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) goto out; err = -EINVAL; - if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; @@ -107,7 +107,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) dprintk("found domain %s\n", buf); err = -EINVAL; - if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; fsidtype = simple_strtoul(buf, &ep, 10); if (*ep) @@ -593,7 +593,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client path expiry [flags anonuid anongid fsid] */ char *buf; - int len; int err; struct auth_domain *dom = NULL; struct svc_export exp = {}, *expp; @@ -609,8 +608,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) /* client */ err = -EINVAL; - len = qword_get(&mesg, buf, PAGE_SIZE); - if (len <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; @@ -620,7 +618,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) /* path */ err = -EINVAL; - if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out1; err = kern_path(buf, 0, &exp.ex_path); @@ -665,7 +663,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) goto out3; exp.ex_fsid = an_int; - while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) { + while (qword_get(&mesg, buf, PAGE_SIZE) > 0) { if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); else if (strcmp(buf, "uuid") == 0) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index e6bb8eeb5bc2..fc8d5b7db9f8 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -151,8 +151,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp) { struct nfsd3_readargs *argp = rqstp->rq_argp; struct nfsd3_readres *resp = rqstp->rq_resp; - unsigned int len; - int v; dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", SVCFH_fmt(&argp->fh), @@ -166,17 +164,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp) if (argp->offset + argp->count > (u64)OFFSET_MAX) argp->count = (u64)OFFSET_MAX - argp->offset; - v = 0; - len = argp->count; resp->pages = rqstp->rq_next_page; - while (len > 0) { - struct page *page = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(page); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } /* Obtain buffer pointer for payload. * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) @@ -187,7 +175,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, v, &resp->count, &resp->eof); + &resp->count, &resp->eof); return rpc_success; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 3308dd671ef0..f32128955ec8 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -828,7 +828,8 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (xdr_stream_encode_u32(xdr, resp->len) < 0) return false; - xdr_write_pages(xdr, resp->pages, 0, resp->len); + svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, 0, + resp->len); if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0) return false; break; @@ -859,8 +860,9 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (xdr_stream_encode_u32(xdr, resp->count) < 0) return false; - xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base, - resp->count); + svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, + rqstp->rq_res.page_base, + resp->count); if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0) return false; break; @@ -961,7 +963,8 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (!svcxdr_encode_cookieverf3(xdr, resp->verf)) return false; - xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len); + svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0, + dirlist->len); /* no more entries */ if (xdr_stream_encode_item_absent(xdr) < 0) return false; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 76db2fe29624..b30dca7de8cc 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2541,6 +2541,20 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, return p; } +static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr, + struct timespec64 *tv) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 3); + if (!p) + return nfserr_resource; + + p = xdr_encode_hyper(p, (s64)tv->tv_sec); + *p = cpu_to_be32(tv->tv_nsec); + return nfs_ok; +} + /* * ctime (in NFSv4, time_metadata) is not writeable, and the client * doesn't really care what resolution could theoretically be stored by @@ -2566,12 +2580,16 @@ static __be32 *encode_time_delta(__be32 *p, struct inode *inode) return p; } -static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c) +static __be32 +nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c) { - *p++ = cpu_to_be32(c->atomic); - p = xdr_encode_hyper(p, c->before_change); - p = xdr_encode_hyper(p, c->after_change); - return p; + if (xdr_stream_encode_bool(xdr, c->atomic) < 0) + return nfserr_resource; + if (xdr_stream_encode_u64(xdr, c->before_change) < 0) + return nfserr_resource; + if (xdr_stream_encode_u64(xdr, c->after_change) < 0) + return nfserr_resource; + return nfs_ok; } /* Encode as an array of strings the string given with components @@ -3348,11 +3366,14 @@ out_acl: p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec); - *p++ = cpu_to_be32(stat.atime.tv_nsec); + status = nfsd4_encode_nfstime4(xdr, &stat.atime); + if (status) + goto out; + } + if (bmval1 & FATTR4_WORD1_TIME_CREATE) { + status = nfsd4_encode_nfstime4(xdr, &stat.btime); + if (status) + goto out; } if (bmval1 & FATTR4_WORD1_TIME_DELTA) { p = xdr_reserve_space(xdr, 12); @@ -3361,25 +3382,14 @@ out_acl: p = encode_time_delta(p, d_inode(dentry)); } if (bmval1 & FATTR4_WORD1_TIME_METADATA) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec); - *p++ = cpu_to_be32(stat.ctime.tv_nsec); + status = nfsd4_encode_nfstime4(xdr, &stat.ctime); + if (status) + goto out; } if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec); - *p++ = cpu_to_be32(stat.mtime.tv_nsec); - } - if (bmval1 & FATTR4_WORD1_TIME_CREATE) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.btime.tv_sec); - *p++ = cpu_to_be32(stat.btime.tv_nsec); + status = nfsd4_encode_nfstime4(xdr, &stat.mtime); + if (status) + goto out; } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { u64 ino = stat.ino; @@ -3689,6 +3699,30 @@ fail: } static __be32 +nfsd4_encode_verifier4(struct xdr_stream *xdr, const nfs4_verifier *verf) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + memcpy(p, verf->data, sizeof(verf->data)); + return nfs_ok; +} + +static __be32 +nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(__be64)); + if (!p) + return nfserr_resource; + memcpy(p, clientid, sizeof(*clientid)); + return nfs_ok; +} + +static __be32 nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) { __be32 *p; @@ -3752,15 +3786,8 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_commit *commit = &u->commit; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, commit->co_verf.data, - NFS4_VERIFIER_SIZE); - return 0; + return nfsd4_encode_verifier4(resp->xdr, &commit->co_verf); } static __be32 @@ -3769,12 +3796,10 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_create *create = &u->create; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - encode_cinfo(p, &create->cr_cinfo); + nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo); + if (nfserr) + return nfserr; return nfsd4_encode_bitmap(xdr, create->cr_bmval[0], create->cr_bmval[1], create->cr_bmval[2]); } @@ -3892,13 +3917,8 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_link *link = &u->link; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &link->li_cinfo); - return 0; + return nfsd4_encode_change_info4(xdr, &link->li_cinfo); } @@ -3913,11 +3933,11 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); if (nfserr) return nfserr; - p = xdr_reserve_space(xdr, 24); - if (!p) + nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo); + if (nfserr) + return nfserr; + if (xdr_stream_encode_u32(xdr, open->op_rflags) < 0) return nfserr_resource; - p = encode_cinfo(p, &open->op_cinfo); - *p++ = cpu_to_be32(open->op_rflags); nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1], open->op_bmval[2]); @@ -3956,7 +3976,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, p = xdr_reserve_space(xdr, 32); if (!p) return nfserr_resource; - *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(open->op_recall); /* * TODO: space_limit's in delegations @@ -4018,6 +4038,11 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, return nfsd4_encode_stateid(xdr, &od->od_stateid); } +/* + * The operation of this function assumes that this is the only + * READ operation in the COMPOUND. If there are multiple READs, + * we use nfsd4_encode_readv(). + */ static __be32 nfsd4_encode_splice_read( struct nfsd4_compoundres *resp, struct nfsd4_read *read, @@ -4028,8 +4053,12 @@ static __be32 nfsd4_encode_splice_read( int status, space_left; __be32 nfserr; - /* Make sure there will be room for padding if needed */ - if (xdr->end - xdr->p < 1) + /* + * Make sure there is room at the end of buf->head for + * svcxdr_encode_opaque_pages() to create a tail buffer + * to XDR-pad the payload. + */ + if (xdr->iov != xdr->buf->head || xdr->end - xdr->p < 1) return nfserr_resource; nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, @@ -4038,6 +4067,8 @@ static __be32 nfsd4_encode_splice_read( read->rd_length = maxcount; if (nfserr) goto out_err; + svcxdr_encode_opaque_pages(read->rd_rqstp, xdr, buf->pages, + buf->page_base, maxcount); status = svc_encode_result_payload(read->rd_rqstp, buf->head[0].iov_len, maxcount); if (status) { @@ -4045,31 +4076,19 @@ static __be32 nfsd4_encode_splice_read( goto out_err; } - buf->page_len = maxcount; - buf->len += maxcount; - xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1) - / PAGE_SIZE; - - /* Use rest of head for padding and remaining ops: */ - buf->tail[0].iov_base = xdr->p; - buf->tail[0].iov_len = 0; - xdr->iov = buf->tail; - if (maxcount&3) { - int pad = 4 - (maxcount&3); - - *(xdr->p++) = 0; - - buf->tail[0].iov_base += maxcount&3; - buf->tail[0].iov_len = pad; - buf->len += pad; - } - + /* + * Prepare to encode subsequent operations. + * + * xdr_truncate_encode() is not safe to use after a successful + * splice read has been done, so the following stream + * manipulations are open-coded. + */ space_left = min_t(int, (void *)xdr->end - (void *)xdr->p, buf->buflen - buf->len); buf->buflen = buf->len + space_left; xdr->end = (__be32 *)((void *)xdr->end + space_left); - return 0; + return nfs_ok; out_err: /* @@ -4090,13 +4109,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, __be32 zero = xdr_zero; __be32 nfserr; - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount); - if (read->rd_vlen < 0) + if (xdr_reserve_space_vec(xdr, maxcount) < 0) return nfserr_resource; - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, &maxcount, - &read->rd_eof); + nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file, + read->rd_offset, &maxcount, + xdr->buf->page_len & ~PAGE_MASK, + &read->rd_eof); read->rd_length = maxcount; if (nfserr) return nfserr; @@ -4213,15 +4232,9 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, int starting_len = xdr->buf->len; __be32 *p; - p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - - /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - xdr->buf->head[0].iov_len = (char *)xdr->p - - (char *)xdr->buf->head[0].iov_base; + nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf); + if (nfserr != nfs_ok) + return nfserr; /* * Number of bytes left for directory entries allowing for the @@ -4299,13 +4312,8 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_remove *remove = &u->remove; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &remove->rm_cinfo); - return 0; + return nfsd4_encode_change_info4(xdr, &remove->rm_cinfo); } static __be32 @@ -4314,14 +4322,11 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_rename *rename = &u->rename; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 40); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &rename->rn_sinfo); - p = encode_cinfo(p, &rename->rn_tinfo); - return 0; + nfserr = nfsd4_encode_change_info4(xdr, &rename->rn_sinfo); + if (nfserr) + return nfserr; + return nfsd4_encode_change_info4(xdr, &rename->rn_tinfo); } static __be32 @@ -4448,23 +4453,25 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_setclientid *scd = &u->setclientid; struct xdr_stream *xdr = resp->xdr; - __be32 *p; if (!nfserr) { - p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8); - p = xdr_encode_opaque_fixed(p, &scd->se_confirm, - NFS4_VERIFIER_SIZE); - } - else if (nfserr == nfserr_clid_inuse) { - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); + nfserr = nfsd4_encode_clientid4(xdr, &scd->se_clientid); + if (nfserr != nfs_ok) + goto out; + nfserr = nfsd4_encode_verifier4(xdr, &scd->se_confirm); + } else if (nfserr == nfserr_clid_inuse) { + /* empty network id */ + if (xdr_stream_encode_u32(xdr, 0) < 0) { + nfserr = nfserr_resource; + goto out; + } + /* empty universal address */ + if (xdr_stream_encode_u32(xdr, 0) < 0) { + nfserr = nfserr_resource; + goto out; + } } +out: return nfserr; } @@ -4473,17 +4480,12 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_write *write = &u->write; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 16); - if (!p) + if (xdr_stream_encode_u32(resp->xdr, write->wr_bytes_written) < 0) return nfserr_resource; - *p++ = cpu_to_be32(write->wr_bytes_written); - *p++ = cpu_to_be32(write->wr_how_written); - p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, - NFS4_VERIFIER_SIZE); - return 0; + if (xdr_stream_encode_u32(resp->xdr, write->wr_how_written) < 0) + return nfserr_resource; + return nfsd4_encode_verifier4(resp->xdr, &write->wr_verifier); } static __be32 @@ -4505,20 +4507,15 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, server_scope = nn->nfsd_name; server_scope_sz = strlen(nn->nfsd_name); - p = xdr_reserve_space(xdr, - 8 /* eir_clientid */ + - 4 /* eir_sequenceid */ + - 4 /* eir_flags */ + - 4 /* spr_how */); - if (!p) + if (nfsd4_encode_clientid4(xdr, &exid->clientid) != nfs_ok) + return nfserr_resource; + if (xdr_stream_encode_u32(xdr, exid->seqid) < 0) + return nfserr_resource; + if (xdr_stream_encode_u32(xdr, exid->flags) < 0) return nfserr_resource; - p = xdr_encode_opaque_fixed(p, &exid->clientid, 8); - *p++ = cpu_to_be32(exid->seqid); - *p++ = cpu_to_be32(exid->flags); - - *p++ = cpu_to_be32(exid->spa_how); - + if (xdr_stream_encode_u32(xdr, exid->spa_how) < 0) + return nfserr_resource; switch (exid->spa_how) { case SP4_NONE: break; @@ -5099,15 +5096,8 @@ nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_setxattr *setxattr = &u->setxattr; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - encode_cinfo(p, &setxattr->setxa_cinfo); - - return 0; + return nfsd4_encode_change_info4(xdr, &setxattr->setxa_cinfo); } /* @@ -5253,14 +5243,8 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_removexattr *removexattr = &u->removexattr; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &removexattr->rmxa_cinfo); - return 0; + return nfsd4_encode_change_info4(xdr, &removexattr->rmxa_cinfo); } typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u); @@ -5460,6 +5444,12 @@ status: release: if (opdesc && opdesc->op_release) opdesc->op_release(&op->u); + + /* + * Account for pages consumed while encoding this operation. + * The xdr_stream primitives don't manage rq_next_page. + */ + rqstp->rq_next_page = xdr->page_ptr + 1; } /* @@ -5528,9 +5518,6 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr) p = resp->statusp; *p++ = resp->cstate.status; - - rqstp->rq_next_page = xdr->page_ptr + 1; - *p++ = htonl(resp->taglen); memcpy(p, resp->tag, resp->taglen); p += XDR_QUADLEN(resp->taglen); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 041faa13b852..a8eda1c85829 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -148,12 +148,23 @@ void nfsd_drc_slab_free(void) kmem_cache_destroy(drc_slab); } -static int nfsd_reply_cache_stats_init(struct nfsd_net *nn) +/** + * nfsd_net_reply_cache_init - per net namespace reply cache set-up + * @nn: nfsd_net being initialized + * + * Returns zero on succes; otherwise a negative errno is returned. + */ +int nfsd_net_reply_cache_init(struct nfsd_net *nn) { return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM); } -static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn) +/** + * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down + * @nn: nfsd_net being freed + * + */ +void nfsd_net_reply_cache_destroy(struct nfsd_net *nn) { nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM); } @@ -169,17 +180,13 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) hashsize = nfsd_hashsize(nn->max_drc_entries); nn->maskbits = ilog2(hashsize); - status = nfsd_reply_cache_stats_init(nn); - if (status) - goto out_nomem; - nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; nn->nfsd_reply_cache_shrinker.seeks = 1; status = register_shrinker(&nn->nfsd_reply_cache_shrinker, "nfsd-reply:%s", nn->nfsd_name); if (status) - goto out_stats_destroy; + return status; nn->drc_hashtbl = kvzalloc(array_size(hashsize, sizeof(*nn->drc_hashtbl)), GFP_KERNEL); @@ -195,9 +202,6 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) return 0; out_shrinker: unregister_shrinker(&nn->nfsd_reply_cache_shrinker); -out_stats_destroy: - nfsd_reply_cache_stats_destroy(nn); -out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); return -ENOMEM; } @@ -217,7 +221,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) rp, nn); } } - nfsd_reply_cache_stats_destroy(nn); kvfree(nn->drc_hashtbl); nn->drc_hashtbl = NULL; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b4fd7a7062d5..1b8b1aab9a15 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -25,6 +25,7 @@ #include "netns.h" #include "pnfs.h" #include "filecache.h" +#include "trace.h" /* * We have a single directory with several nodes in it. @@ -109,12 +110,12 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu if (IS_ERR(data)) return PTR_ERR(data); - rv = write_op[ino](file, data, size); - if (rv >= 0) { - simple_transaction_set(file, rv); - rv = size; - } - return rv; + rv = write_op[ino](file, data, size); + if (rv < 0) + return rv; + + simple_transaction_set(file, rv); + return size; } static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) @@ -230,6 +231,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) if (rpc_pton(net, fo_path, size, sap, salen) == 0) return -EINVAL; + trace_nfsd_ctl_unlock_ip(net, buf); return nlmsvc_unlock_all_by_ip(sap); } @@ -263,7 +265,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) fo_path = buf; if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; - + trace_nfsd_ctl_unlock_fs(netns(file), fo_path); error = kern_path(fo_path, 0, &path); if (error) return error; @@ -324,7 +326,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) len = qword_get(&mesg, dname, size); if (len <= 0) return -EINVAL; - + path = dname+len+1; len = qword_get(&mesg, path, size); if (len <= 0) @@ -338,15 +340,17 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) return -EINVAL; maxsize = min(maxsize, NFS3_FHSIZE); - if (qword_get(&mesg, mesg, size)>0) + if (qword_get(&mesg, mesg, size) > 0) return -EINVAL; + trace_nfsd_ctl_filehandle(netns(file), dname, path, maxsize); + /* we have all the words, they are in buf.. */ dom = unix_domain_find(dname); if (!dom) return -ENOMEM; - len = exp_rootfh(netns(file), dom, path, &fh, maxsize); + len = exp_rootfh(netns(file), dom, path, &fh, maxsize); auth_domain_put(dom); if (len) return len; @@ -399,6 +403,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) return rv; if (newthreads < 0) return -EINVAL; + trace_nfsd_ctl_threads(net, newthreads); rv = nfsd_svc(newthreads, net, file->f_cred); if (rv < 0) return rv; @@ -418,8 +423,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) * OR * * Input: - * buf: C string containing whitespace- - * separated unsigned integer values + * buf: C string containing whitespace- + * separated unsigned integer values * representing the number of NFSD * threads to start in each pool * size: non-zero length of C string in @buf @@ -471,6 +476,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) rv = -EINVAL; if (nthreads[i] < 0) goto out_free; + trace_nfsd_ctl_pool_threads(net, i, nthreads[i]); } rv = nfsd_set_nrthreads(i, nthreads, net); if (rv) @@ -526,7 +532,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) char *sep; struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); - if (size>0) { + if (size > 0) { if (nn->nfsd_serv) /* Cannot change versions without updating * nn->nfsd_serv->sv_xdrsize, and reallocing @@ -536,6 +542,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) if (buf[size-1] != '\n') return -EINVAL; buf[size-1] = 0; + trace_nfsd_ctl_version(netns(file), buf); vers = mesg; len = qword_get(&mesg, vers, size); @@ -637,11 +644,11 @@ out: * OR * * Input: - * buf: C string containing whitespace- - * separated positive or negative - * integer values representing NFS - * protocol versions to enable ("+n") - * or disable ("-n") + * buf: C string containing whitespace- + * separated positive or negative + * integer values representing NFS + * protocol versions to enable ("+n") + * or disable ("-n") * size: non-zero length of C string in @buf * Output: * On success: status of zero or more protocol versions has @@ -689,6 +696,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred err = get_int(&mesg, &fd); if (err != 0 || fd < 0) return -EINVAL; + trace_nfsd_ctl_ports_addfd(net, fd); err = nfsd_create_serv(net); if (err != 0) @@ -705,7 +713,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred } /* - * A transport listener is added by writing it's transport name and + * A transport listener is added by writing its transport name and * a port number. */ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cred *cred) @@ -720,6 +728,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr if (port < 1 || port > USHRT_MAX) return -EINVAL; + trace_nfsd_ctl_ports_addxprt(net, transport, port); err = nfsd_create_serv(net); if (err != 0) @@ -832,9 +841,9 @@ int nfsd_max_blksize; * OR * * Input: - * buf: C string containing an unsigned - * integer value representing the new - * NFS blksize + * buf: C string containing an unsigned + * integer value representing the new + * NFS blksize * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C string @@ -853,6 +862,8 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) int rv = get_int(&mesg, &bsize); if (rv) return rv; + trace_nfsd_ctl_maxblksize(netns(file), bsize); + /* force bsize into allowed range and * required alignment. */ @@ -881,9 +892,9 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) * OR * * Input: - * buf: C string containing an unsigned - * integer value representing the new - * number of max connections + * buf: C string containing an unsigned + * integer value representing the new + * number of max connections * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C string @@ -903,6 +914,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size) if (rv) return rv; + trace_nfsd_ctl_maxconn(netns(file), maxconn); nn->max_connections = maxconn; } @@ -913,6 +925,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size) static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time64_t *time, struct nfsd_net *nn) { + struct dentry *dentry = file_dentry(file); char *mesg = buf; int rv, i; @@ -922,6 +935,9 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, rv = get_int(&mesg, &i); if (rv) return rv; + trace_nfsd_ctl_time(netns(file), dentry->d_name.name, + dentry->d_name.len, i); + /* * Some sanity checking. We don't have a reason for * these particular numbers, but problems with the @@ -1014,6 +1030,7 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, len = qword_get(&mesg, recdir, size); if (len <= 0) return -EINVAL; + trace_nfsd_ctl_recoverydir(netns(file), recdir); status = nfs4_reset_recoverydir(recdir); if (status) @@ -1065,7 +1082,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) * OR * * Input: - * buf: any value + * buf: any value * size: non-zero length of C string in @buf * Output: * passed-in buffer filled with "Y" or "N" with a newline @@ -1087,7 +1104,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case '1': if (!nn->nfsd_serv) return -EBUSY; - nfsd4_end_grace(nn); + trace_nfsd_end_grace(netns(file)); break; default: return -EINVAL; @@ -1192,8 +1209,8 @@ static int __nfsd_symlink(struct inode *dir, struct dentry *dentry, * @content is assumed to be a NUL-terminated string that lives * longer than the symlink itself. */ -static void nfsd_symlink(struct dentry *parent, const char *name, - const char *content) +static void _nfsd_symlink(struct dentry *parent, const char *name, + const char *content) { struct inode *dir = parent->d_inode; struct dentry *dentry; @@ -1210,8 +1227,8 @@ out: inode_unlock(dir); } #else -static inline void nfsd_symlink(struct dentry *parent, const char *name, - const char *content) +static inline void _nfsd_symlink(struct dentry *parent, const char *name, + const char *content) { } @@ -1389,8 +1406,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) ret = simple_fill_super(sb, 0x6e667364, nfsd_files); if (ret) return ret; - nfsd_symlink(sb->s_root, "supported_krb5_enctypes", - "/proc/net/rpc/gss_krb5_enctypes"); + _nfsd_symlink(sb->s_root, "supported_krb5_enctypes", + "/proc/net/rpc/gss_krb5_enctypes"); dentry = nfsd_mkdir(sb->s_root, NULL, "clients"); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -1477,7 +1494,17 @@ static int create_proc_exports_entry(void) unsigned int nfsd_net_id; -static __net_init int nfsd_init_net(struct net *net) +/** + * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace + * @net: a freshly-created network namespace + * + * This information stays around as long as the network namespace is + * alive whether or not there is an NFSD instance running in the + * namespace. + * + * Returns zero on success, or a negative errno otherwise. + */ +static __net_init int nfsd_net_init(struct net *net) { int retval; struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -1488,6 +1515,9 @@ static __net_init int nfsd_init_net(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; + retval = nfsd_net_reply_cache_init(nn); + if (retval) + goto out_repcache_error; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; nfsd4_init_leases_net(nn); @@ -1496,22 +1526,32 @@ static __net_init int nfsd_init_net(struct net *net) return 0; +out_repcache_error: + nfsd_idmap_shutdown(net); out_idmap_error: nfsd_export_shutdown(net); out_export_error: return retval; } -static __net_exit void nfsd_exit_net(struct net *net) +/** + * nfsd_net_exit - Release the nfsd_net portion of a net namespace + * @net: a network namespace that is about to be destroyed + * + */ +static __net_exit void nfsd_net_exit(struct net *net) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfsd_net_reply_cache_destroy(nn); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); - nfsd_netns_free_versions(net_generic(net, nfsd_net_id)); + nfsd_netns_free_versions(nn); } static struct pernet_operations nfsd_net_ops = { - .init = nfsd_init_net, - .exit = nfsd_exit_net, + .init = nfsd_net_init, + .exit = nfsd_net_exit, .id = &nfsd_net_id, .size = sizeof(struct nfsd_net), }; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ccd8485fee04..c291389a1d71 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -414,10 +414,13 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp, struct fid *fid = (struct fid *) (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1); int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4; - int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK); + int fh_flags = (exp->ex_flags & NFSEXP_NOSUBTREECHECK) ? 0 : + EXPORT_FH_CONNECTABLE; + int fileid_type = + exportfs_encode_fh(dentry, fid, &maxsize, fh_flags); fhp->fh_handle.fh_fileid_type = - exportfs_encode_fh(dentry, fid, &maxsize, subtreecheck); + fileid_type > 0 ? fileid_type : FILEID_INVALID; fhp->fh_handle.fh_size += maxsize * 4; } else { fhp->fh_handle.fh_fileid_type = FILEID_ROOT; @@ -623,16 +626,9 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) inode = d_inode(fhp->fh_dentry); err = fh_getattr(fhp, &stat); - if (err) { - /* Grab the times from inode anyway */ - stat.mtime = inode->i_mtime; - stat.ctime = inode->i_ctime; - stat.size = inode->i_size; - if (v4 && IS_I_VERSION(inode)) { - stat.change_cookie = inode_query_iversion(inode); - stat.result_mask |= STATX_CHANGE_COOKIE; - } - } + if (err) + return; + if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); @@ -660,15 +656,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); - if (err) { - fhp->fh_post_saved = false; - fhp->fh_post_attr.ctime = inode->i_ctime; - if (v4 && IS_I_VERSION(inode)) { - fhp->fh_post_attr.change_cookie = inode_query_iversion(inode); - fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE; - } - } else - fhp->fh_post_saved = true; + if (err) + return; + + fhp->fh_post_saved = true; if (v4) fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, inode); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index c37195572fd0..a7315928a760 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -176,9 +176,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) { struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; - unsigned int len; u32 eof; - int v; dprintk("nfsd: READ %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -187,17 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2); argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen); - v = 0; - len = argp->count; resp->pages = rqstp->rq_next_page; - while (len > 0) { - struct page *page = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(page); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } /* Obtain buffer pointer for payload. 19 is 1 word for * status, 17 words for fattr, and 1 word for the byte count. @@ -207,7 +195,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) resp->count = argp->count; fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, v, &resp->count, &eof); + &resp->count, &eof); if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9c7b1ef5be40..2154fa63c5f2 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -402,6 +402,11 @@ void nfsd_reset_write_verifier(struct nfsd_net *nn) write_sequnlock(&nn->writeverf_lock); } +/* + * Crank up a set of per-namespace resources for a new NFSD instance, + * including lockd, a duplicate reply cache, an open file cache + * instance, and a cache of NFSv4 state objects. + */ static int nfsd_startup_net(struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index caf6355b18fa..5777f40c7353 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -468,7 +468,8 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr) case nfs_ok: if (xdr_stream_encode_u32(xdr, resp->len) < 0) return false; - xdr_write_pages(xdr, &resp->page, 0, resp->len); + svcxdr_encode_opaque_pages(rqstp, xdr, &resp->page, 0, + resp->len); if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0) return false; break; @@ -491,8 +492,9 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (xdr_stream_encode_u32(xdr, resp->count) < 0) return false; - xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base, - resp->count); + svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, + rqstp->rq_res.page_base, + resp->count); if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0) return false; break; @@ -511,7 +513,8 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; switch (resp->status) { case nfs_ok: - xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len); + svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0, + dirlist->len); /* no more entries */ if (xdr_stream_encode_item_absent(xdr) < 0) return false; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 72a906a053dc..2af74983f146 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1581,6 +1581,265 @@ TRACE_EVENT(nfsd_cb_recall_any_done, ) ); +TRACE_EVENT(nfsd_ctl_unlock_ip, + TP_PROTO( + const struct net *net, + const char *address + ), + TP_ARGS(net, address), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(address, address) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(address, address); + ), + TP_printk("address=%s", + __get_str(address) + ) +); + +TRACE_EVENT(nfsd_ctl_unlock_fs, + TP_PROTO( + const struct net *net, + const char *path + ), + TP_ARGS(net, path), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(path, path) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(path, path); + ), + TP_printk("path=%s", + __get_str(path) + ) +); + +TRACE_EVENT(nfsd_ctl_filehandle, + TP_PROTO( + const struct net *net, + const char *domain, + const char *path, + int maxsize + ), + TP_ARGS(net, domain, path, maxsize), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, maxsize) + __string(domain, domain) + __string(path, path) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->maxsize = maxsize; + __assign_str(domain, domain); + __assign_str(path, path); + ), + TP_printk("domain=%s path=%s maxsize=%d", + __get_str(domain), __get_str(path), __entry->maxsize + ) +); + +TRACE_EVENT(nfsd_ctl_threads, + TP_PROTO( + const struct net *net, + int newthreads + ), + TP_ARGS(net, newthreads), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, newthreads) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->newthreads = newthreads; + ), + TP_printk("newthreads=%d", + __entry->newthreads + ) +); + +TRACE_EVENT(nfsd_ctl_pool_threads, + TP_PROTO( + const struct net *net, + int pool, + int nrthreads + ), + TP_ARGS(net, pool, nrthreads), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, pool) + __field(int, nrthreads) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->pool = pool; + __entry->nrthreads = nrthreads; + ), + TP_printk("pool=%d nrthreads=%d", + __entry->pool, __entry->nrthreads + ) +); + +TRACE_EVENT(nfsd_ctl_version, + TP_PROTO( + const struct net *net, + const char *mesg + ), + TP_ARGS(net, mesg), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(mesg, mesg) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(mesg, mesg); + ), + TP_printk("%s", + __get_str(mesg) + ) +); + +TRACE_EVENT(nfsd_ctl_ports_addfd, + TP_PROTO( + const struct net *net, + int fd + ), + TP_ARGS(net, fd), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, fd) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->fd = fd; + ), + TP_printk("fd=%d", + __entry->fd + ) +); + +TRACE_EVENT(nfsd_ctl_ports_addxprt, + TP_PROTO( + const struct net *net, + const char *transport, + int port + ), + TP_ARGS(net, transport, port), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, port) + __string(transport, transport) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->port = port; + __assign_str(transport, transport); + ), + TP_printk("transport=%s port=%d", + __get_str(transport), __entry->port + ) +); + +TRACE_EVENT(nfsd_ctl_maxblksize, + TP_PROTO( + const struct net *net, + int bsize + ), + TP_ARGS(net, bsize), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, bsize) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->bsize = bsize; + ), + TP_printk("bsize=%d", + __entry->bsize + ) +); + +TRACE_EVENT(nfsd_ctl_maxconn, + TP_PROTO( + const struct net *net, + int maxconn + ), + TP_ARGS(net, maxconn), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, maxconn) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->maxconn = maxconn; + ), + TP_printk("maxconn=%d", + __entry->maxconn + ) +); + +TRACE_EVENT(nfsd_ctl_time, + TP_PROTO( + const struct net *net, + const char *name, + size_t namelen, + int time + ), + TP_ARGS(net, name, namelen, time), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, time) + __string_len(name, name, namelen) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->time = time; + __assign_str_len(name, name, namelen); + ), + TP_printk("file=%s time=%d\n", + __get_str(name), __entry->time + ) +); + +TRACE_EVENT(nfsd_ctl_recoverydir, + TP_PROTO( + const struct net *net, + const char *recdir + ), + TP_ARGS(net, recdir), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(recdir, recdir) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(recdir, recdir); + ), + TP_printk("recdir=%s", + __get_str(recdir) + ) +); + +TRACE_EVENT(nfsd_end_grace, + TP_PROTO( + const struct net *net + ), + TP_ARGS(net), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + ), + TP_printk("nn=%d", __entry->netns_ino + ) +); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index db67f8e19344..8a2321d19194 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -388,7 +388,9 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) iap->ia_mode &= ~S_ISGID; } else { /* set ATTR_KILL_* bits and let VFS handle it */ - iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); + iap->ia_valid |= ATTR_KILL_SUID; + iap->ia_valid |= + setattr_should_drop_sgid(&nop_mnt_idmap, inode); } } } @@ -936,7 +938,7 @@ nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags, /* * Grab and keep cached pages associated with a file in the svc_rqst - * so that they can be passed to the network sendmsg/sendpage routines + * so that they can be passed to the network sendmsg routines * directly. They will be released after the sending has completed. * * Return values: Number of bytes consumed, or -EIO if there are no @@ -1001,6 +1003,18 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, } } +/** + * nfsd_splice_read - Perform a VFS read using a splice pipe + * @rqstp: RPC transaction context + * @fhp: file handle of file to be read + * @file: opened struct file of file to be read + * @offset: starting byte offset + * @count: IN: requested number of bytes; OUT: number of bytes read + * @eof: OUT: set non-zero if operation reached the end of the file + * + * Returns nfs_ok on success, otherwise an nfserr stat value is + * returned. + */ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count, u32 *eof) @@ -1014,22 +1028,50 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); - rqstp->rq_next_page = rqstp->rq_respages + 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } -__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, - struct kvec *vec, int vlen, unsigned long *count, - u32 *eof) +/** + * nfsd_iter_read - Perform a VFS read using an iterator + * @rqstp: RPC transaction context + * @fhp: file handle of file to be read + * @file: opened struct file of file to be read + * @offset: starting byte offset + * @count: IN: requested number of bytes; OUT: number of bytes read + * @base: offset in first page of read buffer + * @eof: OUT: set non-zero if operation reached the end of the file + * + * Some filesystems or situations cannot use nfsd_splice_read. This + * function is the slightly less-performant fallback for those cases. + * + * Returns nfs_ok on success, otherwise an nfserr stat value is + * returned. + */ +__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, unsigned long *count, + unsigned int base, u32 *eof) { + unsigned long v, total; struct iov_iter iter; loff_t ppos = offset; + struct page *page; ssize_t host_err; + v = 0; + total = *count; + while (total) { + page = *(rqstp->rq_next_page++); + rqstp->rq_vec[v].iov_base = page_address(page) + base; + rqstp->rq_vec[v].iov_len = min_t(size_t, total, PAGE_SIZE - base); + total -= rqstp->rq_vec[v].iov_len; + ++v; + base = 0; + } + WARN_ON_ONCE(v > ARRAY_SIZE(rqstp->rq_vec)); + trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count); + iov_iter_kvec(&iter, ITER_DEST, rqstp->rq_vec, v, *count); host_err = vfs_iter_read(file, &iter, &ppos, 0); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1159,14 +1201,24 @@ out_nfserr: return nfserr; } -/* - * Read data from a file. count must contain the requested read count - * on entry. On return, *count contains the number of bytes actually read. +/** + * nfsd_read - Read data from a file + * @rqstp: RPC transaction context + * @fhp: file handle of file to be read + * @offset: starting byte offset + * @count: IN: requested number of bytes; OUT: number of bytes read + * @eof: OUT: set non-zero if operation reached the end of the file + * + * The caller must verify that there is enough space in @rqstp.rq_res + * to perform this operation. + * * N.B. After this call fhp needs an fh_put + * + * Returns nfs_ok on success, otherwise an nfserr stat value is + * returned. */ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count, - u32 *eof) + loff_t offset, unsigned long *count, u32 *eof) { struct nfsd_file *nf; struct file *file; @@ -1181,12 +1233,10 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); else - err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof); + err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof); nfsd_file_put(nf); - trace_nfsd_read_done(rqstp, fhp, offset, *count); - return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 43fb57a301d3..a6890ea7b765 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -110,13 +110,12 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count, u32 *eof); -__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, +__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - struct kvec *vec, int vlen, - unsigned long *count, + unsigned long *count, unsigned int base, u32 *eof); -__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, - loff_t, struct kvec *, int, unsigned long *, +__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, unsigned long *count, u32 *eof); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *, diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index a265d391ffe9..a9eb3487efb2 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -140,7 +140,7 @@ const struct file_operations nilfs_file_operations = { .open = generic_file_open, /* .release = nilfs_release_file, */ .fsync = nilfs_sync_file, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 9ba4933087af..0ef8c71bde8e 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1299,14 +1299,11 @@ nilfs_mount(struct file_system_type *fs_type, int flags, { struct nilfs_super_data sd; struct super_block *s; - fmode_t mode = FMODE_READ | FMODE_EXCL; struct dentry *root_dentry; int err, s_new = false; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; - - sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type); + sd.bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type, + NULL); if (IS_ERR(sd.bdev)) return ERR_CAST(sd.bdev); @@ -1340,7 +1337,6 @@ nilfs_mount(struct file_system_type *fs_type, int flags, s_new = true; /* New superblock instance created */ - s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev); sb_set_blocksize(s, block_size(sd.bdev)); @@ -1378,7 +1374,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (!s_new) - blkdev_put(sd.bdev, mode); + blkdev_put(sd.bdev, fs_type); return root_dentry; @@ -1387,7 +1383,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, failed: if (!s_new) - blkdev_put(sd.bdev, mode); + blkdev_put(sd.bdev, fs_type); return ERR_PTR(err); } diff --git a/fs/no-block.c b/fs/no-block.c deleted file mode 100644 index 481c0f0ab4bd..000000000000 --- a/fs/no-block.c +++ /dev/null @@ -1,19 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* no-block.c: implementation of routines required for non-BLOCK configuration - * - * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/kernel.h> -#include <linux/fs.h> - -static int no_blkdev_open(struct inode * inode, struct file * filp) -{ - return -ENODEV; -} - -const struct file_operations def_blk_fops = { - .open = no_blkdev_open, - .llseek = noop_llseek, -}; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 29bdd99b29fa..9dac7f6e72d2 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -380,7 +380,7 @@ static int fanotify_encode_fh_len(struct inode *inode) if (!inode) return 0; - exportfs_encode_inode_fh(inode, NULL, &dwords, NULL); + exportfs_encode_fid(inode, NULL, &dwords); fh_len = dwords << 2; /* @@ -443,9 +443,9 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, } dwords = fh_len >> 2; - type = exportfs_encode_inode_fh(inode, buf, &dwords, NULL); + type = exportfs_encode_fid(inode, buf, &dwords); err = -EINVAL; - if (!type || type == FILEID_INVALID || fh_len != dwords << 2) + if (type <= 0 || type == FILEID_INVALID || fh_len != dwords << 2) goto out_err; fh->type = type; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 22fb1cf7e1fc..f69c451018e3 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1591,11 +1591,10 @@ static int fanotify_test_fid(struct dentry *dentry) * We need to make sure that the file system supports at least * encoding a file handle so user can use name_to_handle_at() to * compare fid returned with event to the file handle of watched - * objects. However, name_to_handle_at() requires that the - * filesystem also supports decoding file handles. + * objects. However, even the relaxed AT_HANDLE_FID flag requires + * at least empty export_operations for ecoding unique file ids. */ - if (!dentry->d_sb->s_export_op || - !dentry->d_sb->s_export_op->fh_to_dentry) + if (!dentry->d_sb->s_export_op) return -EOPNOTSUPP; return 0; @@ -1624,6 +1623,20 @@ static int fanotify_events_supported(struct fsnotify_group *group, return -EINVAL; /* + * mount and sb marks are not allowed on kernel internal pseudo fs, + * like pipe_mnt, because that would subscribe to events on all the + * anonynous pipes in the system. + * + * SB_NOUSER covers all of the internal pseudo fs whose objects are not + * exposed to user's mount namespace, but there are other SB_KERNMOUNT + * fs, like nsfs, debugfs, for which the value of allowing sb and mount + * mark is questionable. For now we leave them alone. + */ + if (mark_type != FAN_MARK_INODE && + path->mnt->mnt_sb->s_flags & SB_NOUSER) + return -EINVAL; + + /* * We shouldn't have allowed setting dirent events and the directory * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode, * but because we always allowed it, error only when using new APIs. diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 55081ae3a6ec..5c430736ec12 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -50,7 +50,7 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode) f.handle.handle_bytes = sizeof(f.pad); size = f.handle.handle_bytes >> 2; - ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, NULL); + ret = exportfs_encode_fid(inode, (struct fid *)f.handle.f_handle, &size); if ((ret == FILEID_INVALID) || (ret < 0)) { WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); return; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index e8aeba124a95..4e158bce4192 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -526,7 +526,7 @@ err_out: * * Return 0 on success and -errno on error. * - * Based on ntfs_read_block() and __block_write_full_page(). + * Based on ntfs_read_block() and __block_write_full_folio(). */ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) { diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index a3865bc4a0c6..f79408f9127a 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -2491,7 +2491,7 @@ conv_err_out: * byte offset @ofs inside the attribute with the constant byte @val. * * This function is effectively like memset() applied to an ntfs attribute. - * Note thie function actually only operates on the page cache pages belonging + * Note this function actually only operates on the page cache pages belonging * to the ntfs attribute and it marks them dirty after doing the memset(). * Thus it relies on the vm dirty page write code paths to cause the modified * pages to be written to the mft record/disk. diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index f9cb180b6f6b..761aaa0195d6 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -161,7 +161,7 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], */ u8 *cb_end = cb_start + cb_size; /* End of cb. */ u8 *cb = cb_start; /* Current position in cb. */ - u8 *cb_sb_start = cb; /* Beginning of the current sb in the cb. */ + u8 *cb_sb_start; /* Beginning of the current sb in the cb. */ u8 *cb_sb_end; /* End of current sb / beginning of next sb. */ /* Variables for uncompressed data / destination. */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index c481b14e4fd9..cbc545999cfe 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1911,11 +1911,9 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(vi); /* We can write back this queue in page reclaim. */ - current->backing_dev_info = inode_to_bdi(vi); err = ntfs_prepare_file_for_write(iocb, from); if (iov_iter_count(from) && !err) written = ntfs_perform_write(file, from, iocb->ki_pos); - current->backing_dev_info = NULL; inode_unlock(vi); iocb->ki_pos += written; if (likely(written > 0)) @@ -1992,7 +1990,7 @@ const struct file_operations ntfs_file_ops = { #endif /* NTFS_RW */ .mmap = generic_file_mmap, .open = ntfs_file_open, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, }; const struct inode_operations ntfs_file_inode_ops = { diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 48030899dc6e..0155f106ec34 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1955,36 +1955,38 @@ undo_alloc: "attribute.%s", es); NVolSetErrors(vol); } - a = ctx->attr; + if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { ntfs_error(vol->sb, "Failed to truncate mft data attribute " "runlist.%s", es); NVolSetErrors(vol); } - if (mp_rebuilt && !IS_ERR(ctx->mrec)) { - if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + if (ctx) { + a = ctx->attr; + if (mp_rebuilt && !IS_ERR(ctx->mrec)) { + if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu( a->data.non_resident.mapping_pairs_offset), old_alen - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), + a->data.non_resident.mapping_pairs_offset), rl2, ll, -1, NULL)) { - ntfs_error(vol->sb, "Failed to restore mapping pairs " + ntfs_error(vol->sb, "Failed to restore mapping pairs " "array.%s", es); - NVolSetErrors(vol); - } - if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { - ntfs_error(vol->sb, "Failed to restore attribute " + NVolSetErrors(vol); + } + if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { + ntfs_error(vol->sb, "Failed to restore attribute " "record.%s", es); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } else if (IS_ERR(ctx->mrec)) { + ntfs_error(vol->sb, "Failed to restore attribute search " + "context.%s", es); NVolSetErrors(vol); } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } else if (IS_ERR(ctx->mrec)) { - ntfs_error(vol->sb, "Failed to restore attribute search " - "context.%s", es); - NVolSetErrors(vol); - } - if (ctx) ntfs_attr_put_search_ctx(ctx); + } if (!IS_ERR(mrec)) unmap_mft_record(mft_ni); up_write(&mft_ni->runlist.lock); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 2643a08182e1..56a7d5bd33e4 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1620,7 +1620,7 @@ read_partial_attrdef_page: memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT), page_address(page), size); ntfs_unmap_page(page); - }; + } if (size == PAGE_SIZE) { size = i_size & ~PAGE_MASK; if (size) @@ -1689,7 +1689,7 @@ read_partial_upcase_page: memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT), page_address(page), size); ntfs_unmap_page(page); - }; + } if (size == PAGE_SIZE) { size = i_size & ~PAGE_MASK; if (size) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 4c653945ef08..1d6c824246c4 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -744,6 +744,35 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) return generic_file_read_iter(iocb, iter); } +static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = in->f_mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + + if (is_encrypted(ni)) { + ntfs_inode_warn(inode, "encrypted i/o not supported"); + return -EOPNOTSUPP; + } + +#ifndef CONFIG_NTFS3_LZX_XPRESS + if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) { + ntfs_inode_warn( + inode, + "activate CONFIG_NTFS3_LZX_XPRESS to read external compressed files"); + return -EOPNOTSUPP; + } +#endif + + if (is_dedup(ni)) { + ntfs_inode_warn(inode, "read deduplicated not supported"); + return -EOPNOTSUPP; + } + + return filemap_splice_read(in, ppos, pipe, len, flags); +} + /* * ntfs_get_frame_pages * @@ -820,7 +849,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) if (!pages) return -ENOMEM; - current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) goto out; @@ -993,8 +1021,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) out: kfree(pages); - current->backing_dev_info = NULL; - if (err < 0) return err; @@ -1159,7 +1185,7 @@ const struct file_operations ntfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ntfs_compat_ioctl, #endif - .splice_read = generic_file_splice_read, + .splice_read = ntfs_file_splice_read, .mmap = ntfs_file_mmap, .open = ntfs_file_open, .fsync = generic_file_fsync, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 60b97c92e2b2..21472e3ed182 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1503,7 +1503,7 @@ static void o2hb_region_release(struct config_item *item) } if (reg->hr_bdev) - blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(reg->hr_bdev, NULL); kfree(reg->hr_slots); @@ -1786,7 +1786,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, goto out2; reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev, - FMODE_WRITE | FMODE_READ, NULL); + BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, + NULL); if (IS_ERR(reg->hr_bdev)) { ret = PTR_ERR(reg->hr_bdev); reg->hr_bdev = NULL; @@ -1893,7 +1894,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, out3: if (ret < 0) { - blkdev_put(reg->hr_bdev, FMODE_READ | FMODE_WRITE); + blkdev_put(reg->hr_bdev, NULL); reg->hr_bdev = NULL; } out2: diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index aecbd712a00c..960080753d3b 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -930,19 +930,22 @@ out: } static void o2net_sendpage(struct o2net_sock_container *sc, - void *kmalloced_virt, - size_t size) + void *virt, size_t size) { struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + struct msghdr msg = {}; + struct bio_vec bv; ssize_t ret; + bvec_set_virt(&bv, virt, size); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, size); + while (1) { + msg.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES; mutex_lock(&sc->sc_send_lock); - ret = sc->sc_sock->ops->sendpage(sc->sc_sock, - virt_to_page(kmalloced_virt), - offset_in_page(kmalloced_virt), - size, MSG_DONTWAIT); + ret = sock_sendmsg(sc->sc_sock, &msg); mutex_unlock(&sc->sc_send_lock); + if (ret == size) break; if (ret == (ssize_t)-EAGAIN) { @@ -2087,18 +2090,24 @@ void o2net_stop_listening(struct o2nm_node *node) int o2net_init(void) { + struct folio *folio; + void *p; unsigned long i; o2quo_init(); - o2net_debugfs_init(); - o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); - o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); - o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); - if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0); + if (!folio) goto out; + p = folio_address(folio); + o2net_hand = p; + p += sizeof(struct o2net_handshake); + o2net_keep_req = p; + p += sizeof(struct o2net_msg); + o2net_keep_resp = p; + o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); o2net_hand->connector_id = cpu_to_be64(1); @@ -2124,9 +2133,6 @@ int o2net_init(void) return 0; out: - kfree(o2net_hand); - kfree(o2net_keep_req); - kfree(o2net_keep_resp); o2net_debugfs_exit(); o2quo_exit(); return -ENOMEM; @@ -2135,8 +2141,6 @@ out: void o2net_exit(void) { o2quo_exit(); - kfree(o2net_hand); - kfree(o2net_keep_req); - kfree(o2net_keep_resp); o2net_debugfs_exit(); + folio_put(virt_to_folio(o2net_hand)); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index b173c36bcab3..91a194596552 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2558,7 +2558,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, * * Take and drop the meta data lock to update inode fields * like i_size. This allows the checks down below - * generic_file_read_iter() a chance of actually working. + * copy_splice_read() a chance of actually working. */ ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level, !nowait); @@ -2587,6 +2587,43 @@ bail: return ret; } +static ssize_t ocfs2_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + ssize_t ret = 0; + int lock_level = 0; + + trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + in->f_path.dentry->d_name.len, + in->f_path.dentry->d_name.name, + flags); + + /* + * We're fine letting folks race truncates and extending writes with + * read across the cluster, just like they can locally. Hence no + * rw_lock during read. + * + * Take and drop the meta data lock to update inode fields like i_size. + * This allows the checks down below filemap_splice_read() a chance of + * actually working. + */ + ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level, 1); + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto bail; + } + ocfs2_inode_unlock(inode, lock_level); + + ret = filemap_splice_read(in, ppos, pipe, len, flags); + trace_filemap_splice_read_ret(ret); +bail: + return ret; +} + /* Refer generic_file_llseek_unlocked() */ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) { @@ -2750,7 +2787,7 @@ const struct file_operations ocfs2_fops = { #endif .lock = ocfs2_lock, .flock = ocfs2_flock, - .splice_read = generic_file_splice_read, + .splice_read = ocfs2_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, .remap_file_range = ocfs2_remap_file_range, @@ -2796,7 +2833,7 @@ const struct file_operations ocfs2_fops_no_plocks = { .compat_ioctl = ocfs2_compat_ioctl, #endif .flock = ocfs2_flock, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, .remap_file_range = ocfs2_remap_file_range, diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index c4426d12a2ad..c803c10dd97e 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -973,7 +973,7 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, la_start_blk = ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(la->la_bm_off)); bitmap = la->la_bitmap; - start = count = bit_off = 0; + start = count = 0; left = le32_to_cpu(alloc->id1.bitmap1.i_total); while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index dc4bce1649c1..ac4fd1d5b128 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1315,10 +1315,10 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file); DEFINE_OCFS2_FILE_OPS(ocfs2_file_write_iter); -DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); - DEFINE_OCFS2_FILE_OPS(ocfs2_file_read_iter); +DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read); + DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error); @@ -1470,6 +1470,7 @@ TRACE_EVENT(ocfs2_prepare_inode_for_write, ); DEFINE_OCFS2_INT_EVENT(generic_file_read_iter_ret); +DEFINE_OCFS2_INT_EVENT(filemap_splice_read_ret); /* End of trace events for fs/ocfs2/file.c. */ diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 5022b3e9bfcd..dfaae1e52412 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -811,7 +811,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) struct ocfs2_quota_chunk *chunk; struct ocfs2_local_disk_chunk *dchunk; int mark_clean = 1, len; - int status; + int status = 0; iput(oinfo->dqi_gqinode); ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); @@ -853,17 +853,14 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) oinfo->dqi_libh, olq_update_info, info); - if (status < 0) { + if (status < 0) mlog_errno(status); - goto out; - } - out: ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); brelse(oinfo->dqi_libh); brelse(oinfo->dqi_lqi_bh); kfree(oinfo); - return 0; + return status; } static void olq_set_dquot(struct buffer_head *bh, void *private) diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 0101f1f87b56..de8f57ee39ec 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -334,7 +334,7 @@ const struct file_operations omfs_file_operations = { .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, }; static int omfs_setattr(struct mnt_idmap *idmap, diff --git a/fs/open.c b/fs/open.c index 4478adcc4f3a..0c55c8e7f837 100644 --- a/fs/open.c +++ b/fs/open.c @@ -700,10 +700,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) return do_fchmodat(AT_FDCWD, filename, mode); } -/** - * setattr_vfsuid - check and set ia_fsuid attribute - * @kuid: new inode owner - * +/* * Check whether @kuid is valid and if so generate and set vfsuid_t in * ia_vfsuid. * @@ -718,10 +715,7 @@ static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) return true; } -/** - * setattr_vfsgid - check and set ia_fsgid attribute - * @kgid: new inode owner - * +/* * Check whether @kgid is valid and if so generate and set vfsgid_t in * ia_vfsgid. * @@ -969,6 +963,11 @@ static int do_dentry_open(struct file *f, } } + /* + * Once we return a file with FMODE_OPENED, __fput() will call + * fsnotify_close(), so we need fsnotify_open() here for symmetry. + */ + fsnotify_open(f); return 0; cleanup_all: @@ -989,7 +988,6 @@ cleanup_file: * @file: file pointer * @dentry: pointer to dentry * @open: open callback - * @opened: state of open * * This can be used to finish opening a file passed to i_op->atomic_open(). * @@ -1043,7 +1041,6 @@ EXPORT_SYMBOL(file_path); * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized - * @cred: credentials to use */ int vfs_open(const struct path *path, struct file *file) { @@ -1116,23 +1113,77 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, } EXPORT_SYMBOL(dentry_create); -struct file *open_with_fake_path(const struct path *path, int flags, +/** + * kernel_file_open - open a file for kernel internal use + * @path: path of the file to open + * @flags: open flags + * @inode: the inode + * @cred: credentials for open + * + * Open a file for use by in-kernel consumers. The file is not accounted + * against nr_files and must not be installed into the file descriptor + * table. + * + * Return: Opened file on success, an error pointer on failure. + */ +struct file *kernel_file_open(const struct path *path, int flags, struct inode *inode, const struct cred *cred) { - struct file *f = alloc_empty_file_noaccount(flags, cred); - if (!IS_ERR(f)) { - int error; + struct file *f; + int error; - f->f_path = *path; - error = do_dentry_open(f, inode, NULL); - if (error) { - fput(f); - f = ERR_PTR(error); - } + f = alloc_empty_file_noaccount(flags, cred); + if (IS_ERR(f)) + return f; + + f->f_path = *path; + error = do_dentry_open(f, inode, NULL); + if (error) { + fput(f); + f = ERR_PTR(error); + } + return f; +} +EXPORT_SYMBOL_GPL(kernel_file_open); + +/** + * backing_file_open - open a backing file for kernel internal use + * @path: path of the file to open + * @flags: open flags + * @path: path of the backing file + * @cred: credentials for open + * + * Open a backing file for a stackable filesystem (e.g., overlayfs). + * @path may be on the stackable filesystem and backing inode on the + * underlying filesystem. In this case, we want to be able to return + * the @real_path of the backing inode. This is done by embedding the + * returned file into a container structure that also stores the path of + * the backing inode on the underlying filesystem, which can be + * retrieved using backing_file_real_path(). + */ +struct file *backing_file_open(const struct path *path, int flags, + const struct path *real_path, + const struct cred *cred) +{ + struct file *f; + int error; + + f = alloc_empty_backing_file(flags, cred); + if (IS_ERR(f)) + return f; + + f->f_path = *path; + path_get(real_path); + *backing_file_real_path(f) = *real_path; + error = do_dentry_open(f, d_inode(real_path->dentry), NULL); + if (error) { + fput(f); + f = ERR_PTR(error); } + return f; } -EXPORT_SYMBOL(open_with_fake_path); +EXPORT_SYMBOL_GPL(backing_file_open); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) @@ -1156,7 +1207,7 @@ inline struct open_how build_open_how(int flags, umode_t mode) inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; - u64 strip = FMODE_NONOTIFY | O_CLOEXEC; + u64 strip = __FMODE_NONOTIFY | O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); @@ -1358,7 +1409,6 @@ static long do_sys_openat2(int dfd, const char __user *filename, put_unused_fd(fd); fd = PTR_ERR(f); } else { - fsnotify_open(f); fd_install(fd, f); } } diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 1a4301a38aa7..d68372241b30 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -337,6 +337,26 @@ out: return ret; } +static ssize_t orangefs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + ssize_t ret; + + orangefs_stats.reads++; + + down_read(&inode->i_rwsem); + ret = orangefs_revalidate_mapping(inode); + if (ret) + goto out; + + ret = filemap_splice_read(in, ppos, pipe, len, flags); +out: + up_read(&inode->i_rwsem); + return ret; +} + static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) { @@ -556,7 +576,7 @@ const struct file_operations orangefs_file_operations = { .lock = orangefs_lock, .mmap = orangefs_file_mmap, .open = generic_file_open, - .splice_read = generic_file_splice_read, + .splice_read = orangefs_file_splice_read, .splice_write = iter_file_splice_write, .flush = orangefs_flush, .release = orangefs_file_release, diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile index 9164c585eb2f..4e173d56b11f 100644 --- a/fs/overlayfs/Makefile +++ b/fs/overlayfs/Makefile @@ -6,4 +6,4 @@ obj-$(CONFIG_OVERLAY_FS) += overlay.o overlay-objs := super.o namei.o util.o inode.o file.o dir.o readdir.o \ - copy_up.o export.o + copy_up.o export.o params.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index f658cc8ea492..568f743a5584 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -575,6 +575,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) /* Restore timestamps on parent (best effort) */ ovl_set_timestamps(ofs, upperdir, &c->pstat); ovl_dentry_set_upper_alias(c->dentry); + ovl_dentry_update_reval(c->dentry, upper); } } inode_unlock(udir); @@ -894,6 +895,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) inode_unlock(udir); ovl_dentry_set_upper_alias(c->dentry); + ovl_dentry_update_reval(c->dentry, ovl_dentry_upper(c->dentry)); } out: @@ -1071,6 +1073,15 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags) if (WARN_ON(disconnected && d_is_dir(dentry))) return -EIO; + /* + * We may not need lowerdata if we are only doing metacopy up, but it is + * not very important to optimize this case, so do lazy lowerdata lookup + * before any copy up, so we can do it before taking ovl_inode_lock(). + */ + err = ovl_maybe_lookup_lowerdata(dentry); + if (err) + return err; + old_cred = ovl_override_creds(dentry->d_sb); while (!err) { struct dentry *next; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index fc25fb95d5fc..033fc0458a3d 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -83,7 +83,7 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs) ofs->whiteout = whiteout; } - if (ofs->share_whiteout) { + if (!ofs->no_shared_whiteout) { whiteout = ovl_lookup_temp(ofs, workdir); if (IS_ERR(whiteout)) goto out; @@ -95,7 +95,7 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs) if (err != -EMLINK) { pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%i)\n", ofs->whiteout->d_inode->i_nlink, err); - ofs->share_whiteout = false; + ofs->no_shared_whiteout = true; } dput(whiteout); } @@ -269,8 +269,7 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, ovl_dir_modified(dentry->d_parent, false); ovl_dentry_set_upper_alias(dentry); - ovl_dentry_update_reval(dentry, newdentry, - DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); + ovl_dentry_init_reval(dentry, newdentry, NULL); if (!hardlink) { /* @@ -953,7 +952,7 @@ static bool ovl_type_merge_or_lower(struct dentry *dentry) static bool ovl_can_move(struct dentry *dentry) { - return ovl_redirect_dir(dentry->d_sb) || + return ovl_redirect_dir(OVL_FS(dentry->d_sb)) || !d_is_dir(dentry) || !ovl_type_merge_or_lower(dentry); } diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index defd4e231ad2..35680b6e175b 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -80,7 +80,7 @@ static int ovl_connectable_layer(struct dentry *dentry) /* We can get overlay root from root of any layer */ if (dentry == dentry->d_sb->s_root) - return oe->numlower; + return ovl_numlower(oe); /* * If it's an unindexed merge dir, then it's not connectable with any @@ -91,7 +91,7 @@ static int ovl_connectable_layer(struct dentry *dentry) return 0; /* We can get upper/overlay path from indexed/lower dentry */ - return oe->lowerstack[0].layer->idx; + return ovl_lowerstack(oe)->layer->idx; } /* @@ -105,6 +105,7 @@ static int ovl_connectable_layer(struct dentry *dentry) static int ovl_connect_layer(struct dentry *dentry) { struct dentry *next, *parent = NULL; + struct ovl_entry *oe = OVL_E(dentry); int origin_layer; int err = 0; @@ -112,7 +113,7 @@ static int ovl_connect_layer(struct dentry *dentry) WARN_ON(!ovl_dentry_lower(dentry))) return -EIO; - origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx; + origin_layer = ovl_lowerstack(oe)->layer->idx; if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry)) return origin_layer; @@ -285,21 +286,29 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, struct dentry *lower = lowerpath ? lowerpath->dentry : NULL; struct dentry *upper = upper_alias ?: index; struct dentry *dentry; - struct inode *inode; + struct inode *inode = NULL; struct ovl_entry *oe; struct ovl_inode_params oip = { - .lowerpath = lowerpath, .index = index, - .numlower = !!lower }; /* We get overlay directory dentries with ovl_lookup_real() */ if (d_is_dir(upper ?: lower)) return ERR_PTR(-EIO); + oe = ovl_alloc_entry(!!lower); + if (!oe) + return ERR_PTR(-ENOMEM); + oip.upperdentry = dget(upper); + if (lower) { + ovl_lowerstack(oe)->dentry = dget(lower); + ovl_lowerstack(oe)->layer = lowerpath->layer; + } + oip.oe = oe; inode = ovl_get_inode(sb, &oip); if (IS_ERR(inode)) { + ovl_free_entry(oe); dput(upper); return ERR_CAST(inode); } @@ -314,20 +323,11 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, dentry = d_alloc_anon(inode->i_sb); if (unlikely(!dentry)) goto nomem; - oe = ovl_alloc_entry(lower ? 1 : 0); - if (!oe) - goto nomem; - if (lower) { - oe->lowerstack->dentry = dget(lower); - oe->lowerstack->layer = lowerpath->layer; - } - dentry->d_fsdata = oe; if (upper_alias) ovl_dentry_set_upper_alias(dentry); - ovl_dentry_update_reval(dentry, upper, - DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); + ovl_dentry_init_reval(dentry, upper, OVL_I_E(inode)); return d_instantiate_anon(dentry, inode); @@ -342,15 +342,16 @@ out_iput: /* Get the upper or lower dentry in stack whose on layer @idx */ static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx) { - struct ovl_entry *oe = dentry->d_fsdata; + struct ovl_entry *oe = OVL_E(dentry); + struct ovl_path *lowerstack = ovl_lowerstack(oe); int i; if (!idx) return ovl_dentry_upper(dentry); - for (i = 0; i < oe->numlower; i++) { - if (oe->lowerstack[i].layer->idx == idx) - return oe->lowerstack[i].dentry; + for (i = 0; i < ovl_numlower(oe); i++) { + if (lowerstack[i].layer->idx == idx) + return lowerstack[i].dentry; } return NULL; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 7c04f033aadd..21245b00722a 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -34,8 +34,8 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode) return 'm'; } -/* No atime modification nor notify on underlying */ -#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) +/* No atime modification on underlying */ +#define OVL_OPEN_FLAGS (O_NOATIME) static struct file *ovl_open_realfile(const struct file *file, const struct path *realpath) @@ -61,8 +61,8 @@ static struct file *ovl_open_realfile(const struct file *file, if (!inode_owner_or_capable(real_idmap, realinode)) flags &= ~O_NOATIME; - realfile = open_with_fake_path(&file->f_path, flags, realinode, - current_cred()); + realfile = backing_file_open(&file->f_path, flags, realpath, + current_cred()); } revert_creds(old_cred); @@ -107,14 +107,23 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real, { struct dentry *dentry = file_dentry(file); struct path realpath; + int err; real->flags = 0; real->file = file->private_data; - if (allow_meta) + if (allow_meta) { ovl_path_real(dentry, &realpath); - else + } else { + /* lazy lookup of lowerdata */ + err = ovl_maybe_lookup_lowerdata(dentry); + if (err) + return err; + ovl_path_realdata(dentry, &realpath); + } + if (!realpath.dentry) + return -EIO; /* Has it been copied up since we'd opened it? */ if (unlikely(file_inode(real->file) != d_inode(realpath.dentry))) { @@ -150,6 +159,11 @@ static int ovl_open(struct inode *inode, struct file *file) struct path realpath; int err; + /* lazy lookup of lowerdata */ + err = ovl_maybe_lookup_lowerdata(dentry); + if (err) + return err; + err = ovl_maybe_copy_up(dentry, file->f_flags); if (err) return err; @@ -158,6 +172,9 @@ static int ovl_open(struct inode *inode, struct file *file) file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); ovl_path_realdata(dentry, &realpath); + if (!realpath.dentry) + return -EIO; + realfile = ovl_open_realfile(file, &realpath); if (IS_ERR(realfile)) return PTR_ERR(realfile); @@ -419,6 +436,27 @@ out_unlock: return ret; } +static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + const struct cred *old_cred; + struct fd real; + ssize_t ret; + + ret = ovl_real_fdget(in, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(in)->i_sb); + ret = vfs_splice_read(real.file, ppos, pipe, len, flags); + revert_creds(old_cred); + ovl_file_accessed(in); + + fdput(real); + return ret; +} + /* * Calling iter_file_splice_write() directly from overlay's f_op may deadlock * due to lock order inversion between pipe->mutex in iter_file_splice_write() @@ -695,7 +733,7 @@ const struct file_operations ovl_file_operations = { .fallocate = ovl_fallocate, .fadvise = ovl_fadvise, .flush = ovl_flush, - .splice_read = generic_file_splice_read, + .splice_read = ovl_splice_read, .splice_write = ovl_splice_write, .copy_file_range = ovl_copy_file_range, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 541cf3717fc2..a63e57447be9 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -97,8 +97,9 @@ out: static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) { - bool samefs = ovl_same_fs(dentry->d_sb); - unsigned int xinobits = ovl_xino_bits(dentry->d_sb); + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + bool samefs = ovl_same_fs(ofs); + unsigned int xinobits = ovl_xino_bits(ofs); unsigned int xinoshift = 64 - xinobits; if (samefs) { @@ -123,7 +124,7 @@ static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) stat->ino |= ((u64)fsid) << (xinoshift + 1); stat->dev = dentry->d_sb->s_dev; return; - } else if (ovl_xino_warn(dentry->d_sb)) { + } else if (ovl_xino_warn(ofs)) { pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", dentry, stat->ino, xinobits); } @@ -149,7 +150,7 @@ static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) * is unique per underlying fs, so we use the unique anonymous * bdev assigned to the underlying fs. */ - stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev; + stat->dev = ofs->fs[fsid].pseudo_dev; } } @@ -186,7 +187,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, * If lower filesystem supports NFS file handles, this also guaranties * persistent st_ino across mount cycle. */ - if (!is_dir || ovl_same_dev(dentry->d_sb)) { + if (!is_dir || ovl_same_dev(OVL_FS(dentry->d_sb))) { if (!OVL_TYPE_UPPER(type)) { fsid = ovl_layer_lower(dentry)->fsid; } else if (OVL_TYPE_ORIGIN(type)) { @@ -240,15 +241,22 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, /* * If lower is not same as lowerdata or if there was * no origin on upper, we can end up here. + * With lazy lowerdata lookup, guess lowerdata blocks + * from size to avoid lowerdata lookup on stat(2). */ struct kstat lowerdatastat; u32 lowermask = STATX_BLOCKS; ovl_path_lowerdata(dentry, &realpath); - err = vfs_getattr(&realpath, &lowerdatastat, - lowermask, flags); - if (err) - goto out; + if (realpath.dentry) { + err = vfs_getattr(&realpath, &lowerdatastat, + lowermask, flags); + if (err) + goto out; + } else { + lowerdatastat.blocks = + round_up(stat->size, stat->blksize) >> 9; + } stat->blocks = lowerdatastat.blocks; } } @@ -288,8 +296,8 @@ int ovl_permission(struct mnt_idmap *idmap, int err; /* Careful in RCU walk mode */ - ovl_i_path_real(inode, &realpath); - if (!realpath.dentry) { + realinode = ovl_i_path_real(inode, &realpath); + if (!realinode) { WARN_ON(!(mask & MAY_NOT_BLOCK)); return -ECHILD; } @@ -302,7 +310,6 @@ int ovl_permission(struct mnt_idmap *idmap, if (err) return err; - realinode = d_inode(realpath.dentry); old_cred = ovl_override_creds(inode->i_sb); if (!upperinode && !special_file(realinode->i_mode) && mask & MAY_WRITE) { @@ -559,20 +566,20 @@ struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu, bool noperm) { - struct inode *realinode = ovl_inode_real(inode); + struct inode *realinode; struct posix_acl *acl; struct path realpath; - if (!IS_POSIXACL(realinode)) - return NULL; - /* Careful in RCU walk mode */ - ovl_i_path_real(inode, &realpath); - if (!realpath.dentry) { + realinode = ovl_i_path_real(inode, &realpath); + if (!realinode) { WARN_ON(!rcu); return ERR_PTR(-ECHILD); } + if (!IS_POSIXACL(realinode)) + return NULL; + if (rcu) { /* * If the layer is idmapped drop out of RCU path walk @@ -710,6 +717,9 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct inode *realinode = ovl_inode_realdata(inode); const struct cred *old_cred; + if (!realinode) + return -EIO; + if (!realinode->i_op->fiemap) return -EOPNOTSUPP; @@ -952,7 +962,7 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) static void ovl_next_ino(struct inode *inode) { - struct ovl_fs *ofs = inode->i_sb->s_fs_info; + struct ovl_fs *ofs = OVL_FS(inode->i_sb); inode->i_ino = atomic_long_inc_return(&ofs->last_ino); if (unlikely(!inode->i_ino)) @@ -961,7 +971,8 @@ static void ovl_next_ino(struct inode *inode) static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid) { - int xinobits = ovl_xino_bits(inode->i_sb); + struct ovl_fs *ofs = OVL_FS(inode->i_sb); + int xinobits = ovl_xino_bits(ofs); unsigned int xinoshift = 64 - xinobits; /* @@ -972,7 +983,7 @@ static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid) * with d_ino also causes nfsd readdirplus to fail. */ inode->i_ino = ino; - if (ovl_same_fs(inode->i_sb)) { + if (ovl_same_fs(ofs)) { return; } else if (xinobits && likely(!(ino >> xinoshift))) { inode->i_ino |= (unsigned long)fsid << (xinoshift + 1); @@ -1003,14 +1014,10 @@ void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, struct inode *realinode; struct ovl_inode *oi = OVL_I(inode); - if (oip->upperdentry) - oi->__upperdentry = oip->upperdentry; - if (oip->lowerpath && oip->lowerpath->dentry) { - oi->lowerpath.dentry = dget(oip->lowerpath->dentry); - oi->lowerpath.layer = oip->lowerpath->layer; - } - if (oip->lowerdata) - oi->lowerdata = igrab(d_inode(oip->lowerdata)); + oi->__upperdentry = oip->upperdentry; + oi->oe = oip->oe; + oi->redirect = oip->redirect; + oi->lowerdata_redirect = oip->lowerdata_redirect; realinode = ovl_inode_real(inode); ovl_copyattr(inode); @@ -1325,7 +1332,7 @@ struct inode *ovl_get_inode(struct super_block *sb, { struct ovl_fs *ofs = OVL_FS(sb); struct dentry *upperdentry = oip->upperdentry; - struct ovl_path *lowerpath = oip->lowerpath; + struct ovl_path *lowerpath = ovl_lowerpath(oip->oe); struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; @@ -1369,7 +1376,9 @@ struct inode *ovl_get_inode(struct super_block *sb, } dput(upperdentry); + ovl_free_entry(oip->oe); kfree(oip->redirect); + kfree(oip->lowerdata_redirect); goto out; } @@ -1398,14 +1407,12 @@ struct inode *ovl_get_inode(struct super_block *sb, if (oip->index) ovl_set_flag(OVL_INDEX, inode); - OVL_I(inode)->redirect = oip->redirect; - if (bylower) ovl_set_flag(OVL_CONST_INO, inode); /* Check for non-merge dir that may have whiteouts */ if (is_dir) { - if (((upperdentry && lowerdentry) || oip->numlower > 1) || + if (((upperdentry && lowerdentry) || ovl_numlower(oip->oe) > 1) || ovl_path_check_origin_xattr(ofs, &realpath)) { ovl_set_flag(OVL_WHITEOUTS, inode); } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index cfb3420b7df0..57adf911735f 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -14,6 +14,8 @@ #include <linux/exportfs.h> #include "overlayfs.h" +#include "../internal.h" /* for vfs_path_lookup */ + struct ovl_lookup_data { struct super_block *sb; struct vfsmount *mnt; @@ -24,6 +26,8 @@ struct ovl_lookup_data { bool last; char *redirect; bool metacopy; + /* Referring to last redirect xattr */ + bool absolute_redirect; }; static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d, @@ -33,11 +37,13 @@ static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d char *buf; struct ovl_fs *ofs = OVL_FS(d->sb); + d->absolute_redirect = false; buf = ovl_get_redirect_xattr(ofs, path, prelen + strlen(post)); if (IS_ERR_OR_NULL(buf)) return PTR_ERR(buf); if (buf[0] == '/') { + d->absolute_redirect = true; /* * One of the ancestor path elements in an absolute path * lookup in ovl_lookup_layer() could have been opaque and @@ -349,6 +355,61 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, return 0; } +static int ovl_lookup_data_layer(struct dentry *dentry, const char *redirect, + const struct ovl_layer *layer, + struct path *datapath) +{ + int err; + + err = vfs_path_lookup(layer->mnt->mnt_root, layer->mnt, redirect, + LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | LOOKUP_NO_XDEV, + datapath); + pr_debug("lookup lowerdata (%pd2, redirect=\"%s\", layer=%d, err=%i)\n", + dentry, redirect, layer->idx, err); + + if (err) + return err; + + err = -EREMOTE; + if (ovl_dentry_weird(datapath->dentry)) + goto out_path_put; + + err = -ENOENT; + /* Only regular file is acceptable as lower data */ + if (!d_is_reg(datapath->dentry)) + goto out_path_put; + + return 0; + +out_path_put: + path_put(datapath); + + return err; +} + +/* Lookup in data-only layers by absolute redirect to layer root */ +static int ovl_lookup_data_layers(struct dentry *dentry, const char *redirect, + struct ovl_path *lowerdata) +{ + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); + const struct ovl_layer *layer; + struct path datapath; + int err = -ENOENT; + int i; + + layer = &ofs->layers[ofs->numlayer - ofs->numdatalayer]; + for (i = 0; i < ofs->numdatalayer; i++, layer++) { + err = ovl_lookup_data_layer(dentry, redirect, layer, &datapath); + if (!err) { + mntput(datapath.mnt); + lowerdata->dentry = datapath.dentry; + lowerdata->layer = layer; + return 0; + } + } + + return err; +} int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp) @@ -356,7 +417,7 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *origin = NULL; int i; - for (i = 1; i < ofs->numlayer; i++) { + for (i = 1; i <= ovl_numlowerlayer(ofs); i++) { /* * If lower fs uuid is not unique among lower fs we cannot match * fh->uuid to layer. @@ -790,20 +851,21 @@ fail: */ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) { - struct ovl_entry *oe = dentry->d_fsdata; + struct ovl_entry *oe = OVL_E(dentry); + struct ovl_path *lowerstack = ovl_lowerstack(oe); BUG_ON(idx < 0); if (idx == 0) { ovl_path_upper(dentry, path); if (path->dentry) - return oe->numlower ? 1 : -1; + return ovl_numlower(oe) ? 1 : -1; idx++; } - BUG_ON(idx > oe->numlower); - path->dentry = oe->lowerstack[idx - 1].dentry; - path->mnt = oe->lowerstack[idx - 1].layer->mnt; + BUG_ON(idx > ovl_numlower(oe)); + path->dentry = lowerstack[idx - 1].dentry; + path->mnt = lowerstack[idx - 1].layer->mnt; - return (idx < oe->numlower) ? idx + 1 : -1; + return (idx < ovl_numlower(oe)) ? idx + 1 : -1; } /* Fix missing 'origin' xattr */ @@ -827,14 +889,60 @@ static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, return err; } +/* Lazy lookup of lowerdata */ +int ovl_maybe_lookup_lowerdata(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + const char *redirect = ovl_lowerdata_redirect(inode); + struct ovl_path datapath = {}; + const struct cred *old_cred; + int err; + + if (!redirect || ovl_dentry_lowerdata(dentry)) + return 0; + + if (redirect[0] != '/') + return -EIO; + + err = ovl_inode_lock_interruptible(inode); + if (err) + return err; + + err = 0; + /* Someone got here before us? */ + if (ovl_dentry_lowerdata(dentry)) + goto out; + + old_cred = ovl_override_creds(dentry->d_sb); + err = ovl_lookup_data_layers(dentry, redirect, &datapath); + revert_creds(old_cred); + if (err) + goto out_err; + + err = ovl_dentry_set_lowerdata(dentry, &datapath); + if (err) + goto out_err; + +out: + ovl_inode_unlock(inode); + dput(datapath.dentry); + + return err; + +out_err: + pr_warn_ratelimited("lazy lowerdata lookup failed (%pd2, err=%i)\n", + dentry, err); + goto out; +} + struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct ovl_entry *oe; + struct ovl_entry *oe = NULL; const struct cred *old_cred; struct ovl_fs *ofs = dentry->d_sb->s_fs_info; - struct ovl_entry *poe = dentry->d_parent->d_fsdata; - struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; + struct ovl_entry *poe = OVL_E(dentry->d_parent); + struct ovl_entry *roe = OVL_E(dentry->d_sb->s_root); struct ovl_path *stack = NULL, *origin_path = NULL; struct dentry *upperdir, *upperdentry = NULL; struct dentry *origin = NULL; @@ -853,7 +961,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, .is_dir = false, .opaque = false, .stop = false, - .last = ofs->config.redirect_follow ? false : !poe->numlower, + .last = ovl_redirect_follow(ofs) ? false : !ovl_numlower(poe), .redirect = NULL, .metacopy = false, }; @@ -904,21 +1012,20 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, upperopaque = d.opaque; } - if (!d.stop && poe->numlower) { + if (!d.stop && ovl_numlower(poe)) { err = -ENOMEM; - stack = kcalloc(ofs->numlayer - 1, sizeof(struct ovl_path), - GFP_KERNEL); + stack = ovl_stack_alloc(ofs->numlayer - 1); if (!stack) goto out_put_upper; } - for (i = 0; !d.stop && i < poe->numlower; i++) { - struct ovl_path lower = poe->lowerstack[i]; + for (i = 0; !d.stop && i < ovl_numlower(poe); i++) { + struct ovl_path lower = ovl_lowerstack(poe)[i]; - if (!ofs->config.redirect_follow) - d.last = i == poe->numlower - 1; - else - d.last = lower.layer->idx == roe->numlower; + if (!ovl_redirect_follow(ofs)) + d.last = i == ovl_numlower(poe) - 1; + else if (d.is_dir || !ofs->numdatalayer) + d.last = lower.layer->idx == ovl_numlower(roe); d.mnt = lower.layer->mnt; err = ovl_lookup_layer(lower.dentry, &d, &this, false); @@ -995,7 +1102,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, * this attack vector when not necessary. */ err = -EPERM; - if (d.redirect && !ofs->config.redirect_follow) { + if (d.redirect && !ovl_redirect_follow(ofs)) { pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", dentry); goto out_put; @@ -1011,6 +1118,12 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, } } + /* Defer lookup of lowerdata in data-only layers to first access */ + if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) { + d.metacopy = false; + ctr++; + } + /* * For regular non-metacopy upper dentries, there is no lower * path based lookup, hence ctr will be zero. If a dentry is found @@ -1067,13 +1180,14 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, } } - oe = ovl_alloc_entry(ctr); - err = -ENOMEM; - if (!oe) - goto out_put; + if (ctr) { + oe = ovl_alloc_entry(ctr); + err = -ENOMEM; + if (!oe) + goto out_put; - memcpy(oe->lowerstack, stack, sizeof(struct ovl_path) * ctr); - dentry->d_fsdata = oe; + ovl_stack_cpy(ovl_lowerstack(oe), stack, ctr); + } if (upperopaque) ovl_dentry_set_opaque(dentry); @@ -1106,14 +1220,16 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (upperdentry || ctr) { struct ovl_inode_params oip = { .upperdentry = upperdentry, - .lowerpath = stack, + .oe = oe, .index = index, - .numlower = ctr, .redirect = upperredirect, - .lowerdata = (ctr > 1 && !d.is_dir) ? - stack[ctr - 1].dentry : NULL, }; + /* Store lowerdata redirect for lazy lookup */ + if (ctr > 1 && !d.is_dir && !stack[ctr - 1].dentry) { + oip.lowerdata_redirect = d.redirect; + d.redirect = NULL; + } inode = ovl_get_inode(dentry->d_sb, &oip); err = PTR_ERR(inode); if (IS_ERR(inode)) @@ -1122,8 +1238,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ovl_set_flag(OVL_UPPERDATA, inode); } - ovl_dentry_update_reval(dentry, upperdentry, - DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); + ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode)); revert_creds(old_cred); if (origin_path) { @@ -1131,18 +1246,15 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, kfree(origin_path); } dput(index); - kfree(stack); + ovl_stack_free(stack, ctr); kfree(d.redirect); return d_splice_alias(inode, dentry); out_free_oe: - dentry->d_fsdata = NULL; - kfree(oe); + ovl_free_entry(oe); out_put: dput(index); - for (i = 0; i < ctr; i++) - dput(stack[i].dentry); - kfree(stack); + ovl_stack_free(stack, ctr); out_put_upper: if (origin_path) { dput(origin_path->dentry); @@ -1158,7 +1270,7 @@ out: bool ovl_lower_positive(struct dentry *dentry) { - struct ovl_entry *poe = dentry->d_parent->d_fsdata; + struct ovl_entry *poe = OVL_E(dentry->d_parent); const struct qstr *name = &dentry->d_name; const struct cred *old_cred; unsigned int i; @@ -1178,12 +1290,13 @@ bool ovl_lower_positive(struct dentry *dentry) old_cred = ovl_override_creds(dentry->d_sb); /* Positive upper -> have to look up lower to see whether it exists */ - for (i = 0; !done && !positive && i < poe->numlower; i++) { + for (i = 0; !done && !positive && i < ovl_numlower(poe); i++) { struct dentry *this; - struct dentry *lowerdir = poe->lowerstack[i].dentry; + struct ovl_path *parentpath = &ovl_lowerstack(poe)[i]; - this = lookup_one_positive_unlocked(mnt_idmap(poe->lowerstack[i].layer->mnt), - name->name, lowerdir, name->len); + this = lookup_one_positive_unlocked( + mnt_idmap(parentpath->layer->mnt), + name->name, parentpath->dentry, name->len); if (IS_ERR(this)) { switch (PTR_ERR(this)) { case -ENOENT: diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 4d0b278f5630..9402591f12aa 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -58,6 +58,13 @@ enum ovl_entry_flag { }; enum { + OVL_REDIRECT_OFF, /* "off" mode is never used. In effect */ + OVL_REDIRECT_FOLLOW, /* ...it translates to either "follow" */ + OVL_REDIRECT_NOFOLLOW, /* ...or "nofollow". */ + OVL_REDIRECT_ON, +}; + +enum { OVL_XINO_OFF, OVL_XINO_AUTO, OVL_XINO_ON, @@ -329,8 +336,9 @@ static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, struct dentry *dentry, umode_t mode) { struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; - struct file *file = vfs_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, mode, - O_LARGEFILE | O_WRONLY, current_cred()); + struct file *file = kernel_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, + mode, O_LARGEFILE | O_WRONLY, + current_cred()); int err = PTR_ERR_OR_ZERO(file); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); @@ -352,18 +360,6 @@ static inline bool ovl_open_flags_need_copy_up(int flags) return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); } -static inline bool ovl_allow_offline_changes(struct ovl_fs *ofs) -{ - /* - * To avoid regressions in existing setups with overlay lower offline - * changes, we allow lower changes only if none of the new features - * are used. - */ - return (!ofs->config.index && !ofs->config.metacopy && - !ofs->config.redirect_dir && ofs->config.xino != OVL_XINO_ON); -} - - /* util.c */ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); @@ -373,21 +369,30 @@ int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); bool ovl_verify_lower(struct super_block *sb); +struct ovl_path *ovl_stack_alloc(unsigned int n); +void ovl_stack_cpy(struct ovl_path *dst, struct ovl_path *src, unsigned int n); +void ovl_stack_put(struct ovl_path *stack, unsigned int n); +void ovl_stack_free(struct ovl_path *stack, unsigned int n); struct ovl_entry *ovl_alloc_entry(unsigned int numlower); +void ovl_free_entry(struct ovl_entry *oe); bool ovl_dentry_remote(struct dentry *dentry); -void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry, - unsigned int mask); +void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *realdentry); +void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry, + struct ovl_entry *oe); +void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry, + struct ovl_entry *oe, unsigned int mask); bool ovl_dentry_weird(struct dentry *dentry); enum ovl_path_type ovl_path_type(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); void ovl_path_lower(struct dentry *dentry, struct path *path); void ovl_path_lowerdata(struct dentry *dentry, struct path *path); -void ovl_i_path_real(struct inode *inode, struct path *path); +struct inode *ovl_i_path_real(struct inode *inode, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_lowerdata(struct dentry *dentry); +int ovl_dentry_set_lowerdata(struct dentry *dentry, struct ovl_path *datapath); const struct ovl_layer *ovl_i_layer_lower(struct inode *inode); const struct ovl_layer *ovl_layer_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); @@ -397,6 +402,7 @@ struct inode *ovl_inode_lower(struct inode *inode); struct inode *ovl_inode_lowerdata(struct inode *inode); struct inode *ovl_inode_real(struct inode *inode); struct inode *ovl_inode_realdata(struct inode *inode); +const char *ovl_lowerdata_redirect(struct inode *inode); struct ovl_dir_cache *ovl_dir_cache(struct inode *inode); void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache); void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry); @@ -411,7 +417,6 @@ bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags); bool ovl_dentry_needs_data_copy_up_locked(struct dentry *dentry, int flags); bool ovl_has_upperdata(struct inode *inode); void ovl_set_upperdata(struct inode *inode); -bool ovl_redirect_dir(struct super_block *sb); const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); @@ -479,31 +484,51 @@ static inline bool ovl_is_impuredir(struct super_block *sb, return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE); } +static inline bool ovl_redirect_follow(struct ovl_fs *ofs) +{ + return ofs->config.redirect_mode != OVL_REDIRECT_NOFOLLOW; +} + +static inline bool ovl_redirect_dir(struct ovl_fs *ofs) +{ + return ofs->config.redirect_mode == OVL_REDIRECT_ON; +} + /* * With xino=auto, we do best effort to keep all inodes on same st_dev and * d_ino consistent with st_ino. * With xino=on, we do the same effort but we warn if we failed. */ -static inline bool ovl_xino_warn(struct super_block *sb) +static inline bool ovl_xino_warn(struct ovl_fs *ofs) { - return OVL_FS(sb)->config.xino == OVL_XINO_ON; + return ofs->config.xino == OVL_XINO_ON; +} + +/* + * To avoid regressions in existing setups with overlay lower offline changes, + * we allow lower changes only if none of the new features are used. + */ +static inline bool ovl_allow_offline_changes(struct ovl_fs *ofs) +{ + return (!ofs->config.index && !ofs->config.metacopy && + !ovl_redirect_dir(ofs) && !ovl_xino_warn(ofs)); } /* All layers on same fs? */ -static inline bool ovl_same_fs(struct super_block *sb) +static inline bool ovl_same_fs(struct ovl_fs *ofs) { - return OVL_FS(sb)->xino_mode == 0; + return ofs->xino_mode == 0; } /* All overlay inodes have same st_dev? */ -static inline bool ovl_same_dev(struct super_block *sb) +static inline bool ovl_same_dev(struct ovl_fs *ofs) { - return OVL_FS(sb)->xino_mode >= 0; + return ofs->xino_mode >= 0; } -static inline unsigned int ovl_xino_bits(struct super_block *sb) +static inline unsigned int ovl_xino_bits(struct ovl_fs *ofs) { - return ovl_same_dev(sb) ? OVL_FS(sb)->xino_mode : 0; + return ovl_same_dev(ofs) ? ofs->xino_mode : 0; } static inline void ovl_inode_lock(struct inode *inode) @@ -549,6 +574,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh); struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify); int ovl_path_next(int idx, struct dentry *dentry, struct path *path); +int ovl_maybe_lookup_lowerdata(struct dentry *dentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); bool ovl_lower_positive(struct dentry *dentry); @@ -645,11 +671,10 @@ bool ovl_is_private_xattr(struct super_block *sb, const char *name); struct ovl_inode_params { struct inode *newinode; struct dentry *upperdentry; - struct ovl_path *lowerpath; + struct ovl_entry *oe; bool index; - unsigned int numlower; char *redirect; - struct dentry *lowerdata; + char *lowerdata_redirect; }; void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, unsigned long ino, int fsid); @@ -734,3 +759,12 @@ int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, /* export.c */ extern const struct export_operations ovl_export_operations; + +/* super.c */ +int ovl_fill_super(struct super_block *sb, struct fs_context *fc); + +/* Will this overlay be forced to mount/remount ro? */ +static inline bool ovl_force_readonly(struct ovl_fs *ofs) +{ + return (!ovl_upper_mnt(ofs) || !ofs->workdir); +} diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index fd11fe6d6d45..306e1ecdc96d 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -6,13 +6,10 @@ */ struct ovl_config { - char *lowerdir; char *upperdir; char *workdir; bool default_permissions; - bool redirect_dir; - bool redirect_follow; - const char *redirect_mode; + int redirect_mode; bool index; bool uuid; bool nfs_export; @@ -32,6 +29,7 @@ struct ovl_sb { }; struct ovl_layer { + /* ovl_free_fs() relies on @mnt being the first member! */ struct vfsmount *mnt; /* Trap in ovl inode cache */ struct inode *trap; @@ -40,18 +38,34 @@ struct ovl_layer { int idx; /* One fsid per unique underlying sb (upper fsid == 0) */ int fsid; + char *name; }; +/* + * ovl_free_fs() relies on @mnt being the first member when unmounting + * the private mounts created for each layer. Let's check both the + * offset and type. + */ +static_assert(offsetof(struct ovl_layer, mnt) == 0); +static_assert(__same_type(typeof_member(struct ovl_layer, mnt), struct vfsmount *)); + struct ovl_path { const struct ovl_layer *layer; struct dentry *dentry; }; +struct ovl_entry { + unsigned int __numlower; + struct ovl_path __lowerstack[]; +}; + /* private information held for overlayfs's superblock */ struct ovl_fs { unsigned int numlayer; /* Number of unique fs among layers including upper fs */ unsigned int numfs; + /* Number of data-only lower layers */ + unsigned int numdatalayer; const struct ovl_layer *layers; struct ovl_sb *fs; /* workbasedir is the path at workdir= mount option */ @@ -70,7 +84,6 @@ struct ovl_fs { /* Did we take the inuse lock? */ bool upperdir_locked; bool workdir_locked; - bool share_whiteout; /* Traps in ovl inode cache */ struct inode *workbasedir_trap; struct inode *workdir_trap; @@ -79,12 +92,19 @@ struct ovl_fs { int xino_mode; /* For allocation of non-persistent inode numbers */ atomic_long_t last_ino; - /* Whiteout dentry cache */ + /* Shared whiteout cache */ struct dentry *whiteout; + bool no_shared_whiteout; /* r/o snapshot of upperdir sb's only taken on volatile mounts */ errseq_t errseq; }; +/* Number of lower layers, not including data-only layers */ +static inline unsigned int ovl_numlowerlayer(struct ovl_fs *ofs) +{ + return ofs->numlayer - ofs->numdatalayer - 1; +} + static inline struct vfsmount *ovl_upper_mnt(struct ovl_fs *ofs) { return ofs->layers[0].mnt; @@ -105,36 +125,53 @@ static inline bool ovl_should_sync(struct ovl_fs *ofs) return !ofs->config.ovl_volatile; } -/* private information held for every overlayfs dentry */ -struct ovl_entry { - union { - struct { - unsigned long flags; - }; - struct rcu_head rcu; - }; - unsigned numlower; - struct ovl_path lowerstack[]; -}; +static inline unsigned int ovl_numlower(struct ovl_entry *oe) +{ + return oe ? oe->__numlower : 0; +} -struct ovl_entry *ovl_alloc_entry(unsigned int numlower); +static inline struct ovl_path *ovl_lowerstack(struct ovl_entry *oe) +{ + return ovl_numlower(oe) ? oe->__lowerstack : NULL; +} -static inline struct ovl_entry *OVL_E(struct dentry *dentry) +static inline struct ovl_path *ovl_lowerpath(struct ovl_entry *oe) { - return (struct ovl_entry *) dentry->d_fsdata; + return ovl_lowerstack(oe); +} + +static inline struct ovl_path *ovl_lowerdata(struct ovl_entry *oe) +{ + struct ovl_path *lowerstack = ovl_lowerstack(oe); + + return lowerstack ? &lowerstack[oe->__numlower - 1] : NULL; +} + +/* May return NULL if lazy lookup of lowerdata is needed */ +static inline struct dentry *ovl_lowerdata_dentry(struct ovl_entry *oe) +{ + struct ovl_path *lowerdata = ovl_lowerdata(oe); + + return lowerdata ? READ_ONCE(lowerdata->dentry) : NULL; +} + +/* private information held for every overlayfs dentry */ +static inline unsigned long *OVL_E_FLAGS(struct dentry *dentry) +{ + return (unsigned long *) &dentry->d_fsdata; } struct ovl_inode { union { struct ovl_dir_cache *cache; /* directory */ - struct inode *lowerdata; /* regular file */ + const char *lowerdata_redirect; /* regular file */ }; const char *redirect; u64 version; unsigned long flags; struct inode vfs_inode; struct dentry *__upperdentry; - struct ovl_path lowerpath; + struct ovl_entry *oe; /* synchronize copy up and more */ struct mutex lock; @@ -145,6 +182,16 @@ static inline struct ovl_inode *OVL_I(struct inode *inode) return container_of(inode, struct ovl_inode, vfs_inode); } +static inline struct ovl_entry *OVL_I_E(struct inode *inode) +{ + return inode ? OVL_I(inode)->oe : NULL; +} + +static inline struct ovl_entry *OVL_E(struct dentry *dentry) +{ + return OVL_I_E(d_inode(dentry)); +} + static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi) { return READ_ONCE(oi->__upperdentry); diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c new file mode 100644 index 000000000000..a63160dbb0f9 --- /dev/null +++ b/fs/overlayfs/params.c @@ -0,0 +1,913 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/namei.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/posix_acl_xattr.h> +#include <linux/seq_file.h> +#include <linux/xattr.h> +#include "overlayfs.h" +#include "params.h" + +static bool ovl_redirect_dir_def = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_DIR); +module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644); +MODULE_PARM_DESC(redirect_dir, + "Default to on or off for the redirect_dir feature"); + +static bool ovl_redirect_always_follow = + IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW); +module_param_named(redirect_always_follow, ovl_redirect_always_follow, + bool, 0644); +MODULE_PARM_DESC(redirect_always_follow, + "Follow redirects even if redirect_dir feature is turned off"); + +static bool ovl_xino_auto_def = IS_ENABLED(CONFIG_OVERLAY_FS_XINO_AUTO); +module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644); +MODULE_PARM_DESC(xino_auto, + "Auto enable xino feature"); + +static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX); +module_param_named(index, ovl_index_def, bool, 0644); +MODULE_PARM_DESC(index, + "Default to on or off for the inodes index feature"); + +static bool ovl_nfs_export_def = IS_ENABLED(CONFIG_OVERLAY_FS_NFS_EXPORT); +module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644); +MODULE_PARM_DESC(nfs_export, + "Default to on or off for the NFS export feature"); + +static bool ovl_metacopy_def = IS_ENABLED(CONFIG_OVERLAY_FS_METACOPY); +module_param_named(metacopy, ovl_metacopy_def, bool, 0644); +MODULE_PARM_DESC(metacopy, + "Default to on or off for the metadata only copy up feature"); + +enum { + Opt_lowerdir, + Opt_upperdir, + Opt_workdir, + Opt_default_permissions, + Opt_redirect_dir, + Opt_index, + Opt_uuid, + Opt_nfs_export, + Opt_userxattr, + Opt_xino, + Opt_metacopy, + Opt_volatile, +}; + +static const struct constant_table ovl_parameter_bool[] = { + { "on", true }, + { "off", false }, + {} +}; + +static const struct constant_table ovl_parameter_xino[] = { + { "off", OVL_XINO_OFF }, + { "auto", OVL_XINO_AUTO }, + { "on", OVL_XINO_ON }, + {} +}; + +const char *ovl_xino_mode(struct ovl_config *config) +{ + return ovl_parameter_xino[config->xino].name; +} + +static int ovl_xino_def(void) +{ + return ovl_xino_auto_def ? OVL_XINO_AUTO : OVL_XINO_OFF; +} + +const struct constant_table ovl_parameter_redirect_dir[] = { + { "off", OVL_REDIRECT_OFF }, + { "follow", OVL_REDIRECT_FOLLOW }, + { "nofollow", OVL_REDIRECT_NOFOLLOW }, + { "on", OVL_REDIRECT_ON }, + {} +}; + +static const char *ovl_redirect_mode(struct ovl_config *config) +{ + return ovl_parameter_redirect_dir[config->redirect_mode].name; +} + +static int ovl_redirect_mode_def(void) +{ + return ovl_redirect_dir_def ? OVL_REDIRECT_ON : + ovl_redirect_always_follow ? OVL_REDIRECT_FOLLOW : + OVL_REDIRECT_NOFOLLOW; +} + +#define fsparam_string_empty(NAME, OPT) \ + __fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL) + +const struct fs_parameter_spec ovl_parameter_spec[] = { + fsparam_string_empty("lowerdir", Opt_lowerdir), + fsparam_string("upperdir", Opt_upperdir), + fsparam_string("workdir", Opt_workdir), + fsparam_flag("default_permissions", Opt_default_permissions), + fsparam_enum("redirect_dir", Opt_redirect_dir, ovl_parameter_redirect_dir), + fsparam_enum("index", Opt_index, ovl_parameter_bool), + fsparam_enum("uuid", Opt_uuid, ovl_parameter_bool), + fsparam_enum("nfs_export", Opt_nfs_export, ovl_parameter_bool), + fsparam_flag("userxattr", Opt_userxattr), + fsparam_enum("xino", Opt_xino, ovl_parameter_xino), + fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool), + fsparam_flag("volatile", Opt_volatile), + {} +}; + +static ssize_t ovl_parse_param_split_lowerdirs(char *str) +{ + ssize_t nr_layers = 1, nr_colons = 0; + char *s, *d; + + for (s = d = str;; s++, d++) { + if (*s == '\\') { + s++; + } else if (*s == ':') { + bool next_colon = (*(s + 1) == ':'); + + nr_colons++; + if (nr_colons == 2 && next_colon) { + pr_err("only single ':' or double '::' sequences of unescaped colons in lowerdir mount option allowed.\n"); + return -EINVAL; + } + /* count layers, not colons */ + if (!next_colon) + nr_layers++; + + *d = '\0'; + continue; + } + + *d = *s; + if (!*s) { + /* trailing colons */ + if (nr_colons) { + pr_err("unescaped trailing colons in lowerdir mount option.\n"); + return -EINVAL; + } + break; + } + nr_colons = 0; + } + + return nr_layers; +} + +static int ovl_mount_dir_noesc(const char *name, struct path *path) +{ + int err = -EINVAL; + + if (!*name) { + pr_err("empty lowerdir\n"); + goto out; + } + err = kern_path(name, LOOKUP_FOLLOW, path); + if (err) { + pr_err("failed to resolve '%s': %i\n", name, err); + goto out; + } + err = -EINVAL; + if (ovl_dentry_weird(path->dentry)) { + pr_err("filesystem on '%s' not supported\n", name); + goto out_put; + } + if (!d_is_dir(path->dentry)) { + pr_err("'%s' not a directory\n", name); + goto out_put; + } + return 0; + +out_put: + path_put_init(path); +out: + return err; +} + +static void ovl_unescape(char *s) +{ + char *d = s; + + for (;; s++, d++) { + if (*s == '\\') + s++; + *d = *s; + if (!*s) + break; + } +} + +static int ovl_mount_dir(const char *name, struct path *path) +{ + int err = -ENOMEM; + char *tmp = kstrdup(name, GFP_KERNEL); + + if (tmp) { + ovl_unescape(tmp); + err = ovl_mount_dir_noesc(tmp, path); + + if (!err && path->dentry->d_flags & DCACHE_OP_REAL) { + pr_err("filesystem on '%s' not supported as upperdir\n", + tmp); + path_put_init(path); + err = -EINVAL; + } + kfree(tmp); + } + return err; +} + +static int ovl_parse_param_upperdir(const char *name, struct fs_context *fc, + bool workdir) +{ + int err; + struct ovl_fs *ofs = fc->s_fs_info; + struct ovl_config *config = &ofs->config; + struct ovl_fs_context *ctx = fc->fs_private; + struct path path; + char *dup; + + err = ovl_mount_dir(name, &path); + if (err) + return err; + + /* + * Check whether upper path is read-only here to report failures + * early. Don't forget to recheck when the superblock is created + * as the mount attributes could change. + */ + if (__mnt_is_readonly(path.mnt)) { + path_put(&path); + return -EINVAL; + } + + dup = kstrdup(name, GFP_KERNEL); + if (!dup) { + path_put(&path); + return -ENOMEM; + } + + if (workdir) { + kfree(config->workdir); + config->workdir = dup; + path_put(&ctx->work); + ctx->work = path; + } else { + kfree(config->upperdir); + config->upperdir = dup; + path_put(&ctx->upper); + ctx->upper = path; + } + return 0; +} + +static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx) +{ + for (size_t nr = 0; nr < ctx->nr; nr++) { + path_put(&ctx->lower[nr].path); + kfree(ctx->lower[nr].name); + ctx->lower[nr].name = NULL; + } + ctx->nr = 0; + ctx->nr_data = 0; +} + +/* + * Parse lowerdir= mount option: + * + * (1) lowerdir=/lower1:/lower2:/lower3::/data1::/data2 + * Set "/lower1", "/lower2", and "/lower3" as lower layers and + * "/data1" and "/data2" as data lower layers. Any existing lower + * layers are replaced. + * (2) lowerdir=:/lower4 + * Append "/lower4" to current stack of lower layers. This requires + * that there already is at least one lower layer configured. + * (3) lowerdir=::/lower5 + * Append data "/lower5" as data lower layer. This requires that + * there's at least one regular lower layer present. + */ +static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) +{ + int err; + struct ovl_fs_context *ctx = fc->fs_private; + struct ovl_fs_context_layer *l; + char *dup = NULL, *dup_iter; + ssize_t nr_lower = 0, nr = 0, nr_data = 0; + bool append = false, data_layer = false; + + /* + * Ensure we're backwards compatible with mount(2) + * by allowing relative paths. + */ + + /* drop all existing lower layers */ + if (!*name) { + ovl_parse_param_drop_lowerdir(ctx); + return 0; + } + + if (strncmp(name, "::", 2) == 0) { + /* + * This is a data layer. + * There must be at least one regular lower layer + * specified. + */ + if (ctx->nr == 0) { + pr_err("data lower layers without regular lower layers not allowed"); + return -EINVAL; + } + + /* Skip the leading "::". */ + name += 2; + data_layer = true; + /* + * A data layer is automatically an append as there + * must've been at least one regular lower layer. + */ + append = true; + } else if (*name == ':') { + /* + * This is a regular lower layer. + * If users want to append a layer enforce that they + * have already specified a first layer before. It's + * better to be strict. + */ + if (ctx->nr == 0) { + pr_err("cannot append layer if no previous layer has been specified"); + return -EINVAL; + } + + /* + * Once a sequence of data layers has started regular + * lower layers are forbidden. + */ + if (ctx->nr_data > 0) { + pr_err("regular lower layers cannot follow data lower layers"); + return -EINVAL; + } + + /* Skip the leading ":". */ + name++; + append = true; + } + + dup = kstrdup(name, GFP_KERNEL); + if (!dup) + return -ENOMEM; + + err = -EINVAL; + nr_lower = ovl_parse_param_split_lowerdirs(dup); + if (nr_lower < 0) + goto out_err; + + if ((nr_lower > OVL_MAX_STACK) || + (append && (size_add(ctx->nr, nr_lower) > OVL_MAX_STACK))) { + pr_err("too many lower directories, limit is %d\n", OVL_MAX_STACK); + goto out_err; + } + + if (!append) + ovl_parse_param_drop_lowerdir(ctx); + + /* + * (1) append + * + * We want nr <= nr_lower <= capacity We know nr > 0 and nr <= + * capacity. If nr == 0 this wouldn't be append. If nr + + * nr_lower is <= capacity then nr <= nr_lower <= capacity + * already holds. If nr + nr_lower exceeds capacity, we realloc. + * + * (2) replace + * + * Ensure we're backwards compatible with mount(2) which allows + * "lowerdir=/a:/b:/c,lowerdir=/d:/e:/f" causing the last + * specified lowerdir mount option to win. + * + * We want nr <= nr_lower <= capacity We know either (i) nr == 0 + * or (ii) nr > 0. We also know nr_lower > 0. The capacity + * could've been changed multiple times already so we only know + * nr <= capacity. If nr + nr_lower > capacity we realloc, + * otherwise nr <= nr_lower <= capacity holds already. + */ + nr_lower += ctx->nr; + if (nr_lower > ctx->capacity) { + err = -ENOMEM; + l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower), + GFP_KERNEL_ACCOUNT); + if (!l) + goto out_err; + + ctx->lower = l; + ctx->capacity = nr_lower; + } + + /* + * (3) By (1) and (2) we know nr <= nr_lower <= capacity. + * (4) If ctx->nr == 0 => replace + * We have verified above that the lowerdir mount option + * isn't an append, i.e., the lowerdir mount option + * doesn't start with ":" or "::". + * (4.1) The lowerdir mount options only contains regular lower + * layers ":". + * => Nothing to verify. + * (4.2) The lowerdir mount options contains regular ":" and + * data "::" layers. + * => We need to verify that data lower layers "::" aren't + * followed by regular ":" lower layers + * (5) If ctx->nr > 0 => append + * We know that there's at least one regular layer + * otherwise we would've failed when parsing the previous + * lowerdir mount option. + * (5.1) The lowerdir mount option is a regular layer ":" append + * => We need to verify that no data layers have been + * specified before. + * (5.2) The lowerdir mount option is a data layer "::" append + * We know that there's at least one regular layer or + * other data layers. => There's nothing to verify. + */ + dup_iter = dup; + for (nr = ctx->nr; nr < nr_lower; nr++) { + l = &ctx->lower[nr]; + memset(l, 0, sizeof(*l)); + + err = ovl_mount_dir_noesc(dup_iter, &l->path); + if (err) + goto out_put; + + err = -ENOMEM; + l->name = kstrdup(dup_iter, GFP_KERNEL_ACCOUNT); + if (!l->name) + goto out_put; + + if (data_layer) + nr_data++; + + /* Calling strchr() again would overrun. */ + if ((nr + 1) == nr_lower) + break; + + err = -EINVAL; + dup_iter = strchr(dup_iter, '\0') + 1; + if (*dup_iter) { + /* + * This is a regular layer so we require that + * there are no data layers. + */ + if ((ctx->nr_data + nr_data) > 0) { + pr_err("regular lower layers cannot follow data lower layers"); + goto out_put; + } + + data_layer = false; + continue; + } + + /* This is a data lower layer. */ + data_layer = true; + dup_iter++; + } + ctx->nr = nr_lower; + ctx->nr_data += nr_data; + kfree(dup); + return 0; + +out_put: + /* + * We know nr >= ctx->nr < nr_lower. If we failed somewhere + * we want to undo until nr == ctx->nr. This is correct for + * both ctx->nr == 0 and ctx->nr > 0. + */ + for (; nr >= ctx->nr; nr--) { + l = &ctx->lower[nr]; + kfree(l->name); + l->name = NULL; + path_put(&l->path); + + /* don't overflow */ + if (nr == 0) + break; + } + +out_err: + kfree(dup); + + /* Intentionally don't realloc to a smaller size. */ + return err; +} + +static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + int err = 0; + struct fs_parse_result result; + struct ovl_fs *ofs = fc->s_fs_info; + struct ovl_config *config = &ofs->config; + struct ovl_fs_context *ctx = fc->fs_private; + int opt; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + /* + * On remount overlayfs has always ignored all mount + * options no matter if malformed or not so for + * backwards compatibility we do the same here. + */ + if (fc->oldapi) + return 0; + + /* + * Give us the freedom to allow changing mount options + * with the new mount api in the future. So instead of + * silently ignoring everything we report a proper + * error. This is only visible for users of the new + * mount api. + */ + return invalfc(fc, "No changes allowed in reconfigure"); + } + + opt = fs_parse(fc, ovl_parameter_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_lowerdir: + err = ovl_parse_param_lowerdir(param->string, fc); + break; + case Opt_upperdir: + fallthrough; + case Opt_workdir: + err = ovl_parse_param_upperdir(param->string, fc, + (Opt_workdir == opt)); + break; + case Opt_default_permissions: + config->default_permissions = true; + break; + case Opt_redirect_dir: + config->redirect_mode = result.uint_32; + if (config->redirect_mode == OVL_REDIRECT_OFF) { + config->redirect_mode = ovl_redirect_always_follow ? + OVL_REDIRECT_FOLLOW : + OVL_REDIRECT_NOFOLLOW; + } + ctx->set.redirect = true; + break; + case Opt_index: + config->index = result.uint_32; + ctx->set.index = true; + break; + case Opt_uuid: + config->uuid = result.uint_32; + break; + case Opt_nfs_export: + config->nfs_export = result.uint_32; + ctx->set.nfs_export = true; + break; + case Opt_xino: + config->xino = result.uint_32; + break; + case Opt_metacopy: + config->metacopy = result.uint_32; + ctx->set.metacopy = true; + break; + case Opt_volatile: + config->ovl_volatile = true; + break; + case Opt_userxattr: + config->userxattr = true; + break; + default: + pr_err("unrecognized mount option \"%s\" or missing value\n", + param->key); + return -EINVAL; + } + + return err; +} + +static int ovl_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, ovl_fill_super); +} + +static inline void ovl_fs_context_free(struct ovl_fs_context *ctx) +{ + ovl_parse_param_drop_lowerdir(ctx); + path_put(&ctx->upper); + path_put(&ctx->work); + kfree(ctx->lower); + kfree(ctx); +} + +static void ovl_free(struct fs_context *fc) +{ + struct ovl_fs *ofs = fc->s_fs_info; + struct ovl_fs_context *ctx = fc->fs_private; + + /* + * ofs is stored in the fs_context when it is initialized. + * ofs is transferred to the superblock on a successful mount, + * but if an error occurs before the transfer we have to free + * it here. + */ + if (ofs) + ovl_free_fs(ofs); + + if (ctx) + ovl_fs_context_free(ctx); +} + +static int ovl_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct ovl_fs *ofs = sb->s_fs_info; + struct super_block *upper_sb; + int ret = 0; + + if (!(fc->sb_flags & SB_RDONLY) && ovl_force_readonly(ofs)) + return -EROFS; + + if (fc->sb_flags & SB_RDONLY && !sb_rdonly(sb)) { + upper_sb = ovl_upper_mnt(ofs)->mnt_sb; + if (ovl_should_sync(ofs)) { + down_read(&upper_sb->s_umount); + ret = sync_filesystem(upper_sb); + up_read(&upper_sb->s_umount); + } + } + + return ret; +} + +static const struct fs_context_operations ovl_context_ops = { + .parse_param = ovl_parse_param, + .get_tree = ovl_get_tree, + .reconfigure = ovl_reconfigure, + .free = ovl_free, +}; + +/* + * This is called during fsopen() and will record the user namespace of + * the caller in fc->user_ns since we've raised FS_USERNS_MOUNT. We'll + * need it when we actually create the superblock to verify that the + * process creating the superblock is in the same user namespace as + * process that called fsopen(). + */ +int ovl_init_fs_context(struct fs_context *fc) +{ + struct ovl_fs_context *ctx; + struct ovl_fs *ofs; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT); + if (!ctx) + return -ENOMEM; + + /* + * By default we allocate for three lower layers. It's likely + * that it'll cover most users. + */ + ctx->lower = kmalloc_array(3, sizeof(*ctx->lower), GFP_KERNEL_ACCOUNT); + if (!ctx->lower) + goto out_err; + ctx->capacity = 3; + + ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ofs) + goto out_err; + + ofs->config.redirect_mode = ovl_redirect_mode_def(); + ofs->config.index = ovl_index_def; + ofs->config.uuid = true; + ofs->config.nfs_export = ovl_nfs_export_def; + ofs->config.xino = ovl_xino_def(); + ofs->config.metacopy = ovl_metacopy_def; + + fc->s_fs_info = ofs; + fc->fs_private = ctx; + fc->ops = &ovl_context_ops; + return 0; + +out_err: + ovl_fs_context_free(ctx); + return -ENOMEM; + +} + +void ovl_free_fs(struct ovl_fs *ofs) +{ + struct vfsmount **mounts; + unsigned i; + + iput(ofs->workbasedir_trap); + iput(ofs->indexdir_trap); + iput(ofs->workdir_trap); + dput(ofs->whiteout); + dput(ofs->indexdir); + dput(ofs->workdir); + if (ofs->workdir_locked) + ovl_inuse_unlock(ofs->workbasedir); + dput(ofs->workbasedir); + if (ofs->upperdir_locked) + ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root); + + /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */ + mounts = (struct vfsmount **) ofs->layers; + for (i = 0; i < ofs->numlayer; i++) { + iput(ofs->layers[i].trap); + mounts[i] = ofs->layers[i].mnt; + kfree(ofs->layers[i].name); + } + kern_unmount_array(mounts, ofs->numlayer); + kfree(ofs->layers); + for (i = 0; i < ofs->numfs; i++) + free_anon_bdev(ofs->fs[i].pseudo_dev); + kfree(ofs->fs); + + kfree(ofs->config.upperdir); + kfree(ofs->config.workdir); + if (ofs->creator_cred) + put_cred(ofs->creator_cred); + kfree(ofs); +} + +int ovl_fs_params_verify(const struct ovl_fs_context *ctx, + struct ovl_config *config) +{ + struct ovl_opt_set set = ctx->set; + + if (ctx->nr_data > 0 && !config->metacopy) { + pr_err("lower data-only dirs require metacopy support.\n"); + return -EINVAL; + } + + /* Workdir/index are useless in non-upper mount */ + if (!config->upperdir) { + if (config->workdir) { + pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + config->workdir); + kfree(config->workdir); + config->workdir = NULL; + } + if (config->index && set.index) { + pr_info("option \"index=on\" is useless in a non-upper mount, ignore\n"); + set.index = false; + } + config->index = false; + } + + if (!config->upperdir && config->ovl_volatile) { + pr_info("option \"volatile\" is meaningless in a non-upper mount, ignoring it.\n"); + config->ovl_volatile = false; + } + + /* + * This is to make the logic below simpler. It doesn't make any other + * difference, since redirect_dir=on is only used for upper. + */ + if (!config->upperdir && config->redirect_mode == OVL_REDIRECT_FOLLOW) + config->redirect_mode = OVL_REDIRECT_ON; + + /* Resolve metacopy -> redirect_dir dependency */ + if (config->metacopy && config->redirect_mode != OVL_REDIRECT_ON) { + if (set.metacopy && set.redirect) { + pr_err("conflicting options: metacopy=on,redirect_dir=%s\n", + ovl_redirect_mode(config)); + return -EINVAL; + } + if (set.redirect) { + /* + * There was an explicit redirect_dir=... that resulted + * in this conflict. + */ + pr_info("disabling metacopy due to redirect_dir=%s\n", + ovl_redirect_mode(config)); + config->metacopy = false; + } else { + /* Automatically enable redirect otherwise. */ + config->redirect_mode = OVL_REDIRECT_ON; + } + } + + /* Resolve nfs_export -> index dependency */ + if (config->nfs_export && !config->index) { + if (!config->upperdir && + config->redirect_mode != OVL_REDIRECT_NOFOLLOW) { + pr_info("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); + config->nfs_export = false; + } else if (set.nfs_export && set.index) { + pr_err("conflicting options: nfs_export=on,index=off\n"); + return -EINVAL; + } else if (set.index) { + /* + * There was an explicit index=off that resulted + * in this conflict. + */ + pr_info("disabling nfs_export due to index=off\n"); + config->nfs_export = false; + } else { + /* Automatically enable index otherwise. */ + config->index = true; + } + } + + /* Resolve nfs_export -> !metacopy dependency */ + if (config->nfs_export && config->metacopy) { + if (set.nfs_export && set.metacopy) { + pr_err("conflicting options: nfs_export=on,metacopy=on\n"); + return -EINVAL; + } + if (set.metacopy) { + /* + * There was an explicit metacopy=on that resulted + * in this conflict. + */ + pr_info("disabling nfs_export due to metacopy=on\n"); + config->nfs_export = false; + } else { + /* + * There was an explicit nfs_export=on that resulted + * in this conflict. + */ + pr_info("disabling metacopy due to nfs_export=on\n"); + config->metacopy = false; + } + } + + + /* Resolve userxattr -> !redirect && !metacopy dependency */ + if (config->userxattr) { + if (set.redirect && + config->redirect_mode != OVL_REDIRECT_NOFOLLOW) { + pr_err("conflicting options: userxattr,redirect_dir=%s\n", + ovl_redirect_mode(config)); + return -EINVAL; + } + if (config->metacopy && set.metacopy) { + pr_err("conflicting options: userxattr,metacopy=on\n"); + return -EINVAL; + } + /* + * Silently disable default setting of redirect and metacopy. + * This shall be the default in the future as well: these + * options must be explicitly enabled if used together with + * userxattr. + */ + config->redirect_mode = OVL_REDIRECT_NOFOLLOW; + config->metacopy = false; + } + + return 0; +} + +/** + * ovl_show_options + * @m: the seq_file handle + * @dentry: The dentry to query + * + * Prints the mount options for a given superblock. + * Returns zero; does not fail. + */ +int ovl_show_options(struct seq_file *m, struct dentry *dentry) +{ + struct super_block *sb = dentry->d_sb; + struct ovl_fs *ofs = sb->s_fs_info; + size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer; + const struct ovl_layer *data_layers = &ofs->layers[nr_merged_lower]; + + /* ofs->layers[0] is the upper layer */ + seq_printf(m, ",lowerdir=%s", ofs->layers[1].name); + /* dump regular lower layers */ + for (nr = 2; nr < nr_merged_lower; nr++) + seq_printf(m, ":%s", ofs->layers[nr].name); + /* dump data lower layers */ + for (nr = 0; nr < ofs->numdatalayer; nr++) + seq_printf(m, "::%s", data_layers[nr].name); + if (ofs->config.upperdir) { + seq_show_option(m, "upperdir", ofs->config.upperdir); + seq_show_option(m, "workdir", ofs->config.workdir); + } + if (ofs->config.default_permissions) + seq_puts(m, ",default_permissions"); + if (ofs->config.redirect_mode != ovl_redirect_mode_def()) + seq_printf(m, ",redirect_dir=%s", + ovl_redirect_mode(&ofs->config)); + if (ofs->config.index != ovl_index_def) + seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); + if (!ofs->config.uuid) + seq_puts(m, ",uuid=off"); + if (ofs->config.nfs_export != ovl_nfs_export_def) + seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? + "on" : "off"); + if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(ofs)) + seq_printf(m, ",xino=%s", ovl_xino_mode(&ofs->config)); + if (ofs->config.metacopy != ovl_metacopy_def) + seq_printf(m, ",metacopy=%s", + ofs->config.metacopy ? "on" : "off"); + if (ofs->config.ovl_volatile) + seq_puts(m, ",volatile"); + if (ofs->config.userxattr) + seq_puts(m, ",userxattr"); + return 0; +} diff --git a/fs/overlayfs/params.h b/fs/overlayfs/params.h new file mode 100644 index 000000000000..8750da68ab2a --- /dev/null +++ b/fs/overlayfs/params.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#include <linux/fs_context.h> +#include <linux/fs_parser.h> + +struct ovl_fs; +struct ovl_config; + +extern const struct fs_parameter_spec ovl_parameter_spec[]; +extern const struct constant_table ovl_parameter_redirect_dir[]; + +/* The set of options that user requested explicitly via mount options */ +struct ovl_opt_set { + bool metacopy; + bool redirect; + bool nfs_export; + bool index; +}; + +#define OVL_MAX_STACK 500 + +struct ovl_fs_context_layer { + char *name; + struct path path; +}; + +struct ovl_fs_context { + struct path upper; + struct path work; + size_t capacity; + size_t nr; /* includes nr_data */ + size_t nr_data; + struct ovl_opt_set set; + struct ovl_fs_context_layer *lower; +}; + +int ovl_init_fs_context(struct fs_context *fc); +void ovl_free_fs(struct ovl_fs *ofs); +int ovl_fs_params_verify(const struct ovl_fs_context *ctx, + struct ovl_config *config); +int ovl_show_options(struct seq_file *m, struct dentry *dentry); +const char *ovl_xino_mode(struct ovl_config *config); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index b6952b21a7ee..ee5c4736480f 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -118,7 +118,7 @@ static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, return false; /* Always recalc d_ino when remapping lower inode numbers */ - if (ovl_xino_bits(rdd->dentry->d_sb)) + if (ovl_xino_bits(OVL_FS(rdd->dentry->d_sb))) return true; /* Always recalc d_ino for parent */ @@ -460,13 +460,14 @@ static int ovl_cache_update_ino(const struct path *path, struct ovl_cache_entry { struct dentry *dir = path->dentry; + struct ovl_fs *ofs = OVL_FS(dir->d_sb); struct dentry *this = NULL; enum ovl_path_type type; u64 ino = p->real_ino; - int xinobits = ovl_xino_bits(dir->d_sb); + int xinobits = ovl_xino_bits(ofs); int err = 0; - if (!ovl_same_dev(dir->d_sb)) + if (!ovl_same_dev(ofs)) goto out; if (p->name[0] == '.') { @@ -515,7 +516,7 @@ get: ino = ovl_remap_lower_ino(ino, xinobits, ovl_layer_lower(this)->fsid, p->name, p->len, - ovl_xino_warn(dir->d_sb)); + ovl_xino_warn(ofs)); } out: @@ -694,12 +695,13 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) int err; struct ovl_dir_file *od = file->private_data; struct dentry *dir = file->f_path.dentry; + struct ovl_fs *ofs = OVL_FS(dir->d_sb); const struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, .orig_ctx = ctx, - .xinobits = ovl_xino_bits(dir->d_sb), - .xinowarn = ovl_xino_warn(dir->d_sb), + .xinobits = ovl_xino_bits(ofs), + .xinowarn = ovl_xino_warn(ofs), }; if (rdt.xinobits && lower_layer) @@ -735,6 +737,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_cache_entry *p; const struct cred *old_cred; int err; @@ -749,8 +752,8 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) * dir is impure then need to adjust d_ino for copied up * entries. */ - if (ovl_xino_bits(dentry->d_sb) || - (ovl_same_fs(dentry->d_sb) && + if (ovl_xino_bits(ofs) || + (ovl_same_fs(ofs) && (ovl_is_impure_dir(file) || OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { err = ovl_iterate_real(file, ctx); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index f97ad8b40dbb..5b069f1a1e44 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -16,7 +16,10 @@ #include <linux/posix_acl_xattr.h> #include <linux/exportfs.h> #include <linux/file.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include "overlayfs.h" +#include "params.h" MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Overlay filesystem"); @@ -25,58 +28,6 @@ MODULE_LICENSE("GPL"); struct ovl_dir_cache; -#define OVL_MAX_STACK 500 - -static bool ovl_redirect_dir_def = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_DIR); -module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644); -MODULE_PARM_DESC(redirect_dir, - "Default to on or off for the redirect_dir feature"); - -static bool ovl_redirect_always_follow = - IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW); -module_param_named(redirect_always_follow, ovl_redirect_always_follow, - bool, 0644); -MODULE_PARM_DESC(redirect_always_follow, - "Follow redirects even if redirect_dir feature is turned off"); - -static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX); -module_param_named(index, ovl_index_def, bool, 0644); -MODULE_PARM_DESC(index, - "Default to on or off for the inodes index feature"); - -static bool ovl_nfs_export_def = IS_ENABLED(CONFIG_OVERLAY_FS_NFS_EXPORT); -module_param_named(nfs_export, ovl_nfs_export_def, bool, 0644); -MODULE_PARM_DESC(nfs_export, - "Default to on or off for the NFS export feature"); - -static bool ovl_xino_auto_def = IS_ENABLED(CONFIG_OVERLAY_FS_XINO_AUTO); -module_param_named(xino_auto, ovl_xino_auto_def, bool, 0644); -MODULE_PARM_DESC(xino_auto, - "Auto enable xino feature"); - -static void ovl_entry_stack_free(struct ovl_entry *oe) -{ - unsigned int i; - - for (i = 0; i < oe->numlower; i++) - dput(oe->lowerstack[i].dentry); -} - -static bool ovl_metacopy_def = IS_ENABLED(CONFIG_OVERLAY_FS_METACOPY); -module_param_named(metacopy, ovl_metacopy_def, bool, 0644); -MODULE_PARM_DESC(metacopy, - "Default to on or off for the metadata only copy up feature"); - -static void ovl_dentry_release(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - if (oe) { - ovl_entry_stack_free(oe); - kfree_rcu(oe, rcu); - } -} - static struct dentry *ovl_d_real(struct dentry *dentry, const struct inode *inode) { @@ -99,6 +50,15 @@ static struct dentry *ovl_d_real(struct dentry *dentry, if (real && !inode && ovl_has_upperdata(d_inode(dentry))) return real; + /* + * Best effort lazy lookup of lowerdata for !inode case to return + * the real lowerdata dentry. The only current caller of d_real() with + * NULL inode is d_real_inode() from trace_uprobe and this caller is + * likely going to be followed reading from the file, before placing + * uprobes on offset within the file, so lowerdata should be available + * when setting the uprobe. + */ + ovl_maybe_lookup_lowerdata(dentry); lower = ovl_dentry_lowerdata(dentry); if (!lower) goto bug; @@ -121,6 +81,9 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak) { int ret = 1; + if (!d) + return 1; + if (weak) { if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) ret = d->d_op->d_weak_revalidate(d, flags); @@ -138,7 +101,8 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak) static int ovl_dentry_revalidate_common(struct dentry *dentry, unsigned int flags, bool weak) { - struct ovl_entry *oe = dentry->d_fsdata; + struct ovl_entry *oe = OVL_E(dentry); + struct ovl_path *lowerstack = ovl_lowerstack(oe); struct inode *inode = d_inode_rcu(dentry); struct dentry *upper; unsigned int i; @@ -152,10 +116,9 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry, if (upper) ret = ovl_revalidate_real(upper, flags, weak); - for (i = 0; ret > 0 && i < oe->numlower; i++) { - ret = ovl_revalidate_real(oe->lowerstack[i].dentry, flags, - weak); - } + for (i = 0; ret > 0 && i < ovl_numlower(oe); i++) + ret = ovl_revalidate_real(lowerstack[i].dentry, flags, weak); + return ret; } @@ -170,7 +133,6 @@ static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) } static const struct dentry_operations ovl_dentry_operations = { - .d_release = ovl_dentry_release, .d_real = ovl_d_real, .d_revalidate = ovl_dentry_revalidate, .d_weak_revalidate = ovl_dentry_weak_revalidate, @@ -190,9 +152,8 @@ static struct inode *ovl_alloc_inode(struct super_block *sb) oi->version = 0; oi->flags = 0; oi->__upperdentry = NULL; - oi->lowerpath.dentry = NULL; - oi->lowerpath.layer = NULL; - oi->lowerdata = NULL; + oi->lowerdata_redirect = NULL; + oi->oe = NULL; mutex_init(&oi->lock); return &oi->vfs_inode; @@ -212,56 +173,19 @@ static void ovl_destroy_inode(struct inode *inode) struct ovl_inode *oi = OVL_I(inode); dput(oi->__upperdentry); - dput(oi->lowerpath.dentry); + ovl_free_entry(oi->oe); if (S_ISDIR(inode->i_mode)) ovl_dir_cache_free(inode); else - iput(oi->lowerdata); -} - -static void ovl_free_fs(struct ovl_fs *ofs) -{ - struct vfsmount **mounts; - unsigned i; - - iput(ofs->workbasedir_trap); - iput(ofs->indexdir_trap); - iput(ofs->workdir_trap); - dput(ofs->whiteout); - dput(ofs->indexdir); - dput(ofs->workdir); - if (ofs->workdir_locked) - ovl_inuse_unlock(ofs->workbasedir); - dput(ofs->workbasedir); - if (ofs->upperdir_locked) - ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root); - - /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */ - mounts = (struct vfsmount **) ofs->layers; - for (i = 0; i < ofs->numlayer; i++) { - iput(ofs->layers[i].trap); - mounts[i] = ofs->layers[i].mnt; - } - kern_unmount_array(mounts, ofs->numlayer); - kfree(ofs->layers); - for (i = 0; i < ofs->numfs; i++) - free_anon_bdev(ofs->fs[i].pseudo_dev); - kfree(ofs->fs); - - kfree(ofs->config.lowerdir); - kfree(ofs->config.upperdir); - kfree(ofs->config.workdir); - kfree(ofs->config.redirect_mode); - if (ofs->creator_cred) - put_cred(ofs->creator_cred); - kfree(ofs); + kfree(oi->lowerdata_redirect); } static void ovl_put_super(struct super_block *sb) { struct ovl_fs *ofs = sb->s_fs_info; - ovl_free_fs(ofs); + if (ofs) + ovl_free_fs(ofs); } /* Sync real dirty inodes in upper filesystem (if it exists) */ @@ -331,90 +255,6 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) return err; } -/* Will this overlay be forced to mount/remount ro? */ -static bool ovl_force_readonly(struct ovl_fs *ofs) -{ - return (!ovl_upper_mnt(ofs) || !ofs->workdir); -} - -static const char *ovl_redirect_mode_def(void) -{ - return ovl_redirect_dir_def ? "on" : "off"; -} - -static const char * const ovl_xino_str[] = { - "off", - "auto", - "on", -}; - -static inline int ovl_xino_def(void) -{ - return ovl_xino_auto_def ? OVL_XINO_AUTO : OVL_XINO_OFF; -} - -/** - * ovl_show_options - * @m: the seq_file handle - * @dentry: The dentry to query - * - * Prints the mount options for a given superblock. - * Returns zero; does not fail. - */ -static int ovl_show_options(struct seq_file *m, struct dentry *dentry) -{ - struct super_block *sb = dentry->d_sb; - struct ovl_fs *ofs = sb->s_fs_info; - - seq_show_option(m, "lowerdir", ofs->config.lowerdir); - if (ofs->config.upperdir) { - seq_show_option(m, "upperdir", ofs->config.upperdir); - seq_show_option(m, "workdir", ofs->config.workdir); - } - if (ofs->config.default_permissions) - seq_puts(m, ",default_permissions"); - if (strcmp(ofs->config.redirect_mode, ovl_redirect_mode_def()) != 0) - seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode); - if (ofs->config.index != ovl_index_def) - seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); - if (!ofs->config.uuid) - seq_puts(m, ",uuid=off"); - if (ofs->config.nfs_export != ovl_nfs_export_def) - seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? - "on" : "off"); - if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(sb)) - seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]); - if (ofs->config.metacopy != ovl_metacopy_def) - seq_printf(m, ",metacopy=%s", - ofs->config.metacopy ? "on" : "off"); - if (ofs->config.ovl_volatile) - seq_puts(m, ",volatile"); - if (ofs->config.userxattr) - seq_puts(m, ",userxattr"); - return 0; -} - -static int ovl_remount(struct super_block *sb, int *flags, char *data) -{ - struct ovl_fs *ofs = sb->s_fs_info; - struct super_block *upper_sb; - int ret = 0; - - if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs)) - return -EROFS; - - if (*flags & SB_RDONLY && !sb_rdonly(sb)) { - upper_sb = ovl_upper_mnt(ofs)->mnt_sb; - if (ovl_should_sync(ofs)) { - down_read(&upper_sb->s_umount); - ret = sync_filesystem(upper_sb); - up_read(&upper_sb->s_umount); - } - } - - return ret; -} - static const struct super_operations ovl_super_operations = { .alloc_inode = ovl_alloc_inode, .free_inode = ovl_free_inode, @@ -424,338 +264,8 @@ static const struct super_operations ovl_super_operations = { .sync_fs = ovl_sync_fs, .statfs = ovl_statfs, .show_options = ovl_show_options, - .remount_fs = ovl_remount, -}; - -enum { - OPT_LOWERDIR, - OPT_UPPERDIR, - OPT_WORKDIR, - OPT_DEFAULT_PERMISSIONS, - OPT_REDIRECT_DIR, - OPT_INDEX_ON, - OPT_INDEX_OFF, - OPT_UUID_ON, - OPT_UUID_OFF, - OPT_NFS_EXPORT_ON, - OPT_USERXATTR, - OPT_NFS_EXPORT_OFF, - OPT_XINO_ON, - OPT_XINO_OFF, - OPT_XINO_AUTO, - OPT_METACOPY_ON, - OPT_METACOPY_OFF, - OPT_VOLATILE, - OPT_ERR, }; -static const match_table_t ovl_tokens = { - {OPT_LOWERDIR, "lowerdir=%s"}, - {OPT_UPPERDIR, "upperdir=%s"}, - {OPT_WORKDIR, "workdir=%s"}, - {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, - {OPT_REDIRECT_DIR, "redirect_dir=%s"}, - {OPT_INDEX_ON, "index=on"}, - {OPT_INDEX_OFF, "index=off"}, - {OPT_USERXATTR, "userxattr"}, - {OPT_UUID_ON, "uuid=on"}, - {OPT_UUID_OFF, "uuid=off"}, - {OPT_NFS_EXPORT_ON, "nfs_export=on"}, - {OPT_NFS_EXPORT_OFF, "nfs_export=off"}, - {OPT_XINO_ON, "xino=on"}, - {OPT_XINO_OFF, "xino=off"}, - {OPT_XINO_AUTO, "xino=auto"}, - {OPT_METACOPY_ON, "metacopy=on"}, - {OPT_METACOPY_OFF, "metacopy=off"}, - {OPT_VOLATILE, "volatile"}, - {OPT_ERR, NULL} -}; - -static char *ovl_next_opt(char **s) -{ - char *sbegin = *s; - char *p; - - if (sbegin == NULL) - return NULL; - - for (p = sbegin; *p; p++) { - if (*p == '\\') { - p++; - if (!*p) - break; - } else if (*p == ',') { - *p = '\0'; - *s = p + 1; - return sbegin; - } - } - *s = NULL; - return sbegin; -} - -static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode) -{ - if (strcmp(mode, "on") == 0) { - config->redirect_dir = true; - /* - * Does not make sense to have redirect creation without - * redirect following. - */ - config->redirect_follow = true; - } else if (strcmp(mode, "follow") == 0) { - config->redirect_follow = true; - } else if (strcmp(mode, "off") == 0) { - if (ovl_redirect_always_follow) - config->redirect_follow = true; - } else if (strcmp(mode, "nofollow") != 0) { - pr_err("bad mount option \"redirect_dir=%s\"\n", - mode); - return -EINVAL; - } - - return 0; -} - -static int ovl_parse_opt(char *opt, struct ovl_config *config) -{ - char *p; - int err; - bool metacopy_opt = false, redirect_opt = false; - bool nfs_export_opt = false, index_opt = false; - - config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL); - if (!config->redirect_mode) - return -ENOMEM; - - while ((p = ovl_next_opt(&opt)) != NULL) { - int token; - substring_t args[MAX_OPT_ARGS]; - - if (!*p) - continue; - - token = match_token(p, ovl_tokens, args); - switch (token) { - case OPT_UPPERDIR: - kfree(config->upperdir); - config->upperdir = match_strdup(&args[0]); - if (!config->upperdir) - return -ENOMEM; - break; - - case OPT_LOWERDIR: - kfree(config->lowerdir); - config->lowerdir = match_strdup(&args[0]); - if (!config->lowerdir) - return -ENOMEM; - break; - - case OPT_WORKDIR: - kfree(config->workdir); - config->workdir = match_strdup(&args[0]); - if (!config->workdir) - return -ENOMEM; - break; - - case OPT_DEFAULT_PERMISSIONS: - config->default_permissions = true; - break; - - case OPT_REDIRECT_DIR: - kfree(config->redirect_mode); - config->redirect_mode = match_strdup(&args[0]); - if (!config->redirect_mode) - return -ENOMEM; - redirect_opt = true; - break; - - case OPT_INDEX_ON: - config->index = true; - index_opt = true; - break; - - case OPT_INDEX_OFF: - config->index = false; - index_opt = true; - break; - - case OPT_UUID_ON: - config->uuid = true; - break; - - case OPT_UUID_OFF: - config->uuid = false; - break; - - case OPT_NFS_EXPORT_ON: - config->nfs_export = true; - nfs_export_opt = true; - break; - - case OPT_NFS_EXPORT_OFF: - config->nfs_export = false; - nfs_export_opt = true; - break; - - case OPT_XINO_ON: - config->xino = OVL_XINO_ON; - break; - - case OPT_XINO_OFF: - config->xino = OVL_XINO_OFF; - break; - - case OPT_XINO_AUTO: - config->xino = OVL_XINO_AUTO; - break; - - case OPT_METACOPY_ON: - config->metacopy = true; - metacopy_opt = true; - break; - - case OPT_METACOPY_OFF: - config->metacopy = false; - metacopy_opt = true; - break; - - case OPT_VOLATILE: - config->ovl_volatile = true; - break; - - case OPT_USERXATTR: - config->userxattr = true; - break; - - default: - pr_err("unrecognized mount option \"%s\" or missing value\n", - p); - return -EINVAL; - } - } - - /* Workdir/index are useless in non-upper mount */ - if (!config->upperdir) { - if (config->workdir) { - pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n", - config->workdir); - kfree(config->workdir); - config->workdir = NULL; - } - if (config->index && index_opt) { - pr_info("option \"index=on\" is useless in a non-upper mount, ignore\n"); - index_opt = false; - } - config->index = false; - } - - if (!config->upperdir && config->ovl_volatile) { - pr_info("option \"volatile\" is meaningless in a non-upper mount, ignoring it.\n"); - config->ovl_volatile = false; - } - - err = ovl_parse_redirect_mode(config, config->redirect_mode); - if (err) - return err; - - /* - * This is to make the logic below simpler. It doesn't make any other - * difference, since config->redirect_dir is only used for upper. - */ - if (!config->upperdir && config->redirect_follow) - config->redirect_dir = true; - - /* Resolve metacopy -> redirect_dir dependency */ - if (config->metacopy && !config->redirect_dir) { - if (metacopy_opt && redirect_opt) { - pr_err("conflicting options: metacopy=on,redirect_dir=%s\n", - config->redirect_mode); - return -EINVAL; - } - if (redirect_opt) { - /* - * There was an explicit redirect_dir=... that resulted - * in this conflict. - */ - pr_info("disabling metacopy due to redirect_dir=%s\n", - config->redirect_mode); - config->metacopy = false; - } else { - /* Automatically enable redirect otherwise. */ - config->redirect_follow = config->redirect_dir = true; - } - } - - /* Resolve nfs_export -> index dependency */ - if (config->nfs_export && !config->index) { - if (!config->upperdir && config->redirect_follow) { - pr_info("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); - config->nfs_export = false; - } else if (nfs_export_opt && index_opt) { - pr_err("conflicting options: nfs_export=on,index=off\n"); - return -EINVAL; - } else if (index_opt) { - /* - * There was an explicit index=off that resulted - * in this conflict. - */ - pr_info("disabling nfs_export due to index=off\n"); - config->nfs_export = false; - } else { - /* Automatically enable index otherwise. */ - config->index = true; - } - } - - /* Resolve nfs_export -> !metacopy dependency */ - if (config->nfs_export && config->metacopy) { - if (nfs_export_opt && metacopy_opt) { - pr_err("conflicting options: nfs_export=on,metacopy=on\n"); - return -EINVAL; - } - if (metacopy_opt) { - /* - * There was an explicit metacopy=on that resulted - * in this conflict. - */ - pr_info("disabling nfs_export due to metacopy=on\n"); - config->nfs_export = false; - } else { - /* - * There was an explicit nfs_export=on that resulted - * in this conflict. - */ - pr_info("disabling metacopy due to nfs_export=on\n"); - config->metacopy = false; - } - } - - - /* Resolve userxattr -> !redirect && !metacopy dependency */ - if (config->userxattr) { - if (config->redirect_follow && redirect_opt) { - pr_err("conflicting options: userxattr,redirect_dir=%s\n", - config->redirect_mode); - return -EINVAL; - } - if (config->metacopy && metacopy_opt) { - pr_err("conflicting options: userxattr,metacopy=on\n"); - return -EINVAL; - } - /* - * Silently disable default setting of redirect and metacopy. - * This shall be the default in the future as well: these - * options must be explicitly enabled if used together with - * userxattr. - */ - config->redirect_dir = config->redirect_follow = false; - config->metacopy = false; - } - - return 0; -} - #define OVL_WORKDIR_NAME "work" #define OVL_INDEXDIR_NAME "index" @@ -849,69 +359,6 @@ out_err: goto out_unlock; } -static void ovl_unescape(char *s) -{ - char *d = s; - - for (;; s++, d++) { - if (*s == '\\') - s++; - *d = *s; - if (!*s) - break; - } -} - -static int ovl_mount_dir_noesc(const char *name, struct path *path) -{ - int err = -EINVAL; - - if (!*name) { - pr_err("empty lowerdir\n"); - goto out; - } - err = kern_path(name, LOOKUP_FOLLOW, path); - if (err) { - pr_err("failed to resolve '%s': %i\n", name, err); - goto out; - } - err = -EINVAL; - if (ovl_dentry_weird(path->dentry)) { - pr_err("filesystem on '%s' not supported\n", name); - goto out_put; - } - if (!d_is_dir(path->dentry)) { - pr_err("'%s' not a directory\n", name); - goto out_put; - } - return 0; - -out_put: - path_put_init(path); -out: - return err; -} - -static int ovl_mount_dir(const char *name, struct path *path) -{ - int err = -ENOMEM; - char *tmp = kstrdup(name, GFP_KERNEL); - - if (tmp) { - ovl_unescape(tmp); - err = ovl_mount_dir_noesc(tmp, path); - - if (!err && path->dentry->d_flags & DCACHE_OP_REAL) { - pr_err("filesystem on '%s' not supported as upperdir\n", - tmp); - path_put_init(path); - err = -EINVAL; - } - kfree(tmp); - } - return err; -} - static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs, const char *name) { @@ -932,10 +379,6 @@ static int ovl_lower_dir(const char *name, struct path *path, int fh_type; int err; - err = ovl_mount_dir_noesc(name, path); - if (err) - return err; - err = ovl_check_namelen(path, ofs, name); if (err) return err; @@ -984,26 +427,6 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) return ok; } -static unsigned int ovl_split_lowerdirs(char *str) -{ - unsigned int ctr = 1; - char *s, *d; - - for (s = d = str;; s++, d++) { - if (*s == '\\') { - s++; - } else if (*s == ':') { - *d = '\0'; - ctr++; - continue; - } - *d = *s; - if (!*s) - break; - } - return ctr; -} - static int ovl_own_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) @@ -1104,15 +527,12 @@ static int ovl_report_in_use(struct ovl_fs *ofs, const char *name) } static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, - struct ovl_layer *upper_layer, struct path *upperpath) + struct ovl_layer *upper_layer, + const struct path *upperpath) { struct vfsmount *upper_mnt; int err; - err = ovl_mount_dir(ofs->config.upperdir, upperpath); - if (err) - goto out; - /* Upperdir path should not be r/o */ if (__mnt_is_readonly(upperpath->mnt)) { pr_err("upper fs is r/o, try multi-lower layers mount\n"); @@ -1142,6 +562,11 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, upper_layer->idx = 0; upper_layer->fsid = 0; + err = -ENOMEM; + upper_layer->name = kstrdup(ofs->config.upperdir, GFP_KERNEL); + if (!upper_layer->name) + goto out; + /* * Inherit SB_NOSEC flag from upperdir. * @@ -1333,10 +758,17 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, if (err) { pr_warn("failed to set xattr on upper\n"); ofs->noxattr = true; - if (ofs->config.index || ofs->config.metacopy) { - ofs->config.index = false; + if (ovl_redirect_follow(ofs)) { + ofs->config.redirect_mode = OVL_REDIRECT_NOFOLLOW; + pr_warn("...falling back to redirect_dir=nofollow.\n"); + } + if (ofs->config.metacopy) { ofs->config.metacopy = false; - pr_warn("...falling back to index=off,metacopy=off.\n"); + pr_warn("...falling back to metacopy=off.\n"); + } + if (ofs->config.index) { + ofs->config.index = false; + pr_warn("...falling back to index=off.\n"); } /* * xattr support is required for persistent st_ino. @@ -1399,46 +831,37 @@ out: } static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs, - const struct path *upperpath) + const struct path *upperpath, + const struct path *workpath) { int err; - struct path workpath = { }; - - err = ovl_mount_dir(ofs->config.workdir, &workpath); - if (err) - goto out; err = -EINVAL; - if (upperpath->mnt != workpath.mnt) { + if (upperpath->mnt != workpath->mnt) { pr_err("workdir and upperdir must reside under the same mount\n"); - goto out; + return err; } - if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) { + if (!ovl_workdir_ok(workpath->dentry, upperpath->dentry)) { pr_err("workdir and upperdir must be separate subtrees\n"); - goto out; + return err; } - ofs->workbasedir = dget(workpath.dentry); + ofs->workbasedir = dget(workpath->dentry); if (ovl_inuse_trylock(ofs->workbasedir)) { ofs->workdir_locked = true; } else { err = ovl_report_in_use(ofs, "workdir"); if (err) - goto out; + return err; } err = ovl_setup_trap(sb, ofs->workbasedir, &ofs->workbasedir_trap, "workdir"); if (err) - goto out; - - err = ovl_make_workdir(sb, ofs, &workpath); - -out: - path_put(&workpath); + return err; - return err; + return ovl_make_workdir(sb, ofs, workpath); } static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, @@ -1454,7 +877,7 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, /* Verify lower root is upper root origin */ err = ovl_verify_origin(ofs, upperpath->dentry, - oe->lowerstack[0].dentry, true); + ovl_lowerstack(oe)->dentry, true); if (err) { pr_err("failed to verify upper root origin\n"); goto out; @@ -1574,7 +997,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) pr_warn("%s uuid detected in lower fs '%pd2', falling back to xino=%s,index=off,nfs_export=off.\n", uuid_is_null(&sb->s_uuid) ? "null" : "conflicting", - path->dentry, ovl_xino_str[ofs->config.xino]); + path->dentry, ovl_xino_mode(&ofs->config)); } } @@ -1591,19 +1014,31 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) return ofs->numfs++; } +/* + * The fsid after the last lower fsid is used for the data layers. + * It is a "null fs" with a null sb, null uuid, and no pseudo dev. + */ +static int ovl_get_data_fsid(struct ovl_fs *ofs) +{ + return ofs->numfs; +} + + static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, - struct path *stack, unsigned int numlower, - struct ovl_layer *layers) + struct ovl_fs_context *ctx, struct ovl_layer *layers) { int err; unsigned int i; + size_t nr_merged_lower; - err = -ENOMEM; - ofs->fs = kcalloc(numlower + 1, sizeof(struct ovl_sb), GFP_KERNEL); + ofs->fs = kcalloc(ctx->nr + 2, sizeof(struct ovl_sb), GFP_KERNEL); if (ofs->fs == NULL) - goto out; + return -ENOMEM; - /* idx/fsid 0 are reserved for upper fs even with lower only overlay */ + /* + * idx/fsid 0 are reserved for upper fs even with lower only overlay + * and the last fsid is reserved for "null fs" of the data layers. + */ ofs->numfs++; /* @@ -1615,7 +1050,7 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, err = get_anon_bdev(&ofs->fs[0].pseudo_dev); if (err) { pr_err("failed to get anonymous bdev for upper fs\n"); - goto out; + return err; } if (ovl_upper_mnt(ofs)) { @@ -1623,14 +1058,19 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, ofs->fs[0].is_lower = false; } - for (i = 0; i < numlower; i++) { + nr_merged_lower = ctx->nr - ctx->nr_data; + for (i = 0; i < ctx->nr; i++) { + struct ovl_fs_context_layer *l = &ctx->lower[i]; struct vfsmount *mnt; struct inode *trap; int fsid; - err = fsid = ovl_get_fsid(ofs, &stack[i]); - if (err < 0) - goto out; + if (i < nr_merged_lower) + fsid = ovl_get_fsid(ofs, &l->path); + else + fsid = ovl_get_data_fsid(ofs); + if (fsid < 0) + return fsid; /* * Check if lower root conflicts with this overlay layers before @@ -1639,24 +1079,24 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, * the upperdir/workdir is in fact in-use by our * upperdir/workdir. */ - err = ovl_setup_trap(sb, stack[i].dentry, &trap, "lowerdir"); + err = ovl_setup_trap(sb, l->path.dentry, &trap, "lowerdir"); if (err) - goto out; + return err; - if (ovl_is_inuse(stack[i].dentry)) { + if (ovl_is_inuse(l->path.dentry)) { err = ovl_report_in_use(ofs, "lowerdir"); if (err) { iput(trap); - goto out; + return err; } } - mnt = clone_private_mount(&stack[i]); + mnt = clone_private_mount(&l->path); err = PTR_ERR(mnt); if (IS_ERR(mnt)) { pr_err("failed to clone lowerpath\n"); iput(trap); - goto out; + return err; } /* @@ -1670,6 +1110,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, layers[ofs->numlayer].idx = ofs->numlayer; layers[ofs->numlayer].fsid = fsid; layers[ofs->numlayer].fs = &ofs->fs[fsid]; + layers[ofs->numlayer].name = l->name; + l->name = NULL; ofs->numlayer++; ofs->fs[fsid].is_lower = true; } @@ -1706,69 +1148,63 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, ofs->xino_mode); } - err = 0; -out: - return err; + return 0; } static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, - const char *lower, unsigned int numlower, - struct ovl_fs *ofs, struct ovl_layer *layers) + struct ovl_fs_context *ctx, + struct ovl_fs *ofs, + struct ovl_layer *layers) { int err; - struct path *stack = NULL; unsigned int i; + size_t nr_merged_lower; struct ovl_entry *oe; + struct ovl_path *lowerstack; - if (!ofs->config.upperdir && numlower == 1) { + struct ovl_fs_context_layer *l; + + if (!ofs->config.upperdir && ctx->nr == 1) { pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n"); return ERR_PTR(-EINVAL); } - stack = kcalloc(numlower, sizeof(struct path), GFP_KERNEL); - if (!stack) - return ERR_PTR(-ENOMEM); - err = -EINVAL; - for (i = 0; i < numlower; i++) { - err = ovl_lower_dir(lower, &stack[i], ofs, &sb->s_stack_depth); - if (err) - goto out_err; + for (i = 0; i < ctx->nr; i++) { + l = &ctx->lower[i]; - lower = strchr(lower, '\0') + 1; + err = ovl_lower_dir(l->name, &l->path, ofs, &sb->s_stack_depth); + if (err) + return ERR_PTR(err); } err = -EINVAL; sb->s_stack_depth++; if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { pr_err("maximum fs stacking depth exceeded\n"); - goto out_err; + return ERR_PTR(err); } - err = ovl_get_layers(sb, ofs, stack, numlower, layers); + err = ovl_get_layers(sb, ofs, ctx, layers); if (err) - goto out_err; + return ERR_PTR(err); err = -ENOMEM; - oe = ovl_alloc_entry(numlower); + /* Data-only layers are not merged in root directory */ + nr_merged_lower = ctx->nr - ctx->nr_data; + oe = ovl_alloc_entry(nr_merged_lower); if (!oe) - goto out_err; + return ERR_PTR(err); - for (i = 0; i < numlower; i++) { - oe->lowerstack[i].dentry = dget(stack[i].dentry); - oe->lowerstack[i].layer = &ofs->layers[i+1]; + lowerstack = ovl_lowerstack(oe); + for (i = 0; i < nr_merged_lower; i++) { + l = &ctx->lower[i]; + lowerstack[i].dentry = dget(l->path.dentry); + lowerstack[i].layer = &ofs->layers[i + 1]; } - -out: - for (i = 0; i < numlower; i++) - path_put(&stack[i]); - kfree(stack); + ofs->numdatalayer = ctx->nr_data; return oe; - -out_err: - oe = ERR_PTR(err); - goto out; } /* @@ -1849,20 +1285,18 @@ static struct dentry *ovl_get_root(struct super_block *sb, struct ovl_entry *oe) { struct dentry *root; - struct ovl_path *lowerpath = &oe->lowerstack[0]; + struct ovl_path *lowerpath = ovl_lowerstack(oe); unsigned long ino = d_inode(lowerpath->dentry)->i_ino; int fsid = lowerpath->layer->fsid; struct ovl_inode_params oip = { .upperdentry = upperdentry, - .lowerpath = lowerpath, + .oe = oe, }; root = d_make_root(ovl_new_inode(sb, S_IFDIR, 0)); if (!root) return NULL; - root->d_fsdata = oe; - if (upperdentry) { /* Root inode uses upper st_ino/i_ino */ ino = d_inode(upperdentry)->i_ino; @@ -1877,73 +1311,47 @@ static struct dentry *ovl_get_root(struct super_block *sb, ovl_dentry_set_flag(OVL_E_CONNECTED, root); ovl_set_upperdata(d_inode(root)); ovl_inode_init(d_inode(root), &oip, ino, fsid); - ovl_dentry_update_reval(root, upperdentry, DCACHE_OP_WEAK_REVALIDATE); + ovl_dentry_init_flags(root, upperdentry, oe, DCACHE_OP_WEAK_REVALIDATE); + /* root keeps a reference of upperdentry */ + dget(upperdentry); return root; } -static int ovl_fill_super(struct super_block *sb, void *data, int silent) +int ovl_fill_super(struct super_block *sb, struct fs_context *fc) { - struct path upperpath = { }; + struct ovl_fs *ofs = sb->s_fs_info; + struct ovl_fs_context *ctx = fc->fs_private; struct dentry *root_dentry; struct ovl_entry *oe; - struct ovl_fs *ofs; struct ovl_layer *layers; struct cred *cred; - char *splitlower = NULL; - unsigned int numlower; int err; err = -EIO; - if (WARN_ON(sb->s_user_ns != current_user_ns())) - goto out; + if (WARN_ON(fc->user_ns != current_user_ns())) + goto out_err; sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; - ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); - if (!ofs) - goto out; - - err = -ENOMEM; ofs->creator_cred = cred = prepare_creds(); if (!cred) goto out_err; - /* Is there a reason anyone would want not to share whiteouts? */ - ofs->share_whiteout = true; - - ofs->config.index = ovl_index_def; - ofs->config.uuid = true; - ofs->config.nfs_export = ovl_nfs_export_def; - ofs->config.xino = ovl_xino_def(); - ofs->config.metacopy = ovl_metacopy_def; - err = ovl_parse_opt((char *) data, &ofs->config); + err = ovl_fs_params_verify(ctx, &ofs->config); if (err) goto out_err; err = -EINVAL; - if (!ofs->config.lowerdir) { - if (!silent) + if (ctx->nr == 0) { + if (!(fc->sb_flags & SB_SILENT)) pr_err("missing 'lowerdir'\n"); goto out_err; } err = -ENOMEM; - splitlower = kstrdup(ofs->config.lowerdir, GFP_KERNEL); - if (!splitlower) - goto out_err; - - err = -EINVAL; - numlower = ovl_split_lowerdirs(splitlower); - if (numlower > OVL_MAX_STACK) { - pr_err("too many lower directories, limit is %d\n", - OVL_MAX_STACK); - goto out_err; - } - - err = -ENOMEM; - layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL); + layers = kcalloc(ctx->nr + 1, sizeof(struct ovl_layer), GFP_KERNEL); if (!layers) goto out_err; @@ -1975,7 +1383,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_err; } - err = ovl_get_upper(sb, ofs, &layers[0], &upperpath); + err = ovl_get_upper(sb, ofs, &layers[0], &ctx->upper); if (err) goto out_err; @@ -1989,7 +1397,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } } - err = ovl_get_workdir(sb, ofs, &upperpath); + err = ovl_get_workdir(sb, ofs, &ctx->upper, &ctx->work); if (err) goto out_err; @@ -1999,7 +1407,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = upper_sb->s_stack_depth; sb->s_time_gran = upper_sb->s_time_gran; } - oe = ovl_get_lowerstack(sb, splitlower, numlower, ofs, layers); + oe = ovl_get_lowerstack(sb, ctx, ofs, layers); err = PTR_ERR(oe); if (IS_ERR(oe)) goto out_err; @@ -2014,7 +1422,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } if (!ovl_force_readonly(ofs) && ofs->config.index) { - err = ovl_get_indexdir(sb, ofs, oe, &upperpath); + err = ovl_get_indexdir(sb, ofs, oe, &ctx->upper); if (err) goto out_free_oe; @@ -2055,40 +1463,29 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_iflags |= SB_I_SKIP_SYNC; err = -ENOMEM; - root_dentry = ovl_get_root(sb, upperpath.dentry, oe); + root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe); if (!root_dentry) goto out_free_oe; - mntput(upperpath.mnt); - kfree(splitlower); - sb->s_root = root_dentry; return 0; out_free_oe: - ovl_entry_stack_free(oe); - kfree(oe); + ovl_free_entry(oe); out_err: - kfree(splitlower); - path_put(&upperpath); ovl_free_fs(ofs); -out: + sb->s_fs_info = NULL; return err; } -static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data) -{ - return mount_nodev(fs_type, flags, raw_data, ovl_fill_super); -} - static struct file_system_type ovl_fs_type = { - .owner = THIS_MODULE, - .name = "overlay", - .fs_flags = FS_USERNS_MOUNT, - .mount = ovl_mount, - .kill_sb = kill_anon_super, + .owner = THIS_MODULE, + .name = "overlay", + .init_fs_context = ovl_init_fs_context, + .parameters = ovl_parameter_spec, + .fs_flags = FS_USERNS_MOUNT, + .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("overlay"); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 923d66d131c1..7ef9e13c404a 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -83,33 +83,84 @@ bool ovl_verify_lower(struct super_block *sb) return ofs->config.nfs_export && ofs->config.index; } +struct ovl_path *ovl_stack_alloc(unsigned int n) +{ + return kcalloc(n, sizeof(struct ovl_path), GFP_KERNEL); +} + +void ovl_stack_cpy(struct ovl_path *dst, struct ovl_path *src, unsigned int n) +{ + unsigned int i; + + memcpy(dst, src, sizeof(struct ovl_path) * n); + for (i = 0; i < n; i++) + dget(src[i].dentry); +} + +void ovl_stack_put(struct ovl_path *stack, unsigned int n) +{ + unsigned int i; + + for (i = 0; stack && i < n; i++) + dput(stack[i].dentry); +} + +void ovl_stack_free(struct ovl_path *stack, unsigned int n) +{ + ovl_stack_put(stack, n); + kfree(stack); +} + struct ovl_entry *ovl_alloc_entry(unsigned int numlower) { - size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); + size_t size = offsetof(struct ovl_entry, __lowerstack[numlower]); struct ovl_entry *oe = kzalloc(size, GFP_KERNEL); if (oe) - oe->numlower = numlower; + oe->__numlower = numlower; return oe; } +void ovl_free_entry(struct ovl_entry *oe) +{ + ovl_stack_put(ovl_lowerstack(oe), ovl_numlower(oe)); + kfree(oe); +} + +#define OVL_D_REVALIDATE (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE) + bool ovl_dentry_remote(struct dentry *dentry) { - return dentry->d_flags & - (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); + return dentry->d_flags & OVL_D_REVALIDATE; } -void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry, - unsigned int mask) +void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *realdentry) { - struct ovl_entry *oe = OVL_E(dentry); + if (!ovl_dentry_remote(realdentry)) + return; + + spin_lock(&dentry->d_lock); + dentry->d_flags |= realdentry->d_flags & OVL_D_REVALIDATE; + spin_unlock(&dentry->d_lock); +} + +void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry, + struct ovl_entry *oe) +{ + return ovl_dentry_init_flags(dentry, upperdentry, oe, OVL_D_REVALIDATE); +} + +void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry, + struct ovl_entry *oe, unsigned int mask) +{ + struct ovl_path *lowerstack = ovl_lowerstack(oe); unsigned int i, flags = 0; if (upperdentry) flags |= upperdentry->d_flags; - for (i = 0; i < oe->numlower; i++) - flags |= oe->lowerstack[i].dentry->d_flags; + for (i = 0; i < ovl_numlower(oe) && lowerstack[i].dentry; i++) + flags |= lowerstack[i].dentry->d_flags; spin_lock(&dentry->d_lock); dentry->d_flags &= ~mask; @@ -127,7 +178,7 @@ bool ovl_dentry_weird(struct dentry *dentry) enum ovl_path_type ovl_path_type(struct dentry *dentry) { - struct ovl_entry *oe = dentry->d_fsdata; + struct ovl_entry *oe = OVL_E(dentry); enum ovl_path_type type = 0; if (ovl_dentry_upper(dentry)) { @@ -136,7 +187,7 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) /* * Non-dir dentry can hold lower dentry of its copy up origin. */ - if (oe->numlower) { + if (ovl_numlower(oe)) { if (ovl_test_flag(OVL_CONST_INO, d_inode(dentry))) type |= __OVL_PATH_ORIGIN; if (d_is_dir(dentry) || @@ -144,7 +195,7 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) type |= __OVL_PATH_MERGE; } } else { - if (oe->numlower > 1) + if (ovl_numlower(oe) > 1) type |= __OVL_PATH_MERGE; } return type; @@ -160,11 +211,12 @@ void ovl_path_upper(struct dentry *dentry, struct path *path) void ovl_path_lower(struct dentry *dentry, struct path *path) { - struct ovl_entry *oe = dentry->d_fsdata; + struct ovl_entry *oe = OVL_E(dentry); + struct ovl_path *lowerpath = ovl_lowerstack(oe); - if (oe->numlower) { - path->mnt = oe->lowerstack[0].layer->mnt; - path->dentry = oe->lowerstack[0].dentry; + if (ovl_numlower(oe)) { + path->mnt = lowerpath->layer->mnt; + path->dentry = lowerpath->dentry; } else { *path = (struct path) { }; } @@ -172,11 +224,19 @@ void ovl_path_lower(struct dentry *dentry, struct path *path) void ovl_path_lowerdata(struct dentry *dentry, struct path *path) { - struct ovl_entry *oe = dentry->d_fsdata; + struct ovl_entry *oe = OVL_E(dentry); + struct ovl_path *lowerdata = ovl_lowerdata(oe); + struct dentry *lowerdata_dentry = ovl_lowerdata_dentry(oe); - if (oe->numlower) { - path->mnt = oe->lowerstack[oe->numlower - 1].layer->mnt; - path->dentry = oe->lowerstack[oe->numlower - 1].dentry; + if (lowerdata_dentry) { + path->dentry = lowerdata_dentry; + /* + * Pairs with smp_wmb() in ovl_dentry_set_lowerdata(). + * Make sure that if lowerdata->dentry is visible, then + * datapath->layer is visible as well. + */ + smp_rmb(); + path->mnt = READ_ONCE(lowerdata->layer)->mnt; } else { *path = (struct path) { }; } @@ -215,16 +275,16 @@ struct dentry *ovl_dentry_upper(struct dentry *dentry) struct dentry *ovl_dentry_lower(struct dentry *dentry) { - struct ovl_entry *oe = dentry->d_fsdata; + struct ovl_entry *oe = OVL_E(dentry); - return oe->numlower ? oe->lowerstack[0].dentry : NULL; + return ovl_numlower(oe) ? ovl_lowerstack(oe)->dentry : NULL; } const struct ovl_layer *ovl_layer_lower(struct dentry *dentry) { - struct ovl_entry *oe = dentry->d_fsdata; + struct ovl_entry *oe = OVL_E(dentry); - return oe->numlower ? oe->lowerstack[0].layer : NULL; + return ovl_numlower(oe) ? ovl_lowerstack(oe)->layer : NULL; } /* @@ -235,9 +295,30 @@ const struct ovl_layer *ovl_layer_lower(struct dentry *dentry) */ struct dentry *ovl_dentry_lowerdata(struct dentry *dentry) { - struct ovl_entry *oe = dentry->d_fsdata; + return ovl_lowerdata_dentry(OVL_E(dentry)); +} + +int ovl_dentry_set_lowerdata(struct dentry *dentry, struct ovl_path *datapath) +{ + struct ovl_entry *oe = OVL_E(dentry); + struct ovl_path *lowerdata = ovl_lowerdata(oe); + struct dentry *datadentry = datapath->dentry; + + if (WARN_ON_ONCE(ovl_numlower(oe) <= 1)) + return -EIO; + + WRITE_ONCE(lowerdata->layer, datapath->layer); + /* + * Pairs with smp_rmb() in ovl_path_lowerdata(). + * Make sure that if lowerdata->dentry is visible, then + * lowerdata->layer is visible as well. + */ + smp_wmb(); + WRITE_ONCE(lowerdata->dentry, dget(datadentry)); - return oe->numlower ? oe->lowerstack[oe->numlower - 1].dentry : NULL; + ovl_dentry_update_reval(dentry, datadentry); + + return 0; } struct dentry *ovl_dentry_real(struct dentry *dentry) @@ -250,15 +331,19 @@ struct dentry *ovl_i_dentry_upper(struct inode *inode) return ovl_upperdentry_dereference(OVL_I(inode)); } -void ovl_i_path_real(struct inode *inode, struct path *path) +struct inode *ovl_i_path_real(struct inode *inode, struct path *path) { + struct ovl_path *lowerpath = ovl_lowerpath(OVL_I_E(inode)); + path->dentry = ovl_i_dentry_upper(inode); if (!path->dentry) { - path->dentry = OVL_I(inode)->lowerpath.dentry; - path->mnt = OVL_I(inode)->lowerpath.layer->mnt; + path->dentry = lowerpath->dentry; + path->mnt = lowerpath->layer->mnt; } else { path->mnt = ovl_upper_mnt(OVL_FS(inode->i_sb)); } + + return path->dentry ? d_inode_rcu(path->dentry) : NULL; } struct inode *ovl_inode_upper(struct inode *inode) @@ -270,9 +355,9 @@ struct inode *ovl_inode_upper(struct inode *inode) struct inode *ovl_inode_lower(struct inode *inode) { - struct dentry *lowerdentry = OVL_I(inode)->lowerpath.dentry; + struct ovl_path *lowerpath = ovl_lowerpath(OVL_I_E(inode)); - return lowerdentry ? d_inode(lowerdentry) : NULL; + return lowerpath ? d_inode(lowerpath->dentry) : NULL; } struct inode *ovl_inode_real(struct inode *inode) @@ -283,10 +368,12 @@ struct inode *ovl_inode_real(struct inode *inode) /* Return inode which contains lower data. Do not return metacopy */ struct inode *ovl_inode_lowerdata(struct inode *inode) { + struct dentry *lowerdata = ovl_lowerdata_dentry(OVL_I_E(inode)); + if (WARN_ON(!S_ISREG(inode->i_mode))) return NULL; - return OVL_I(inode)->lowerdata ?: ovl_inode_lower(inode); + return lowerdata ? d_inode(lowerdata) : NULL; } /* Return real inode which contains data. Does not return metacopy inode */ @@ -301,9 +388,15 @@ struct inode *ovl_inode_realdata(struct inode *inode) return ovl_inode_lowerdata(inode); } +const char *ovl_lowerdata_redirect(struct inode *inode) +{ + return inode && S_ISREG(inode->i_mode) ? + OVL_I(inode)->lowerdata_redirect : NULL; +} + struct ovl_dir_cache *ovl_dir_cache(struct inode *inode) { - return OVL_I(inode)->cache; + return inode && S_ISDIR(inode->i_mode) ? OVL_I(inode)->cache : NULL; } void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache) @@ -313,17 +406,17 @@ void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache) void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry) { - set_bit(flag, &OVL_E(dentry)->flags); + set_bit(flag, OVL_E_FLAGS(dentry)); } void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry) { - clear_bit(flag, &OVL_E(dentry)->flags); + clear_bit(flag, OVL_E_FLAGS(dentry)); } bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry) { - return test_bit(flag, &OVL_E(dentry)->flags); + return test_bit(flag, OVL_E_FLAGS(dentry)); } bool ovl_dentry_is_opaque(struct dentry *dentry) @@ -413,13 +506,6 @@ bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags) return !ovl_has_upperdata(d_inode(dentry)); } -bool ovl_redirect_dir(struct super_block *sb) -{ - struct ovl_fs *ofs = sb->s_fs_info; - - return ofs->config.redirect_dir && !ofs->noxattr; -} - const char *ovl_dentry_get_redirect(struct dentry *dentry) { return OVL_I(d_inode(dentry))->redirect; @@ -999,7 +1085,7 @@ out: bool ovl_is_metacopy_dentry(struct dentry *dentry) { - struct ovl_entry *oe = dentry->d_fsdata; + struct ovl_entry *oe = OVL_E(dentry); if (!d_is_reg(dentry)) return false; @@ -1010,7 +1096,7 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry) return false; } - return (oe->numlower > 1); + return (ovl_numlower(oe) > 1); } char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding) @@ -1105,8 +1191,7 @@ void ovl_copyattr(struct inode *inode) vfsuid_t vfsuid; vfsgid_t vfsgid; - ovl_i_path_real(inode, &realpath); - realinode = d_inode(realpath.dentry); + realinode = ovl_i_path_real(inode, &realpath); real_idmap = mnt_idmap(realpath.mnt); vfsuid = i_uid_into_vfsuid(real_idmap, realinode); diff --git a/fs/pnode.c b/fs/pnode.c index 3cede8b18c8b..e4d0340393d5 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -216,7 +216,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin) static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct hlist_head *list; -static inline bool peers(struct mount *m1, struct mount *m2) +static inline bool peers(const struct mount *m1, const struct mount *m2) { return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } @@ -354,6 +354,46 @@ static inline int do_refcount_check(struct mount *mnt, int count) return mnt_get_count(mnt) > count; } +/** + * propagation_would_overmount - check whether propagation from @from + * would overmount @to + * @from: shared mount + * @to: mount to check + * @mp: future mountpoint of @to on @from + * + * If @from propagates mounts to @to, @from and @to must either be peers + * or one of the masters in the hierarchy of masters of @to must be a + * peer of @from. + * + * If the root of the @to mount is equal to the future mountpoint @mp of + * the @to mount on @from then @to will be overmounted by whatever is + * propagated to it. + * + * Context: This function expects namespace_lock() to be held and that + * @mp is stable. + * Return: If @from overmounts @to, true is returned, false if not. + */ +bool propagation_would_overmount(const struct mount *from, + const struct mount *to, + const struct mountpoint *mp) +{ + if (!IS_MNT_SHARED(from)) + return false; + + if (IS_MNT_NEW(to)) + return false; + + if (to->mnt.mnt_root != mp->m_dentry) + return false; + + for (const struct mount *m = to; m; m = m->mnt_master) { + if (peers(from, m)) + return true; + } + + return false; +} + /* * check if the mount 'mnt' can be unmounted successfully. * @mnt: the mount to be checked for unmount diff --git a/fs/pnode.h b/fs/pnode.h index 988f1aa9b02a..0b02a6393891 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -53,4 +53,7 @@ struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); int count_mounts(struct mnt_namespace *ns, struct mount *mnt); +bool propagation_would_overmount(const struct mount *from, + const struct mount *to, + const struct mountpoint *mp); #endif /* _LINUX_PNODE_H */ diff --git a/fs/proc/inode.c b/fs/proc/inode.c index f495fdb39151..67b09a1d9433 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -591,7 +591,7 @@ static const struct file_operations proc_iter_file_ops = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .write = proc_reg_write, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, @@ -617,7 +617,7 @@ static const struct file_operations proc_reg_file_ops_compat = { static const struct file_operations proc_iter_file_ops_compat = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 25b44b303b35..9cb32e1a78a0 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -199,7 +199,7 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) ent->addr = (unsigned long)page_to_virt(p); ent->size = nr_pages << PAGE_SHIFT; - if (!virt_addr_valid(ent->addr)) + if (!virt_addr_valid((void *)ent->addr)) goto free_out; /* cut not-mapped area. ....from ppc-32 code. */ @@ -419,7 +419,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) char *notes; size_t i = 0; - strlcpy(prpsinfo.pr_psargs, saved_command_line, + strscpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs)); notes = kzalloc(notes_len, GFP_KERNEL); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index b43d0bd42762..8dca4d6d96c7 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -168,6 +168,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_zone_page_state(NR_FREE_CMA_PAGES)); #endif +#ifdef CONFIG_UNACCEPTED_MEMORY + show_val_kb(m, "Unaccepted: ", + global_zone_page_state(NR_UNACCEPTED)); +#endif + hugetlb_report_meminfo(m); arch_report_meminfo(m); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 8038833ff5b0..5ea42653126e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -29,9 +29,8 @@ static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* Support for permanently empty directories */ - -struct ctl_table sysctl_mount_point[] = { - { } +static struct ctl_table sysctl_mount_point[] = { + {.type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY } }; /** @@ -48,21 +47,14 @@ struct ctl_table_header *register_sysctl_mount_point(const char *path) } EXPORT_SYMBOL(register_sysctl_mount_point); -static bool is_empty_dir(struct ctl_table_header *head) -{ - return head->ctl_table[0].child == sysctl_mount_point; -} - -static void set_empty_dir(struct ctl_dir *dir) -{ - dir->header.ctl_table[0].child = sysctl_mount_point; -} - -static void clear_empty_dir(struct ctl_dir *dir) - -{ - dir->header.ctl_table[0].child = NULL; -} +#define sysctl_is_perm_empty_ctl_table(tptr) \ + (tptr[0].type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) +#define sysctl_is_perm_empty_ctl_header(hptr) \ + (sysctl_is_perm_empty_ctl_table(hptr->ctl_table)) +#define sysctl_set_perm_empty_ctl_header(hptr) \ + (hptr->ctl_table[0].type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY) +#define sysctl_clear_perm_empty_ctl_header(hptr) \ + (hptr->ctl_table[0].type = SYSCTL_TABLE_TYPE_DEFAULT) void proc_sys_poll_notify(struct ctl_table_poll *poll) { @@ -230,20 +222,22 @@ static void erase_header(struct ctl_table_header *head) static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) { struct ctl_table *entry; + struct ctl_table_header *dir_h = &dir->header; int err; + /* Is this a permanently empty directory? */ - if (is_empty_dir(&dir->header)) + if (sysctl_is_perm_empty_ctl_header(dir_h)) return -EROFS; /* Am I creating a permanently empty directory? */ - if (header->ctl_table == sysctl_mount_point) { + if (sysctl_is_perm_empty_ctl_table(header->ctl_table)) { if (!RB_EMPTY_ROOT(&dir->root)) return -EINVAL; - set_empty_dir(dir); + sysctl_set_perm_empty_ctl_header(dir_h); } - dir->header.nreg++; + dir_h->nreg++; header->parent = dir; err = insert_links(header); if (err) @@ -259,9 +253,9 @@ fail: put_links(header); fail_links: if (header->ctl_table == sysctl_mount_point) - clear_empty_dir(dir); + sysctl_clear_perm_empty_ctl_header(dir_h); header->parent = NULL; - drop_sysctl_table(&dir->header); + drop_sysctl_table(dir_h); return err; } @@ -479,7 +473,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_mode |= S_IFDIR; inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; - if (is_empty_dir(head)) + if (sysctl_is_perm_empty_ctl_header(head)) make_empty_dir_inode(inode); } @@ -868,7 +862,7 @@ static const struct file_operations proc_sys_file_operations = { .poll = proc_sys_poll, .read_iter = proc_sys_read, .write_iter = proc_sys_write, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .llseek = default_llseek, }; @@ -1136,9 +1130,6 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) struct ctl_table *entry; int err = 0; list_for_each_table_entry(entry, table) { - if (entry->child) - err |= sysctl_err(path, entry, "Not a file"); - if ((entry->proc_handler == proc_dostring) || (entry->proc_handler == proc_dobool) || (entry->proc_handler == proc_dointvec) || @@ -1406,7 +1397,6 @@ fail_put_dir_locked: spin_unlock(&sysctl_lock); fail: kfree(header); - dump_stack(); return NULL; } @@ -1466,185 +1456,6 @@ void __init __register_sysctl_init(const char *path, struct ctl_table *table, kmemleak_not_leak(hdr); } -static char *append_path(const char *path, char *pos, const char *name) -{ - int namelen; - namelen = strlen(name); - if (((pos - path) + namelen + 2) >= PATH_MAX) - return NULL; - memcpy(pos, name, namelen); - pos[namelen] = '/'; - pos[namelen + 1] = '\0'; - pos += namelen + 1; - return pos; -} - -static int count_subheaders(struct ctl_table *table) -{ - int has_files = 0; - int nr_subheaders = 0; - struct ctl_table *entry; - - /* special case: no directory and empty directory */ - if (!table || !table->procname) - return 1; - - list_for_each_table_entry(entry, table) { - if (entry->child) - nr_subheaders += count_subheaders(entry->child); - else - has_files = 1; - } - return nr_subheaders + has_files; -} - -static int register_leaf_sysctl_tables(const char *path, char *pos, - struct ctl_table_header ***subheader, struct ctl_table_set *set, - struct ctl_table *table) -{ - struct ctl_table *ctl_table_arg = NULL; - struct ctl_table *entry, *files; - int nr_files = 0; - int nr_dirs = 0; - int err = -ENOMEM; - - list_for_each_table_entry(entry, table) { - if (entry->child) - nr_dirs++; - else - nr_files++; - } - - files = table; - /* If there are mixed files and directories we need a new table */ - if (nr_dirs && nr_files) { - struct ctl_table *new; - files = kcalloc(nr_files + 1, sizeof(struct ctl_table), - GFP_KERNEL); - if (!files) - goto out; - - ctl_table_arg = files; - new = files; - - list_for_each_table_entry(entry, table) { - if (entry->child) - continue; - *new = *entry; - new++; - } - } - - /* Register everything except a directory full of subdirectories */ - if (nr_files || !nr_dirs) { - struct ctl_table_header *header; - header = __register_sysctl_table(set, path, files); - if (!header) { - kfree(ctl_table_arg); - goto out; - } - - /* Remember if we need to free the file table */ - header->ctl_table_arg = ctl_table_arg; - **subheader = header; - (*subheader)++; - } - - /* Recurse into the subdirectories. */ - list_for_each_table_entry(entry, table) { - char *child_pos; - - if (!entry->child) - continue; - - err = -ENAMETOOLONG; - child_pos = append_path(path, pos, entry->procname); - if (!child_pos) - goto out; - - err = register_leaf_sysctl_tables(path, child_pos, subheader, - set, entry->child); - pos[0] = '\0'; - if (err) - goto out; - } - err = 0; -out: - /* On failure our caller will unregister all registered subheaders */ - return err; -} - -/** - * register_sysctl_table - register a sysctl table hierarchy - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * We are slowly deprecating this call so avoid its use. - */ -struct ctl_table_header *register_sysctl_table(struct ctl_table *table) -{ - struct ctl_table *ctl_table_arg = table; - int nr_subheaders = count_subheaders(table); - struct ctl_table_header *header = NULL, **subheaders, **subheader; - char *new_path, *pos; - - pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL); - if (!new_path) - return NULL; - - pos[0] = '\0'; - while (table->procname && table->child && !table[1].procname) { - pos = append_path(new_path, pos, table->procname); - if (!pos) - goto out; - table = table->child; - } - if (nr_subheaders == 1) { - header = __register_sysctl_table(&sysctl_table_root.default_set, new_path, table); - if (header) - header->ctl_table_arg = ctl_table_arg; - } else { - header = kzalloc(sizeof(*header) + - sizeof(*subheaders)*nr_subheaders, GFP_KERNEL); - if (!header) - goto out; - - subheaders = (struct ctl_table_header **) (header + 1); - subheader = subheaders; - header->ctl_table_arg = ctl_table_arg; - - if (register_leaf_sysctl_tables(new_path, pos, &subheader, - &sysctl_table_root.default_set, table)) - goto err_register_leaves; - } - -out: - kfree(new_path); - return header; - -err_register_leaves: - while (subheader > subheaders) { - struct ctl_table_header *subh = *(--subheader); - struct ctl_table *table = subh->ctl_table_arg; - unregister_sysctl_table(subh); - kfree(table); - } - kfree(header); - header = NULL; - goto out; -} -EXPORT_SYMBOL(register_sysctl_table); - -int __register_sysctl_base(struct ctl_table *base_table) -{ - struct ctl_table_header *hdr; - - hdr = register_sysctl_table(base_table); - kmemleak_not_leak(hdr); - return 0; -} - static void put_links(struct ctl_table_header *header) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; @@ -1700,35 +1511,18 @@ static void drop_sysctl_table(struct ctl_table_header *header) /** * unregister_sysctl_table - unregister a sysctl table hierarchy - * @header: the header returned from register_sysctl_table + * @header: the header returned from register_sysctl or __register_sysctl_table * * Unregisters the sysctl table and all children. proc entries may not * actually be removed until they are no longer used by anyone. */ void unregister_sysctl_table(struct ctl_table_header * header) { - int nr_subheaders; might_sleep(); if (header == NULL) return; - nr_subheaders = count_subheaders(header->ctl_table_arg); - if (unlikely(nr_subheaders > 1)) { - struct ctl_table_header **subheaders; - int i; - - subheaders = (struct ctl_table_header **)(header + 1); - for (i = nr_subheaders -1; i >= 0; i--) { - struct ctl_table_header *subh = subheaders[i]; - struct ctl_table *table = subh->ctl_table_arg; - unregister_sysctl_table(subh); - kfree(table); - } - kfree(header); - return; - } - spin_lock(&sysctl_lock); drop_sysctl_table(header); spin_unlock(&sysctl_lock); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 420510f6a545..507cd4e59d07 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -538,13 +538,14 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool migration = false, young = false, dirty = false; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - young = pte_young(*pte); - dirty = pte_dirty(*pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + young = pte_young(ptent); + dirty = pte_dirty(ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); if (!non_swap_entry(swpent)) { int mapcount; @@ -631,14 +632,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, goto out; } - if (pmd_trans_unstable(pmd)) - goto out; - /* - * The mmap_lock held all the way back in m_start() is what - * keeps khugepaged out of here and from collapsing things - * in here. - */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } for (; addr != end; pte++, addr += PAGE_SIZE) smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); @@ -735,11 +733,12 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; struct page *page = NULL; + pte_t ptent = ptep_get(pte); - if (pte_present(*pte)) { - page = vm_normal_page(vma, addr, *pte); - } else if (is_swap_pte(*pte)) { - swp_entry_t swpent = pte_to_swp_entry(*pte); + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + } else if (is_swap_pte(ptent)) { + swp_entry_t swpent = pte_to_swp_entry(ptent); if (is_pfn_swap_entry(swpent)) page = pfn_swap_entry_to_page(swpent); @@ -1108,7 +1107,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, * Documentation/admin-guide/mm/soft-dirty.rst for full description * of how soft-dirty works. */ - pte_t ptent = *pte; + pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { pte_t old_pte; @@ -1191,12 +1190,13 @@ out: return 0; } - if (pmd_trans_unstable(pmd)) - return 0; - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; + ptent = ptep_get(pte); if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty(vma, addr, pte); @@ -1538,9 +1538,6 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, spin_unlock(ptl); return err; } - - if (pmd_trans_unstable(pmdp)) - return 0; #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* @@ -1548,10 +1545,14 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, * goes beyond vma->vm_end. */ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return err; + } for (; addr < end; pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pme = pte_to_pagemap_entry(pm, vma, addr, *pte); + pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte)); err = add_to_pagemap(addr, &pme, pm); if (err) break; @@ -1689,23 +1690,23 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* watch out for wraparound */ start_vaddr = end_vaddr; if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) { + unsigned long end; + ret = mmap_read_lock_killable(mm); if (ret) goto out_free; start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT); mmap_read_unlock(mm); + + end = start_vaddr + ((count / PM_ENTRY_BYTES) << PAGE_SHIFT); + if (end >= start_vaddr && end < mm->task_size) + end_vaddr = end; } /* Ensure the address is inside the task */ if (start_vaddr > mm->task_size) start_vaddr = end_vaddr; - /* - * The odds are that this will stop walking way - * before end_vaddr, because the length of the - * user buffer is tracked in "pm", and the walk - * will stop when we hit the end of the buffer. - */ ret = 0; while (count && (start_vaddr < end_vaddr)) { int len; @@ -1887,16 +1888,18 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, spin_unlock(ptl); return 0; } - - if (pmd_trans_unstable(pmd)) - return 0; #endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte) { + walk->action = ACTION_AGAIN; + return 0; + } do { - struct page *page = can_gather_numa_stats(*pte, vma, addr); + pte_t ptent = ptep_get(pte); + struct page *page = can_gather_numa_stats(ptent, vma, addr); if (!page) continue; - gather_stats(page, md, pte_dirty(*pte), 1); + gather_stats(page, md, pte_dirty(ptent), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 0ec35072a8e5..2c8b62265981 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -51,7 +51,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) sbytes += kobjsize(mm); else bytes += kobjsize(mm); - + if (current->fs && current->fs->users > 1) sbytes += kobjsize(current->fs); else @@ -69,13 +69,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) bytes += kobjsize(current); /* includes kernel stack */ + mmap_read_unlock(mm); + seq_printf(m, "Mem:\t%8lu bytes\n" "Slack:\t%8lu bytes\n" "Shared:\t%8lu bytes\n", bytes, slack, sbytes); - - mmap_read_unlock(mm); } unsigned long task_vsize(struct mm_struct *mm) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 03f5963914a1..cb80a7703d58 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -877,7 +877,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; - phdr.p_align = 0; + phdr.p_align = 4; /* Add merged PT_NOTE program header*/ tmp = elfptr + sizeof(Elf64_Ehdr); @@ -1068,7 +1068,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, phdr.p_offset = roundup(note_off, PAGE_SIZE); phdr.p_vaddr = phdr.p_paddr = 0; phdr.p_filesz = phdr.p_memsz = phdr_sz; - phdr.p_align = 0; + phdr.p_align = 4; /* Add merged PT_NOTE program header*/ tmp = elfptr + sizeof(Elf32_Ehdr); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 846f9455ae22..250eb5bf7b52 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -324,7 +324,7 @@ static int mountstats_open(struct inode *inode, struct file *file) const struct file_operations proc_mounts_operations = { .open = mounts_open, .read_iter = seq_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .llseek = seq_lseek, .release = mounts_release, .poll = mounts_poll, @@ -333,7 +333,7 @@ const struct file_operations proc_mounts_operations = { const struct file_operations proc_mountinfo_operations = { .open = mountinfo_open, .read_iter = seq_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .llseek = seq_lseek, .release = mounts_release, .poll = mounts_poll, @@ -342,7 +342,7 @@ const struct file_operations proc_mountinfo_operations = { const struct file_operations proc_mountstats_operations = { .open = mountstats_open, .read_iter = seq_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = copy_splice_read, .llseek = seq_lseek, .release = mounts_release, }; diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 4ae0cfcd15f2..de8cf5d75f34 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -263,9 +263,9 @@ static __init const char *early_boot_devpath(const char *initial_devname) * same scheme to find the device that we use for mounting * the root file system. */ - dev_t dev = name_to_dev_t(initial_devname); + dev_t dev; - if (!dev) { + if (early_lookup_bdev(initial_devname, &dev)) { pr_err("failed to resolve '%s'!\n", initial_devname); return initial_devname; } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index ade66dbe5f39..2f625e1fa8d8 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -875,7 +875,7 @@ fail_out: return err; } -static int ramoops_remove(struct platform_device *pdev) +static void ramoops_remove(struct platform_device *pdev) { struct ramoops_context *cxt = &oops_cxt; @@ -885,8 +885,6 @@ static int ramoops_remove(struct platform_device *pdev) cxt->pstore.bufsize = 0; ramoops_free_przs(cxt); - - return 0; } static const struct of_device_id dt_match[] = { @@ -896,7 +894,7 @@ static const struct of_device_id dt_match[] = { static struct platform_driver ramoops_driver = { .probe = ramoops_probe, - .remove = ramoops_remove, + .remove_new = ramoops_remove, .driver = { .name = "ramoops", .of_match_table = dt_match, diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 966191d3a5ba..85aaf0fc6d7d 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -599,6 +599,8 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, raw_spin_lock_init(&prz->buffer_lock); prz->flags = flags; prz->label = kstrdup(label, GFP_KERNEL); + if (!prz->label) + goto err; ret = persistent_ram_buffer_map(start, size, prz, memtype); if (ret) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ffd40dc3e4e9..e3e4f4047657 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -555,7 +555,7 @@ restart: continue; /* Wait for dquot users */ if (atomic_read(&dquot->dq_count)) { - dqgrab(dquot); + atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); /* * Once dqput() wakes us up, we know it's time to free @@ -2420,7 +2420,8 @@ int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, error = add_dquot_ref(sb, type); if (error) - dquot_disable(sb, type, flags); + dquot_disable(sb, type, + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; out_fmt: diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 052f143e2e0e..0e41fb84060f 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -895,8 +895,9 @@ retry: up_write(&sb->s_umount); else up_read(&sb->s_umount); - wait_event(sb->s_writers.wait_unfrozen, - sb->s_writers.frozen == SB_UNFROZEN); + /* Wait for sb to unfreeze */ + sb_start_write(sb); + sb_end_write(sb); put_super(sb); goto retry; } diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 12af0490322f..c7a1aa3c882b 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -43,7 +43,7 @@ const struct file_operations ramfs_file_operations = { .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = noop_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, .get_unmapped_area = ramfs_mmu_get_unmapped_area, diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 9fbb9b5256f7..efb1b4c1a0a4 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -43,7 +43,7 @@ const struct file_operations ramfs_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .fsync = noop_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 5ba580c78835..fef477c78107 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -278,7 +278,7 @@ int ramfs_init_fs_context(struct fs_context *fc) return 0; } -static void ramfs_kill_sb(struct super_block *sb) +void ramfs_kill_sb(struct super_block *sb) { kfree(sb->s_fs_info); kill_litter_super(sb); diff --git a/fs/read_write.c b/fs/read_write.c index a21ba3be7dbe..b07de77ef126 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -29,7 +29,7 @@ const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, }; EXPORT_SYMBOL(generic_ro_fops); diff --git a/fs/readdir.c b/fs/readdir.c index 9c53edb60c03..b264ce60114d 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -131,7 +131,7 @@ struct old_linux_dirent { unsigned long d_ino; unsigned long d_offset; unsigned short d_namlen; - char d_name[1]; + char d_name[]; }; struct readdir_callback { @@ -208,7 +208,7 @@ struct linux_dirent { unsigned long d_ino; unsigned long d_off; unsigned short d_reclen; - char d_name[1]; + char d_name[]; }; struct getdents_callback { @@ -388,7 +388,7 @@ struct compat_old_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_offset; unsigned short d_namlen; - char d_name[1]; + char d_name[]; }; struct compat_readdir_callback { @@ -460,7 +460,7 @@ struct compat_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_off; unsigned short d_reclen; - char d_name[1]; + char d_name[]; }; struct compat_getdents_callback { diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index b54cc7048f02..8eb3ad3e8ae9 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -247,7 +247,7 @@ const struct file_operations reiserfs_file_operations = { .fsync = reiserfs_sync_file, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index d8debbb6105f..77bd3b27059f 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2506,7 +2506,7 @@ out: /* * mason@suse.com: updated in 2.5.54 to follow the same general io - * start/recovery path as __block_write_full_page, along with special + * start/recovery path as __block_write_full_folio, along with special * code to handle reiserfs tails. */ static int reiserfs_write_full_page(struct page *page, @@ -2872,6 +2872,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; int ret = 0; int update_sd = 0; @@ -2887,12 +2888,12 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, start = pos & (PAGE_SIZE - 1); if (unlikely(copied < len)) { - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) copied = 0; - page_zero_new_buffers(page, start + copied, start + len); + folio_zero_new_buffers(folio, start + copied, start + len); } - flush_dcache_page(page); + flush_dcache_folio(folio); reiserfs_commit_page(inode, page, start, start + copied); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4d11d60f493c..479aa4a57602 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2589,7 +2589,12 @@ static void release_journal_dev(struct super_block *super, struct reiserfs_journal *journal) { if (journal->j_dev_bd != NULL) { - blkdev_put(journal->j_dev_bd, journal->j_dev_mode); + void *holder = NULL; + + if (journal->j_dev_bd->bd_dev != super->s_dev) + holder = journal; + + blkdev_put(journal->j_dev_bd, holder); journal->j_dev_bd = NULL; } } @@ -2598,9 +2603,10 @@ static int journal_init_dev(struct super_block *super, struct reiserfs_journal *journal, const char *jdev_name) { + blk_mode_t blkdev_mode = BLK_OPEN_READ; + void *holder = journal; int result; dev_t jdev; - fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; result = 0; @@ -2608,16 +2614,15 @@ static int journal_init_dev(struct super_block *super, jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; - if (bdev_read_only(super->s_bdev)) - blkdev_mode = FMODE_READ; + if (!bdev_read_only(super->s_bdev)) + blkdev_mode |= BLK_OPEN_WRITE; /* there is no "jdev" option and journal is on separate device */ if ((!jdev_name || !jdev_name[0])) { if (jdev == super->s_dev) - blkdev_mode &= ~FMODE_EXCL; - journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, - journal); - journal->j_dev_mode = blkdev_mode; + holder = NULL; + journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, holder, + NULL); if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; @@ -2631,8 +2636,8 @@ static int journal_init_dev(struct super_block *super, return 0; } - journal->j_dev_mode = blkdev_mode; - journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal); + journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, holder, + NULL); if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 1bccf6a2e908..55e85256aae8 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -300,7 +300,6 @@ struct reiserfs_journal { struct reiserfs_journal_cnode *j_first; struct block_device *j_dev_bd; - fmode_t j_dev_mode; /* first block on s_dev of reserved area journal */ int j_1st_reserved_block; diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 6e0a099dd788..078dd8cc312f 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -67,6 +67,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode, sec->name = NULL; sec->value = NULL; + sec->length = 0; /* Don't add selinux attributes on xattrs - they'll never get used */ if (IS_PRIVATE(dir)) diff --git a/fs/remap_range.c b/fs/remap_range.c index 1331a890f2f2..87ae4f0dc3aa 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -15,6 +15,7 @@ #include <linux/mount.h> #include <linux/fs.h> #include <linux/dax.h> +#include <linux/overflow.h> #include "internal.h" #include <linux/uaccess.h> @@ -101,10 +102,12 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, static int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { + loff_t tmp; + if (unlikely(pos < 0 || len < 0)) return -EINVAL; - if (unlikely((loff_t) (pos + len) < 0)) + if (unlikely(check_add_overflow(pos, len, &tmp))) return -EINVAL; return security_file_permission(file, write ? MAY_WRITE : MAY_READ); diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index 4578dc45e50a..4520ca413867 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -78,7 +78,7 @@ static unsigned romfs_mmap_capabilities(struct file *file) const struct file_operations romfs_ro_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .mmap = romfs_mmap, .get_unmapped_area = romfs_get_unmapped_area, .mmap_capabilities = romfs_mmap_capabilities, diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index b279f745466e..fb4162a52844 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -122,6 +122,12 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_puts(m, " nosparse"); if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); + spin_lock(&tcon->tc_lock); + if (tcon->origin_fullpath) { + seq_printf(m, "\n\tDFS origin fullpath: %s", + tcon->origin_fullpath); + } + spin_unlock(&tcon->tc_lock); seq_putc(m, '\n'); } @@ -330,6 +336,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) spin_lock(&server->srv_lock); if (server->hostname) seq_printf(m, "Hostname: %s ", server->hostname); + seq_printf(m, "\nClientGUID: %pUL", server->client_guid); spin_unlock(&server->srv_lock); #ifdef CONFIG_CIFS_SMB_DIRECT if (!server->rdma) @@ -427,13 +434,9 @@ skip_rdma: seq_printf(m, "\nIn Send: %d In MaxReq Wait: %d", atomic_read(&server->in_send), atomic_read(&server->num_waiters)); - if (IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)) { - if (server->origin_fullpath) - seq_printf(m, "\nDFS origin full path: %s", - server->origin_fullpath); - if (server->leaf_fullpath) - seq_printf(m, "\nDFS leaf full path: %s", - server->leaf_fullpath); + if (server->leaf_fullpath) { + seq_printf(m, "\nDFS leaf full path: %s", + server->leaf_fullpath); } seq_printf(m, "\n\n\tSessions: "); diff --git a/fs/smb/client/cifs_dfs_ref.c b/fs/smb/client/cifs_dfs_ref.c index 0329a907bdfe..b1c2499b1c3b 100644 --- a/fs/smb/client/cifs_dfs_ref.c +++ b/fs/smb/client/cifs_dfs_ref.c @@ -118,12 +118,12 @@ cifs_build_devname(char *nodename, const char *prepath) return dev; } -static int set_dest_addr(struct smb3_fs_context *ctx, const char *full_path) +static int set_dest_addr(struct smb3_fs_context *ctx) { struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr; int rc; - rc = dns_resolve_server_name_to_ip(full_path, addr, NULL); + rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL); if (!rc) cifs_set_port(addr, ctx->port); return rc; @@ -171,10 +171,9 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path) mnt = ERR_CAST(full_path); goto out; } - cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path); tmp = *cur_ctx; - tmp.source = full_path; + tmp.source = NULL; tmp.leaf_fullpath = NULL; tmp.UNC = tmp.prepath = NULL; tmp.dfs_root_ses = NULL; @@ -185,13 +184,22 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path) goto out; } - rc = set_dest_addr(ctx, full_path); + rc = smb3_parse_devname(full_path, ctx); if (rc) { mnt = ERR_PTR(rc); goto out; } - rc = smb3_parse_devname(full_path, ctx); + ctx->source = smb3_fs_context_fullpath(ctx, '/'); + if (IS_ERR(ctx->source)) { + mnt = ERR_CAST(ctx->source); + ctx->source = NULL; + goto out; + } + cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s dstaddr=%pISpc\n", + __func__, ctx->source, ctx->UNC, ctx->prepath, &ctx->dstaddr); + + rc = set_dest_addr(ctx); if (!rc) mnt = fc_mount(fc); else diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 43a4d8603db3..a4d8b0ea1c8c 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -688,6 +688,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",noautotune"); if (tcon->ses->server->noblocksnd) seq_puts(s, ",noblocksend"); + if (tcon->ses->server->nosharesock) + seq_puts(s, ",nosharesock"); if (tcon->snapshot_time) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); @@ -884,11 +886,11 @@ struct dentry * cifs_smb3_do_mount(struct file_system_type *fs_type, int flags, struct smb3_fs_context *old_ctx) { - int rc; - struct super_block *sb = NULL; - struct cifs_sb_info *cifs_sb = NULL; struct cifs_mnt_data mnt_data; + struct cifs_sb_info *cifs_sb; + struct super_block *sb; struct dentry *root; + int rc; if (cifsFYI) { cifs_dbg(FYI, "%s: devname=%s flags=0x%x\n", __func__, @@ -897,11 +899,9 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, cifs_info("Attempting to mount %s\n", old_ctx->source); } - cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL); - if (cifs_sb == NULL) { - root = ERR_PTR(-ENOMEM); - goto out; - } + cifs_sb = kzalloc(sizeof(*cifs_sb), GFP_KERNEL); + if (!cifs_sb) + return ERR_PTR(-ENOMEM); cifs_sb->ctx = kzalloc(sizeof(struct smb3_fs_context), GFP_KERNEL); if (!cifs_sb->ctx) { @@ -938,10 +938,8 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data); if (IS_ERR(sb)) { - root = ERR_CAST(sb); cifs_umount(cifs_sb); - cifs_sb = NULL; - goto out; + return ERR_CAST(sb); } if (sb->s_root) { @@ -972,13 +970,9 @@ out_super: deactivate_locked_super(sb); return root; out: - if (cifs_sb) { - if (!sb || IS_ERR(sb)) { /* otherwise kill_sb will handle */ - kfree(cifs_sb->prepath); - smb3_cleanup_fs_context(cifs_sb->ctx); - kfree(cifs_sb); - } - } + kfree(cifs_sb->prepath); + smb3_cleanup_fs_context(cifs_sb->ctx); + kfree(cifs_sb); return root; } @@ -1376,7 +1370,7 @@ const struct file_operations cifs_file_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = cifs_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1396,7 +1390,7 @@ const struct file_operations cifs_file_strict_ops = { .fsync = cifs_strict_fsync, .flush = cifs_flush, .mmap = cifs_file_strict_mmap, - .splice_read = cifs_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1416,7 +1410,7 @@ const struct file_operations cifs_file_direct_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = direct_splice_read, + .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, @@ -1434,7 +1428,7 @@ const struct file_operations cifs_file_nobrl_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = cifs_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1452,7 +1446,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .fsync = cifs_strict_fsync, .flush = cifs_flush, .mmap = cifs_file_strict_mmap, - .splice_read = cifs_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, @@ -1470,7 +1464,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .fsync = cifs_fsync, .flush = cifs_flush, .mmap = cifs_file_mmap, - .splice_read = direct_splice_read, + .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = cifs_ioctl, .copy_file_range = cifs_copy_file_range, diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 74cd6fafb33e..d7274eefc666 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -100,9 +100,6 @@ extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to); extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from); extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from); -extern ssize_t cifs_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags); extern int cifs_flock(struct file *pfile, int cmd, struct file_lock *plock); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, loff_t, loff_t, int); diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index b212a4e16b39..b5808fe3469a 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -736,23 +736,20 @@ struct TCP_Server_Info { #endif struct mutex refpath_lock; /* protects leaf_fullpath */ /* - * origin_fullpath: Canonical copy of smb3_fs_context::source. - * It is used for matching existing DFS tcons. - * * leaf_fullpath: Canonical DFS referral path related to this * connection. * It is used in DFS cache refresher, reconnect and may * change due to nested DFS links. * - * Both protected by @refpath_lock and @srv_lock. The @refpath_lock is - * mosly used for not requiring a copy of @leaf_fullpath when getting + * Protected by @refpath_lock and @srv_lock. The @refpath_lock is + * mostly used for not requiring a copy of @leaf_fullpath when getting * cached or new DFS referrals (which might also sleep during I/O). * While @srv_lock is held for making string and NULL comparions against * both fields as in mount(2) and cache refresh. * * format: \\HOST\SHARE[\OPTIONAL PATH] */ - char *origin_fullpath, *leaf_fullpath; + char *leaf_fullpath; }; static inline bool is_smb1(struct TCP_Server_Info *server) @@ -1205,6 +1202,7 @@ struct cifs_tcon { struct delayed_work dfs_cache_work; #endif struct delayed_work query_interfaces; /* query interfaces workqueue job */ + char *origin_fullpath; /* canonical copy of smb3_fs_context::source */ }; /* @@ -2181,7 +2179,7 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable, } while (buflen); } else { sg_set_page(&sgtable->sgl[sgtable->nents++], - virt_to_page(addr), buflen, off); + virt_to_page((void *)addr), buflen, off); } } diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index d127aded2f28..1d71d658e167 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -85,6 +85,8 @@ extern void release_mid(struct mid_q_entry *mid); extern void cifs_wake_up_task(struct mid_q_entry *mid); extern int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid); +extern char *smb3_fs_context_fullpath(const struct smb3_fs_context *ctx, + char dirsep); extern int smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx); extern int smb3_parse_opt(const char *options, const char *key, char **val); extern int cifs_ipaddr_cmp(struct sockaddr *srcaddr, struct sockaddr *rhs); @@ -650,7 +652,7 @@ int smb2_parse_query_directory(struct cifs_tcon *tcon, struct kvec *rsp_iov, int resp_buftype, struct cifs_search_info *srch_inf); -struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server); +struct super_block *cifs_get_dfs_tcon_super(struct cifs_tcon *tcon); void cifs_put_tcp_super(struct super_block *sb); int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix); char *extract_hostname(const char *unc); diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 9d963caec35c..19f7385abeec 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -3958,11 +3958,12 @@ CIFSFindFirst(const unsigned int xid, struct cifs_tcon *tcon, TRANSACTION2_FFIRST_REQ *pSMB = NULL; TRANSACTION2_FFIRST_RSP *pSMBr = NULL; T2_FFIRST_RSP_PARMS *parms; - int rc = 0; + struct nls_table *nls_codepage; + unsigned int lnoff; + __u16 params, byte_count; int bytes_returned = 0; int name_len, remap; - __u16 params, byte_count; - struct nls_table *nls_codepage; + int rc = 0; cifs_dbg(FYI, "In FindFirst for %s\n", searchName); @@ -4043,63 +4044,52 @@ findFirstRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_ffirst); - if (rc) {/* BB add logic to retry regular search if Unix search - rejected unexpectedly by server */ - /* BB Add code to handle unsupported level rc */ + if (rc) { + /* + * BB: add logic to retry regular search if Unix search rejected + * unexpectedly by server. + */ + /* BB: add code to handle unsupported level rc */ cifs_dbg(FYI, "Error in FindFirst = %d\n", rc); - cifs_buf_release(pSMB); - - /* BB eventually could optimize out free and realloc of buf */ - /* for this case */ + /* + * BB: eventually could optimize out free and realloc of buf for + * this case. + */ if (rc == -EAGAIN) goto findFirstRetry; - } else { /* decode response */ - /* BB remember to free buffer if error BB */ - rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc == 0) { - unsigned int lnoff; - - if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) - psrch_inf->unicode = true; - else - psrch_inf->unicode = false; - - psrch_inf->ntwrk_buf_start = (char *)pSMBr; - psrch_inf->smallBuf = false; - psrch_inf->srch_entries_start = - (char *) &pSMBr->hdr.Protocol + - le16_to_cpu(pSMBr->t2.DataOffset); - parms = (T2_FFIRST_RSP_PARMS *)((char *) &pSMBr->hdr.Protocol + - le16_to_cpu(pSMBr->t2.ParameterOffset)); - - if (parms->EndofSearch) - psrch_inf->endOfSearch = true; - else - psrch_inf->endOfSearch = false; - - psrch_inf->entries_in_buffer = - le16_to_cpu(parms->SearchCount); - psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + - psrch_inf->entries_in_buffer; - lnoff = le16_to_cpu(parms->LastNameOffset); - if (CIFSMaxBufSize < lnoff) { - cifs_dbg(VFS, "ignoring corrupt resume name\n"); - psrch_inf->last_entry = NULL; - return rc; - } - - psrch_inf->last_entry = psrch_inf->srch_entries_start + - lnoff; - - if (pnetfid) - *pnetfid = parms->SearchHandle; - } else { - cifs_buf_release(pSMB); - } + return rc; + } + /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + if (rc) { + cifs_buf_release(pSMB); + return rc; } - return rc; + psrch_inf->unicode = !!(pSMBr->hdr.Flags2 & SMBFLG2_UNICODE); + psrch_inf->ntwrk_buf_start = (char *)pSMBr; + psrch_inf->smallBuf = false; + psrch_inf->srch_entries_start = (char *)&pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.DataOffset); + + parms = (T2_FFIRST_RSP_PARMS *)((char *)&pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.ParameterOffset)); + psrch_inf->endOfSearch = !!parms->EndofSearch; + + psrch_inf->entries_in_buffer = le16_to_cpu(parms->SearchCount); + psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + + psrch_inf->entries_in_buffer; + lnoff = le16_to_cpu(parms->LastNameOffset); + if (CIFSMaxBufSize < lnoff) { + cifs_dbg(VFS, "ignoring corrupt resume name\n"); + psrch_inf->last_entry = NULL; + } else { + psrch_inf->last_entry = psrch_inf->srch_entries_start + lnoff; + if (pnetfid) + *pnetfid = parms->SearchHandle; + } + return 0; } int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, @@ -4109,11 +4099,12 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, TRANSACTION2_FNEXT_REQ *pSMB = NULL; TRANSACTION2_FNEXT_RSP *pSMBr = NULL; T2_FNEXT_RSP_PARMS *parms; - char *response_data; - int rc = 0; - int bytes_returned; unsigned int name_len; + unsigned int lnoff; __u16 params, byte_count; + char *response_data; + int bytes_returned; + int rc = 0; cifs_dbg(FYI, "In FindNext\n"); @@ -4158,8 +4149,8 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, pSMB->ResumeFileName[name_len] = 0; pSMB->ResumeFileName[name_len+1] = 0; } else { - rc = -EINVAL; - goto FNext2_err_exit; + cifs_buf_release(pSMB); + return -EINVAL; } byte_count = params + 1 /* pad */ ; pSMB->TotalParameterCount = cpu_to_le16(params); @@ -4170,71 +4161,61 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->stats.cifs_stats.num_fnext); + if (rc) { + cifs_buf_release(pSMB); if (rc == -EBADF) { psrch_inf->endOfSearch = true; - cifs_buf_release(pSMB); rc = 0; /* search probably was closed at end of search*/ - } else + } else { cifs_dbg(FYI, "FindNext returned = %d\n", rc); - } else { /* decode response */ - rc = validate_t2((struct smb_t2_rsp *)pSMBr); - - if (rc == 0) { - unsigned int lnoff; - - /* BB fixme add lock for file (srch_info) struct here */ - if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) - psrch_inf->unicode = true; - else - psrch_inf->unicode = false; - response_data = (char *) &pSMBr->hdr.Protocol + - le16_to_cpu(pSMBr->t2.ParameterOffset); - parms = (T2_FNEXT_RSP_PARMS *)response_data; - response_data = (char *)&pSMBr->hdr.Protocol + - le16_to_cpu(pSMBr->t2.DataOffset); - if (psrch_inf->smallBuf) - cifs_small_buf_release( - psrch_inf->ntwrk_buf_start); - else - cifs_buf_release(psrch_inf->ntwrk_buf_start); - psrch_inf->srch_entries_start = response_data; - psrch_inf->ntwrk_buf_start = (char *)pSMB; - psrch_inf->smallBuf = false; - if (parms->EndofSearch) - psrch_inf->endOfSearch = true; - else - psrch_inf->endOfSearch = false; - psrch_inf->entries_in_buffer = - le16_to_cpu(parms->SearchCount); - psrch_inf->index_of_last_entry += - psrch_inf->entries_in_buffer; - lnoff = le16_to_cpu(parms->LastNameOffset); - if (CIFSMaxBufSize < lnoff) { - cifs_dbg(VFS, "ignoring corrupt resume name\n"); - psrch_inf->last_entry = NULL; - return rc; - } else - psrch_inf->last_entry = - psrch_inf->srch_entries_start + lnoff; - -/* cifs_dbg(FYI, "fnxt2 entries in buf %d index_of_last %d\n", - psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */ - - /* BB fixme add unlock here */ } - + return rc; } - /* BB On error, should we leave previous search buf (and count and - last entry fields) intact or free the previous one? */ - - /* Note: On -EAGAIN error only caller can retry on handle based calls - since file handle passed in no longer valid */ -FNext2_err_exit: - if (rc != 0) + /* decode response */ + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + if (rc) { cifs_buf_release(pSMB); - return rc; + return rc; + } + /* BB fixme add lock for file (srch_info) struct here */ + psrch_inf->unicode = !!(pSMBr->hdr.Flags2 & SMBFLG2_UNICODE); + response_data = (char *)&pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.ParameterOffset); + parms = (T2_FNEXT_RSP_PARMS *)response_data; + response_data = (char *)&pSMBr->hdr.Protocol + + le16_to_cpu(pSMBr->t2.DataOffset); + + if (psrch_inf->smallBuf) + cifs_small_buf_release(psrch_inf->ntwrk_buf_start); + else + cifs_buf_release(psrch_inf->ntwrk_buf_start); + + psrch_inf->srch_entries_start = response_data; + psrch_inf->ntwrk_buf_start = (char *)pSMB; + psrch_inf->smallBuf = false; + psrch_inf->endOfSearch = !!parms->EndofSearch; + psrch_inf->entries_in_buffer = le16_to_cpu(parms->SearchCount); + psrch_inf->index_of_last_entry += psrch_inf->entries_in_buffer; + lnoff = le16_to_cpu(parms->LastNameOffset); + if (CIFSMaxBufSize < lnoff) { + cifs_dbg(VFS, "ignoring corrupt resume name\n"); + psrch_inf->last_entry = NULL; + } else { + psrch_inf->last_entry = + psrch_inf->srch_entries_start + lnoff; + } + /* BB fixme add unlock here */ + + /* + * BB: On error, should we leave previous search buf + * (and count and last entry fields) intact or free the previous one? + * + * Note: On -EAGAIN error only caller can retry on handle based calls + * since file handle passed in no longer valid. + */ + return 0; } int diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 9d16626e7a66..dab7bc876507 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -996,7 +996,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) */ } - kfree(server->origin_fullpath); kfree(server->leaf_fullpath); kfree(server); @@ -1436,7 +1435,9 @@ match_security(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) } /* this function must be called with srv_lock held */ -static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) +static int match_server(struct TCP_Server_Info *server, + struct smb3_fs_context *ctx, + bool match_super) { struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr; @@ -1467,36 +1468,38 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context * (struct sockaddr *)&server->srcaddr)) return 0; /* - * - Match for an DFS tcon (@server->origin_fullpath). - * - Match for an DFS root server connection (@server->leaf_fullpath). - * - If none of the above and @ctx->leaf_fullpath is set, then - * it is a new DFS connection. - * - If 'nodfs' mount option was passed, then match only connections - * that have no DFS referrals set - * (e.g. can't failover to other targets). + * When matching cifs.ko superblocks (@match_super == true), we can't + * really match either @server->leaf_fullpath or @server->dstaddr + * directly since this @server might belong to a completely different + * server -- in case of domain-based DFS referrals or DFS links -- as + * provided earlier by mount(2) through 'source' and 'ip' options. + * + * Otherwise, match the DFS referral in @server->leaf_fullpath or the + * destination address in @server->dstaddr. + * + * When using 'nodfs' mount option, we avoid sharing it with DFS + * connections as they might failover. */ - if (!ctx->nodfs) { - if (ctx->source && server->origin_fullpath) { - if (!dfs_src_pathname_equal(ctx->source, - server->origin_fullpath)) + if (!match_super) { + if (!ctx->nodfs) { + if (server->leaf_fullpath) { + if (!ctx->leaf_fullpath || + strcasecmp(server->leaf_fullpath, + ctx->leaf_fullpath)) + return 0; + } else if (ctx->leaf_fullpath) { return 0; + } } else if (server->leaf_fullpath) { - if (!ctx->leaf_fullpath || - strcasecmp(server->leaf_fullpath, - ctx->leaf_fullpath)) - return 0; - } else if (ctx->leaf_fullpath) { return 0; } - } else if (server->origin_fullpath || server->leaf_fullpath) { - return 0; } /* * Match for a regular connection (address/hostname/port) which has no * DFS referrals set. */ - if (!server->origin_fullpath && !server->leaf_fullpath && + if (!server->leaf_fullpath && (strcasecmp(server->hostname, ctx->server_hostname) || !match_server_address(server, addr) || !match_port(server, addr))) @@ -1532,7 +1535,8 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx) * Skip ses channels since they're only handled in lower layers * (e.g. cifs_send_recv). */ - if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx)) { + if (CIFS_SERVER_IS_CHAN(server) || + !match_server(server, ctx, false)) { spin_unlock(&server->srv_lock); continue; } @@ -2320,10 +2324,16 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) if (tcon->status == TID_EXITING) return 0; - /* Skip UNC validation when matching DFS connections or superblocks */ - if (!server->origin_fullpath && !server->leaf_fullpath && - strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE)) + + if (tcon->origin_fullpath) { + if (!ctx->source || + !dfs_src_pathname_equal(ctx->source, + tcon->origin_fullpath)) + return 0; + } else if (!server->leaf_fullpath && + strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE)) { return 0; + } if (tcon->seal != ctx->seal) return 0; if (tcon->snapshot_time != ctx->snapshot_time) @@ -2722,7 +2732,7 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) } static int match_prepath(struct super_block *sb, - struct TCP_Server_Info *server, + struct cifs_tcon *tcon, struct cifs_mnt_data *mnt_data) { struct smb3_fs_context *ctx = mnt_data->ctx; @@ -2733,8 +2743,8 @@ static int match_prepath(struct super_block *sb, bool new_set = (new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) && new->prepath; - if (server->origin_fullpath && - dfs_src_pathname_equal(server->origin_fullpath, ctx->source)) + if (tcon->origin_fullpath && + dfs_src_pathname_equal(tcon->origin_fullpath, ctx->source)) return 1; if (old_set && new_set && !strcmp(new->prepath, old->prepath)) @@ -2767,8 +2777,9 @@ cifs_match_super(struct super_block *sb, void *data) } tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); - if (tlink == NULL) { - /* can not match superblock if tlink were ever null */ + if (IS_ERR_OR_NULL(tlink)) { + pr_warn_once("%s: skip super matching due to bad tlink(%p)\n", + __func__, tlink); spin_unlock(&cifs_tcp_ses_lock); return 0; } @@ -2782,10 +2793,10 @@ cifs_match_super(struct super_block *sb, void *data) spin_lock(&ses->ses_lock); spin_lock(&ses->chan_lock); spin_lock(&tcon->tc_lock); - if (!match_server(tcp_srv, ctx) || + if (!match_server(tcp_srv, ctx, true) || !match_session(ses, ctx) || !match_tcon(tcon, ctx) || - !match_prepath(sb, tcp_srv, mnt_data)) { + !match_prepath(sb, tcon, mnt_data)) { rc = 0; goto out; } @@ -2933,11 +2944,11 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) static int generic_ip_connect(struct TCP_Server_Info *server) { - int rc = 0; - __be16 sport; - int slen, sfamily; - struct socket *socket = server->ssocket; struct sockaddr *saddr; + struct socket *socket; + int slen, sfamily; + __be16 sport; + int rc = 0; saddr = (struct sockaddr *) &server->dstaddr; @@ -2959,18 +2970,19 @@ generic_ip_connect(struct TCP_Server_Info *server) ntohs(sport)); } - if (socket == NULL) { + if (server->ssocket) { + socket = server->ssocket; + } else { rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM, - IPPROTO_TCP, &socket, 1); + IPPROTO_TCP, &server->ssocket, 1); if (rc < 0) { cifs_server_dbg(VFS, "Error %d creating socket\n", rc); - server->ssocket = NULL; return rc; } /* BB other socket options to set KEEPALIVE, NODELAY? */ cifs_dbg(FYI, "Socket created\n"); - server->ssocket = socket; + socket = server->ssocket; socket->sk->sk_allocation = GFP_NOFS; socket->sk->sk_use_task_frag = false; if (sfamily == AF_INET6) diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index 2390b2fedd6a..26d14dd0482e 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -54,39 +54,6 @@ out: return rc; } -/* - * cifs_build_path_to_root returns full path to root when we do not have an - * existing connection (tcon) - */ -static char *build_unc_path_to_root(const struct smb3_fs_context *ctx, - const struct cifs_sb_info *cifs_sb, bool useppath) -{ - char *full_path, *pos; - unsigned int pplen = useppath && ctx->prepath ? strlen(ctx->prepath) + 1 : 0; - unsigned int unc_len = strnlen(ctx->UNC, MAX_TREE_SIZE + 1); - - if (unc_len > MAX_TREE_SIZE) - return ERR_PTR(-EINVAL); - - full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); - if (full_path == NULL) - return ERR_PTR(-ENOMEM); - - memcpy(full_path, ctx->UNC, unc_len); - pos = full_path + unc_len; - - if (pplen) { - *pos = CIFS_DIR_SEP(cifs_sb); - memcpy(pos + 1, ctx->prepath, pplen); - pos += pplen; - } - - *pos = '\0'; /* add trailing null */ - convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); - cifs_dbg(FYI, "%s: full_path=%s\n", __func__, full_path); - return full_path; -} - static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; @@ -179,6 +146,7 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) struct TCP_Server_Info *server; struct cifs_tcon *tcon; char *origin_fullpath = NULL; + char sep = CIFS_DIR_SEP(cifs_sb); int num_links = 0; int rc; @@ -186,7 +154,7 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) if (IS_ERR(ref_path)) return PTR_ERR(ref_path); - full_path = build_unc_path_to_root(ctx, cifs_sb, true); + full_path = smb3_fs_context_fullpath(ctx, sep); if (IS_ERR(full_path)) { rc = PTR_ERR(full_path); full_path = NULL; @@ -228,7 +196,7 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) kfree(full_path); ref_path = full_path = NULL; - full_path = build_unc_path_to_root(ctx, cifs_sb, true); + full_path = smb3_fs_context_fullpath(ctx, sep); if (IS_ERR(full_path)) { rc = PTR_ERR(full_path); full_path = NULL; @@ -249,14 +217,12 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx) server = mnt_ctx->server; tcon = mnt_ctx->tcon; - mutex_lock(&server->refpath_lock); - spin_lock(&server->srv_lock); - if (!server->origin_fullpath) { - server->origin_fullpath = origin_fullpath; + spin_lock(&tcon->tc_lock); + if (!tcon->origin_fullpath) { + tcon->origin_fullpath = origin_fullpath; origin_fullpath = NULL; } - spin_unlock(&server->srv_lock); - mutex_unlock(&server->refpath_lock); + spin_unlock(&tcon->tc_lock); if (list_empty(&tcon->dfs_ses_list)) { list_replace_init(&mnt_ctx->dfs_ses_list, @@ -279,18 +245,13 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; struct cifs_ses *ses; - char *source = ctx->source; bool nodfs = ctx->nodfs; int rc; *isdfs = false; - /* Temporarily set @ctx->source to NULL as we're not matching DFS - * superblocks yet. See cifs_match_super() and match_server(). - */ - ctx->source = NULL; rc = get_session(mnt_ctx, NULL); if (rc) - goto out; + return rc; ctx->dfs_root_ses = mnt_ctx->ses; /* @@ -303,8 +264,9 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) if (!nodfs) { rc = dfs_get_referral(mnt_ctx, ctx->UNC + 1, NULL, NULL); if (rc) { - if (rc != -ENOENT && rc != -EOPNOTSUPP && rc != -EIO) - goto out; + cifs_dbg(FYI, "%s: no dfs referral for %s: %d\n", + __func__, ctx->UNC + 1, rc); + cifs_dbg(FYI, "%s: assuming non-dfs mount...\n", __func__); nodfs = true; } } @@ -312,7 +274,7 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) rc = cifs_mount_get_tcon(mnt_ctx); if (!rc) rc = cifs_is_path_remote(mnt_ctx); - goto out; + return rc; } *isdfs = true; @@ -328,12 +290,7 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) rc = __dfs_mount_share(mnt_ctx); if (ses == ctx->dfs_root_ses) cifs_put_smb_ses(ses); -out: - /* - * Restore previous value of @ctx->source so DFS superblock can be - * matched in cifs_match_super(). - */ - ctx->source = source; + return rc; } @@ -567,11 +524,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru int rc; struct TCP_Server_Info *server = tcon->ses->server; const struct smb_version_operations *ops = server->ops; - struct super_block *sb = NULL; - struct cifs_sb_info *cifs_sb; struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl); - char *tree; + struct cifs_sb_info *cifs_sb = NULL; + struct super_block *sb = NULL; struct dfs_info3_param ref = {0}; + char *tree; /* only send once per connect */ spin_lock(&tcon->tc_lock); @@ -603,19 +560,18 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru goto out; } - sb = cifs_get_tcp_super(server); - if (IS_ERR(sb)) { - rc = PTR_ERR(sb); - cifs_dbg(VFS, "%s: could not find superblock: %d\n", __func__, rc); - goto out; - } - - cifs_sb = CIFS_SB(sb); + sb = cifs_get_dfs_tcon_super(tcon); + if (!IS_ERR(sb)) + cifs_sb = CIFS_SB(sb); - /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */ - if (!server->leaf_fullpath || + /* + * Tree connect to last share in @tcon->tree_name whether dfs super or + * cached dfs referral was not found. + */ + if (!cifs_sb || !server->leaf_fullpath || dfs_cache_noreq_find(server->leaf_fullpath + 1, &ref, &tl)) { - rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, cifs_sb->local_nls); + rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, + cifs_sb ? cifs_sb->local_nls : nlsc); goto out; } diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h index 1c90df5ecfbd..98e9d2aca6a7 100644 --- a/fs/smb/client/dfs.h +++ b/fs/smb/client/dfs.h @@ -39,16 +39,15 @@ static inline char *dfs_get_automount_devname(struct dentry *dentry, void *page) { struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - struct TCP_Server_Info *server = tcon->ses->server; size_t len; char *s; - spin_lock(&server->srv_lock); - if (unlikely(!server->origin_fullpath)) { - spin_unlock(&server->srv_lock); + spin_lock(&tcon->tc_lock); + if (unlikely(!tcon->origin_fullpath)) { + spin_unlock(&tcon->tc_lock); return ERR_PTR(-EREMOTE); } - spin_unlock(&server->srv_lock); + spin_unlock(&tcon->tc_lock); s = dentry_path_raw(dentry, page, PATH_MAX); if (IS_ERR(s)) @@ -57,16 +56,16 @@ static inline char *dfs_get_automount_devname(struct dentry *dentry, void *page) if (!s[1]) s++; - spin_lock(&server->srv_lock); - len = strlen(server->origin_fullpath); + spin_lock(&tcon->tc_lock); + len = strlen(tcon->origin_fullpath); if (s < (char *)page + len) { - spin_unlock(&server->srv_lock); + spin_unlock(&tcon->tc_lock); return ERR_PTR(-ENAMETOOLONG); } s -= len; - memcpy(s, server->origin_fullpath, len); - spin_unlock(&server->srv_lock); + memcpy(s, tcon->origin_fullpath, len); + spin_unlock(&tcon->tc_lock); convert_delimiter(s, '/'); return s; diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c index 1513b2709889..33adf43a01f1 100644 --- a/fs/smb/client/dfs_cache.c +++ b/fs/smb/client/dfs_cache.c @@ -1248,18 +1248,20 @@ static int refresh_tcon(struct cifs_tcon *tcon, bool force_refresh) int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb) { struct cifs_tcon *tcon; - struct TCP_Server_Info *server; if (!cifs_sb || !cifs_sb->master_tlink) return -EINVAL; tcon = cifs_sb_master_tcon(cifs_sb); - server = tcon->ses->server; - if (!server->origin_fullpath) { + spin_lock(&tcon->tc_lock); + if (!tcon->origin_fullpath) { + spin_unlock(&tcon->tc_lock); cifs_dbg(FYI, "%s: not a dfs mount\n", __func__); return 0; } + spin_unlock(&tcon->tc_lock); + /* * After reconnecting to a different server, unique ids won't match anymore, so we disable * serverino. This prevents dentry revalidation to think the dentry are stale (ESTALE). diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 051283386e22..879bc8e6555c 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -4936,20 +4936,19 @@ oplock_break_ack: _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false); /* - * releasing stale oplock after recent reconnect of smb session using - * a now incorrect file handle is not a data integrity issue but do - * not bother sending an oplock release if session to server still is - * disconnected since oplock already released by the server + * MS-SMB2 3.2.5.19.1 and 3.2.5.19.2 (and MS-CIFS 3.2.5.42) do not require + * an acknowledgment to be sent when the file has already been closed. + * check for server null, since can race with kill_sb calling tree disconnect. */ - if (!oplock_break_cancelled) { - /* check for server null since can race with kill_sb calling tree disconnect */ - if (tcon->ses && tcon->ses->server) { - rc = tcon->ses->server->ops->oplock_response(tcon, persistent_fid, - volatile_fid, net_fid, cinode); - cifs_dbg(FYI, "Oplock release rc = %d\n", rc); - } else - pr_warn_once("lease break not sent for unmounted share\n"); - } + spin_lock(&cinode->open_file_lock); + if (tcon->ses && tcon->ses->server && !oplock_break_cancelled && + !list_empty(&cinode->openFileList)) { + spin_unlock(&cinode->open_file_lock); + rc = tcon->ses->server->ops->oplock_response(tcon, persistent_fid, + volatile_fid, net_fid, cinode); + cifs_dbg(FYI, "Oplock release rc = %d\n", rc); + } else + spin_unlock(&cinode->open_file_lock); cifs_done_oplock_break(cinode); } @@ -5083,19 +5082,3 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .launder_folio = cifs_launder_folio, .migrate_folio = filemap_migrate_folio, }; - -/* - * Splice data from a file into a pipe. - */ -ssize_t cifs_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - if (unlikely(*ppos >= file_inode(in)->i_sb->s_maxbytes)) - return 0; - if (unlikely(!len)) - return 0; - if (in->f_flags & O_DIRECT) - return direct_splice_read(in, ppos, pipe, len, flags); - return filemap_splice_read(in, ppos, pipe, len, flags); -} diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 1bda75609b64..4946a0c59600 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -441,14 +441,17 @@ out: * but there are some bugs that prevent rename from working if there are * multiple delimiters. * - * Returns a sanitized duplicate of @path. @gfp indicates the GFP_* flags - * for kstrdup. + * Return a sanitized duplicate of @path or NULL for empty prefix paths. + * Otherwise, return ERR_PTR. + * + * @gfp indicates the GFP_* flags for kstrdup. * The caller is responsible for freeing the original. */ #define IS_DELIM(c) ((c) == '/' || (c) == '\\') char *cifs_sanitize_prepath(char *prepath, gfp_t gfp) { char *cursor1 = prepath, *cursor2 = prepath; + char *s; /* skip all prepended delimiters */ while (IS_DELIM(*cursor1)) @@ -469,8 +472,39 @@ char *cifs_sanitize_prepath(char *prepath, gfp_t gfp) if (IS_DELIM(*(cursor2 - 1))) cursor2--; - *(cursor2) = '\0'; - return kstrdup(prepath, gfp); + *cursor2 = '\0'; + if (!*prepath) + return NULL; + s = kstrdup(prepath, gfp); + if (!s) + return ERR_PTR(-ENOMEM); + return s; +} + +/* + * Return full path based on the values of @ctx->{UNC,prepath}. + * + * It is assumed that both values were already parsed by smb3_parse_devname(). + */ +char *smb3_fs_context_fullpath(const struct smb3_fs_context *ctx, char dirsep) +{ + size_t ulen, plen; + char *s; + + ulen = strlen(ctx->UNC); + plen = ctx->prepath ? strlen(ctx->prepath) + 1 : 0; + + s = kmalloc(ulen + plen + 1, GFP_KERNEL); + if (!s) + return ERR_PTR(-ENOMEM); + memcpy(s, ctx->UNC, ulen); + if (plen) { + s[ulen] = dirsep; + memcpy(s + ulen + 1, ctx->prepath, plen); + } + s[ulen + plen] = '\0'; + convert_delimiter(s, dirsep); + return s; } /* @@ -484,6 +518,7 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) char *pos; const char *delims = "/\\"; size_t len; + int rc; if (unlikely(!devname || !*devname)) { cifs_dbg(VFS, "Device name not specified\n"); @@ -511,6 +546,8 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) /* now go until next delimiter or end of string */ len = strcspn(pos, delims); + if (!len) + return -EINVAL; /* move "pos" up to delimiter or NULL */ pos += len; @@ -533,8 +570,11 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) return 0; ctx->prepath = cifs_sanitize_prepath(pos, GFP_KERNEL); - if (!ctx->prepath) - return -ENOMEM; + if (IS_ERR(ctx->prepath)) { + rc = PTR_ERR(ctx->prepath); + ctx->prepath = NULL; + return rc; + } return 0; } @@ -1146,12 +1186,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_errorf(fc, "Unknown error parsing devname\n"); goto cifs_parse_mount_err; } - ctx->source = kstrdup(param->string, GFP_KERNEL); - if (ctx->source == NULL) { + ctx->source = smb3_fs_context_fullpath(ctx, '/'); + if (IS_ERR(ctx->source)) { + ctx->source = NULL; cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } - fc->source = kstrdup(param->string, GFP_KERNEL); + fc->source = kstrdup(ctx->source, GFP_KERNEL); if (fc->source == NULL) { cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 1087ac6104a9..c3eeae07e139 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2344,8 +2344,8 @@ cifs_invalidate_mapping(struct inode *inode) if (inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = invalidate_inode_pages2(inode->i_mapping); if (rc) - cifs_dbg(VFS, "%s: Could not invalidate inode %p\n", - __func__, inode); + cifs_dbg(VFS, "%s: invalidate inode %p failed with rc %d\n", + __func__, inode, rc); } return rc; diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index cd914be905b2..70dbfe6584f9 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -156,6 +156,7 @@ tconInfoFree(struct cifs_tcon *tcon) #ifdef CONFIG_CIFS_DFS_UPCALL dfs_put_root_smb_sessions(&tcon->dfs_ses_list); #endif + kfree(tcon->origin_fullpath); kfree(tcon); } @@ -1106,20 +1107,25 @@ struct super_cb_data { struct super_block *sb; }; -static void tcp_super_cb(struct super_block *sb, void *arg) +static void tcon_super_cb(struct super_block *sb, void *arg) { struct super_cb_data *sd = arg; - struct TCP_Server_Info *server = sd->data; struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; + struct cifs_tcon *t1 = sd->data, *t2; if (sd->sb) return; cifs_sb = CIFS_SB(sb); - tcon = cifs_sb_master_tcon(cifs_sb); - if (tcon->ses->server == server) + t2 = cifs_sb_master_tcon(cifs_sb); + + spin_lock(&t2->tc_lock); + if (t1->ses == t2->ses && + t1->ses->server == t2->ses->server && + t2->origin_fullpath && + dfs_src_pathname_equal(t2->origin_fullpath, t1->origin_fullpath)) sd->sb = sb; + spin_unlock(&t2->tc_lock); } static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void *), @@ -1145,6 +1151,7 @@ static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void return sd.sb; } } + pr_warn_once("%s: could not find dfs superblock\n", __func__); return ERR_PTR(-EINVAL); } @@ -1154,9 +1161,15 @@ static void __cifs_put_super(struct super_block *sb) cifs_sb_deactive(sb); } -struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server) +struct super_block *cifs_get_dfs_tcon_super(struct cifs_tcon *tcon) { - return __cifs_get_super(tcp_super_cb, server); + spin_lock(&tcon->tc_lock); + if (!tcon->origin_fullpath) { + spin_unlock(&tcon->tc_lock); + return ERR_PTR(-ENOENT); + } + spin_unlock(&tcon->tc_lock); + return __cifs_get_super(tcon_super_cb, tcon); } void cifs_put_tcp_super(struct super_block *sb) @@ -1198,16 +1211,21 @@ int match_target_ip(struct TCP_Server_Info *server, int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix) { + int rc; + kfree(cifs_sb->prepath); + cifs_sb->prepath = NULL; if (prefix && *prefix) { cifs_sb->prepath = cifs_sanitize_prepath(prefix, GFP_ATOMIC); - if (!cifs_sb->prepath) - return -ENOMEM; - - convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb)); - } else - cifs_sb->prepath = NULL; + if (IS_ERR(cifs_sb->prepath)) { + rc = PTR_ERR(cifs_sb->prepath); + cifs_sb->prepath = NULL; + return rc; + } + if (cifs_sb->prepath) + convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb)); + } cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; return 0; @@ -1238,9 +1256,16 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid, */ if (strlen(full_path) < 2 || !cifs_sb || (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) || - !is_tcon_dfs(tcon) || !ses->server->origin_fullpath) + !is_tcon_dfs(tcon)) return 0; + spin_lock(&tcon->tc_lock); + if (!tcon->origin_fullpath) { + spin_unlock(&tcon->tc_lock); + return 0; + } + spin_unlock(&tcon->tc_lock); + /* * Slow path - tcon is DFS and @full_path has prefix path, so attempt * to get a referral to figure out whether it is an DFS link. @@ -1264,7 +1289,7 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid, /* * XXX: we are not using dfs_cache_find() here because we might - * end filling all the DFS cache and thus potentially + * end up filling all the DFS cache and thus potentially * removing cached DFS targets that the client would eventually * need during failover. */ diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 163a03298430..8e696fbd72fa 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -398,9 +398,6 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rsp_iov); finished: - if (cfile) - cifsFileInfo_put(cfile); - SMB2_open_free(&rqst[0]); if (rc == -EREMCHG) { pr_warn_once("server share %s deleted\n", tcon->tree_name); @@ -529,6 +526,9 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, break; } + if (cfile) + cifsFileInfo_put(cfile); + if (rc && err_iov && err_buftype) { memcpy(err_iov, rsp_iov, 3 * sizeof(*err_iov)); memcpy(err_buftype, resp_buftype, 3 * sizeof(*err_buftype)); @@ -609,9 +609,6 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if (islink) rc = -EREMOTE; } - if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && cifs_sb && - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)) - rc = -EOPNOTSUPP; } out: diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index a8bb9d00d33a..87abce010974 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -109,7 +109,11 @@ smb2_add_credits(struct TCP_Server_Info *server, server->credits--; server->oplock_credits++; } - } + } else if ((server->in_flight > 0) && (server->oplock_credits > 3) && + ((optype & CIFS_OP_MASK) == CIFS_OBREAK_OP)) + /* if now have too many oplock credits, rebalance so don't starve normal ops */ + change_conf(server); + scredits = *val; in_flight = server->in_flight; spin_unlock(&server->req_lock); @@ -211,6 +215,16 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, spin_lock(&server->req_lock); while (1) { + spin_unlock(&server->req_lock); + + spin_lock(&server->srv_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&server->srv_lock); + return -ENOENT; + } + spin_unlock(&server->srv_lock); + + spin_lock(&server->req_lock); if (server->credits <= 0) { spin_unlock(&server->req_lock); cifs_num_waiters_inc(server); @@ -221,15 +235,6 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, return rc; spin_lock(&server->req_lock); } else { - spin_unlock(&server->req_lock); - spin_lock(&server->srv_lock); - if (server->tcpStatus == CifsExiting) { - spin_unlock(&server->srv_lock); - return -ENOENT; - } - spin_unlock(&server->srv_lock); - - spin_lock(&server->req_lock); scredits = server->credits; /* can deadlock with reopen */ if (scredits <= 8) { @@ -4373,8 +4378,8 @@ static void *smb2_get_aead_req(struct crypto_aead *tfm, struct smb_rqst *rqst, } sgtable.orig_nents = sgtable.nents; - rc = netfs_extract_iter_to_sg(iter, count, &sgtable, - num_sgs - sgtable.nents, 0); + rc = extract_iter_to_sg(iter, count, &sgtable, + num_sgs - sgtable.nents, 0); iov_iter_revert(iter, rc); sgtable.orig_nents = sgtable.nents; } @@ -4409,6 +4414,8 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) } spin_unlock(&cifs_tcp_ses_lock); + trace_smb3_ses_not_found(ses_id); + return -EAGAIN; } /* @@ -4439,8 +4446,8 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { - cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__, - enc ? "en" : "de"); + cifs_server_dbg(FYI, "%s: Could not get %scryption key. sid: 0x%llx\n", __func__, + enc ? "en" : "de", le64_to_cpu(tr_hdr->SessionId)); return rc; } diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 17fe212ab895..e04766fe6f80 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -3797,6 +3797,12 @@ void smb2_reconnect_server(struct work_struct *work) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + spin_lock(&ses->ses_lock); + if (ses->ses_status == SES_EXITING) { + spin_unlock(&ses->ses_lock); + continue; + } + spin_unlock(&ses->ses_lock); tcon_selected = false; diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 790acf65a092..c6db898dab7c 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -92,7 +92,8 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) if (ses->Suid == ses_id) goto found; } - cifs_server_dbg(VFS, "%s: Could not find session 0x%llx\n", + trace_smb3_ses_not_found(ses_id); + cifs_server_dbg(FYI, "%s: Could not find session 0x%llx\n", __func__, ses_id); rc = -ENOENT; goto out; @@ -153,7 +154,14 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { if (ses->Suid != ses_id) continue; + + spin_lock(&ses->ses_lock); + if (ses->ses_status == SES_EXITING) { + spin_unlock(&ses->ses_lock); + continue; + } ++ses->ses_count; + spin_unlock(&ses->ses_lock); return ses; } @@ -557,7 +565,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key); if (unlikely(rc)) { - cifs_server_dbg(VFS, "%s: Could not get signing key\n", __func__); + cifs_server_dbg(FYI, "%s: Could not get signing key\n", __func__); return rc; } diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 0362ebd4fa0f..2a2aec8c6112 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -2227,7 +2227,7 @@ static int smbd_iter_to_mr(struct smbd_connection *info, memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist)); - ret = netfs_extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0); + ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0); WARN_ON(ret < 0); if (sgt->nents > 0) sg_mark_end(&sgt->sgl[sgt->nents - 1]); @@ -2500,7 +2500,7 @@ static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter, if (is_vmalloc_or_module_addr((void *)kaddr)) page = vmalloc_to_page((void *)kaddr); else - page = virt_to_page(kaddr); + page = virt_to_page((void *)kaddr); if (!smb_set_sge(rdma, page, off, seg)) return -EIO; diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index d3053bd8ae73..e671bd16f00c 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -1003,6 +1003,26 @@ DEFINE_EVENT(smb3_reconnect_class, smb3_##name, \ DEFINE_SMB3_RECONNECT_EVENT(reconnect); DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect); +DECLARE_EVENT_CLASS(smb3_ses_class, + TP_PROTO(__u64 sesid), + TP_ARGS(sesid), + TP_STRUCT__entry( + __field(__u64, sesid) + ), + TP_fast_assign( + __entry->sesid = sesid; + ), + TP_printk("sid=0x%llx", + __entry->sesid) +) + +#define DEFINE_SMB3_SES_EVENT(name) \ +DEFINE_EVENT(smb3_ses_class, smb3_##name, \ + TP_PROTO(__u64 sesid), \ + TP_ARGS(sesid)) + +DEFINE_SMB3_SES_EVENT(ses_not_found); + DECLARE_EVENT_CLASS(smb3_credit_class, TP_PROTO(__u64 currmid, __u64 conn_id, diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 0474d0bba0a2..f280502a2aee 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -522,6 +522,16 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, } while (1) { + spin_unlock(&server->req_lock); + + spin_lock(&server->srv_lock); + if (server->tcpStatus == CifsExiting) { + spin_unlock(&server->srv_lock); + return -ENOENT; + } + spin_unlock(&server->srv_lock); + + spin_lock(&server->req_lock); if (*credits < num_credits) { scredits = *credits; spin_unlock(&server->req_lock); @@ -547,15 +557,6 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, return -ERESTARTSYS; spin_lock(&server->req_lock); } else { - spin_unlock(&server->req_lock); - - spin_lock(&server->srv_lock); - if (server->tcpStatus == CifsExiting) { - spin_unlock(&server->srv_lock); - return -ENOENT; - } - spin_unlock(&server->srv_lock); - /* * For normal commands, reserve the last MAX_COMPOUND * credits to compound requests. @@ -569,7 +570,6 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, * for servers that are slow to hand out credits on * new sessions. */ - spin_lock(&server->req_lock); if (!optype && num_credits == 1 && server->in_flight > 2 * MAX_COMPOUND && *credits <= MAX_COMPOUND) { diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c index f07a05f37651..408cddf2f094 100644 --- a/fs/smb/server/mgmt/tree_connect.c +++ b/fs/smb/server/mgmt/tree_connect.c @@ -120,17 +120,6 @@ struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess, return tcon; } -struct ksmbd_share_config *ksmbd_tree_conn_share(struct ksmbd_session *sess, - unsigned int id) -{ - struct ksmbd_tree_connect *tc; - - tc = ksmbd_tree_conn_lookup(sess, id); - if (tc) - return tc->share_conf; - return NULL; -} - int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess) { int ret = 0; diff --git a/fs/smb/server/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h index 700df36cf3e3..562d647ad9fa 100644 --- a/fs/smb/server/mgmt/tree_connect.h +++ b/fs/smb/server/mgmt/tree_connect.h @@ -53,9 +53,6 @@ int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess, unsigned int id); -struct ksmbd_share_config *ksmbd_tree_conn_share(struct ksmbd_session *sess, - unsigned int id); - int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess); #endif /* __TREE_CONNECT_MANAGEMENT_H__ */ diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index da1787c68ba0..cf8822103f50 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -543,7 +543,7 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work) if (le32_to_cpu(hdr->NextCommand) > 0) sz = large_sz; - work->response_buf = kvmalloc(sz, GFP_KERNEL | __GFP_ZERO); + work->response_buf = kvzalloc(sz, GFP_KERNEL); if (!work->response_buf) return -ENOMEM; @@ -1322,9 +1322,8 @@ static int decode_negotiation_token(struct ksmbd_conn *conn, static int ntlm_negotiate(struct ksmbd_work *work, struct negotiate_message *negblob, - size_t negblob_len) + size_t negblob_len, struct smb2_sess_setup_rsp *rsp) { - struct smb2_sess_setup_rsp *rsp = smb2_get_msg(work->response_buf); struct challenge_message *chgblob; unsigned char *spnego_blob = NULL; u16 spnego_blob_len; @@ -1429,10 +1428,10 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn, return user; } -static int ntlm_authenticate(struct ksmbd_work *work) +static int ntlm_authenticate(struct ksmbd_work *work, + struct smb2_sess_setup_req *req, + struct smb2_sess_setup_rsp *rsp) { - struct smb2_sess_setup_req *req = smb2_get_msg(work->request_buf); - struct smb2_sess_setup_rsp *rsp = smb2_get_msg(work->response_buf); struct ksmbd_conn *conn = work->conn; struct ksmbd_session *sess = work->sess; struct channel *chann = NULL; @@ -1566,10 +1565,10 @@ binding_session: } #ifdef CONFIG_SMB_SERVER_KERBEROS5 -static int krb5_authenticate(struct ksmbd_work *work) +static int krb5_authenticate(struct ksmbd_work *work, + struct smb2_sess_setup_req *req, + struct smb2_sess_setup_rsp *rsp) { - struct smb2_sess_setup_req *req = smb2_get_msg(work->request_buf); - struct smb2_sess_setup_rsp *rsp = smb2_get_msg(work->response_buf); struct ksmbd_conn *conn = work->conn; struct ksmbd_session *sess = work->sess; char *in_blob, *out_blob; @@ -1647,7 +1646,9 @@ static int krb5_authenticate(struct ksmbd_work *work) return 0; } #else -static int krb5_authenticate(struct ksmbd_work *work) +static int krb5_authenticate(struct ksmbd_work *work, + struct smb2_sess_setup_req *req, + struct smb2_sess_setup_rsp *rsp) { return -EOPNOTSUPP; } @@ -1656,8 +1657,8 @@ static int krb5_authenticate(struct ksmbd_work *work) int smb2_sess_setup(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - struct smb2_sess_setup_req *req = smb2_get_msg(work->request_buf); - struct smb2_sess_setup_rsp *rsp = smb2_get_msg(work->response_buf); + struct smb2_sess_setup_req *req; + struct smb2_sess_setup_rsp *rsp; struct ksmbd_session *sess; struct negotiate_message *negblob; unsigned int negblob_len, negblob_off; @@ -1665,6 +1666,8 @@ int smb2_sess_setup(struct ksmbd_work *work) ksmbd_debug(SMB, "Received request for session setup\n"); + WORK_BUFFERS(work, req, rsp); + rsp->StructureSize = cpu_to_le16(9); rsp->SessionFlags = 0; rsp->SecurityBufferOffset = cpu_to_le16(72); @@ -1786,7 +1789,7 @@ int smb2_sess_setup(struct ksmbd_work *work) if (conn->preferred_auth_mech & (KSMBD_AUTH_KRB5 | KSMBD_AUTH_MSKRB5)) { - rc = krb5_authenticate(work); + rc = krb5_authenticate(work, req, rsp); if (rc) { rc = -EINVAL; goto out_err; @@ -1800,7 +1803,7 @@ int smb2_sess_setup(struct ksmbd_work *work) sess->Preauth_HashValue = NULL; } else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) { if (negblob->MessageType == NtLmNegotiate) { - rc = ntlm_negotiate(work, negblob, negblob_len); + rc = ntlm_negotiate(work, negblob, negblob_len, rsp); if (rc) goto out_err; rsp->hdr.Status = @@ -1813,7 +1816,7 @@ int smb2_sess_setup(struct ksmbd_work *work) le16_to_cpu(rsp->SecurityBufferLength) - 1); } else if (negblob->MessageType == NtLmAuthenticate) { - rc = ntlm_authenticate(work); + rc = ntlm_authenticate(work, req, rsp); if (rc) goto out_err; @@ -1911,14 +1914,16 @@ out_err: int smb2_tree_connect(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - struct smb2_tree_connect_req *req = smb2_get_msg(work->request_buf); - struct smb2_tree_connect_rsp *rsp = smb2_get_msg(work->response_buf); + struct smb2_tree_connect_req *req; + struct smb2_tree_connect_rsp *rsp; struct ksmbd_session *sess = work->sess; char *treename = NULL, *name = NULL; struct ksmbd_tree_conn_status status; struct ksmbd_share_config *share; int rc = -EINVAL; + WORK_BUFFERS(work, req, rsp); + treename = smb_strndup_from_utf16(req->Buffer, le16_to_cpu(req->PathLength), true, conn->local_nls); @@ -2087,19 +2092,19 @@ static int smb2_create_open_flags(bool file_present, __le32 access, */ int smb2_tree_disconnect(struct ksmbd_work *work) { - struct smb2_tree_disconnect_rsp *rsp = smb2_get_msg(work->response_buf); + struct smb2_tree_disconnect_rsp *rsp; + struct smb2_tree_disconnect_req *req; struct ksmbd_session *sess = work->sess; struct ksmbd_tree_connect *tcon = work->tcon; + WORK_BUFFERS(work, req, rsp); + rsp->StructureSize = cpu_to_le16(4); inc_rfc1001_len(work->response_buf, 4); ksmbd_debug(SMB, "request\n"); if (!tcon || test_and_set_bit(TREE_CONN_EXPIRE, &tcon->status)) { - struct smb2_tree_disconnect_req *req = - smb2_get_msg(work->request_buf); - ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; @@ -2122,10 +2127,14 @@ int smb2_tree_disconnect(struct ksmbd_work *work) int smb2_session_logoff(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - struct smb2_logoff_rsp *rsp = smb2_get_msg(work->response_buf); + struct smb2_logoff_req *req; + struct smb2_logoff_rsp *rsp; struct ksmbd_session *sess; - struct smb2_logoff_req *req = smb2_get_msg(work->request_buf); - u64 sess_id = le64_to_cpu(req->hdr.SessionId); + u64 sess_id; + + WORK_BUFFERS(work, req, rsp); + + sess_id = le64_to_cpu(req->hdr.SessionId); rsp->StructureSize = cpu_to_le16(4); inc_rfc1001_len(work->response_buf, 4); @@ -2165,12 +2174,14 @@ int smb2_session_logoff(struct ksmbd_work *work) */ static noinline int create_smb2_pipe(struct ksmbd_work *work) { - struct smb2_create_rsp *rsp = smb2_get_msg(work->response_buf); - struct smb2_create_req *req = smb2_get_msg(work->request_buf); + struct smb2_create_rsp *rsp; + struct smb2_create_req *req; int id; int err; char *name; + WORK_BUFFERS(work, req, rsp); + name = smb_strndup_from_utf16(req->Buffer, le16_to_cpu(req->NameLength), 1, work->conn->local_nls); if (IS_ERR(name)) { @@ -2872,11 +2883,9 @@ int smb2_open(struct ksmbd_work *work) if (!file_present) { daccess = cpu_to_le32(GENERIC_ALL_FLAGS); } else { - rc = ksmbd_vfs_query_maximal_access(idmap, + ksmbd_vfs_query_maximal_access(idmap, path.dentry, &daccess); - if (rc) - goto err_out; already_permitted = true; } maximal_access = daccess; @@ -5305,8 +5314,10 @@ int smb2_query_info(struct ksmbd_work *work) static noinline int smb2_close_pipe(struct ksmbd_work *work) { u64 id; - struct smb2_close_req *req = smb2_get_msg(work->request_buf); - struct smb2_close_rsp *rsp = smb2_get_msg(work->response_buf); + struct smb2_close_req *req; + struct smb2_close_rsp *rsp; + + WORK_BUFFERS(work, req, rsp); id = req->VolatileFileId; ksmbd_session_rpc_close(work->sess, id); @@ -5448,6 +5459,9 @@ int smb2_echo(struct ksmbd_work *work) { struct smb2_echo_rsp *rsp = smb2_get_msg(work->response_buf); + if (work->next_smb2_rcv_hdr_off) + rsp = ksmbd_resp_buf_next(work); + rsp->StructureSize = cpu_to_le16(4); rsp->Reserved = 0; inc_rfc1001_len(work->response_buf, 4); @@ -6082,8 +6096,10 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) int nbytes = 0, err; u64 id; struct ksmbd_rpc_command *rpc_resp; - struct smb2_read_req *req = smb2_get_msg(work->request_buf); - struct smb2_read_rsp *rsp = smb2_get_msg(work->response_buf); + struct smb2_read_req *req; + struct smb2_read_rsp *rsp; + + WORK_BUFFERS(work, req, rsp); id = req->VolatileFileId; @@ -6096,7 +6112,7 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) } work->aux_payload_buf = - kvmalloc(rpc_resp->payload_sz, GFP_KERNEL | __GFP_ZERO); + kvmalloc(rpc_resp->payload_sz, GFP_KERNEL); if (!work->aux_payload_buf) { err = -ENOMEM; goto out; @@ -6248,7 +6264,7 @@ int smb2_read(struct ksmbd_work *work) ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n", fp->filp, offset, length); - work->aux_payload_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); + work->aux_payload_buf = kvzalloc(length, GFP_KERNEL); if (!work->aux_payload_buf) { err = -ENOMEM; goto out; @@ -6331,14 +6347,16 @@ out: */ static noinline int smb2_write_pipe(struct ksmbd_work *work) { - struct smb2_write_req *req = smb2_get_msg(work->request_buf); - struct smb2_write_rsp *rsp = smb2_get_msg(work->response_buf); + struct smb2_write_req *req; + struct smb2_write_rsp *rsp; struct ksmbd_rpc_command *rpc_resp; u64 id = 0; int err = 0, ret = 0; char *data_buf; size_t length; + WORK_BUFFERS(work, req, rsp); + length = le32_to_cpu(req->Length); id = req->VolatileFileId; @@ -6397,7 +6415,7 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, int ret; ssize_t nbytes; - data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); + data_buf = kvzalloc(length, GFP_KERNEL); if (!data_buf) return -ENOMEM; @@ -6607,6 +6625,9 @@ int smb2_cancel(struct ksmbd_work *work) struct ksmbd_work *iter; struct list_head *command_list; + if (work->next_smb2_rcv_hdr_off) + hdr = ksmbd_resp_buf_next(work); + ksmbd_debug(SMB, "smb2 cancel called on mid %llu, async flags 0x%x\n", hdr->MessageId, hdr->Flags); @@ -6766,8 +6787,8 @@ static inline bool lock_defer_pending(struct file_lock *fl) */ int smb2_lock(struct ksmbd_work *work) { - struct smb2_lock_req *req = smb2_get_msg(work->request_buf); - struct smb2_lock_rsp *rsp = smb2_get_msg(work->response_buf); + struct smb2_lock_req *req; + struct smb2_lock_rsp *rsp; struct smb2_lock_element *lock_ele; struct ksmbd_file *fp = NULL; struct file_lock *flock = NULL; @@ -6784,6 +6805,8 @@ int smb2_lock(struct ksmbd_work *work) LIST_HEAD(rollback_list); int prior_lock = 0; + WORK_BUFFERS(work, req, rsp); + ksmbd_debug(SMB, "Received lock request\n"); fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId); if (!fp) { @@ -7897,8 +7920,8 @@ out: */ static void smb20_oplock_break_ack(struct ksmbd_work *work) { - struct smb2_oplock_break *req = smb2_get_msg(work->request_buf); - struct smb2_oplock_break *rsp = smb2_get_msg(work->response_buf); + struct smb2_oplock_break *req; + struct smb2_oplock_break *rsp; struct ksmbd_file *fp; struct oplock_info *opinfo = NULL; __le32 err = 0; @@ -7907,6 +7930,8 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) char req_oplevel = 0, rsp_oplevel = 0; unsigned int oplock_change_type; + WORK_BUFFERS(work, req, rsp); + volatile_id = req->VolatileFid; persistent_id = req->PersistentFid; req_oplevel = req->OplockLevel; @@ -8041,8 +8066,8 @@ static int check_lease_state(struct lease *lease, __le32 req_state) static void smb21_lease_break_ack(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; - struct smb2_lease_ack *req = smb2_get_msg(work->request_buf); - struct smb2_lease_ack *rsp = smb2_get_msg(work->response_buf); + struct smb2_lease_ack *req; + struct smb2_lease_ack *rsp; struct oplock_info *opinfo; __le32 err = 0; int ret = 0; @@ -8050,6 +8075,8 @@ static void smb21_lease_break_ack(struct ksmbd_work *work) __le32 lease_state; struct lease *lease; + WORK_BUFFERS(work, req, rsp); + ksmbd_debug(OPLOCK, "smb21 lease break, lease state(0x%x)\n", le32_to_cpu(req->LeaseState)); opinfo = lookup_lease_in_table(conn, req->LeaseKey); @@ -8175,8 +8202,10 @@ err_out: */ int smb2_oplock_break(struct ksmbd_work *work) { - struct smb2_oplock_break *req = smb2_get_msg(work->request_buf); - struct smb2_oplock_break *rsp = smb2_get_msg(work->response_buf); + struct smb2_oplock_break *req; + struct smb2_oplock_break *rsp; + + WORK_BUFFERS(work, req, rsp); switch (le16_to_cpu(req->StructureSize)) { case OP_BREAK_STRUCT_SIZE_20: diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index 569e5eecdf3d..ef20f63e55e6 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -266,7 +266,7 @@ static int ksmbd_negotiate_smb_dialect(void *buf) if (smb2_neg_size > smb_buf_length) goto err_out; - if (smb2_neg_size + le16_to_cpu(req->DialectCount) * sizeof(__le16) > + if (struct_size(req, Dialects, le16_to_cpu(req->DialectCount)) > smb_buf_length) goto err_out; @@ -359,8 +359,8 @@ static int smb1_check_user_session(struct ksmbd_work *work) */ static int smb1_allocate_rsp_buf(struct ksmbd_work *work) { - work->response_buf = kmalloc(MAX_CIFS_SMALL_BUFFER_SIZE, - GFP_KERNEL | __GFP_ZERO); + work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, + GFP_KERNEL); work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE; if (!work->response_buf) { @@ -536,7 +536,7 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname, out[baselen + 3] = PERIOD; if (dot_present) - memcpy(&out[baselen + 4], extension, 4); + memcpy(out + baselen + 4, extension, 4); else out[baselen + 4] = '\0'; smbConvertToUTF16((__le16 *)shortname, out, PATH_MAX, diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index 6b0d5f1fe85c..aeca0f46068f 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -200,7 +200,7 @@ struct smb_hdr { struct smb_negotiate_req { struct smb_hdr hdr; /* wct = 0 */ __le16 ByteCount; - unsigned char DialectsArray[1]; + unsigned char DialectsArray[]; } __packed; struct smb_negotiate_rsp { diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index ad919a4239d0..e5e438bf5499 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -97,7 +97,7 @@ int compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid) /* compare all of the subauth values if any */ num_sat = ctsid->num_subauth; num_saw = cwsid->num_subauth; - num_subauth = num_sat < num_saw ? num_sat : num_saw; + num_subauth = min(num_sat, num_saw); if (num_subauth) { for (i = 0; i < num_subauth; ++i) { if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) { diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 40c721f9227e..b49d47bdafc9 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -229,7 +229,7 @@ static struct ksmbd_ipc_msg *ipc_msg_alloc(size_t sz) struct ksmbd_ipc_msg *msg; size_t msg_sz = sz + sizeof(struct ksmbd_ipc_msg); - msg = kvmalloc(msg_sz, GFP_KERNEL | __GFP_ZERO); + msg = kvzalloc(msg_sz, GFP_KERNEL); if (msg) msg->sz = sz; return msg; @@ -268,7 +268,7 @@ static int handle_response(int type, void *payload, size_t sz) entry->type + 1, type); } - entry->response = kvmalloc(sz, GFP_KERNEL | __GFP_ZERO); + entry->response = kvzalloc(sz, GFP_KERNEL); if (!entry->response) { ret = -ENOMEM; break; diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 81489fdedd8e..e35914457350 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -121,11 +121,9 @@ err_out: return -ENOENT; } -int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, +void ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess) { - int ret = 0; - *daccess = cpu_to_le32(FILE_READ_ATTRIBUTES | READ_CONTROL); if (!inode_permission(idmap, d_inode(dentry), MAY_OPEN | MAY_WRITE)) @@ -142,8 +140,6 @@ int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, if (!inode_permission(idmap, d_inode(dentry->d_parent), MAY_EXEC | MAY_WRITE)) *daccess |= FILE_DELETE_LE; - - return ret; } /** @@ -440,7 +436,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, } if (v_len < size) { - wbuf = kvmalloc(size, GFP_KERNEL | __GFP_ZERO); + wbuf = kvzalloc(size, GFP_KERNEL); if (!wbuf) { err = -ENOMEM; goto out; @@ -857,7 +853,7 @@ ssize_t ksmbd_vfs_listxattr(struct dentry *dentry, char **list) if (size <= 0) return size; - vlist = kvmalloc(size, GFP_KERNEL | __GFP_ZERO); + vlist = kvzalloc(size, GFP_KERNEL); if (!vlist) return -ENOMEM; @@ -1207,7 +1203,7 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, path); if (!err) - return err; + return 0; if (caseless) { char *filepath; diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index 8c0931d4d531..80039312c255 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -72,7 +72,7 @@ struct ksmbd_kstat { }; int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child); -int ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, +void ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap, struct dentry *dentry, __le32 *daccess); int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode); int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode); diff --git a/fs/splice.c b/fs/splice.c index 3e06611d19ae..004eb1c4ce31 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -33,6 +33,7 @@ #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/gfp.h> +#include <linux/net.h> #include <linux/socket.h> #include <linux/sched/signal.h> @@ -299,20 +300,36 @@ void splice_shrink_spd(struct splice_pipe_desc *spd) kfree(spd->partial); } -/* - * Splice data from an O_DIRECT file into pages and then add them to the output - * pipe. +/** + * copy_splice_read - Copy data from a file and splice the copy into a pipe + * @in: The file to read from + * @ppos: Pointer to the file position to read from + * @pipe: The pipe to splice into + * @len: The amount to splice + * @flags: The SPLICE_F_* flags + * + * This function allocates a bunch of pages sufficient to hold the requested + * amount of data (but limited by the remaining pipe capacity), passes it to + * the file's ->read_iter() to read into and then splices the used pages into + * the pipe. + * + * Return: On success, the number of bytes read will be returned and *@ppos + * will be updated if appropriate; 0 will be returned if there is no more data + * to be read; -EAGAIN will be returned if the pipe had no space, and some + * other negative error code will be returned on error. A short read may occur + * if the pipe has insufficient space, we reach the end of the data or we hit a + * hole. */ -ssize_t direct_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +ssize_t copy_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { struct iov_iter to; struct bio_vec *bv; struct kiocb kiocb; struct page **pages; ssize_t ret; - size_t used, npages, chunk, remain, reclaim; + size_t used, npages, chunk, remain, keep = 0; int i; /* Work out how much data we can actually add into the pipe */ @@ -326,7 +343,7 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, if (!bv) return -ENOMEM; - pages = (void *)(bv + npages); + pages = (struct page **)(bv + npages); npages = alloc_pages_bulk_array(GFP_USER, npages, pages); if (!npages) { kfree(bv); @@ -349,31 +366,25 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, kiocb.ki_pos = *ppos; ret = call_read_iter(in, &kiocb, &to); - reclaim = npages * PAGE_SIZE; - remain = 0; if (ret > 0) { - reclaim -= ret; - remain = ret; + keep = DIV_ROUND_UP(ret, PAGE_SIZE); *ppos = kiocb.ki_pos; - file_accessed(in); - } else if (ret < 0) { - /* - * callers of ->splice_read() expect -EAGAIN on - * "can't put anything in there", rather than -EFAULT. - */ - if (ret == -EFAULT) - ret = -EAGAIN; } + /* + * Callers of ->splice_read() expect -EAGAIN on "can't put anything in + * there", rather than -EFAULT. + */ + if (ret == -EFAULT) + ret = -EAGAIN; + /* Free any pages that didn't get touched at all. */ - reclaim /= PAGE_SIZE; - if (reclaim) { - npages -= reclaim; - release_pages(pages + npages, reclaim); - } + if (keep < npages) + release_pages(pages + keep, npages - keep); /* Push the remaining pages into the pipe. */ - for (i = 0; i < npages; i++) { + remain = ret; + for (i = 0; i < keep; i++) { struct pipe_buffer *buf = pipe_head_buf(pipe); chunk = min_t(size_t, remain, PAGE_SIZE); @@ -390,50 +401,7 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, kfree(bv); return ret; } -EXPORT_SYMBOL(direct_splice_read); - -/** - * generic_file_splice_read - splice data from file to a pipe - * @in: file to splice from - * @ppos: position in @in - * @pipe: pipe to splice to - * @len: number of bytes to splice - * @flags: splice modifier flags - * - * Description: - * Will read pages from given file and fill them into a pipe. Can be - * used as long as it has more or less sane ->read_iter(). - * - */ -ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - struct iov_iter to; - struct kiocb kiocb; - int ret; - - iov_iter_pipe(&to, ITER_DEST, pipe, len); - init_sync_kiocb(&kiocb, in); - kiocb.ki_pos = *ppos; - ret = call_read_iter(in, &kiocb, &to); - if (ret > 0) { - *ppos = kiocb.ki_pos; - file_accessed(in); - } else if (ret < 0) { - /* free what was emitted */ - pipe_discard_from(pipe, to.start_head); - /* - * callers of ->splice_read() expect -EAGAIN on - * "can't put anything in there", rather than -EFAULT. - */ - if (ret == -EFAULT) - ret = -EAGAIN; - } - - return ret; -} -EXPORT_SYMBOL(generic_file_splice_read); +EXPORT_SYMBOL(copy_splice_read); const struct pipe_buf_operations default_pipe_buf_ops = { .release = generic_pipe_buf_release, @@ -448,30 +416,6 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = { }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); -/* - * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' - * using sendpage(). Return the number of bytes sent. - */ -static int pipe_to_sendpage(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, struct splice_desc *sd) -{ - struct file *file = sd->u.file; - loff_t pos = sd->pos; - int more; - - if (!likely(file->f_op->sendpage)) - return -EINVAL; - - more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; - - if (sd->len < sd->total_len && - pipe_occupancy(pipe->head, pipe->tail) > 1) - more |= MSG_SENDPAGE_NOTLAST; - - return file->f_op->sendpage(file, buf->page, buf->offset, - sd->len, &pos, more); -} - static void wakeup_pipe_writers(struct pipe_inode_info *pipe) { smp_mb(); @@ -652,7 +596,7 @@ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_des * Description: * This function does little more than loop over the pipe and call * @actor to do the actual moving of a single struct pipe_buffer to - * the desired destination. See pipe_to_file, pipe_to_sendpage, or + * the desired destination. See pipe_to_file, pipe_to_sendmsg, or * pipe_to_user. * */ @@ -833,8 +777,9 @@ done: EXPORT_SYMBOL(iter_file_splice_write); +#ifdef CONFIG_NET /** - * generic_splice_sendpage - splice data from a pipe to a socket + * splice_to_socket - splice data from a pipe to a socket * @pipe: pipe to splice from * @out: socket to write to * @ppos: position in @out @@ -846,13 +791,129 @@ EXPORT_SYMBOL(iter_file_splice_write); * is involved. * */ -ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) +ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) { - return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); -} + struct socket *sock = sock_from_file(out); + struct bio_vec bvec[16]; + struct msghdr msg = {}; + ssize_t ret = 0; + size_t spliced = 0; + bool need_wakeup = false; + + pipe_lock(pipe); -EXPORT_SYMBOL(generic_splice_sendpage); + while (len > 0) { + unsigned int head, tail, mask, bc = 0; + size_t remain = len; + + /* + * Check for signal early to make process killable when there + * are always buffers available + */ + ret = -ERESTARTSYS; + if (signal_pending(current)) + break; + + while (pipe_empty(pipe->head, pipe->tail)) { + ret = 0; + if (!pipe->writers) + goto out; + + if (spliced) + goto out; + + ret = -EAGAIN; + if (flags & SPLICE_F_NONBLOCK) + goto out; + + ret = -ERESTARTSYS; + if (signal_pending(current)) + goto out; + + if (need_wakeup) { + wakeup_pipe_writers(pipe); + need_wakeup = false; + } + + pipe_wait_readable(pipe); + } + + head = pipe->head; + tail = pipe->tail; + mask = pipe->ring_size - 1; + + while (!pipe_empty(head, tail)) { + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + size_t seg; + + if (!buf->len) { + tail++; + continue; + } + + seg = min_t(size_t, remain, buf->len); + + ret = pipe_buf_confirm(pipe, buf); + if (unlikely(ret)) { + if (ret == -ENODATA) + ret = 0; + break; + } + + bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset); + remain -= seg; + if (remain == 0 || bc >= ARRAY_SIZE(bvec)) + break; + tail++; + } + + if (!bc) + break; + + msg.msg_flags = MSG_SPLICE_PAGES; + if (flags & SPLICE_F_MORE) + msg.msg_flags |= MSG_MORE; + if (remain && pipe_occupancy(pipe->head, tail) > 0) + msg.msg_flags |= MSG_MORE; + + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc, + len - remain); + ret = sock_sendmsg(sock, &msg); + if (ret <= 0) + break; + + spliced += ret; + len -= ret; + tail = pipe->tail; + while (ret > 0) { + struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + size_t seg = min_t(size_t, ret, buf->len); + + buf->offset += seg; + buf->len -= seg; + ret -= seg; + + if (!buf->len) { + pipe_buf_release(pipe, buf); + tail++; + } + } + + if (tail != pipe->tail) { + pipe->tail = tail; + if (pipe->files) + need_wakeup = true; + } + } + +out: + pipe_unlock(pipe); + if (need_wakeup) + wakeup_pipe_writers(pipe); + return spliced ?: ret; +} +#endif static int warn_unsupported(struct file *file, const char *op) { @@ -874,17 +935,42 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, } /* - * Attempt to initiate a splice from a file to a pipe. + * Indicate to the caller that there was a premature EOF when reading from the + * source and the caller didn't indicate they would be sending more data after + * this. */ -static long do_splice_to(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +static void do_splice_eof(struct splice_desc *sd) +{ + if (sd->splice_eof) + sd->splice_eof(sd); +} + +/** + * vfs_splice_read - Read data from a file and splice it into a pipe + * @in: File to splice from + * @ppos: Input file offset + * @pipe: Pipe to splice to + * @len: Number of bytes to splice + * @flags: Splice modifier flags (SPLICE_F_*) + * + * Splice the requested amount of data from the input file to the pipe. This + * is synchronous as the caller must hold the pipe lock across the entire + * operation. + * + * If successful, it returns the amount of data spliced, 0 if it hit the EOF or + * a hole and a negative error code otherwise. + */ +long vfs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { unsigned int p_space; int ret; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; + if (!len) + return 0; /* Don't try to read more the pipe has space for. */ p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); @@ -899,8 +985,15 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(!in->f_op->splice_read)) return warn_unsupported(in, "read"); + /* + * O_DIRECT and DAX don't deal with the pagecache, so we allocate a + * buffer, copy into it and splice that into the pipe. + */ + if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host)) + return copy_splice_read(in, ppos, pipe, len, flags); return in->f_op->splice_read(in, ppos, pipe, len, flags); } +EXPORT_SYMBOL_GPL(vfs_splice_read); /** * splice_direct_to_actor - splices data directly between two non-pipes @@ -956,13 +1049,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, */ bytes = 0; len = sd->total_len; + + /* Don't block on output, we have to drain the direct pipe. */ flags = sd->flags; + sd->flags &= ~SPLICE_F_NONBLOCK; /* - * Don't block on output, we have to drain the direct pipe. + * We signal MORE until we've read sufficient data to fulfill the + * request and we keep signalling it if the caller set it. */ - sd->flags &= ~SPLICE_F_NONBLOCK; more = sd->flags & SPLICE_F_MORE; + sd->flags |= SPLICE_F_MORE; WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); @@ -970,22 +1067,20 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, size_t read_len; loff_t pos = sd->pos, prev_pos = pos; - ret = do_splice_to(in, &pos, pipe, len, flags); + ret = vfs_splice_read(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) - goto out_release; + goto read_failure; read_len = ret; sd->total_len = read_len; /* - * If more data is pending, set SPLICE_F_MORE - * If this is the last data and SPLICE_F_MORE was not set - * initially, clears it. + * If we now have sufficient data to fulfill the request then + * we clear SPLICE_F_MORE if it was not set initially. */ - if (read_len < len) - sd->flags |= SPLICE_F_MORE; - else if (!more) + if (read_len >= len && !more) sd->flags &= ~SPLICE_F_MORE; + /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we @@ -1012,6 +1107,15 @@ done: file_accessed(in); return bytes; +read_failure: + /* + * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that + * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a + * "->splice_in()" that returned EOF (ie zero) *and* we have sent at + * least 1 byte *then* we will also do the ->splice_eof() call. + */ + if (ret == 0 && !more && len > 0 && bytes) + do_splice_eof(sd); out_release: /* * If we did an incomplete transfer we must release @@ -1040,6 +1144,14 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, sd->flags); } +static void direct_file_splice_eof(struct splice_desc *sd) +{ + struct file *file = sd->u.file; + + if (file->f_op->splice_eof) + file->f_op->splice_eof(file); +} + /** * do_splice_direct - splices data directly between two files * @in: file to splice from @@ -1065,6 +1177,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, .flags = flags, .pos = *ppos, .u.file = out, + .splice_eof = direct_file_splice_eof, .opos = opos, }; long ret; @@ -1118,7 +1231,7 @@ long splice_file_to_pipe(struct file *in, pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) - ret = do_splice_to(in, offset, opipe, len, flags); + ret = vfs_splice_read(in, offset, opipe, len, flags); pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index bed3bb8b27fa..6aa9c2e1e8eb 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -17,8 +17,8 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> +#include <linux/pagemap.h> #include <linux/string.h> -#include <linux/buffer_head.h> #include <linux/bio.h> #include "squashfs_fs.h" @@ -76,10 +76,101 @@ static int copy_bio_to_actor(struct bio *bio, return copied_bytes; } +static int squashfs_bio_read_cached(struct bio *fullbio, + struct address_space *cache_mapping, u64 index, int length, + u64 read_start, u64 read_end, int page_count) +{ + struct page *head_to_cache = NULL, *tail_to_cache = NULL; + struct block_device *bdev = fullbio->bi_bdev; + int start_idx = 0, end_idx = 0; + struct bvec_iter_all iter_all; + struct bio *bio = NULL; + struct bio_vec *bv; + int idx = 0; + int err = 0; + + bio_for_each_segment_all(bv, fullbio, iter_all) { + struct page *page = bv->bv_page; + + if (page->mapping == cache_mapping) { + idx++; + continue; + } + + /* + * We only use this when the device block size is the same as + * the page size, so read_start and read_end cover full pages. + * + * Compare these to the original required index and length to + * only cache pages which were requested partially, since these + * are the ones which are likely to be needed when reading + * adjacent blocks. + */ + if (idx == 0 && index != read_start) + head_to_cache = page; + else if (idx == page_count - 1 && index + length != read_end) + tail_to_cache = page; + + if (!bio || idx != end_idx) { + struct bio *new = bio_alloc_clone(bdev, fullbio, + GFP_NOIO, &fs_bio_set); + + if (bio) { + bio_trim(bio, start_idx * PAGE_SECTORS, + (end_idx - start_idx) * PAGE_SECTORS); + bio_chain(bio, new); + submit_bio(bio); + } + + bio = new; + start_idx = idx; + } + + idx++; + end_idx = idx; + } + + if (bio) { + bio_trim(bio, start_idx * PAGE_SECTORS, + (end_idx - start_idx) * PAGE_SECTORS); + err = submit_bio_wait(bio); + bio_put(bio); + } + + if (err) + return err; + + if (head_to_cache) { + int ret = add_to_page_cache_lru(head_to_cache, cache_mapping, + read_start >> PAGE_SHIFT, + GFP_NOIO); + + if (!ret) { + SetPageUptodate(head_to_cache); + unlock_page(head_to_cache); + } + + } + + if (tail_to_cache) { + int ret = add_to_page_cache_lru(tail_to_cache, cache_mapping, + (read_end >> PAGE_SHIFT) - 1, + GFP_NOIO); + + if (!ret) { + SetPageUptodate(tail_to_cache); + unlock_page(tail_to_cache); + } + } + + return 0; +} + static int squashfs_bio_read(struct super_block *sb, u64 index, int length, struct bio **biop, int *block_offset) { struct squashfs_sb_info *msblk = sb->s_fs_info; + struct address_space *cache_mapping = msblk->cache_mapping; const u64 read_start = round_down(index, msblk->devblksize); const sector_t block = read_start >> msblk->devblksize_log2; const u64 read_end = round_up(index + length, msblk->devblksize); @@ -99,21 +190,34 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, for (i = 0; i < page_count; ++i) { unsigned int len = min_t(unsigned int, PAGE_SIZE - offset, total_len); - struct page *page = alloc_page(GFP_NOIO); + struct page *page = NULL; + + if (cache_mapping) + page = find_get_page(cache_mapping, + (read_start >> PAGE_SHIFT) + i); + if (!page) + page = alloc_page(GFP_NOIO); if (!page) { error = -ENOMEM; goto out_free_bio; } - if (!bio_add_page(bio, page, len, offset)) { - error = -EIO; - goto out_free_bio; - } + + /* + * Use the __ version to avoid merging since we need each page + * to be separate when we check for and avoid cached pages. + */ + __bio_add_page(bio, page, len, offset); offset = 0; total_len -= len; } - error = submit_bio_wait(bio); + if (cache_mapping) + error = squashfs_bio_read_cached(bio, cache_mapping, index, + length, read_start, read_end, + page_count); + else + error = submit_bio_wait(bio); if (error) goto out_free_bio; diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index 8893cb9b4198..a676084be27e 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -11,7 +11,6 @@ #include <linux/types.h> #include <linux/mutex.h> #include <linux/slab.h> -#include <linux/buffer_head.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index 1dfadf76ed9a..8a218e7c2390 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -7,7 +7,6 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/percpu.h> -#include <linux/buffer_head.h> #include <linux/local_lock.h> #include "squashfs_fs.h" diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h index 72f6f4b37863..c01998eec146 100644 --- a/fs/squashfs/squashfs_fs_sb.h +++ b/fs/squashfs/squashfs_fs_sb.h @@ -47,6 +47,7 @@ struct squashfs_sb_info { struct squashfs_cache *block_cache; struct squashfs_cache *fragment_cache; struct squashfs_cache *read_page; + struct address_space *cache_mapping; int next_meta_index; __le64 *id_table; __le64 *fragment_index; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index e090fae48e68..22e812808e5c 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -329,6 +329,19 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) goto failed_mount; } + if (msblk->devblksize == PAGE_SIZE) { + struct inode *cache = new_inode(sb); + + if (cache == NULL) + goto failed_mount; + + set_nlink(cache, 1); + cache->i_size = OFFSET_MAX; + mapping_set_gfp_mask(cache->i_mapping, GFP_NOFS); + + msblk->cache_mapping = cache->i_mapping; + } + msblk->stream = squashfs_decompressor_setup(sb, flags); if (IS_ERR(msblk->stream)) { err = PTR_ERR(msblk->stream); @@ -454,6 +467,8 @@ failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); + if (msblk->cache_mapping) + iput(msblk->cache_mapping->host); msblk->thread_ops->destroy(msblk); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); @@ -572,6 +587,8 @@ static void squashfs_put_super(struct super_block *sb) squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); + if (sbi->cache_mapping) + iput(sbi->cache_mapping->host); sbi->thread_ops->destroy(sbi); kfree(sbi->id_table); kfree(sbi->fragment_index); diff --git a/fs/super.c b/fs/super.c index 04bc62ab7dfe..e781226e2880 100644 --- a/fs/super.c +++ b/fs/super.c @@ -236,7 +236,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, &type->s_writers_key[i])) goto fail; } - init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; if (s->s_user_ns != &init_user_ns) @@ -595,7 +594,7 @@ retry: fc->s_fs_info = NULL; s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; - strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id)); + strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); @@ -674,7 +673,7 @@ retry: return ERR_PTR(err); } s->s_type = type; - strlcpy(s->s_id, type->name, sizeof(s->s_id)); + strscpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); @@ -903,6 +902,7 @@ int reconfigure_super(struct fs_context *fc) struct super_block *sb = fc->root->d_sb; int retval; bool remount_ro = false; + bool remount_rw = false; bool force = fc->sb_flags & SB_FORCE; if (fc->sb_flags_mask & ~MS_RMT_MASK) @@ -920,7 +920,7 @@ int reconfigure_super(struct fs_context *fc) bdev_read_only(sb->s_bdev)) return -EACCES; #endif - + remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb); remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); } @@ -943,13 +943,18 @@ int reconfigure_super(struct fs_context *fc) */ if (remount_ro) { if (force) { - sb->s_readonly_remount = 1; - smp_wmb(); + sb_start_ro_state_change(sb); } else { retval = sb_prepare_remount_readonly(sb); if (retval) return retval; } + } else if (remount_rw) { + /* + * Protect filesystem's reconfigure code from writes from + * userspace until reconfigure finishes. + */ + sb_start_ro_state_change(sb); } if (fc->ops->reconfigure) { @@ -965,9 +970,7 @@ int reconfigure_super(struct fs_context *fc) WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | (fc->sb_flags & fc->sb_flags_mask))); - /* Needs to be ordered wrt mnt_is_readonly() */ - smp_wmb(); - sb->s_readonly_remount = 0; + sb_end_ro_state_change(sb); /* * Some filesystems modify their metadata via some other path than the @@ -982,7 +985,7 @@ int reconfigure_super(struct fs_context *fc) return 0; cancel_readonly: - sb->s_readonly_remount = 0; + sb_end_ro_state_change(sb); return retval; } @@ -1206,6 +1209,22 @@ int get_tree_keyed(struct fs_context *fc, EXPORT_SYMBOL(get_tree_keyed); #ifdef CONFIG_BLOCK +static void fs_mark_dead(struct block_device *bdev) +{ + struct super_block *sb; + + sb = get_super(bdev); + if (!sb) + return; + + if (sb->s_op->shutdown) + sb->s_op->shutdown(sb); + drop_super(sb); +} + +static const struct blk_holder_ops fs_holder_ops = { + .mark_dead = fs_mark_dead, +}; static int set_bdev_super(struct super_block *s, void *data) { @@ -1239,16 +1258,13 @@ int get_tree_bdev(struct fs_context *fc, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ | FMODE_EXCL; int error = 0; - if (!(fc->sb_flags & SB_RDONLY)) - mode |= FMODE_WRITE; - if (!fc->source) return invalf(fc, "No source specified"); - bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type); + bdev = blkdev_get_by_path(fc->source, sb_open_mode(fc->sb_flags), + fc->fs_type, &fs_holder_ops); if (IS_ERR(bdev)) { errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev); @@ -1262,7 +1278,7 @@ int get_tree_bdev(struct fs_context *fc, if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - blkdev_put(bdev, mode); + blkdev_put(bdev, fc->fs_type); return -EBUSY; } @@ -1271,7 +1287,7 @@ int get_tree_bdev(struct fs_context *fc, s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); mutex_unlock(&bdev->bd_fsfreeze_mutex); if (IS_ERR(s)) { - blkdev_put(bdev, mode); + blkdev_put(bdev, fc->fs_type); return PTR_ERR(s); } @@ -1280,7 +1296,7 @@ int get_tree_bdev(struct fs_context *fc, if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { warnf(fc, "%pg: Can't mount, would change RO state", bdev); deactivate_locked_super(s); - blkdev_put(bdev, mode); + blkdev_put(bdev, fc->fs_type); return -EBUSY; } @@ -1292,10 +1308,9 @@ int get_tree_bdev(struct fs_context *fc, * holding an active reference. */ up_write(&s->s_umount); - blkdev_put(bdev, mode); + blkdev_put(bdev, fc->fs_type); down_write(&s->s_umount); } else { - s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fc->fs_type->name, s->s_id); @@ -1327,13 +1342,10 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ | FMODE_EXCL; int error = 0; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; - - bdev = blkdev_get_by_path(dev_name, mode, fs_type); + bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type, + &fs_holder_ops); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -1369,10 +1381,9 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, * holding an active reference. */ up_write(&s->s_umount); - blkdev_put(bdev, mode); + blkdev_put(bdev, fs_type); down_write(&s->s_umount); } else { - s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, s->s_id); @@ -1392,7 +1403,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, error_s: error = PTR_ERR(s); error_bdev: - blkdev_put(bdev, mode); + blkdev_put(bdev, fs_type); error: return ERR_PTR(error); } @@ -1401,13 +1412,11 @@ EXPORT_SYMBOL(mount_bdev); void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; - fmode_t mode = sb->s_mode; bdev->bd_super = NULL; generic_shutdown_super(sb); sync_blockdev(bdev); - WARN_ON_ONCE(!(mode & FMODE_EXCL)); - blkdev_put(bdev, mode | FMODE_EXCL); + blkdev_put(bdev, sb->s_type); } EXPORT_SYMBOL(kill_block_super); @@ -1706,7 +1715,6 @@ int freeze_super(struct super_block *sb) if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); - wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; } @@ -1722,7 +1730,6 @@ int freeze_super(struct super_block *sb) "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); - wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; } @@ -1768,7 +1775,6 @@ static int thaw_super_locked(struct super_block *sb) sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); out: - wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return 0; } diff --git a/fs/sysctls.c b/fs/sysctls.c index c701273c9432..76a0aee8c229 100644 --- a/fs/sysctls.c +++ b/fs/sysctls.c @@ -29,11 +29,10 @@ static struct ctl_table fs_shared_sysctls[] = { { } }; -DECLARE_SYSCTL_BASE(fs, fs_shared_sysctls); - static int __init init_fs_sysctls(void) { - return register_sysctl_base(fs); + register_sysctl_init("fs", fs_shared_sysctls); + return 0; } early_initcall(init_fs_sysctls); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index eeb0e3099421..138676463336 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -118,11 +118,13 @@ static int internal_create_group(struct kobject *kobj, int update, /* Updates may happen before the object has been instantiated */ if (unlikely(update && !kobj->sd)) return -EINVAL; + if (!grp->attrs && !grp->bin_attrs) { - WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n", - kobj->name, grp->name ?: ""); - return -EINVAL; + pr_debug("sysfs: (bin_)attrs not set by subsystem for group: %s/%s, skipping\n", + kobj->name, grp->name ?: ""); + return 0; } + kobject_get_ownership(kobj, &uid, &gid); if (grp->name) { if (update) { @@ -142,8 +144,10 @@ static int internal_create_group(struct kobject *kobj, int update, return PTR_ERR(kn); } } - } else + } else { kn = kobj->sd; + } + kernfs_get(kn); error = create_files(kn, kobj, uid, gid, grp, update); if (error) { diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index cdb3d632c63d..0140010aa0c3 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -52,7 +52,7 @@ static int sysv_handle_dirsync(struct inode *dir) } /* - * Calls to dir_get_page()/put_and_unmap_page() must be nested according to the + * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the * rules documented in mm/highmem.rst. * * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_page() @@ -103,11 +103,11 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), fs16_to_cpu(SYSV_SB(sb), de->inode), DT_UNKNOWN)) { - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); return 0; } } - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); } return 0; } @@ -131,7 +131,7 @@ static inline int namecompare(int len, int maxlen, * itself (as a parameter - res_dir). It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * - * On Success put_and_unmap_page() should be called on *res_page. + * On Success unmap_and_put_page() should be called on *res_page. * * sysv_find_entry() acts as a call to dir_get_page() and must be treated * accordingly for nesting purposes. @@ -166,7 +166,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ name, de->name)) goto found; } - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); } if (++n >= npages) @@ -209,7 +209,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) goto out_page; de++; } - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); } BUG(); return -EINVAL; @@ -228,7 +228,7 @@ got_it: mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); out_page: - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); return err; out_unlock: unlock_page(page); @@ -321,12 +321,12 @@ int sysv_empty_dir(struct inode * inode) if (de->name[1] != '.' || de->name[2]) goto not_empty; } - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); } return 1; not_empty: - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); return 0; } @@ -352,7 +352,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page, } /* - * Calls to dir_get_page()/put_and_unmap_page() must be nested according to the + * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the * rules documented in mm/highmem.rst. * * sysv_dotdot() acts as a call to dir_get_page() and must be treated @@ -376,7 +376,7 @@ ino_t sysv_inode_by_name(struct dentry *dentry) if (de) { res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); - put_and_unmap_page(page, de); + unmap_and_put_page(page, de); } return res; } diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 50eb92557a0f..c645f60bdb7f 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = { .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, }; static int sysv_setattr(struct mnt_idmap *idmap, diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index b22764fe669c..58d7f43a1371 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -145,6 +145,10 @@ static int alloc_branch(struct inode *inode, */ parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key); bh = sb_getblk(inode->i_sb, parent); + if (!bh) { + sysv_free_block(inode->i_sb, branch[n].key); + break; + } lock_buffer(bh); memset(bh->b_data, 0, blocksize); branch[n].bh = bh; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 2b2dba4c4f56..fcf163fea3ad 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -164,7 +164,7 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry) inode->i_ctime = dir->i_ctime; inode_dec_link_count(inode); } - put_and_unmap_page(page, de); + unmap_and_put_page(page, de); return err; } @@ -227,7 +227,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!new_de) goto out_dir; err = sysv_set_link(new_de, new_page, old_inode); - put_and_unmap_page(new_page, new_de); + unmap_and_put_page(new_page, new_de); if (err) goto out_dir; new_inode->i_ctime = current_time(new_inode); @@ -256,9 +256,9 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, out_dir: if (dir_de) - put_and_unmap_page(dir_page, dir_de); + unmap_and_put_page(dir_page, dir_de); out_old: - put_and_unmap_page(old_page, old_de); + unmap_and_put_page(old_page, old_de); out: return err; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 979ab1d9d0c3..6738fe43040b 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1669,7 +1669,7 @@ const struct file_operations ubifs_file_operations = { .mmap = ubifs_file_mmap, .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .open = fscrypt_file_open, #ifdef CONFIG_COMPAT diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 14b9db4c80f0..ab3ffc355949 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * balloc.c * @@ -5,11 +6,6 @@ * Block allocation handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * * (C) 1999-2001 Ben Fennema * (C) 1999 Stelias Computing Inc * diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 212393b12c22..f6533f93851b 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * dir.c * @@ -5,11 +6,6 @@ * Directory handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * * (C) 1998-2004 Ben Fennema * * HISTORY diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 654536d2b609..1c775e072b2f 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * directory.c * * PURPOSE * Directory related functions * - * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. */ #include "udfdecl.h" diff --git a/fs/udf/file.c b/fs/udf/file.c index 8238f742377b..243840dc83ad 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * file.c * @@ -5,11 +6,6 @@ * File handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * * (C) 1998-1999 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc @@ -209,7 +205,7 @@ const struct file_operations udf_file_operations = { .write_iter = udf_file_write_iter, .release = udf_release_file, .fsync = generic_file_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .llseek = generic_file_llseek, }; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 8d50121778a5..5f7ac8c84798 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * ialloc.c * @@ -5,11 +6,6 @@ * Inode allocation handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * * (C) 1998-2001 Ben Fennema * * HISTORY diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 1e71e04ae8f6..28cdfc57d946 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * inode.c * @@ -5,11 +6,6 @@ * Inode handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * * (C) 1998 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index c87ed942d076..9d847a7a0905 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * lowlevel.c * @@ -5,11 +6,6 @@ * Low Level Device Routines for the UDF filesystem * * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * * (C) 1999-2001 Ben Fennema * * HISTORY diff --git a/fs/udf/misc.c b/fs/udf/misc.c index 3777468d06ce..0788593b6a1d 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * misc.c * @@ -5,11 +6,6 @@ * Miscellaneous routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * * (C) 1998 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc diff --git a/fs/udf/namei.c b/fs/udf/namei.c index fd20423d3ed2..a95579b043ab 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * namei.c * @@ -5,11 +6,6 @@ * Inode name handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc * @@ -793,11 +789,6 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!empty_dir(new_inode)) goto out_oiter; } - /* - * We need to protect against old_inode getting converted from - * ICB to normal directory. - */ - inode_lock_nested(old_inode, I_MUTEX_NONDIR2); retval = udf_fiiter_find_entry(old_inode, &dotdot_name, &diriter); if (retval == -ENOENT) { @@ -806,10 +797,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, old_inode->i_ino); retval = -EFSCORRUPTED; } - if (retval) { - inode_unlock(old_inode); + if (retval) goto out_oiter; - } has_diriter = true; tloc = lelb_to_cpu(diriter.fi.icb.extLocation); if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) != @@ -889,7 +878,6 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, udf_dir_entry_len(&diriter.fi)); udf_fiiter_write_fi(&diriter, NULL); udf_fiiter_release(&diriter); - inode_unlock(old_inode); inode_dec_link_count(old_dir); if (new_inode) @@ -901,10 +889,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, } return 0; out_oiter: - if (has_diriter) { + if (has_diriter) udf_fiiter_release(&diriter); - inode_unlock(old_inode); - } udf_fiiter_release(&oiter); return retval; diff --git a/fs/udf/partition.c b/fs/udf/partition.c index 5bcfe78d5cab..af877991edc1 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * partition.c * @@ -5,11 +6,6 @@ * Partition handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * * (C) 1998-2001 Ben Fennema * * HISTORY diff --git a/fs/udf/super.c b/fs/udf/super.c index 6304e3c5c3d9..928a04d9d9e0 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * super.c * @@ -15,11 +16,6 @@ * https://www.iso.org/ * * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * * (C) 1998 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 2000 Stelias Computing Inc diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index a34c8c4e6d21..779b5c2c75f6 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * symlink.c * @@ -5,11 +6,6 @@ * Symlink handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * * (C) 1998-2001 Ben Fennema * (C) 1999 Stelias Computing Inc * diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 2e7ba234bab8..a686c10fd709 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * truncate.c * @@ -5,11 +6,6 @@ * Truncate handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * * (C) 1999-2004 Ben Fennema * (C) 1999 Stelias Computing Inc * diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c index fce4ad976c8c..758163af39c2 100644 --- a/fs/udf/udftime.c +++ b/fs/udf/udftime.c @@ -1,21 +1,7 @@ +// SPDX-License-Identifier: LGPL-2.0+ /* Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. This file is part of the GNU C Library. - Contributed by Paul Eggert (eggert@twinsun.com). - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with the GNU C Library; see the file COPYING.LIB. If not, - write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. */ + Contributed by Paul Eggert (eggert@twinsun.com). */ /* * dgb 10/02/98: ripped this from glibc source to help convert timestamps diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 622569007b53..32c7f3d27f74 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * unicode.c * @@ -11,11 +12,6 @@ * UTF-8 is explained in the IETF RFC XXXX. * ftp://ftp.internic.net/rfc/rfcxxxx.txt * - * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. */ #include "udfdecl.h" @@ -247,7 +243,7 @@ static int udf_name_from_CS0(struct super_block *sb, } if (translate) { - if (str_o_len <= 2 && str_o[0] == '.' && + if (str_o_len > 0 && str_o_len <= 2 && str_o[0] == '.' && (str_o_len == 1 || str_o[1] == '.')) needsCRC = 1; if (needsCRC) { diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 7e087581be7e..6558882a89ef 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -41,5 +41,5 @@ const struct file_operations ufs_file_operations = { .mmap = generic_file_mmap, .open = generic_file_open, .fsync = generic_file_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, }; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4e800bb7d2ab..7cecd49e078b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -335,6 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pud_t *pud; pmd_t *pmd, _pmd; pte_t *pte; + pte_t ptent; bool ret = true; mmap_assert_locked(mm); @@ -349,20 +350,13 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, if (!pud_present(*pud)) goto out; pmd = pmd_offset(pud, address); - /* - * READ_ONCE must function as a barrier with narrower scope - * and it must be equivalent to: - * _pmd = *pmd; barrier(); - * - * This is to deal with the instability (as in - * pmd_trans_unstable) of the pmd. - */ - _pmd = READ_ONCE(*pmd); +again: + _pmd = pmdp_get_lockless(pmd); if (pmd_none(_pmd)) goto out; ret = false; - if (!pmd_present(_pmd)) + if (!pmd_present(_pmd) || pmd_devmap(_pmd)) goto out; if (pmd_trans_huge(_pmd)) { @@ -371,19 +365,20 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, goto out; } - /* - * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it - * and use the standard pte_offset_map() instead of parsing _pmd. - */ pte = pte_offset_map(pmd, address); + if (!pte) { + ret = true; + goto again; + } /* * Lockless access: we're in a wait_event so it's ok if it * changes under us. PTE markers should be handled the same as none * ptes here. */ - if (pte_none_mostly(*pte)) + ptent = ptep_get(pte); + if (pte_none_mostly(ptent)) ret = true; - if (!pte_write(*pte) && (reason & VM_UFFD_WP)) + if (!pte_write(ptent) && (reason & VM_UFFD_WP)) ret = true; pte_unmap(pte); @@ -857,31 +852,26 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, return false; } -int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, +int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *unmaps) { - VMA_ITERATOR(vmi, mm, start); - struct vm_area_struct *vma; - - for_each_vma_range(vmi, vma, end) { - struct userfaultfd_unmap_ctx *unmap_ctx; - struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; + struct userfaultfd_unmap_ctx *unmap_ctx; + struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; - if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || - has_unmap_ctx(ctx, unmaps, start, end)) - continue; + if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || + has_unmap_ctx(ctx, unmaps, start, end)) + return 0; - unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); - if (!unmap_ctx) - return -ENOMEM; + unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); + if (!unmap_ctx) + return -ENOMEM; - userfaultfd_ctx_get(ctx); - atomic_inc(&ctx->mmap_changing); - unmap_ctx->ctx = ctx; - unmap_ctx->start = start; - unmap_ctx->end = end; - list_add_tail(&unmap_ctx->list, unmaps); - } + userfaultfd_ctx_get(ctx); + atomic_inc(&ctx->mmap_changing); + unmap_ctx->ctx = ctx; + unmap_ctx->start = start; + unmap_ctx->end = end; + list_add_tail(&unmap_ctx->list, unmaps); return 0; } diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index 572aa1c43b37..2307f8037efc 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -217,7 +217,7 @@ const struct file_operations vboxsf_reg_fops = { .open = vboxsf_file_open, .release = vboxsf_file_release, .fsync = noop_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, }; const struct inode_operations vboxsf_reg_iops = { diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index d2f6df69f611..1fb8f4df60cb 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -176,7 +176,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) } folder_name->size = size; folder_name->length = size - 1; - strlcpy(folder_name->string.utf8, fc->source, size); + strscpy(folder_name->string.utf8, fc->source, size); err = vboxsf_map_folder(folder_name, &sbi->root); kfree(folder_name); if (err) { diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index a7ffd718f171..e1036e535352 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -39,14 +39,14 @@ config FS_VERITY_BUILTIN_SIGNATURES depends on FS_VERITY select SYSTEM_DATA_VERIFICATION help - Support verifying signatures of verity files against the X.509 - certificates that have been loaded into the ".fs-verity" - kernel keyring. + This option adds support for in-kernel verification of + fs-verity builtin signatures. - This is meant as a relatively simple mechanism that can be - used to provide an authenticity guarantee for verity files, as - an alternative to IMA appraisal. Userspace programs still - need to check that the verity bit is set in order to get an - authenticity guarantee. + Please take great care before using this feature. It is not + the only way to do signatures with fs-verity, and the + alternatives (such as userspace signature verification, and + IMA appraisal) can be much better. For details about the + limitations of this feature, see + Documentation/filesystems/fsverity.rst. If unsure, say N. diff --git a/fs/verity/enable.c b/fs/verity/enable.c index fc4c50e5219d..c284f46d1b53 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -7,6 +7,7 @@ #include "fsverity_private.h" +#include <crypto/hash.h> #include <linux/mount.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> @@ -20,7 +21,7 @@ struct block_buffer { /* Hash a block, writing the result to the next level's pending block buffer. */ static int hash_one_block(struct inode *inode, const struct merkle_tree_params *params, - struct ahash_request *req, struct block_buffer *cur) + struct block_buffer *cur) { struct block_buffer *next = cur + 1; int err; @@ -36,8 +37,7 @@ static int hash_one_block(struct inode *inode, /* Zero-pad the block if it's shorter than the block size. */ memset(&cur->data[cur->filled], 0, params->block_size - cur->filled); - err = fsverity_hash_block(params, inode, req, virt_to_page(cur->data), - offset_in_page(cur->data), + err = fsverity_hash_block(params, inode, cur->data, &next->data[next->filled]); if (err) return err; @@ -76,7 +76,6 @@ static int build_merkle_tree(struct file *filp, struct inode *inode = file_inode(filp); const u64 data_size = inode->i_size; const int num_levels = params->num_levels; - struct ahash_request *req; struct block_buffer _buffers[1 + FS_VERITY_MAX_LEVELS + 1] = {}; struct block_buffer *buffers = &_buffers[1]; unsigned long level_offset[FS_VERITY_MAX_LEVELS]; @@ -90,9 +89,6 @@ static int build_merkle_tree(struct file *filp, return 0; } - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(params->hash_alg, GFP_KERNEL); - /* * Allocate the block buffers. Buffer "-1" is for data blocks. * Buffers 0 <= level < num_levels are for the actual tree levels. @@ -130,7 +126,7 @@ static int build_merkle_tree(struct file *filp, fsverity_err(inode, "Short read of file data"); goto out; } - err = hash_one_block(inode, params, req, &buffers[-1]); + err = hash_one_block(inode, params, &buffers[-1]); if (err) goto out; for (level = 0; level < num_levels; level++) { @@ -141,8 +137,7 @@ static int build_merkle_tree(struct file *filp, } /* Next block at @level is full */ - err = hash_one_block(inode, params, req, - &buffers[level]); + err = hash_one_block(inode, params, &buffers[level]); if (err) goto out; err = write_merkle_tree_block(inode, @@ -162,8 +157,7 @@ static int build_merkle_tree(struct file *filp, /* Finish all nonempty pending tree blocks. */ for (level = 0; level < num_levels; level++) { if (buffers[level].filled != 0) { - err = hash_one_block(inode, params, req, - &buffers[level]); + err = hash_one_block(inode, params, &buffers[level]); if (err) goto out; err = write_merkle_tree_block(inode, @@ -183,7 +177,6 @@ static int build_merkle_tree(struct file *filp, out: for (level = -1; level < num_levels; level++) kfree(buffers[level].data); - fsverity_free_hash_request(params->hash_alg, req); return err; } @@ -215,7 +208,7 @@ static int enable_verity(struct file *filp, } desc->salt_size = arg->salt_size; - /* Get the signature if the user provided one */ + /* Get the builtin signature if the user provided one */ if (arg->sig_size && copy_from_user(desc->signature, u64_to_user_ptr(arg->sig_ptr), arg->sig_size)) { diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index d34dcc033d72..49bf3a1eb2a0 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -11,9 +11,6 @@ #define pr_fmt(fmt) "fs-verity: " fmt #include <linux/fsverity.h> -#include <linux/mempool.h> - -struct ahash_request; /* * Implementation limit: maximum depth of the Merkle tree. For now 8 is plenty; @@ -23,11 +20,10 @@ struct ahash_request; /* A hash algorithm supported by fs-verity */ struct fsverity_hash_alg { - struct crypto_ahash *tfm; /* hash tfm, allocated on demand */ + struct crypto_shash *tfm; /* hash tfm, allocated on demand */ const char *name; /* crypto API name, e.g. sha256 */ unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */ unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */ - mempool_t req_pool; /* mempool with a preallocated hash request */ /* * The HASH_ALGO_* constant for this algorithm. This is different from * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme. @@ -37,7 +33,7 @@ struct fsverity_hash_alg { /* Merkle tree parameters: hash algorithm, initial hash state, and topology */ struct merkle_tree_params { - struct fsverity_hash_alg *hash_alg; /* the hash algorithm */ + const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */ const u8 *hashstate; /* initial hash state or NULL */ unsigned int digest_size; /* same as hash_alg->digest_size */ unsigned int block_size; /* size of data and tree blocks */ @@ -83,18 +79,13 @@ struct fsverity_info { extern struct fsverity_hash_alg fsverity_hash_algs[]; -struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, - unsigned int num); -struct ahash_request *fsverity_alloc_hash_request(struct fsverity_hash_alg *alg, - gfp_t gfp_flags); -void fsverity_free_hash_request(struct fsverity_hash_alg *alg, - struct ahash_request *req); -const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, +const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, + unsigned int num); +const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size); int fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, struct ahash_request *req, - struct page *page, unsigned int offset, u8 *out); -int fsverity_hash_buffer(struct fsverity_hash_alg *alg, + const struct inode *inode, const void *data, u8 *out); +int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out); void __init fsverity_check_hash_algs(void); diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index ea00dbedf756..c598d2035476 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -8,7 +8,6 @@ #include "fsverity_private.h" #include <crypto/hash.h> -#include <linux/scatterlist.h> /* The hash algorithms supported by fs-verity */ struct fsverity_hash_alg fsverity_hash_algs[] = { @@ -40,11 +39,11 @@ static DEFINE_MUTEX(fsverity_hash_alg_init_mutex); * * Return: pointer to the hash alg on success, else an ERR_PTR() */ -struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, - unsigned int num) +const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, + unsigned int num) { struct fsverity_hash_alg *alg; - struct crypto_ahash *tfm; + struct crypto_shash *tfm; int err; if (num >= ARRAY_SIZE(fsverity_hash_algs) || @@ -63,11 +62,7 @@ struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, if (alg->tfm != NULL) goto out_unlock; - /* - * Using the shash API would make things a bit simpler, but the ahash - * API is preferable as it allows the use of crypto accelerators. - */ - tfm = crypto_alloc_ahash(alg->name, 0, 0); + tfm = crypto_alloc_shash(alg->name, 0, 0); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) { fsverity_warn(inode, @@ -84,26 +79,20 @@ struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, } err = -EINVAL; - if (WARN_ON_ONCE(alg->digest_size != crypto_ahash_digestsize(tfm))) + if (WARN_ON_ONCE(alg->digest_size != crypto_shash_digestsize(tfm))) goto err_free_tfm; - if (WARN_ON_ONCE(alg->block_size != crypto_ahash_blocksize(tfm))) - goto err_free_tfm; - - err = mempool_init_kmalloc_pool(&alg->req_pool, 1, - sizeof(struct ahash_request) + - crypto_ahash_reqsize(tfm)); - if (err) + if (WARN_ON_ONCE(alg->block_size != crypto_shash_blocksize(tfm))) goto err_free_tfm; pr_info("%s using implementation \"%s\"\n", - alg->name, crypto_ahash_driver_name(tfm)); + alg->name, crypto_shash_driver_name(tfm)); /* pairs with smp_load_acquire() above */ smp_store_release(&alg->tfm, tfm); goto out_unlock; err_free_tfm: - crypto_free_ahash(tfm); + crypto_free_shash(tfm); alg = ERR_PTR(err); out_unlock: mutex_unlock(&fsverity_hash_alg_init_mutex); @@ -111,42 +100,6 @@ out_unlock: } /** - * fsverity_alloc_hash_request() - allocate a hash request object - * @alg: the hash algorithm for which to allocate the request - * @gfp_flags: memory allocation flags - * - * This is mempool-backed, so this never fails if __GFP_DIRECT_RECLAIM is set in - * @gfp_flags. However, in that case this might need to wait for all - * previously-allocated requests to be freed. So to avoid deadlocks, callers - * must never need multiple requests at a time to make forward progress. - * - * Return: the request object on success; NULL on failure (but see above) - */ -struct ahash_request *fsverity_alloc_hash_request(struct fsverity_hash_alg *alg, - gfp_t gfp_flags) -{ - struct ahash_request *req = mempool_alloc(&alg->req_pool, gfp_flags); - - if (req) - ahash_request_set_tfm(req, alg->tfm); - return req; -} - -/** - * fsverity_free_hash_request() - free a hash request object - * @alg: the hash algorithm - * @req: the hash request object to free - */ -void fsverity_free_hash_request(struct fsverity_hash_alg *alg, - struct ahash_request *req) -{ - if (req) { - ahash_request_zero(req); - mempool_free(req, &alg->req_pool); - } -} - -/** * fsverity_prepare_hash_state() - precompute the initial hash state * @alg: hash algorithm * @salt: a salt which is to be prepended to all data to be hashed @@ -155,27 +108,24 @@ void fsverity_free_hash_request(struct fsverity_hash_alg *alg, * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed * initial hash state on success or an ERR_PTR() on failure. */ -const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, +const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size) { u8 *hashstate = NULL; - struct ahash_request *req = NULL; + SHASH_DESC_ON_STACK(desc, alg->tfm); u8 *padded_salt = NULL; size_t padded_salt_size; - struct scatterlist sg; - DECLARE_CRYPTO_WAIT(wait); int err; + desc->tfm = alg->tfm; + if (salt_size == 0) return NULL; - hashstate = kmalloc(crypto_ahash_statesize(alg->tfm), GFP_KERNEL); + hashstate = kmalloc(crypto_shash_statesize(alg->tfm), GFP_KERNEL); if (!hashstate) return ERR_PTR(-ENOMEM); - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(alg, GFP_KERNEL); - /* * Zero-pad the salt to the next multiple of the input size of the hash * algorithm's compression function, e.g. 64 bytes for SHA-256 or 128 @@ -190,26 +140,18 @@ const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, goto err_free; } memcpy(padded_salt, salt, salt_size); - - sg_init_one(&sg, padded_salt, padded_salt_size); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &wait); - ahash_request_set_crypt(req, &sg, NULL, padded_salt_size); - - err = crypto_wait_req(crypto_ahash_init(req), &wait); + err = crypto_shash_init(desc); if (err) goto err_free; - err = crypto_wait_req(crypto_ahash_update(req), &wait); + err = crypto_shash_update(desc, padded_salt, padded_salt_size); if (err) goto err_free; - err = crypto_ahash_export(req, hashstate); + err = crypto_shash_export(desc, hashstate); if (err) goto err_free; out: - fsverity_free_hash_request(alg, req); kfree(padded_salt); return hashstate; @@ -223,9 +165,7 @@ err_free: * fsverity_hash_block() - hash a single data or hash block * @params: the Merkle tree's parameters * @inode: inode for which the hashing is being done - * @req: preallocated hash request - * @page: the page containing the block to hash - * @offset: the offset of the block within @page + * @data: virtual address of a buffer containing the block to hash * @out: output digest, size 'params->digest_size' bytes * * Hash a single data or hash block. The hash is salted if a salt is specified @@ -234,33 +174,24 @@ err_free: * Return: 0 on success, -errno on failure */ int fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, struct ahash_request *req, - struct page *page, unsigned int offset, u8 *out) + const struct inode *inode, const void *data, u8 *out) { - struct scatterlist sg; - DECLARE_CRYPTO_WAIT(wait); + SHASH_DESC_ON_STACK(desc, params->hash_alg->tfm); int err; - sg_init_table(&sg, 1); - sg_set_page(&sg, page, params->block_size, offset); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &wait); - ahash_request_set_crypt(req, &sg, out, params->block_size); + desc->tfm = params->hash_alg->tfm; if (params->hashstate) { - err = crypto_ahash_import(req, params->hashstate); + err = crypto_shash_import(desc, params->hashstate); if (err) { fsverity_err(inode, "Error %d importing hash state", err); return err; } - err = crypto_ahash_finup(req); + err = crypto_shash_finup(desc, data, params->block_size, out); } else { - err = crypto_ahash_digest(req); + err = crypto_shash_digest(desc, data, params->block_size, out); } - - err = crypto_wait_req(err, &wait); if (err) fsverity_err(inode, "Error %d computing block hash", err); return err; @@ -273,32 +204,12 @@ int fsverity_hash_block(const struct merkle_tree_params *params, * @size: size of data to hash, in bytes * @out: output digest, size 'alg->digest_size' bytes * - * Hash some data which is located in physically contiguous memory (i.e. memory - * allocated by kmalloc(), not by vmalloc()). No salt is used. - * * Return: 0 on success, -errno on failure */ -int fsverity_hash_buffer(struct fsverity_hash_alg *alg, +int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out) { - struct ahash_request *req; - struct scatterlist sg; - DECLARE_CRYPTO_WAIT(wait); - int err; - - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(alg, GFP_KERNEL); - - sg_init_one(&sg, data, size); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &wait); - ahash_request_set_crypt(req, &sg, out, size); - - err = crypto_wait_req(crypto_ahash_digest(req), &wait); - - fsverity_free_hash_request(alg, req); - return err; + return crypto_shash_tfm_digest(alg->tfm, data, size, out); } void __init fsverity_check_hash_algs(void) diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 5c79ea1b2468..eec5956141da 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -61,27 +61,42 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure); /** * fsverity_get_digest() - get a verity file's digest * @inode: inode to get digest of - * @digest: (out) pointer to the digest - * @alg: (out) pointer to the hash algorithm enumeration + * @raw_digest: (out) the raw file digest + * @alg: (out) the digest's algorithm, as a FS_VERITY_HASH_ALG_* value + * @halg: (out) the digest's algorithm, as a HASH_ALGO_* value * - * Return the file hash algorithm and digest of an fsverity protected file. - * Assumption: before calling this, the file must have been opened. + * Retrieves the fsverity digest of the given file. The file must have been + * opened at least once since the inode was last loaded into the inode cache; + * otherwise this function will not recognize when fsverity is enabled. * - * Return: 0 on success, -errno on failure + * The file's fsverity digest consists of @raw_digest in combination with either + * @alg or @halg. (The caller can choose which one of @alg or @halg to use.) + * + * IMPORTANT: Callers *must* make use of one of the two algorithm IDs, since + * @raw_digest is meaningless without knowing which algorithm it uses! fsverity + * provides no security guarantee for users who ignore the algorithm ID, even if + * they use the digest size (since algorithms can share the same digest size). + * + * Return: The size of the raw digest in bytes, or 0 if the file doesn't have + * fsverity enabled. */ int fsverity_get_digest(struct inode *inode, - u8 digest[FS_VERITY_MAX_DIGEST_SIZE], - enum hash_algo *alg) + u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], + u8 *alg, enum hash_algo *halg) { const struct fsverity_info *vi; const struct fsverity_hash_alg *hash_alg; vi = fsverity_get_info(inode); if (!vi) - return -ENODATA; /* not a verity file */ + return 0; /* not a verity file */ hash_alg = vi->tree_params.hash_alg; - memcpy(digest, vi->file_digest, hash_alg->digest_size); - *alg = hash_alg->algo_id; - return 0; + memcpy(raw_digest, vi->file_digest, hash_alg->digest_size); + if (alg) + *alg = hash_alg - fsverity_hash_algs; + if (halg) + *halg = hash_alg->algo_id; + return hash_alg->digest_size; } +EXPORT_SYMBOL_GPL(fsverity_get_digest); diff --git a/fs/verity/open.c b/fs/verity/open.c index 52048b7630dc..1db5106a9c38 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -32,7 +32,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, unsigned int log_blocksize, const u8 *salt, size_t salt_size) { - struct fsverity_hash_alg *hash_alg; + const struct fsverity_hash_alg *hash_alg; int err; u64 blocks; u64 blocks_in_level[FS_VERITY_MAX_LEVELS]; @@ -156,9 +156,9 @@ out_err: /* * Compute the file digest by hashing the fsverity_descriptor excluding the - * signature and with the sig_size field set to 0. + * builtin signature and with the sig_size field set to 0. */ -static int compute_file_digest(struct fsverity_hash_alg *hash_alg, +static int compute_file_digest(const struct fsverity_hash_alg *hash_alg, struct fsverity_descriptor *desc, u8 *file_digest) { @@ -174,7 +174,7 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg, /* * Create a new fsverity_info from the given fsverity_descriptor (with optional - * appended signature), and check the signature if present. The + * appended builtin signature), and check the signature if present. The * fsverity_descriptor must have already undergone basic validation. */ struct fsverity_info *fsverity_create_info(const struct inode *inode, @@ -319,8 +319,8 @@ static bool validate_fsverity_descriptor(struct inode *inode, } /* - * Read the inode's fsverity_descriptor (with optional appended signature) from - * the filesystem, and do basic validation of it. + * Read the inode's fsverity_descriptor (with optional appended builtin + * signature) from the filesystem, and do basic validation of it. */ int fsverity_get_descriptor(struct inode *inode, struct fsverity_descriptor **desc_ret) diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 2aefc5565152..f58432772d9e 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -105,7 +105,7 @@ static int fsverity_read_descriptor(struct inode *inode, if (res) return res; - /* don't include the signature */ + /* don't include the builtin signature */ desc_size = offsetof(struct fsverity_descriptor, signature); desc->sig_size = 0; @@ -131,7 +131,7 @@ static int fsverity_read_signature(struct inode *inode, } /* - * Include only the signature. Note that fsverity_get_descriptor() + * Include only the builtin signature. fsverity_get_descriptor() * already verified that sig_size is in-bounds. */ res = fsverity_read_buffer(buf, offset, length, desc->signature, diff --git a/fs/verity/signature.c b/fs/verity/signature.c index b8c51ad40d3a..72034bc71c9d 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -5,6 +5,14 @@ * Copyright 2019 Google LLC */ +/* + * This file implements verification of fs-verity builtin signatures. Please + * take great care before using this feature. It is not the only way to do + * signatures with fs-verity, and the alternatives (such as userspace signature + * verification, and IMA appraisal) can be much better. For details about the + * limitations of this feature, see Documentation/filesystems/fsverity.rst. + */ + #include "fsverity_private.h" #include <linux/cred.h> diff --git a/fs/verity/verify.c b/fs/verity/verify.c index e2508222750b..433cef51f5f6 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -12,38 +12,6 @@ static struct workqueue_struct *fsverity_read_workqueue; -static inline int cmp_hashes(const struct fsverity_info *vi, - const u8 *want_hash, const u8 *real_hash, - u64 data_pos, int level) -{ - const unsigned int hsize = vi->tree_params.digest_size; - - if (memcmp(want_hash, real_hash, hsize) == 0) - return 0; - - fsverity_err(vi->inode, - "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", - data_pos, level, - vi->tree_params.hash_alg->name, hsize, want_hash, - vi->tree_params.hash_alg->name, hsize, real_hash); - return -EBADMSG; -} - -static bool data_is_zeroed(struct inode *inode, struct page *page, - unsigned int len, unsigned int offset) -{ - void *virt = kmap_local_page(page); - - if (memchr_inv(virt + offset, 0, len)) { - kunmap_local(virt); - fsverity_err(inode, - "FILE CORRUPTED! Data past EOF is not zeroed"); - return false; - } - kunmap_local(virt); - return true; -} - /* * Returns true if the hash block with index @hblock_idx in the tree, located in * @hpage, has already been verified. @@ -122,9 +90,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, */ static bool verify_data_block(struct inode *inode, struct fsverity_info *vi, - struct ahash_request *req, struct page *data_page, - u64 data_pos, unsigned int dblock_offset_in_page, - unsigned long max_ra_pages) + const void *data, u64 data_pos, unsigned long max_ra_pages) { const struct merkle_tree_params *params = &vi->tree_params; const unsigned int hsize = params->digest_size; @@ -136,11 +102,11 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, struct { /* Page containing the hash block */ struct page *page; + /* Mapped address of the hash block (will be within @page) */ + const void *addr; /* Index of the hash block in the tree overall */ unsigned long index; - /* Byte offset of the hash block within @page */ - unsigned int offset_in_page; - /* Byte offset of the wanted hash within @page */ + /* Byte offset of the wanted hash relative to @addr */ unsigned int hoffset; } hblocks[FS_VERITY_MAX_LEVELS]; /* @@ -148,7 +114,9 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, * index of that block's hash within the current level. */ u64 hidx = data_pos >> params->log_blocksize; - int err; + + /* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */ + BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX); if (unlikely(data_pos >= inode->i_size)) { /* @@ -159,8 +127,12 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, * any part past EOF should be all zeroes. Therefore, we need * to verify that any data blocks fully past EOF are all zeroes. */ - return data_is_zeroed(inode, data_page, params->block_size, - dblock_offset_in_page); + if (memchr_inv(data, 0, params->block_size)) { + fsverity_err(inode, + "FILE CORRUPTED! Data past EOF is not zeroed"); + return false; + } + return true; } /* @@ -175,6 +147,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, unsigned int hblock_offset_in_page; unsigned int hoffset; struct page *hpage; + const void *haddr; /* * The index of the block in the current level; also the index @@ -192,30 +165,30 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, hblock_offset_in_page = (hblock_idx << params->log_blocksize) & ~PAGE_MASK; - /* Byte offset of the hash within the page */ - hoffset = hblock_offset_in_page + - ((hidx << params->log_digestsize) & - (params->block_size - 1)); + /* Byte offset of the hash within the block */ + hoffset = (hidx << params->log_digestsize) & + (params->block_size - 1); hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, hpage_idx, level == 0 ? min(max_ra_pages, params->tree_pages - hpage_idx) : 0); if (IS_ERR(hpage)) { - err = PTR_ERR(hpage); fsverity_err(inode, - "Error %d reading Merkle tree page %lu", - err, hpage_idx); - goto out; + "Error %ld reading Merkle tree page %lu", + PTR_ERR(hpage), hpage_idx); + goto error; } + haddr = kmap_local_page(hpage) + hblock_offset_in_page; if (is_hash_block_verified(vi, hpage, hblock_idx)) { - memcpy_from_page(_want_hash, hpage, hoffset, hsize); + memcpy(_want_hash, haddr + hoffset, hsize); want_hash = _want_hash; + kunmap_local(haddr); put_page(hpage); goto descend; } hblocks[level].page = hpage; + hblocks[level].addr = haddr; hblocks[level].index = hblock_idx; - hblocks[level].offset_in_page = hblock_offset_in_page; hblocks[level].hoffset = hoffset; hidx = next_hidx; } @@ -225,18 +198,14 @@ descend: /* Descend the tree verifying hash blocks. */ for (; level > 0; level--) { struct page *hpage = hblocks[level - 1].page; + const void *haddr = hblocks[level - 1].addr; unsigned long hblock_idx = hblocks[level - 1].index; - unsigned int hblock_offset_in_page = - hblocks[level - 1].offset_in_page; unsigned int hoffset = hblocks[level - 1].hoffset; - err = fsverity_hash_block(params, inode, req, hpage, - hblock_offset_in_page, real_hash); - if (err) - goto out; - err = cmp_hashes(vi, want_hash, real_hash, data_pos, level - 1); - if (err) - goto out; + if (fsverity_hash_block(params, inode, haddr, real_hash) != 0) + goto error; + if (memcmp(want_hash, real_hash, hsize) != 0) + goto corrupted; /* * Mark the hash block as verified. This must be atomic and * idempotent, as the same hash block might be verified by @@ -246,29 +215,39 @@ descend: set_bit(hblock_idx, vi->hash_block_verified); else SetPageChecked(hpage); - memcpy_from_page(_want_hash, hpage, hoffset, hsize); + memcpy(_want_hash, haddr + hoffset, hsize); want_hash = _want_hash; + kunmap_local(haddr); put_page(hpage); } /* Finally, verify the data block. */ - err = fsverity_hash_block(params, inode, req, data_page, - dblock_offset_in_page, real_hash); - if (err) - goto out; - err = cmp_hashes(vi, want_hash, real_hash, data_pos, -1); -out: - for (; level > 0; level--) - put_page(hblocks[level - 1].page); + if (fsverity_hash_block(params, inode, data, real_hash) != 0) + goto error; + if (memcmp(want_hash, real_hash, hsize) != 0) + goto corrupted; + return true; - return err == 0; +corrupted: + fsverity_err(inode, + "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", + data_pos, level - 1, + params->hash_alg->name, hsize, want_hash, + params->hash_alg->name, hsize, real_hash); +error: + for (; level > 0; level--) { + kunmap_local(hblocks[level - 1].addr); + put_page(hblocks[level - 1].page); + } + return false; } static bool -verify_data_blocks(struct inode *inode, struct fsverity_info *vi, - struct ahash_request *req, struct folio *data_folio, - size_t len, size_t offset, unsigned long max_ra_pages) +verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, + unsigned long max_ra_pages) { + struct inode *inode = data_folio->mapping->host; + struct fsverity_info *vi = inode->i_verity_info; const unsigned int block_size = vi->tree_params.block_size; u64 pos = (u64)data_folio->index << PAGE_SHIFT; @@ -278,11 +257,14 @@ verify_data_blocks(struct inode *inode, struct fsverity_info *vi, folio_test_uptodate(data_folio))) return false; do { - struct page *data_page = - folio_page(data_folio, offset >> PAGE_SHIFT); - - if (!verify_data_block(inode, vi, req, data_page, pos + offset, - offset & ~PAGE_MASK, max_ra_pages)) + void *data; + bool valid; + + data = kmap_local_folio(data_folio, offset); + valid = verify_data_block(inode, vi, data, pos + offset, + max_ra_pages); + kunmap_local(data); + if (!valid) return false; offset += block_size; len -= block_size; @@ -304,19 +286,7 @@ verify_data_blocks(struct inode *inode, struct fsverity_info *vi, */ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { - struct inode *inode = folio->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; - struct ahash_request *req; - bool valid; - - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); - - valid = verify_data_blocks(inode, vi, req, folio, len, offset, 0); - - fsverity_free_hash_request(vi->tree_params.hash_alg, req); - - return valid; + return verify_data_blocks(folio, len, offset, 0); } EXPORT_SYMBOL_GPL(fsverity_verify_blocks); @@ -337,15 +307,9 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks); */ void fsverity_verify_bio(struct bio *bio) { - struct inode *inode = bio_first_page_all(bio)->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; - struct ahash_request *req; struct folio_iter fi; unsigned long max_ra_pages = 0; - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); - if (bio->bi_opf & REQ_RAHEAD) { /* * If this bio is for data readahead, then we also do readahead @@ -360,14 +324,12 @@ void fsverity_verify_bio(struct bio *bio) } bio_for_each_folio_all(fi, bio) { - if (!verify_data_blocks(inode, vi, req, fi.folio, fi.length, - fi.offset, max_ra_pages)) { + if (!verify_data_blocks(fi.folio, fi.length, fi.offset, + max_ra_pages)) { bio->bi_status = BLK_STS_IOERR; break; } } - - fsverity_free_hash_request(vi->tree_params.hash_alg, req); } EXPORT_SYMBOL_GPL(fsverity_verify_bio); #endif /* CONFIG_BLOCK */ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index ee84835ebc66..e9cc481b4ddf 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -985,7 +985,7 @@ xfs_ag_shrink_space( goto resv_err; err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, - true); + XFS_AG_RESV_NONE, true); if (err2) goto resv_err; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index c20fe99405d8..3069194527dd 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1536,7 +1536,8 @@ xfs_alloc_ag_vextent_lastblock( */ STATIC int xfs_alloc_ag_vextent_near( - struct xfs_alloc_arg *args) + struct xfs_alloc_arg *args, + uint32_t alloc_flags) { struct xfs_alloc_cur acur = {}; int error; /* error code */ @@ -1555,6 +1556,8 @@ xfs_alloc_ag_vextent_near( if (args->agbno > args->max_agbno) args->agbno = args->max_agbno; + /* Retry once quickly if we find busy extents before blocking. */ + alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; restart: len = 0; @@ -1610,9 +1613,20 @@ restart: */ if (!acur.len) { if (acur.busy) { + /* + * Our only valid extents must have been busy. Flush and + * retry the allocation again. If we get an -EAGAIN + * error, we're being told that a deadlock was avoided + * and the current transaction needs committing before + * the allocation can be retried. + */ trace_xfs_alloc_near_busy(args); - xfs_extent_busy_flush(args->mp, args->pag, - acur.busy_gen); + error = xfs_extent_busy_flush(args->tp, args->pag, + acur.busy_gen, alloc_flags); + if (error) + goto out; + + alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; goto restart; } trace_xfs_alloc_size_neither(args); @@ -1635,22 +1649,25 @@ out: * and of the form k * prod + mod unless there's nothing that large. * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. */ -STATIC int /* error */ +static int xfs_alloc_ag_vextent_size( - xfs_alloc_arg_t *args) /* allocation argument structure */ + struct xfs_alloc_arg *args, + uint32_t alloc_flags) { - struct xfs_agf *agf = args->agbp->b_addr; - struct xfs_btree_cur *bno_cur; /* cursor for bno btree */ - struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */ - int error; /* error result */ - xfs_agblock_t fbno; /* start of found freespace */ - xfs_extlen_t flen; /* length of found freespace */ - int i; /* temp status variable */ - xfs_agblock_t rbno; /* returned block number */ - xfs_extlen_t rlen; /* length of returned extent */ - bool busy; - unsigned busy_gen; + struct xfs_agf *agf = args->agbp->b_addr; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + xfs_agblock_t fbno; /* start of found freespace */ + xfs_extlen_t flen; /* length of found freespace */ + xfs_agblock_t rbno; /* returned block number */ + xfs_extlen_t rlen; /* length of returned extent */ + bool busy; + unsigned busy_gen; + int error; + int i; + /* Retry once quickly if we find busy extents before blocking. */ + alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; restart: /* * Allocate and initialize a cursor for the by-size btree. @@ -1708,19 +1725,25 @@ restart: error = xfs_btree_increment(cnt_cur, 0, &i); if (error) goto error0; - if (i == 0) { - /* - * Our only valid extents must have been busy. - * Make it unbusy by forcing the log out and - * retrying. - */ - xfs_btree_del_cursor(cnt_cur, - XFS_BTREE_NOERROR); - trace_xfs_alloc_size_busy(args); - xfs_extent_busy_flush(args->mp, - args->pag, busy_gen); - goto restart; - } + if (i) + continue; + + /* + * Our only valid extents must have been busy. Flush and + * retry the allocation again. If we get an -EAGAIN + * error, we're being told that a deadlock was avoided + * and the current transaction needs committing before + * the allocation can be retried. + */ + trace_xfs_alloc_size_busy(args); + error = xfs_extent_busy_flush(args->tp, args->pag, + busy_gen, alloc_flags); + if (error) + goto error0; + + alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + goto restart; } } @@ -1800,9 +1823,21 @@ restart: args->len = rlen; if (rlen < args->minlen) { if (busy) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + /* + * Our only valid extents must have been busy. Flush and + * retry the allocation again. If we get an -EAGAIN + * error, we're being told that a deadlock was avoided + * and the current transaction needs committing before + * the allocation can be retried. + */ trace_xfs_alloc_size_busy(args); - xfs_extent_busy_flush(args->mp, args->pag, busy_gen); + error = xfs_extent_busy_flush(args->tp, args->pag, + busy_gen, alloc_flags); + if (error) + goto error0; + + alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); goto restart; } goto out_nominleft; @@ -2435,23 +2470,25 @@ static int xfs_defer_agfl_block( struct xfs_trans *tp, xfs_agnumber_t agno, - xfs_fsblock_t agbno, + xfs_agblock_t agbno, struct xfs_owner_info *oinfo) { struct xfs_mount *mp = tp->t_mountp; struct xfs_extent_free_item *xefi; + xfs_fsblock_t fsbno = XFS_AGB_TO_FSB(mp, agno, agbno); ASSERT(xfs_extfree_item_cache != NULL); ASSERT(oinfo != NULL); + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno))) + return -EFSCORRUPTED; + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); - xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); + xefi->xefi_startblock = fsbno; xefi->xefi_blockcount = 1; xefi->xefi_owner = oinfo->oi_owner; - - if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, xefi->xefi_startblock))) - return -EFSCORRUPTED; + xefi->xefi_agresv = XFS_AG_RESV_AGFL; trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); @@ -2470,6 +2507,7 @@ __xfs_free_extent_later( xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, bool skip_discard) { struct xfs_extent_free_item *xefi; @@ -2490,6 +2528,7 @@ __xfs_free_extent_later( ASSERT(agbno + len <= mp->m_sb.sb_agblocks); #endif ASSERT(xfs_extfree_item_cache != NULL); + ASSERT(type != XFS_AG_RESV_AGFL); if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) return -EFSCORRUPTED; @@ -2498,6 +2537,7 @@ __xfs_free_extent_later( GFP_KERNEL | __GFP_NOFAIL); xefi->xefi_startblock = bno; xefi->xefi_blockcount = (xfs_extlen_t)len; + xefi->xefi_agresv = type; if (skip_discard) xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; if (oinfo) { @@ -2568,7 +2608,7 @@ out: int /* error */ xfs_alloc_fix_freelist( struct xfs_alloc_arg *args, /* allocation argument structure */ - int flags) /* XFS_ALLOC_FLAG_... */ + uint32_t alloc_flags) { struct xfs_mount *mp = args->mp; struct xfs_perag *pag = args->pag; @@ -2584,7 +2624,7 @@ xfs_alloc_fix_freelist( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); if (!xfs_perag_initialised_agf(pag)) { - error = xfs_alloc_read_agf(pag, tp, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2600,13 +2640,13 @@ xfs_alloc_fix_freelist( */ if (xfs_perag_prefers_metadata(pag) && (args->datatype & XFS_ALLOC_USERDATA) && - (flags & XFS_ALLOC_FLAG_TRYLOCK)) { - ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); + (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) { + ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING)); goto out_agbp_relse; } need = xfs_alloc_min_freelist(mp, pag); - if (!xfs_alloc_space_available(args, need, flags | + if (!xfs_alloc_space_available(args, need, alloc_flags | XFS_ALLOC_FLAG_CHECK)) goto out_agbp_relse; @@ -2615,7 +2655,7 @@ xfs_alloc_fix_freelist( * Can fail if we're not blocking on locks, and it's held. */ if (!agbp) { - error = xfs_alloc_read_agf(pag, tp, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2630,7 +2670,7 @@ xfs_alloc_fix_freelist( /* If there isn't enough total space or single-extent, reject it. */ need = xfs_alloc_min_freelist(mp, pag); - if (!xfs_alloc_space_available(args, need, flags)) + if (!xfs_alloc_space_available(args, need, alloc_flags)) goto out_agbp_relse; #ifdef DEBUG @@ -2668,11 +2708,12 @@ xfs_alloc_fix_freelist( */ memset(&targs, 0, sizeof(targs)); /* struct copy below */ - if (flags & XFS_ALLOC_FLAG_NORMAP) + if (alloc_flags & XFS_ALLOC_FLAG_NORMAP) targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; else targs.oinfo = XFS_RMAP_OINFO_AG; - while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { + while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) && + pag->pagf_flcount > need) { error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); if (error) goto out_agbp_relse; @@ -2700,7 +2741,7 @@ xfs_alloc_fix_freelist( targs.resv = XFS_AG_RESV_AGFL; /* Allocate as many blocks as possible at once. */ - error = xfs_alloc_ag_vextent_size(&targs); + error = xfs_alloc_ag_vextent_size(&targs, alloc_flags); if (error) goto out_agflbp_relse; @@ -2710,7 +2751,7 @@ xfs_alloc_fix_freelist( * on a completely full ag. */ if (targs.agbno == NULLAGBLOCK) { - if (flags & XFS_ALLOC_FLAG_FREEING) + if (alloc_flags & XFS_ALLOC_FLAG_FREEING) break; goto out_agflbp_relse; } @@ -2916,6 +2957,47 @@ xfs_alloc_put_freelist( } /* + * Check that this AGF/AGI header's sequence number and length matches the AG + * number and size in fsblocks. + */ +xfs_failaddr_t +xfs_validate_ag_length( + struct xfs_buf *bp, + uint32_t seqno, + uint32_t length) +{ + struct xfs_mount *mp = bp->b_mount; + /* + * During growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && seqno != bp->b_pag->pag_agno) + return __this_address; + + /* + * Only the last AG in the filesystem is allowed to be shorter + * than the AG size recorded in the superblock. + */ + if (length != mp->m_sb.sb_agblocks) { + /* + * During growfs, the new last AG can get here before we + * have updated the superblock. Give it a pass on the seqno + * check. + */ + if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1) + return __this_address; + if (length < XFS_MIN_AG_BLOCKS) + return __this_address; + if (length > mp->m_sb.sb_agblocks) + return __this_address; + } + + return NULL; +} + +/* * Verify the AGF is consistent. * * We do not verify the AGFL indexes in the AGF are fully consistent here @@ -2934,6 +3016,9 @@ xfs_agf_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_agf *agf = bp->b_addr; + xfs_failaddr_t fa; + uint32_t agf_seqno = be32_to_cpu(agf->agf_seqno); + uint32_t agf_length = be32_to_cpu(agf->agf_length); if (xfs_has_crc(mp)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) @@ -2945,18 +3030,26 @@ xfs_agf_verify( if (!xfs_verify_magic(bp, agf->agf_magicnum)) return __this_address; - if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && - be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && - be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) + if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum))) return __this_address; - if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks) + /* + * Both agf_seqno and agf_length need to validated before anything else + * block number related in the AGF or AGFL can be checked. + */ + fa = xfs_validate_ag_length(bp, agf_seqno, agf_length); + if (fa) + return fa; + + if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp)) + return __this_address; + if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp)) + return __this_address; + if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp)) return __this_address; if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || - be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length)) + be32_to_cpu(agf->agf_freeblks) > agf_length) return __this_address; if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || @@ -2967,38 +3060,28 @@ xfs_agf_verify( mp->m_alloc_maxlevels) return __this_address; - if (xfs_has_rmapbt(mp) && - (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > - mp->m_rmap_maxlevels)) - return __this_address; - - if (xfs_has_rmapbt(mp) && - be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) + if (xfs_has_lazysbcount(mp) && + be32_to_cpu(agf->agf_btreeblks) > agf_length) return __this_address; - /* - * during growfs operations, the perag is not fully initialised, - * so we can't use it for any useful checking. growfs ensures we can't - * use it by using uncached buffers that don't have the perag attached - * so we can detect and avoid this problem. - */ - if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) - return __this_address; + if (xfs_has_rmapbt(mp)) { + if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length) + return __this_address; - if (xfs_has_lazysbcount(mp) && - be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) - return __this_address; + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > + mp->m_rmap_maxlevels) + return __this_address; + } - if (xfs_has_reflink(mp) && - be32_to_cpu(agf->agf_refcount_blocks) > - be32_to_cpu(agf->agf_length)) - return __this_address; + if (xfs_has_reflink(mp)) { + if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length) + return __this_address; - if (xfs_has_reflink(mp) && - (be32_to_cpu(agf->agf_refcount_level) < 1 || - be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)) - return __this_address; + if (be32_to_cpu(agf->agf_refcount_level) < 1 || + be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels) + return __this_address; + } return NULL; } @@ -3226,7 +3309,7 @@ xfs_alloc_vextent_check_args( static int xfs_alloc_vextent_prepare_ag( struct xfs_alloc_arg *args, - uint32_t flags) + uint32_t alloc_flags) { bool need_pag = !args->pag; int error; @@ -3235,7 +3318,7 @@ xfs_alloc_vextent_prepare_ag( args->pag = xfs_perag_get(args->mp, args->agno); args->agbp = NULL; - error = xfs_alloc_fix_freelist(args, flags); + error = xfs_alloc_fix_freelist(args, alloc_flags); if (error) { trace_xfs_alloc_vextent_nofix(args); if (need_pag) @@ -3357,6 +3440,7 @@ xfs_alloc_vextent_this_ag( { struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; + uint32_t alloc_flags = 0; int error; ASSERT(args->pag != NULL); @@ -3375,9 +3459,9 @@ xfs_alloc_vextent_this_ag( return error; } - error = xfs_alloc_vextent_prepare_ag(args, 0); + error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); if (!error && args->agbp) - error = xfs_alloc_ag_vextent_size(args); + error = xfs_alloc_ag_vextent_size(args, alloc_flags); return xfs_alloc_vextent_finish(args, minimum_agno, error, false); } @@ -3406,20 +3490,20 @@ xfs_alloc_vextent_iterate_ags( xfs_agnumber_t minimum_agno, xfs_agnumber_t start_agno, xfs_agblock_t target_agbno, - uint32_t flags) + uint32_t alloc_flags) { struct xfs_mount *mp = args->mp; xfs_agnumber_t restart_agno = minimum_agno; xfs_agnumber_t agno; int error = 0; - if (flags & XFS_ALLOC_FLAG_TRYLOCK) + if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) restart_agno = 0; restart: for_each_perag_wrap_range(mp, start_agno, restart_agno, mp->m_sb.sb_agcount, agno, args->pag) { args->agno = agno; - error = xfs_alloc_vextent_prepare_ag(args, flags); + error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); if (error) break; if (!args->agbp) { @@ -3433,10 +3517,10 @@ restart: */ if (args->agno == start_agno && target_agbno) { args->agbno = target_agbno; - error = xfs_alloc_ag_vextent_near(args); + error = xfs_alloc_ag_vextent_near(args, alloc_flags); } else { args->agbno = 0; - error = xfs_alloc_ag_vextent_size(args); + error = xfs_alloc_ag_vextent_size(args, alloc_flags); } break; } @@ -3453,8 +3537,8 @@ restart: * constraining flags by the caller, drop them and retry the allocation * without any constraints being set. */ - if (flags) { - flags = 0; + if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) { + alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK; restart_agno = minimum_agno; goto restart; } @@ -3482,6 +3566,7 @@ xfs_alloc_vextent_start_ag( xfs_agnumber_t start_agno; xfs_agnumber_t rotorstep = xfs_rotorstep; bool bump_rotor = false; + uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; int error; ASSERT(args->pag == NULL); @@ -3508,7 +3593,7 @@ xfs_alloc_vextent_start_ag( start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, - XFS_FSB_TO_AGBNO(mp, target), XFS_ALLOC_FLAG_TRYLOCK); + XFS_FSB_TO_AGBNO(mp, target), alloc_flags); if (bump_rotor) { if (args->agno == start_agno) @@ -3535,6 +3620,7 @@ xfs_alloc_vextent_first_ag( struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; xfs_agnumber_t start_agno; + uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; int error; ASSERT(args->pag == NULL); @@ -3553,7 +3639,7 @@ xfs_alloc_vextent_first_ag( start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, - XFS_FSB_TO_AGBNO(mp, target), 0); + XFS_FSB_TO_AGBNO(mp, target), alloc_flags); return xfs_alloc_vextent_finish(args, minimum_agno, error, true); } @@ -3606,6 +3692,7 @@ xfs_alloc_vextent_near_bno( struct xfs_mount *mp = args->mp; xfs_agnumber_t minimum_agno; bool needs_perag = args->pag == NULL; + uint32_t alloc_flags = 0; int error; if (!needs_perag) @@ -3626,9 +3713,9 @@ xfs_alloc_vextent_near_bno( if (needs_perag) args->pag = xfs_perag_grab(mp, args->agno); - error = xfs_alloc_vextent_prepare_ag(args, 0); + error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); if (!error && args->agbp) - error = xfs_alloc_ag_vextent_near(args); + error = xfs_alloc_ag_vextent_near(args, alloc_flags); return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag); } @@ -3756,15 +3843,11 @@ xfs_alloc_query_range( xfs_alloc_query_range_fn fn, void *priv) { - union xfs_btree_irec low_brec; - union xfs_btree_irec high_brec; - struct xfs_alloc_query_range_info query; + union xfs_btree_irec low_brec = { .a = *low_rec }; + union xfs_btree_irec high_brec = { .a = *high_rec }; + struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn }; ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); - low_brec.a = *low_rec; - high_brec.a = *high_rec; - query.priv = priv; - query.fn = fn; return xfs_btree_query_range(cur, &low_brec, &high_brec, xfs_alloc_query_range_helper, &query); } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 85ac470be0da..6bb8d295c321 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -19,11 +19,12 @@ unsigned int xfs_agfl_size(struct xfs_mount *mp); /* * Flags for xfs_alloc_fix_freelist. */ -#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */ -#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ -#define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */ -#define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */ -#define XFS_ALLOC_FLAG_CHECK 0x00000010 /* test only, don't modify args */ +#define XFS_ALLOC_FLAG_TRYLOCK (1U << 0) /* use trylock for buffer locking */ +#define XFS_ALLOC_FLAG_FREEING (1U << 1) /* indicate caller is freeing extents*/ +#define XFS_ALLOC_FLAG_NORMAP (1U << 2) /* don't modify the rmapbt */ +#define XFS_ALLOC_FLAG_NOSHRINK (1U << 3) /* don't shrink the freelist */ +#define XFS_ALLOC_FLAG_CHECK (1U << 4) /* test only, don't modify args */ +#define XFS_ALLOC_FLAG_TRYFLUSH (1U << 5) /* don't wait in busy extent flush */ /* * Argument structure for xfs_alloc routines. @@ -195,7 +196,7 @@ int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf **bpp); int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, struct xfs_buf *, struct xfs_owner_info *); -int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); +int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, uint32_t alloc_flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_buf **agbp); @@ -232,7 +233,7 @@ xfs_buf_to_agfl_bno( int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, - bool skip_discard); + enum xfs_ag_resv_type type, bool skip_discard); /* * List of extents to be free "later". @@ -245,6 +246,7 @@ struct xfs_extent_free_item { xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ struct xfs_perag *xefi_pag; unsigned int xefi_flags; + enum xfs_ag_resv_type xefi_agresv; }; void xfs_extent_free_get_group(struct xfs_mount *mp, @@ -259,9 +261,10 @@ xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, - const struct xfs_owner_info *oinfo) + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) { - return __xfs_free_extent_later(tp, bno, len, oinfo, false); + return __xfs_free_extent_later(tp, bno, len, oinfo, type, false); } @@ -270,4 +273,7 @@ extern struct kmem_cache *xfs_extfree_item_cache; int __init xfs_extfree_intent_init_cache(void); void xfs_extfree_intent_destroy_cache(void); +xfs_failaddr_t xfs_validate_ag_length(struct xfs_buf *bp, uint32_t seqno, + uint32_t length); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index beee51ad75ce..2580ae47209a 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2293,8 +2293,6 @@ xfs_attr3_leaf_unbalance( trace_xfs_attr_leaf_unbalance(state->args); - drop_leaf = drop_blk->bp->b_addr; - save_leaf = save_blk->bp->b_addr; xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf); xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf); entry = xfs_attr3_leaf_entryp(drop_leaf); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index fef35696adb7..30c931b38853 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -574,7 +574,8 @@ xfs_bmap_btree_to_extents( return error; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); - error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo); + error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, + XFS_AG_RESV_NONE); if (error) return error; @@ -5236,8 +5237,9 @@ xfs_bmap_del_extent_real( } else { error = __xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, - (bflags & XFS_BMAPI_NODISCARD) || - del->br_state == XFS_EXT_UNWRITTEN); + XFS_AG_RESV_NONE, + ((bflags & XFS_BMAPI_NODISCARD) || + del->br_state == XFS_EXT_UNWRITTEN)); if (error) goto done; } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 36564ae3084f..bf3f1b36fdd2 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -271,7 +271,8 @@ xfs_bmbt_free_block( int error; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); - error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo); + error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo, + XFS_AG_RESV_NONE); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index a2aa36b23e25..4d68a58be160 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -301,7 +301,7 @@ struct xfs_btree_cur static inline size_t xfs_btree_cur_sizeof(unsigned int nlevels) { - return struct_size((struct xfs_btree_cur *)NULL, bc_levels, nlevels); + return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels); } /* cursor flags */ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 1cfd5bc6520a..9c60ebb328b4 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -257,6 +257,8 @@ typedef struct xfs_fsop_resblks { #define XFS_MAX_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_BLOCKSIZE) #define XFS_MAX_CRC_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_CRC_BLOCKSIZE) +#define XFS_MAX_AGNUMBER ((xfs_agnumber_t)(NULLAGNUMBER - 1)) + /* keep the maximum size under 2^31 by a small amount */ #define XFS_MAX_LOG_BYTES \ ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 34600f94c2f4..b83e54c70906 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1853,8 +1853,8 @@ xfs_difree_inode_chunk( /* not sparse, calculate extent info directly */ return xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), - M_IGEO(mp)->ialloc_blks, - &XFS_RMAP_OINFO_INODES); + M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, + XFS_AG_RESV_NONE); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1899,8 +1899,8 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); error = xfs_free_extent_later(tp, - XFS_AGB_TO_FSB(mp, agno, agbno), - contigblk, &XFS_RMAP_OINFO_INODES); + XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, + &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE); if (error) return error; @@ -2486,11 +2486,14 @@ xfs_ialloc_log_agi( static xfs_failaddr_t xfs_agi_verify( - struct xfs_buf *bp) + struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_mount; - struct xfs_agi *agi = bp->b_addr; - int i; + struct xfs_mount *mp = bp->b_mount; + struct xfs_agi *agi = bp->b_addr; + xfs_failaddr_t fa; + uint32_t agi_seqno = be32_to_cpu(agi->agi_seqno); + uint32_t agi_length = be32_to_cpu(agi->agi_length); + int i; if (xfs_has_crc(mp)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) @@ -2507,6 +2510,10 @@ xfs_agi_verify( if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; + fa = xfs_validate_ag_length(bp, agi_seqno, agi_length); + if (fa) + return fa; + if (be32_to_cpu(agi->agi_level) < 1 || be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels) return __this_address; @@ -2516,15 +2523,6 @@ xfs_agi_verify( be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels)) return __this_address; - /* - * during growfs operations, the perag is not fully initialised, - * so we can't use it for any useful checking. growfs ensures we can't - * use it by using uncached buffers that don't have the perag attached - * so we can detect and avoid this problem. - */ - if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) - return __this_address; - for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO)) continue; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 5a945ae21b5d..9258f01c0015 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -160,8 +160,7 @@ __xfs_inobt_free_block( xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - return xfs_free_extent(cur->bc_tp, cur->bc_ag.pag, - XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1, + return xfs_free_extent_later(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_INOBT, resv); } diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index b6e21433925c..646b3fa362ad 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1152,7 +1152,8 @@ xfs_refcount_adjust_extents( cur->bc_ag.pag->pag_agno, tmp.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, - tmp.rc_blockcount, NULL); + tmp.rc_blockcount, NULL, + XFS_AG_RESV_NONE); if (error) goto out_error; } @@ -1213,7 +1214,8 @@ xfs_refcount_adjust_extents( cur->bc_ag.pag->pag_agno, ext.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, - ext.rc_blockcount, NULL); + ext.rc_blockcount, NULL, + XFS_AG_RESV_NONE); if (error) goto out_error; } @@ -1919,8 +1921,13 @@ xfs_refcount_recover_cow_leftovers( struct xfs_buf *agbp; struct xfs_refcount_recovery *rr, *n; struct list_head debris; - union xfs_btree_irec low; - union xfs_btree_irec high; + union xfs_btree_irec low = { + .rc.rc_domain = XFS_REFC_DOMAIN_COW, + }; + union xfs_btree_irec high = { + .rc.rc_domain = XFS_REFC_DOMAIN_COW, + .rc.rc_startblock = -1U, + }; xfs_fsblock_t fsb; int error; @@ -1951,10 +1958,6 @@ xfs_refcount_recover_cow_leftovers( cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); /* Find all the leftover CoW staging extents. */ - memset(&low, 0, sizeof(low)); - memset(&high, 0, sizeof(high)); - low.rc.rc_domain = high.rc.rc_domain = XFS_REFC_DOMAIN_COW; - high.rc.rc_startblock = -1U; error = xfs_btree_query_range(cur, &low, &high, xfs_refcount_recover_extent, &debris); xfs_btree_del_cursor(cur, error); @@ -1981,7 +1984,8 @@ xfs_refcount_recover_cow_leftovers( /* Free the block. */ error = xfs_free_extent_later(tp, fsb, - rr->rr_rrec.rc_blockcount, NULL); + rr->rr_rrec.rc_blockcount, NULL, + XFS_AG_RESV_NONE); if (error) goto out_trans; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index d4afc5f4e6a5..5c3987d8dc24 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -106,19 +106,13 @@ xfs_refcountbt_free_block( struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); - int error; trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); - error = xfs_free_extent(cur->bc_tp, cur->bc_ag.pag, - XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1, + return xfs_free_extent_later(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); - if (error) - return error; - - return error; } STATIC int diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index f4dc23b3b837..fbb0b2637463 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2389,14 +2389,10 @@ xfs_rmap_query_range( xfs_rmap_query_range_fn fn, void *priv) { - union xfs_btree_irec low_brec; - union xfs_btree_irec high_brec; - struct xfs_rmap_query_range_info query; + union xfs_btree_irec low_brec = { .r = *low_rec }; + union xfs_btree_irec high_brec = { .r = *high_rec }; + struct xfs_rmap_query_range_info query = { .priv = priv, .fn = fn }; - low_brec.r = *low_rec; - high_brec.r = *high_rec; - query.priv = priv; - query.fn = fn; return xfs_btree_query_range(cur, &low_brec, &high_brec, xfs_rmap_query_range_helper, &query); } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index ba0f17bc1dc0..5e174685a77c 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -412,7 +412,6 @@ xfs_validate_sb_common( sbp->sb_inodelog < XFS_DINODE_MIN_LOG || sbp->sb_inodelog > XFS_DINODE_MAX_LOG || sbp->sb_inodesize != (1 << sbp->sb_inodelog) || - sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE || sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || XFS_FSB_TO_B(mp, sbp->sb_agblocks) < XFS_MIN_AG_BYTES || XFS_FSB_TO_B(mp, sbp->sb_agblocks) > XFS_MAX_AG_BYTES || @@ -430,6 +429,61 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } + /* + * Logs that are too large are not supported at all. Reject them + * outright. Logs that are too small are tolerated on v4 filesystems, + * but we can only check that when mounting the log. Hence we skip + * those checks here. + */ + if (sbp->sb_logblocks > XFS_MAX_LOG_BLOCKS) { + xfs_notice(mp, + "Log size 0x%x blocks too large, maximum size is 0x%llx blocks", + sbp->sb_logblocks, XFS_MAX_LOG_BLOCKS); + return -EFSCORRUPTED; + } + + if (XFS_FSB_TO_B(mp, sbp->sb_logblocks) > XFS_MAX_LOG_BYTES) { + xfs_warn(mp, + "log size 0x%llx bytes too large, maximum size is 0x%llx bytes", + XFS_FSB_TO_B(mp, sbp->sb_logblocks), + XFS_MAX_LOG_BYTES); + return -EFSCORRUPTED; + } + + /* + * Do not allow filesystems with corrupted log sector or stripe units to + * be mounted. We cannot safely size the iclogs or write to the log if + * the log stripe unit is not valid. + */ + if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) { + if (sbp->sb_logsectsize != (1U << sbp->sb_logsectlog)) { + xfs_notice(mp, + "log sector size in bytes/log2 (0x%x/0x%x) must match", + sbp->sb_logsectsize, 1U << sbp->sb_logsectlog); + return -EFSCORRUPTED; + } + } else if (sbp->sb_logsectsize || sbp->sb_logsectlog) { + xfs_notice(mp, + "log sector size in bytes/log2 (0x%x/0x%x) are not zero", + sbp->sb_logsectsize, sbp->sb_logsectlog); + return -EFSCORRUPTED; + } + + if (sbp->sb_logsunit > 1) { + if (sbp->sb_logsunit % sbp->sb_blocksize) { + xfs_notice(mp, + "log stripe unit 0x%x bytes must be a multiple of block size", + sbp->sb_logsunit); + return -EFSCORRUPTED; + } + if (sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE) { + xfs_notice(mp, + "log stripe unit 0x%x bytes over maximum size (0x%x bytes)", + sbp->sb_logsunit, XLOG_MAX_RECORD_BSIZE); + return -EFSCORRUPTED; + } + } + /* Validate the realtime geometry; stolen from xfs_repair */ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index 9d7b9ee8bef4..c32b5fad6174 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -60,7 +60,7 @@ struct xchk_btree { static inline size_t xchk_btree_sizeof(unsigned int nlevels) { - return struct_size((struct xchk_btree *)NULL, lastkey, nlevels - 1); + return struct_size_t(struct xchk_btree, lastkey, nlevels - 1); } int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2ef78aa1d3f6..451942fb38ec 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -582,7 +582,6 @@ const struct address_space_operations xfs_address_space_operations = { .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .bmap = xfs_vm_bmap, - .direct_IO = noop_direct_IO, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, @@ -591,7 +590,6 @@ const struct address_space_operations xfs_address_space_operations = { const struct address_space_operations xfs_dax_aops = { .writepages = xfs_dax_writepages, - .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index f3d328e4a440..7c2fdc71e42d 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -566,20 +566,45 @@ xfs_extent_busy_clear( /* * Flush out all busy extents for this AG. + * + * If the current transaction is holding busy extents, the caller may not want + * to wait for committed busy extents to resolve. If we are being told just to + * try a flush or progress has been made since we last skipped a busy extent, + * return immediately to allow the caller to try again. + * + * If we are freeing extents, we might actually be holding the only free extents + * in the transaction busy list and the log force won't resolve that situation. + * In this case, we must return -EAGAIN to avoid a deadlock by informing the + * caller it needs to commit the busy extents it holds before retrying the + * extent free operation. */ -void +int xfs_extent_busy_flush( - struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_perag *pag, - unsigned busy_gen) + unsigned busy_gen, + uint32_t alloc_flags) { DEFINE_WAIT (wait); int error; - error = xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); if (error) - return; + return error; + + /* Avoid deadlocks on uncommitted busy extents. */ + if (!list_empty(&tp->t_busy)) { + if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH) + return 0; + + if (busy_gen != READ_ONCE(pag->pagb_gen)) + return 0; + + if (alloc_flags & XFS_ALLOC_FLAG_FREEING) + return -EAGAIN; + } + /* Wait for committed busy extents to resolve. */ do { prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); if (busy_gen != READ_ONCE(pag->pagb_gen)) @@ -588,6 +613,7 @@ xfs_extent_busy_flush( } while (1); finish_wait(&pag->pagb_wait, &wait); + return 0; } void diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 4a118131059f..c37bf87e6781 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -51,9 +51,9 @@ bool xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno, xfs_extlen_t *len, unsigned *busy_gen); -void -xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag, - unsigned busy_gen); +int +xfs_extent_busy_flush(struct xfs_trans *tp, struct xfs_perag *pag, + unsigned busy_gen, uint32_t alloc_flags); void xfs_extent_busy_wait_all(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index f9e36b810663..f1a5ecf099aa 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -337,6 +337,34 @@ xfs_trans_get_efd( } /* + * Fill the EFD with all extents from the EFI when we need to roll the + * transaction and continue with a new EFI. + * + * This simply copies all the extents in the EFI to the EFD rather than make + * assumptions about which extents in the EFI have already been processed. We + * currently keep the xefi list in the same order as the EFI extent list, but + * that may not always be the case. Copying everything avoids leaving a landmine + * were we fail to cancel all the extents in an EFI if the xefi list is + * processed in a different order to the extents in the EFI. + */ +static void +xfs_efd_from_efi( + struct xfs_efd_log_item *efdp) +{ + struct xfs_efi_log_item *efip = efdp->efd_efip; + uint i; + + ASSERT(efip->efi_format.efi_nextents > 0); + ASSERT(efdp->efd_next_extent < efip->efi_format.efi_nextents); + + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + efdp->efd_format.efd_extents[i] = + efip->efi_format.efi_extents[i]; + } + efdp->efd_next_extent = efip->efi_format.efi_nextents; +} + +/* * Free an extent and log it to the EFD. Note that the transaction is marked * dirty regardless of whether the extent free succeeds or fails to support the * EFI/EFD lifecycle rules. @@ -365,7 +393,7 @@ xfs_trans_free_extent( agbno, xefi->xefi_blockcount); error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, - xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, + xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); /* @@ -378,6 +406,17 @@ xfs_trans_free_extent( tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); + /* + * If we need a new transaction to make progress, the caller will log a + * new EFI with the current contents. It will also log an EFD to cancel + * the existing EFI, and so we need to copy all the unprocessed extents + * in this EFI to the EFD so this works correctly. + */ + if (error == -EAGAIN) { + xfs_efd_from_efi(efdp); + return error; + } + next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); @@ -495,6 +534,13 @@ xfs_extent_free_finish_item( error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + /* + * Don't free the XEFI if we need a new transaction to complete + * processing of it. + */ + if (error == -EAGAIN) + return error; + xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); return error; @@ -620,6 +666,7 @@ xfs_efi_item_recover( struct xfs_trans *tp; int i; int error = 0; + bool requeue_only = false; /* * First check the validity of the extents described by the @@ -644,6 +691,7 @@ xfs_efi_item_recover( for (i = 0; i < efip->efi_format.efi_nextents; i++) { struct xfs_extent_free_item fake = { .xefi_owner = XFS_RMAP_OWN_UNKNOWN, + .xefi_agresv = XFS_AG_RESV_NONE, }; struct xfs_extent *extp; @@ -652,9 +700,28 @@ xfs_efi_item_recover( fake.xefi_startblock = extp->ext_start; fake.xefi_blockcount = extp->ext_len; - xfs_extent_free_get_group(mp, &fake); - error = xfs_trans_free_extent(tp, efdp, &fake); - xfs_extent_free_put_group(&fake); + if (!requeue_only) { + xfs_extent_free_get_group(mp, &fake); + error = xfs_trans_free_extent(tp, efdp, &fake); + xfs_extent_free_put_group(&fake); + } + + /* + * If we can't free the extent without potentially deadlocking, + * requeue the rest of the extents to a new so that they get + * run again later with a new transaction context. + */ + if (error == -EAGAIN || requeue_only) { + error = xfs_free_extent_later(tp, fake.xefi_startblock, + fake.xefi_blockcount, + &XFS_RMAP_OINFO_ANY_OWNER, + fake.xefi_agresv); + if (!error) { + requeue_only = true; + continue; + } + } + if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, extp, sizeof(*extp)); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index aede746541f8..4f502219ae4f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -306,6 +306,34 @@ xfs_file_read_iter( return ret; } +STATIC ssize_t +xfs_file_splice_read( + struct file *in, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + struct inode *inode = file_inode(in); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + + XFS_STATS_INC(mp, xs_read_calls); + + if (xfs_is_shutdown(mp)) + return -EIO; + + trace_xfs_file_splice_read(ip, *ppos, len); + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = filemap_splice_read(in, ppos, pipe, len, flags); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (ret > 0) + XFS_STATS_ADD(mp, xs_read_bytes, ret); + return ret; +} + /* * Common pre-write limit and setup checks. * @@ -717,14 +745,9 @@ write_retry: if (ret) goto out; - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); - trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, &xfs_buffered_write_iomap_ops); - if (likely(ret >= 0)) - iocb->ki_pos += ret; /* * If we hit a space limit, try to free up some lingering preallocated @@ -753,7 +776,6 @@ write_retry: goto write_retry; } - current->backing_dev_info = NULL; out: if (iolock) xfs_iunlock(ip, iolock); @@ -1172,7 +1194,7 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | - FMODE_DIO_PARALLEL_WRITE; + FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT; return generic_file_open(inode, file); } @@ -1423,7 +1445,7 @@ const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, .read_iter = xfs_file_read_iter, .write_iter = xfs_file_write_iter, - .splice_read = generic_file_splice_read, + .splice_read = xfs_file_splice_read, .splice_write = iter_file_splice_write, .iopoll = iocb_bio_iopoll, .unlocked_ioctl = xfs_file_ioctl, diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 59e7d1a14b67..10403ba9b58f 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -160,9 +160,18 @@ struct xfs_getfsmap_info { struct xfs_buf *agf_bp; /* AGF, for refcount queries */ struct xfs_perag *pag; /* AG info, if applicable */ xfs_daddr_t next_daddr; /* next daddr we expect */ + /* daddr of low fsmap key when we're using the rtbitmap */ + xfs_daddr_t low_daddr; u64 missing_owner; /* owner of holes */ u32 dev; /* device id */ - struct xfs_rmap_irec low; /* low rmap key */ + /* + * Low rmap key for the query. If low.rm_blockcount is nonzero, this + * is the second (or later) call to retrieve the recordset in pieces. + * xfs_getfsmap_rec_before_start will compare all records retrieved + * by the rmapbt query to filter out any records that start before + * the last record. + */ + struct xfs_rmap_irec low; struct xfs_rmap_irec high; /* high rmap key */ bool last; /* last extent? */ }; @@ -237,16 +246,31 @@ xfs_getfsmap_format( xfs_fsmap_from_internal(rec, xfm); } +static inline bool +xfs_getfsmap_rec_before_start( + struct xfs_getfsmap_info *info, + const struct xfs_rmap_irec *rec, + xfs_daddr_t rec_daddr) +{ + if (info->low_daddr != -1ULL) + return rec_daddr < info->low_daddr; + if (info->low.rm_blockcount) + return xfs_rmap_compare(rec, &info->low) < 0; + return false; +} + /* * Format a reverse mapping for getfsmap, having translated rm_startblock - * into the appropriate daddr units. + * into the appropriate daddr units. Pass in a nonzero @len_daddr if the + * length could be larger than rm_blockcount in struct xfs_rmap_irec. */ STATIC int xfs_getfsmap_helper( struct xfs_trans *tp, struct xfs_getfsmap_info *info, const struct xfs_rmap_irec *rec, - xfs_daddr_t rec_daddr) + xfs_daddr_t rec_daddr, + xfs_daddr_t len_daddr) { struct xfs_fsmap fmr; struct xfs_mount *mp = tp->t_mountp; @@ -256,12 +280,15 @@ xfs_getfsmap_helper( if (fatal_signal_pending(current)) return -EINTR; + if (len_daddr == 0) + len_daddr = XFS_FSB_TO_BB(mp, rec->rm_blockcount); + /* * Filter out records that start before our startpoint, if the * caller requested that. */ - if (xfs_rmap_compare(rec, &info->low) < 0) { - rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (xfs_getfsmap_rec_before_start(info, rec, rec_daddr)) { + rec_daddr += len_daddr; if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; return 0; @@ -280,7 +307,7 @@ xfs_getfsmap_helper( info->head->fmh_entries++; - rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + rec_daddr += len_daddr; if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; return 0; @@ -320,7 +347,7 @@ xfs_getfsmap_helper( if (error) return error; fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset); - fmr.fmr_length = XFS_FSB_TO_BB(mp, rec->rm_blockcount); + fmr.fmr_length = len_daddr; if (rec->rm_flags & XFS_RMAP_UNWRITTEN) fmr.fmr_flags |= FMR_OF_PREALLOC; if (rec->rm_flags & XFS_RMAP_ATTR_FORK) @@ -337,7 +364,7 @@ xfs_getfsmap_helper( xfs_getfsmap_format(mp, &fmr, info); out: - rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + rec_daddr += len_daddr; if (info->next_daddr < rec_daddr) info->next_daddr = rec_daddr; return 0; @@ -358,7 +385,7 @@ xfs_getfsmap_datadev_helper( fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock); rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); - return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); + return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr, 0); } /* Transform a bnobt irec into a fsmap */ @@ -382,7 +409,7 @@ xfs_getfsmap_datadev_bnobt_helper( irec.rm_offset = 0; irec.rm_flags = 0; - return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr); + return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr, 0); } /* Set rmap flags based on the getfsmap flags */ @@ -409,31 +436,25 @@ xfs_getfsmap_logdev( { struct xfs_mount *mp = tp->t_mountp; struct xfs_rmap_irec rmap; - int error; + xfs_daddr_t rec_daddr, len_daddr; + xfs_fsblock_t start_fsb, end_fsb; + uint64_t eofs; - /* Set up search keys */ - info->low.rm_startblock = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); - info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); - error = xfs_fsmap_owner_to_rmap(&info->low, keys); - if (error) - return error; - info->low.rm_blockcount = 0; - xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + if (keys[0].fmr_physical >= eofs) + return 0; + start_fsb = XFS_BB_TO_FSBT(mp, + keys[0].fmr_physical + keys[0].fmr_length); + end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); - error = xfs_fsmap_owner_to_rmap(&info->high, keys + 1); - if (error) - return error; - info->high.rm_startblock = -1U; - info->high.rm_owner = ULLONG_MAX; - info->high.rm_offset = ULLONG_MAX; - info->high.rm_blockcount = 0; - info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; - info->missing_owner = XFS_FMR_OWN_FREE; + /* Adjust the low key if we are continuing from where we left off. */ + if (keys[0].fmr_length > 0) + info->low_daddr = XFS_FSB_TO_BB(mp, start_fsb); - trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high); + trace_xfs_fsmap_low_key_linear(mp, info->dev, start_fsb); + trace_xfs_fsmap_high_key_linear(mp, info->dev, end_fsb); - if (keys[0].fmr_physical > 0) + if (start_fsb > 0) return 0; /* Fabricate an rmap entry for the external log device. */ @@ -443,7 +464,9 @@ xfs_getfsmap_logdev( rmap.rm_offset = 0; rmap.rm_flags = 0; - return xfs_getfsmap_helper(tp, info, &rmap, 0); + rec_daddr = XFS_FSB_TO_BB(mp, rmap.rm_startblock); + len_daddr = XFS_FSB_TO_BB(mp, rmap.rm_blockcount); + return xfs_getfsmap_helper(tp, info, &rmap, rec_daddr, len_daddr); } #ifdef CONFIG_XFS_RT @@ -457,72 +480,58 @@ xfs_getfsmap_rtdev_rtbitmap_helper( { struct xfs_getfsmap_info *info = priv; struct xfs_rmap_irec irec; - xfs_daddr_t rec_daddr; + xfs_rtblock_t rtbno; + xfs_daddr_t rec_daddr, len_daddr; + + rtbno = rec->ar_startext * mp->m_sb.sb_rextsize; + rec_daddr = XFS_FSB_TO_BB(mp, rtbno); + irec.rm_startblock = rtbno; + + rtbno = rec->ar_extcount * mp->m_sb.sb_rextsize; + len_daddr = XFS_FSB_TO_BB(mp, rtbno); + irec.rm_blockcount = rtbno; - irec.rm_startblock = rec->ar_startext * mp->m_sb.sb_rextsize; - rec_daddr = XFS_FSB_TO_BB(mp, irec.rm_startblock); - irec.rm_blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ irec.rm_offset = 0; irec.rm_flags = 0; - return xfs_getfsmap_helper(tp, info, &irec, rec_daddr); + return xfs_getfsmap_helper(tp, info, &irec, rec_daddr, len_daddr); } -/* Execute a getfsmap query against the realtime device. */ +/* Execute a getfsmap query against the realtime device rtbitmap. */ STATIC int -__xfs_getfsmap_rtdev( +xfs_getfsmap_rtdev_rtbitmap( struct xfs_trans *tp, const struct xfs_fsmap *keys, - int (*query_fn)(struct xfs_trans *, - struct xfs_getfsmap_info *), struct xfs_getfsmap_info *info) { + + struct xfs_rtalloc_rec alow = { 0 }; + struct xfs_rtalloc_rec ahigh = { 0 }; struct xfs_mount *mp = tp->t_mountp; - xfs_fsblock_t start_fsb; - xfs_fsblock_t end_fsb; + xfs_rtblock_t start_rtb; + xfs_rtblock_t end_rtb; uint64_t eofs; - int error = 0; + int error; - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rextents * mp->m_sb.sb_rextsize); if (keys[0].fmr_physical >= eofs) return 0; - start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); - end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); + start_rtb = XFS_BB_TO_FSBT(mp, + keys[0].fmr_physical + keys[0].fmr_length); + end_rtb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); - /* Set up search keys */ - info->low.rm_startblock = start_fsb; - error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); - if (error) - return error; - info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); - info->low.rm_blockcount = 0; - xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); - - info->high.rm_startblock = end_fsb; - error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); - if (error) - return error; - info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset); - info->high.rm_blockcount = 0; - xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); - - trace_xfs_fsmap_low_key(mp, info->dev, NULLAGNUMBER, &info->low); - trace_xfs_fsmap_high_key(mp, info->dev, NULLAGNUMBER, &info->high); + info->missing_owner = XFS_FMR_OWN_UNKNOWN; - return query_fn(tp, info); -} + /* Adjust the low key if we are continuing from where we left off. */ + if (keys[0].fmr_length > 0) { + info->low_daddr = XFS_FSB_TO_BB(mp, start_rtb); + if (info->low_daddr >= eofs) + return 0; + } -/* Actually query the realtime bitmap. */ -STATIC int -xfs_getfsmap_rtdev_rtbitmap_query( - struct xfs_trans *tp, - struct xfs_getfsmap_info *info) -{ - struct xfs_rtalloc_rec alow = { 0 }; - struct xfs_rtalloc_rec ahigh = { 0 }; - struct xfs_mount *mp = tp->t_mountp; - int error; + trace_xfs_fsmap_low_key_linear(mp, info->dev, start_rtb); + trace_xfs_fsmap_high_key_linear(mp, info->dev, end_rtb); xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); @@ -530,8 +539,8 @@ xfs_getfsmap_rtdev_rtbitmap_query( * Set up query parameters to return free rtextents covering the range * we want. */ - alow.ar_startext = info->low.rm_startblock; - ahigh.ar_startext = info->high.rm_startblock; + alow.ar_startext = start_rtb; + ahigh.ar_startext = end_rtb; do_div(alow.ar_startext, mp->m_sb.sb_rextsize); if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) ahigh.ar_startext++; @@ -554,18 +563,6 @@ err: xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); return error; } - -/* Execute a getfsmap query against the realtime device rtbitmap. */ -STATIC int -xfs_getfsmap_rtdev_rtbitmap( - struct xfs_trans *tp, - const struct xfs_fsmap *keys, - struct xfs_getfsmap_info *info) -{ - info->missing_owner = XFS_FMR_OWN_UNKNOWN; - return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query, - info); -} #endif /* CONFIG_XFS_RT */ /* Execute a getfsmap query against the regular data device. */ @@ -606,9 +603,27 @@ __xfs_getfsmap_datadev( error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); if (error) return error; - info->low.rm_blockcount = 0; + info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length); xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + /* Adjust the low key if we are continuing from where we left off. */ + if (info->low.rm_blockcount == 0) { + /* empty */ + } else if (XFS_RMAP_NON_INODE_OWNER(info->low.rm_owner) || + (info->low.rm_flags & (XFS_RMAP_ATTR_FORK | + XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN))) { + info->low.rm_startblock += info->low.rm_blockcount; + info->low.rm_owner = 0; + info->low.rm_offset = 0; + + start_fsb += info->low.rm_blockcount; + if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs) + return 0; + } else { + info->low.rm_offset += info->low.rm_blockcount; + } + info->high.rm_startblock = -1U; info->high.rm_owner = ULLONG_MAX; info->high.rm_offset = ULLONG_MAX; @@ -659,12 +674,8 @@ __xfs_getfsmap_datadev( * Set the AG low key to the start of the AG prior to * moving on to the next AG. */ - if (pag->pag_agno == start_ag) { - info->low.rm_startblock = 0; - info->low.rm_owner = 0; - info->low.rm_offset = 0; - info->low.rm_flags = 0; - } + if (pag->pag_agno == start_ag) + memset(&info->low, 0, sizeof(info->low)); /* * If this is the last AG, report any gap at the end of it @@ -791,6 +802,19 @@ xfs_getfsmap_check_keys( struct xfs_fsmap *low_key, struct xfs_fsmap *high_key) { + if (low_key->fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) { + if (low_key->fmr_offset) + return false; + } + if (high_key->fmr_flags != -1U && + (high_key->fmr_flags & (FMR_OF_SPECIAL_OWNER | + FMR_OF_EXTENT_MAP))) { + if (high_key->fmr_offset && high_key->fmr_offset != -1ULL) + return false; + } + if (high_key->fmr_length && high_key->fmr_length != -1ULL) + return false; + if (low_key->fmr_device > high_key->fmr_device) return false; if (low_key->fmr_device < high_key->fmr_device) @@ -834,15 +858,15 @@ xfs_getfsmap_check_keys( * ---------------- * There are multiple levels of keys and counters at work here: * xfs_fsmap_head.fmh_keys -- low and high fsmap keys passed in; - * these reflect fs-wide sector addrs. + * these reflect fs-wide sector addrs. * dkeys -- fmh_keys used to query each device; - * these are fmh_keys but w/ the low key - * bumped up by fmr_length. + * these are fmh_keys but w/ the low key + * bumped up by fmr_length. * xfs_getfsmap_info.next_daddr -- next disk addr we expect to see; this * is how we detect gaps in the fsmap records and report them. * xfs_getfsmap_info.low/high -- per-AG low/high keys computed from - * dkeys; used to query the metadata. + * dkeys; used to query the metadata. */ int xfs_getfsmap( @@ -863,6 +887,8 @@ xfs_getfsmap( if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) || !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) return -EINVAL; + if (!xfs_getfsmap_check_keys(&head->fmh_keys[0], &head->fmh_keys[1])) + return -EINVAL; use_rmap = xfs_has_rmapbt(mp) && has_capability_noaudit(current, CAP_SYS_ADMIN); @@ -901,26 +927,15 @@ xfs_getfsmap( * blocks could be mapped to several other files/offsets. * According to rmapbt record ordering, the minimal next * possible record for the block range is the next starting - * offset in the same inode. Therefore, bump the file offset to - * continue the search appropriately. For all other low key - * mapping types (attr blocks, metadata), bump the physical - * offset as there can be no other mapping for the same physical - * block range. + * offset in the same inode. Therefore, each fsmap backend bumps + * the file offset to continue the search appropriately. For + * all other low key mapping types (attr blocks, metadata), each + * fsmap backend bumps the physical offset as there can be no + * other mapping for the same physical block range. */ dkeys[0] = head->fmh_keys[0]; - if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) { - dkeys[0].fmr_physical += dkeys[0].fmr_length; - dkeys[0].fmr_owner = 0; - if (dkeys[0].fmr_offset) - return -EINVAL; - } else - dkeys[0].fmr_offset += dkeys[0].fmr_length; - dkeys[0].fmr_length = 0; memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap)); - if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) - return -EINVAL; - info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; info.fsmap_recs = fsmap_recs; @@ -960,6 +975,8 @@ xfs_getfsmap( info.dev = handlers[i].dev; info.last = false; info.pag = NULL; + info.low_daddr = -1ULL; + info.low.rm_blockcount = 0; error = handlers[i].fn(tp, dkeys, &info); if (error) break; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 13851c0d640b..122b83488a05 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -115,11 +115,16 @@ xfs_growfs_data_private( nb_div = nb; nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); - nagcount = nb_div + (nb_mod != 0); - if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { - nagcount--; - nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; + if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS) + nb_div++; + else if (nb_mod) + nb = nb_div * mp->m_sb.sb_agblocks; + + if (nb_div > XFS_MAX_AGNUMBER + 1) { + nb_div = XFS_MAX_AGNUMBER + 1; + nb = nb_div * mp->m_sb.sb_agblocks; } + nagcount = nb_div; delta = nb - mp->m_sb.sb_dblocks; /* * Reject filesystems with a single AG because they are not @@ -140,9 +145,13 @@ xfs_growfs_data_private( return -EINVAL; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, - (delta > 0 ? XFS_GROWFS_SPACE_RES(mp) : -delta), 0, - XFS_TRANS_RESERVE, &tp); + if (delta > 0) + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, + &tp); + else + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0, + 0, &tp); if (error) return error; @@ -534,6 +543,9 @@ xfs_do_force_shutdown( } else if (flags & SHUTDOWN_CORRUPT_ONDISK) { tag = XFS_PTAG_SHUTDOWN_CORRUPT; why = "Corruption of on-disk metadata"; + } else if (flags & SHUTDOWN_DEVICE_REMOVED) { + tag = XFS_PTAG_SHUTDOWN_IOERROR; + why = "Block device removal"; } else { tag = XFS_PTAG_SHUTDOWN_IOERROR; why = "Metadata I/O Error"; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fc61cc024023..79004d193e54 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -639,7 +639,6 @@ xfs_log_mount( int num_bblks) { struct xlog *log; - bool fatal = xfs_has_crc(mp); int error = 0; int min_logfsbs; @@ -663,53 +662,37 @@ xfs_log_mount( mp->m_log = log; /* - * Validate the given log space and drop a critical message via syslog - * if the log size is too small that would lead to some unexpected - * situations in transaction log space reservation stage. + * Now that we have set up the log and it's internal geometry + * parameters, we can validate the given log space and drop a critical + * message via syslog if the log size is too small. A log that is too + * small can lead to unexpected situations in transaction log space + * reservation stage. The superblock verifier has already validated all + * the other log geometry constraints, so we don't have to check those + * here. * - * Note: we can't just reject the mount if the validation fails. This - * would mean that people would have to downgrade their kernel just to - * remedy the situation as there is no way to grow the log (short of - * black magic surgery with xfs_db). + * Note: For v4 filesystems, we can't just reject the mount if the + * validation fails. This would mean that people would have to + * downgrade their kernel just to remedy the situation as there is no + * way to grow the log (short of black magic surgery with xfs_db). * - * We can, however, reject mounts for CRC format filesystems, as the + * We can, however, reject mounts for V5 format filesystems, as the * mkfs binary being used to make the filesystem should never create a * filesystem with a log that is too small. */ min_logfsbs = xfs_log_calc_minimum_size(mp); - if (mp->m_sb.sb_logblocks < min_logfsbs) { xfs_warn(mp, "Log size %d blocks too small, minimum size is %d blocks", mp->m_sb.sb_logblocks, min_logfsbs); - error = -EINVAL; - } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { - xfs_warn(mp, - "Log size %d blocks too large, maximum size is %lld blocks", - mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); - error = -EINVAL; - } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { - xfs_warn(mp, - "log size %lld bytes too large, maximum size is %lld bytes", - XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), - XFS_MAX_LOG_BYTES); - error = -EINVAL; - } else if (mp->m_sb.sb_logsunit > 1 && - mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) { - xfs_warn(mp, - "log stripe unit %u bytes must be a multiple of block size", - mp->m_sb.sb_logsunit); - error = -EINVAL; - fatal = true; - } - if (error) { + /* * Log check errors are always fatal on v5; or whenever bad * metadata leads to a crash. */ - if (fatal) { + if (xfs_has_crc(mp)) { xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); ASSERT(0); + error = -EINVAL; goto out_free_log; } xfs_crit(mp, "Log size out of supported range."); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 6c09f89534d3..e2866e7fa60c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -458,12 +458,14 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname, #define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */ #define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */ #define SHUTDOWN_CORRUPT_ONDISK (1u << 4) /* corrupt metadata on device */ +#define SHUTDOWN_DEVICE_REMOVED (1u << 5) /* device removed underneath us */ #define XFS_SHUTDOWN_STRINGS \ { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ { SHUTDOWN_LOG_IO_ERROR, "log_io" }, \ { SHUTDOWN_FORCE_UMOUNT, "force_umount" }, \ - { SHUTDOWN_CORRUPT_INCORE, "corruption" } + { SHUTDOWN_CORRUPT_INCORE, "corruption" }, \ + { SHUTDOWN_DEVICE_REMOVED, "device_removed" } /* * Flags for xfs_mountfs diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index c4078d0ec108..4a9bbd3fe120 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -114,7 +114,8 @@ xfs_dax_notify_ddev_failure( int error = 0; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); - xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen); + xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, + daddr + bblen - 1); xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); error = xfs_trans_alloc_empty(mp, &tp); @@ -210,7 +211,7 @@ xfs_dax_notify_failure( ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; /* Ignore the range out of filesystem area */ - if (offset + len < ddev_start) + if (offset + len - 1 < ddev_start) return -ENXIO; if (offset > ddev_end) return -ENXIO; @@ -222,8 +223,8 @@ xfs_dax_notify_failure( len -= ddev_start - offset; offset = 0; } - if (offset + len > ddev_end) - len -= ddev_end - offset; + if (offset + len - 1 > ddev_end) + len = ddev_end - offset + 1; return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), mf_flags); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index abcc559f3c64..eb9102453aff 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -617,7 +617,8 @@ xfs_reflink_cancel_cow_blocks( del.br_blockcount); error = xfs_free_extent_later(*tpp, del.br_startblock, - del.br_blockcount, NULL); + del.br_blockcount, NULL, + XFS_AG_RESV_NONE); if (error) break; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 4120bd1cba90..818510243130 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -377,6 +377,17 @@ disable_dax: return 0; } +static void +xfs_bdev_mark_dead( + struct block_device *bdev) +{ + xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED); +} + +static const struct blk_holder_ops xfs_holder_ops = { + .mark_dead = xfs_bdev_mark_dead, +}; + STATIC int xfs_blkdev_get( xfs_mount_t *mp, @@ -385,8 +396,8 @@ xfs_blkdev_get( { int error = 0; - *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - mp); + *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp, + &xfs_holder_ops); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); xfs_warn(mp, "Invalid device [%s], error=%d", name, error); @@ -397,10 +408,11 @@ xfs_blkdev_get( STATIC void xfs_blkdev_put( + struct xfs_mount *mp, struct block_device *bdev) { if (bdev) - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, mp); } STATIC void @@ -411,13 +423,13 @@ xfs_close_devices( struct block_device *logdev = mp->m_logdev_targp->bt_bdev; xfs_free_buftarg(mp->m_logdev_targp); - xfs_blkdev_put(logdev); + xfs_blkdev_put(mp, logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; xfs_free_buftarg(mp->m_rtdev_targp); - xfs_blkdev_put(rtdev); + xfs_blkdev_put(mp, rtdev); } xfs_free_buftarg(mp->m_ddev_targp); } @@ -492,10 +504,10 @@ xfs_open_devices( out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - xfs_blkdev_put(rtdev); + xfs_blkdev_put(mp, rtdev); out_close_logdev: if (logdev && logdev != ddev) - xfs_blkdev_put(logdev); + xfs_blkdev_put(mp, logdev); return error; } @@ -1160,6 +1172,13 @@ xfs_fs_free_cached_objects( return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); } +static void +xfs_fs_shutdown( + struct super_block *sb) +{ + xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED); +} + static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, @@ -1173,6 +1192,7 @@ static const struct super_operations xfs_super_operations = { .show_options = xfs_fs_show_options, .nr_cached_objects = xfs_fs_nr_cached_objects, .free_cached_objects = xfs_fs_free_cached_objects, + .shutdown = xfs_fs_shutdown, }; static int @@ -1687,10 +1707,6 @@ xfs_fs_fill_super( goto out_filestream_unmount; } - if (xfs_has_large_extent_counts(mp)) - xfs_warn(mp, - "EXPERIMENTAL Large extent counts feature in use. Use at your own risk!"); - error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index cd4ca5b1fcb0..f3cc204bb4bf 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1445,7 +1445,6 @@ DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write); - DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int whichfork, struct xfs_bmbt_irec *irec), @@ -1535,6 +1534,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append); +DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read); DECLARE_EVENT_CLASS(xfs_itrunc_class, TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), @@ -3623,6 +3623,31 @@ DEFINE_FSMAP_EVENT(xfs_fsmap_low_key); DEFINE_FSMAP_EVENT(xfs_fsmap_high_key); DEFINE_FSMAP_EVENT(xfs_fsmap_mapping); +DECLARE_EVENT_CLASS(xfs_fsmap_linear_class, + TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), + TP_ARGS(mp, keydev, bno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(xfs_fsblock_t, bno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->keydev = new_decode_dev(keydev); + __entry->bno = bno; + ), + TP_printk("dev %d:%d keydev %d:%d bno 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->bno) +) +#define DEFINE_FSMAP_LINEAR_EVENT(name) \ +DEFINE_EVENT(xfs_fsmap_linear_class, name, \ + TP_PROTO(struct xfs_mount *mp, u32 keydev, uint64_t bno), \ + TP_ARGS(mp, keydev, bno)) +DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_low_key_linear); +DEFINE_FSMAP_LINEAR_EVENT(xfs_fsmap_high_key_linear); + DECLARE_EVENT_CLASS(xfs_getfsmap_class, TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), TP_ARGS(mp, fsmap), diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 7d4109af193e..1098452e7f95 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -823,7 +823,7 @@ xfs_trans_ail_update_bulk( trace_xfs_ail_insert(lip, 0, lsn); } lip->li_lsn = lsn; - list_add(&lip->li_ail, &tmp); + list_add_tail(&lip->li_ail, &tmp); } if (!list_empty(&tmp)) diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 132f01d3461f..92c9aaae3663 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -181,7 +181,6 @@ const struct address_space_operations zonefs_file_aops = { .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, - .direct_IO = noop_direct_IO, .swap_activate = zonefs_swap_activate, }; @@ -342,6 +341,77 @@ static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek_size(file, offset, whence, isize, isize); } +struct zonefs_zone_append_bio { + /* The target inode of the BIO */ + struct inode *inode; + + /* For sync writes, the target append write offset */ + u64 append_offset; + + /* + * This member must come last, bio_alloc_bioset will allocate enough + * bytes for entire zonefs_bio but relies on bio being last. + */ + struct bio bio; +}; + +static inline struct zonefs_zone_append_bio * +zonefs_zone_append_bio(struct bio *bio) +{ + return container_of(bio, struct zonefs_zone_append_bio, bio); +} + +static void zonefs_file_zone_append_dio_bio_end_io(struct bio *bio) +{ + struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio); + struct zonefs_zone *z = zonefs_inode_zone(za_bio->inode); + sector_t za_sector; + + if (bio->bi_status != BLK_STS_OK) + goto bio_end; + + /* + * If the file zone was written underneath the file system, the zone + * append operation can still succedd (if the zone is not full) but + * the write append location will not be where we expect it to be. + * Check that we wrote where we intended to, that is, at z->z_wpoffset. + */ + za_sector = z->z_sector + (za_bio->append_offset >> SECTOR_SHIFT); + if (bio->bi_iter.bi_sector != za_sector) { + zonefs_warn(za_bio->inode->i_sb, + "Invalid write sector %llu for zone at %llu\n", + bio->bi_iter.bi_sector, z->z_sector); + bio->bi_status = BLK_STS_IOERR; + } + +bio_end: + iomap_dio_bio_end_io(bio); +} + +static void zonefs_file_zone_append_dio_submit_io(const struct iomap_iter *iter, + struct bio *bio, + loff_t file_offset) +{ + struct zonefs_zone_append_bio *za_bio = zonefs_zone_append_bio(bio); + struct inode *inode = iter->inode; + struct zonefs_zone *z = zonefs_inode_zone(inode); + + /* + * Issue a zone append BIO to process sync dio writes. The append + * file offset is saved to check the zone append write location + * on completion of the BIO. + */ + za_bio->inode = inode; + za_bio->append_offset = file_offset; + + bio->bi_opf &= ~REQ_OP_WRITE; + bio->bi_opf |= REQ_OP_ZONE_APPEND; + bio->bi_iter.bi_sector = z->z_sector; + bio->bi_end_io = zonefs_file_zone_append_dio_bio_end_io; + + submit_bio(bio); +} + static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { @@ -372,93 +442,17 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, return 0; } -static const struct iomap_dio_ops zonefs_write_dio_ops = { - .end_io = zonefs_file_write_dio_end_io, -}; - -static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct zonefs_zone *z = zonefs_inode_zone(inode); - struct block_device *bdev = inode->i_sb->s_bdev; - unsigned int max = bdev_max_zone_append_sectors(bdev); - pgoff_t start, end; - struct bio *bio; - ssize_t size = 0; - int nr_pages; - ssize_t ret; - - max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); - iov_iter_truncate(from, max); - - /* - * If the inode block size (zone write granularity) is smaller than the - * page size, we may be appending data belonging to the last page of the - * inode straddling inode->i_size, with that page already cached due to - * a buffered read or readahead. So make sure to invalidate that page. - * This will always be a no-op for the case where the block size is - * equal to the page size. - */ - start = iocb->ki_pos >> PAGE_SHIFT; - end = (iocb->ki_pos + iov_iter_count(from) - 1) >> PAGE_SHIFT; - if (invalidate_inode_pages2_range(inode->i_mapping, start, end)) - return -EBUSY; - - nr_pages = iov_iter_npages(from, BIO_MAX_VECS); - if (!nr_pages) - return 0; - - bio = bio_alloc(bdev, nr_pages, - REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); - bio->bi_iter.bi_sector = z->z_sector; - bio->bi_ioprio = iocb->ki_ioprio; - if (iocb_is_dsync(iocb)) - bio->bi_opf |= REQ_FUA; - - ret = bio_iov_iter_get_pages(bio, from); - if (unlikely(ret)) - goto out_release; - - size = bio->bi_iter.bi_size; - task_io_account_write(size); - - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(bio, iocb); - - ret = submit_bio_wait(bio); - - /* - * If the file zone was written underneath the file system, the zone - * write pointer may not be where we expect it to be, but the zone - * append write can still succeed. So check manually that we wrote where - * we intended to, that is, at zi->i_wpoffset. - */ - if (!ret) { - sector_t wpsector = - z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT); - - if (bio->bi_iter.bi_sector != wpsector) { - zonefs_warn(inode->i_sb, - "Corrupted write pointer %llu for zone at %llu\n", - bio->bi_iter.bi_sector, z->z_sector); - ret = -EIO; - } - } - - zonefs_file_write_dio_end_io(iocb, size, ret, 0); - trace_zonefs_file_dio_append(inode, size, ret); +static struct bio_set zonefs_zone_append_bio_set; -out_release: - bio_release_pages(bio, false); - bio_put(bio); - - if (ret >= 0) { - iocb->ki_pos += size; - return size; - } +static const struct iomap_dio_ops zonefs_zone_append_dio_ops = { + .submit_io = zonefs_file_zone_append_dio_submit_io, + .end_io = zonefs_file_write_dio_end_io, + .bio_set = &zonefs_zone_append_bio_set, +}; - return ret; -} +static const struct iomap_dio_ops zonefs_write_dio_ops = { + .end_io = zonefs_file_write_dio_end_io, +}; /* * Do not exceed the LFS limits nor the file zone size. If pos is under the @@ -539,6 +533,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) struct zonefs_inode_info *zi = ZONEFS_I(inode); struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; + const struct iomap_dio_ops *dio_ops; bool sync = is_sync_kiocb(iocb); bool append = false; ssize_t ret, count; @@ -582,20 +577,26 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) } if (append) { - ret = zonefs_file_dio_append(iocb, from); + unsigned int max = bdev_max_zone_append_sectors(sb->s_bdev); + + max = ALIGN_DOWN(max << SECTOR_SHIFT, sb->s_blocksize); + iov_iter_truncate(from, max); + + dio_ops = &zonefs_zone_append_dio_ops; } else { - /* - * iomap_dio_rw() may return ENOTBLK if there was an issue with - * page invalidation. Overwrite that error code with EBUSY to - * be consistent with zonefs_file_dio_append() return value for - * similar issues. - */ - ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, - &zonefs_write_dio_ops, 0, NULL, 0); - if (ret == -ENOTBLK) - ret = -EBUSY; + dio_ops = &zonefs_write_dio_ops; } + /* + * iomap_dio_rw() may return ENOTBLK if there was an issue with + * page invalidation. Overwrite that error code with EBUSY so that + * the user can make sense of the error. + */ + ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, + dio_ops, 0, NULL, 0); + if (ret == -ENOTBLK) + ret = -EBUSY; + if (zonefs_zone_is_seq(z) && (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) @@ -643,9 +644,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, goto inode_unlock; ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); - if (ret > 0) - iocb->ki_pos += ret; - else if (ret == -EIO) + if (ret == -EIO) zonefs_io_error(inode, true); inode_unlock: @@ -752,6 +751,44 @@ inode_unlock: return ret; } +static ssize_t zonefs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct inode *inode = file_inode(in); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t isize; + ssize_t ret = 0; + + /* Offline zones cannot be read */ + if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) + return -EPERM; + + if (*ppos >= z->z_capacity) + return 0; + + inode_lock_shared(inode); + + /* Limit read operations to written data */ + mutex_lock(&zi->i_truncate_mutex); + isize = i_size_read(inode); + if (*ppos >= isize) + len = 0; + else + len = min_t(loff_t, len, isize - *ppos); + mutex_unlock(&zi->i_truncate_mutex); + + if (len > 0) { + ret = filemap_splice_read(in, ppos, pipe, len, flags); + if (ret == -EIO) + zonefs_io_error(inode, false); + } + + inode_unlock_shared(inode); + return ret; +} + /* * Write open accounting is done only for sequential files. */ @@ -813,6 +850,7 @@ static int zonefs_file_open(struct inode *inode, struct file *file) { int ret; + file->f_mode |= FMODE_CAN_ODIRECT; ret = generic_file_open(inode, file); if (ret) return ret; @@ -896,7 +934,19 @@ const struct file_operations zonefs_file_operations = { .llseek = zonefs_file_llseek, .read_iter = zonefs_file_read_iter, .write_iter = zonefs_file_write_iter, - .splice_read = generic_file_splice_read, + .splice_read = zonefs_file_splice_read, .splice_write = iter_file_splice_write, .iopoll = iocb_bio_iopoll, }; + +int zonefs_file_bioset_init(void) +{ + return bioset_init(&zonefs_zone_append_bio_set, BIO_POOL_SIZE, + offsetof(struct zonefs_zone_append_bio, bio), + BIOSET_NEED_BVECS); +} + +void zonefs_file_bioset_exit(void) +{ + bioset_exit(&zonefs_zone_append_bio_set); +} diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 23b8b299c64e..bbe44a26a8e5 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1128,7 +1128,7 @@ static int zonefs_read_super(struct super_block *sb) bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = 0; - bio_add_page(&bio, page, PAGE_SIZE, 0); + __bio_add_page(&bio, page, PAGE_SIZE, 0); ret = submit_bio_wait(&bio); if (ret) @@ -1412,10 +1412,14 @@ static int __init zonefs_init(void) BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE); - ret = zonefs_init_inodecache(); + ret = zonefs_file_bioset_init(); if (ret) return ret; + ret = zonefs_init_inodecache(); + if (ret) + goto destroy_bioset; + ret = zonefs_sysfs_init(); if (ret) goto destroy_inodecache; @@ -1430,6 +1434,8 @@ sysfs_exit: zonefs_sysfs_exit(); destroy_inodecache: zonefs_destroy_inodecache(); +destroy_bioset: + zonefs_file_bioset_exit(); return ret; } @@ -1439,6 +1445,7 @@ static void __exit zonefs_exit(void) unregister_filesystem(&zonefs_type); zonefs_sysfs_exit(); zonefs_destroy_inodecache(); + zonefs_file_bioset_exit(); } MODULE_AUTHOR("Damien Le Moal"); diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 8175652241b5..f663b8ebc2cb 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -279,6 +279,8 @@ extern const struct file_operations zonefs_dir_operations; extern const struct address_space_operations zonefs_file_aops; extern const struct file_operations zonefs_file_operations; int zonefs_file_truncate(struct inode *inode, loff_t isize); +int zonefs_file_bioset_init(void); +void zonefs_file_bioset_exit(void); /* In sysfs.c */ int zonefs_sysfs_register(struct super_block *sb); |