diff options
Diffstat (limited to 'fs')
257 files changed, 9189 insertions, 5089 deletions
@@ -70,6 +70,12 @@ struct aio_ring { struct io_event io_events[0]; }; /* 128 bytes + ring size */ +/* + * Plugging is meant to work with larger batches of IOs. If we don't + * have more than the below, then don't bother setting up a plug. + */ +#define AIO_PLUG_THRESHOLD 2 + #define AIO_RING_PAGES 8 struct kioctx_table { @@ -409,7 +415,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, BUG_ON(PageWriteback(old)); get_page(new); - rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1); + rc = migrate_page_move_mapping(mapping, new, old, mode, 1); if (rc != MIGRATEPAGE_SUCCESS) { put_page(new); goto out_unlock; @@ -902,7 +908,7 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) local_irq_restore(flags); } -static bool get_reqs_available(struct kioctx *ctx) +static bool __get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; @@ -994,6 +1000,14 @@ static void user_refill_reqs_available(struct kioctx *ctx) spin_unlock_irq(&ctx->completion_lock); } +static bool get_reqs_available(struct kioctx *ctx) +{ + if (__get_reqs_available(ctx)) + return true; + user_refill_reqs_available(ctx); + return __get_reqs_available(ctx); +} + /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. @@ -1002,24 +1016,16 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { struct aio_kiocb *req; - if (!get_reqs_available(ctx)) { - user_refill_reqs_available(ctx); - if (!get_reqs_available(ctx)) - return NULL; - } - - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) - goto out_put; + return NULL; percpu_ref_get(&ctx->reqs); + req->ki_ctx = ctx; INIT_LIST_HEAD(&req->ki_list); refcount_set(&req->ki_refcnt, 0); - req->ki_ctx = ctx; + req->ki_eventfd = NULL; return req; -out_put: - put_reqs_available(ctx, 1); - return NULL; } static struct kioctx *lookup_ioctx(unsigned long ctx_id) @@ -1059,6 +1065,15 @@ static inline void iocb_put(struct aio_kiocb *iocb) } } +static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, + long res, long res2) +{ + ev->obj = (u64)(unsigned long)iocb->ki_user_iocb; + ev->data = iocb->ki_user_data; + ev->res = res; + ev->res2 = res2; +} + /* aio_complete * Called when the io request on the given iocb is complete. */ @@ -1086,10 +1101,7 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; - event->obj = (u64)(unsigned long)iocb->ki_user_iocb; - event->data = iocb->ki_user_data; - event->res = res; - event->res2 = res2; + aio_fill_event(event, iocb, res, res2); kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); @@ -1416,7 +1428,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) aio_complete(iocb, res, res2); } -static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) { int ret; @@ -1438,21 +1450,26 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); - fput(req->ki_filp); - return ret; + goto out_fput; } req->ki_ioprio = iocb->aio_reqprio; } else - req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + req->ki_ioprio = get_current_ioprio(); ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) - fput(req->ki_filp); + goto out_fput; + + req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ + return 0; + +out_fput: + fput(req->ki_filp); return ret; } -static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, +static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, bool vectored, bool compat, struct iov_iter *iter) { void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; @@ -1487,12 +1504,12 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) ret = -EINTR; /*FALLTHRU*/ default: - aio_complete_rw(req, ret, 0); + req->ki_complete(req, ret, 0); } } -static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, - bool compat) +static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb, + bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; @@ -1524,8 +1541,8 @@ out_fput: return ret; } -static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, - bool compat) +static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, + bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; @@ -1580,7 +1597,8 @@ static void aio_fsync_work(struct work_struct *work) aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); } -static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) +static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, + bool datasync) { if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) @@ -1708,7 +1726,7 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, add_wait_queue(head, &pt->iocb->poll.wait); } -static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) +static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; @@ -1728,6 +1746,10 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) if (unlikely(!req->file)) return -EBADF; + req->head = NULL; + req->woken = false; + req->cancelled = false; + apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; apt.iocb = aiocb; @@ -1776,44 +1798,44 @@ out: return 0; } -static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - bool compat) +static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, + struct iocb __user *user_iocb, bool compat) { struct aio_kiocb *req; - struct iocb iocb; ssize_t ret; - if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) - return -EFAULT; - /* enforce forwards compatibility on users */ - if (unlikely(iocb.aio_reserved2)) { + if (unlikely(iocb->aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } /* prevent overflows */ if (unlikely( - (iocb.aio_buf != (unsigned long)iocb.aio_buf) || - (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || - ((ssize_t)iocb.aio_nbytes < 0) + (iocb->aio_buf != (unsigned long)iocb->aio_buf) || + (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || + ((ssize_t)iocb->aio_nbytes < 0) )) { pr_debug("EINVAL: overflow check\n"); return -EINVAL; } + if (!get_reqs_available(ctx)) + return -EAGAIN; + + ret = -EAGAIN; req = aio_get_req(ctx); if (unlikely(!req)) - return -EAGAIN; + goto out_put_reqs_available; - if (iocb.aio_flags & IOCB_FLAG_RESFD) { + if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ - req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd); + req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); req->ki_eventfd = NULL; @@ -1828,32 +1850,32 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, } req->ki_user_iocb = user_iocb; - req->ki_user_data = iocb.aio_data; + req->ki_user_data = iocb->aio_data; - switch (iocb.aio_lio_opcode) { + switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: - ret = aio_read(&req->rw, &iocb, false, compat); + ret = aio_read(&req->rw, iocb, false, compat); break; case IOCB_CMD_PWRITE: - ret = aio_write(&req->rw, &iocb, false, compat); + ret = aio_write(&req->rw, iocb, false, compat); break; case IOCB_CMD_PREADV: - ret = aio_read(&req->rw, &iocb, true, compat); + ret = aio_read(&req->rw, iocb, true, compat); break; case IOCB_CMD_PWRITEV: - ret = aio_write(&req->rw, &iocb, true, compat); + ret = aio_write(&req->rw, iocb, true, compat); break; case IOCB_CMD_FSYNC: - ret = aio_fsync(&req->fsync, &iocb, false); + ret = aio_fsync(&req->fsync, iocb, false); break; case IOCB_CMD_FDSYNC: - ret = aio_fsync(&req->fsync, &iocb, true); + ret = aio_fsync(&req->fsync, iocb, true); break; case IOCB_CMD_POLL: - ret = aio_poll(req, &iocb); + ret = aio_poll(req, iocb); break; default: - pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode); + pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); ret = -EINVAL; break; } @@ -1867,14 +1889,25 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; return 0; out_put_req: - put_reqs_available(ctx, 1); - percpu_ref_put(&ctx->reqs); if (req->ki_eventfd) eventfd_ctx_put(req->ki_eventfd); - kmem_cache_free(kiocb_cachep, req); + iocb_put(req); +out_put_reqs_available: + put_reqs_available(ctx, 1); return ret; } +static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, + bool compat) +{ + struct iocb iocb; + + if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) + return -EFAULT; + + return __io_submit_one(ctx, &iocb, user_iocb, compat); +} + /* sys_io_submit: * Queue the nr iocbs pointed to by iocbpp for processing. Returns * the number of iocbs queued. May return -EINVAL if the aio_context @@ -1907,7 +1940,8 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, if (nr > ctx->nr_events) nr = ctx->nr_events; - blk_start_plug(&plug); + if (nr > AIO_PLUG_THRESHOLD) + blk_start_plug(&plug); for (i = 0; i < nr; i++) { struct iocb __user *user_iocb; @@ -1920,7 +1954,8 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, if (ret) break; } - blk_finish_plug(&plug); + if (nr > AIO_PLUG_THRESHOLD) + blk_finish_plug(&plug); percpu_ref_put(&ctx->users); return i ? i : ret; @@ -1947,7 +1982,8 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, if (nr > ctx->nr_events) nr = ctx->nr_events; - blk_start_plug(&plug); + if (nr > AIO_PLUG_THRESHOLD) + blk_start_plug(&plug); for (i = 0; i < nr; i++) { compat_uptr_t user_iocb; @@ -1960,7 +1996,8 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, if (ret) break; } - blk_finish_plug(&plug); + if (nr > AIO_PLUG_THRESHOLD) + blk_finish_plug(&plug); percpu_ref_put(&ctx->users); return i ? i : ret; @@ -2065,11 +2102,13 @@ static long do_io_getevents(aio_context_t ctx_id, * specifies an infinite timeout. Note that the timeout pointed to by * timeout is relative. Will fail with -ENOSYS if not implemented. */ +#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) + SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, long, nr, struct io_event __user *, events, - struct timespec __user *, timeout) + struct __kernel_timespec __user *, timeout) { struct timespec64 ts; int ret; @@ -2083,6 +2122,8 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, return ret; } +#endif + struct __aio_sigset { const sigset_t __user *sigmask; size_t sigsetsize; @@ -2093,7 +2134,7 @@ SYSCALL_DEFINE6(io_pgetevents, long, min_nr, long, nr, struct io_event __user *, events, - struct timespec __user *, timeout, + struct __kernel_timespec __user *, timeout, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; @@ -2107,33 +2148,56 @@ SYSCALL_DEFINE6(io_pgetevents, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - if (ksig.sigmask) { - if (ksig.sigsetsize != sizeof(sigset_t)) - return -EINVAL; - if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask))) - return -EFAULT; - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } + ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + if (ret) + return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); - if (signal_pending(current)) { - if (ksig.sigmask) { - current->saved_sigmask = sigsaved; - set_restore_sigmask(); - } + restore_user_sigmask(ksig.sigmask, &sigsaved); + if (signal_pending(current) && !ret) + ret = -ERESTARTNOHAND; - if (!ret) - ret = -ERESTARTNOHAND; - } else { - if (ksig.sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - } + return ret; +} + +#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) + +SYSCALL_DEFINE6(io_pgetevents_time32, + aio_context_t, ctx_id, + long, min_nr, + long, nr, + struct io_event __user *, events, + struct old_timespec32 __user *, timeout, + const struct __aio_sigset __user *, usig) +{ + struct __aio_sigset ksig = { NULL, }; + sigset_t ksigmask, sigsaved; + struct timespec64 ts; + int ret; + + if (timeout && unlikely(get_old_timespec32(&ts, timeout))) + return -EFAULT; + + if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) + return -EFAULT; + + + ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + if (ret) + return ret; + + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); + restore_user_sigmask(ksig.sigmask, &sigsaved); + if (signal_pending(current) && !ret) + ret = -ERESTARTNOHAND; return ret; } -#ifdef CONFIG_COMPAT +#endif + +#if defined(CONFIG_COMPAT_32BIT_TIME) + COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, compat_long_t, min_nr, compat_long_t, nr, @@ -2152,12 +2216,17 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, return ret; } +#endif + +#ifdef CONFIG_COMPAT struct __compat_aio_sigset { compat_sigset_t __user *sigmask; compat_size_t sigsetsize; }; +#if defined(CONFIG_COMPAT_32BIT_TIME) + COMPAT_SYSCALL_DEFINE6(io_pgetevents, compat_aio_context_t, ctx_id, compat_long_t, min_nr, @@ -2177,27 +2246,47 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - if (ksig.sigmask) { - if (ksig.sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (get_compat_sigset(&ksigmask, ksig.sigmask)) - return -EFAULT; - sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } + ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + if (ret) + return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); - if (signal_pending(current)) { - if (ksig.sigmask) { - current->saved_sigmask = sigsaved; - set_restore_sigmask(); - } - if (!ret) - ret = -ERESTARTNOHAND; - } else { - if (ksig.sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - } + restore_user_sigmask(ksig.sigmask, &sigsaved); + if (signal_pending(current) && !ret) + ret = -ERESTARTNOHAND; + + return ret; +} + +#endif + +COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, + compat_aio_context_t, ctx_id, + compat_long_t, min_nr, + compat_long_t, nr, + struct io_event __user *, events, + struct __kernel_timespec __user *, timeout, + const struct __compat_aio_sigset __user *, usig) +{ + struct __compat_aio_sigset ksig = { NULL, }; + sigset_t ksigmask, sigsaved; + struct timespec64 t; + int ret; + + if (timeout && get_timespec64(&t, timeout)) + return -EFAULT; + + if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) + return -EFAULT; + + ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + if (ret) + return ret; + + ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); + restore_user_sigmask(ksig.sigmask, &sigsaved); + if (signal_pending(current) && !ret) + ret = -ERESTARTNOHAND; return ret; } diff --git a/fs/block_dev.c b/fs/block_dev.c index a80b4f0ee7c4..c546cdce77e6 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -181,7 +181,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio) struct task_struct *waiter = bio->bi_private; WRITE_ONCE(bio->bi_private, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } static ssize_t @@ -232,6 +232,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio.bi_opf = dio_bio_write_op(iocb); task_io_account_write(ret); } + if (iocb->ki_flags & IOCB_HIPRI) + bio.bi_opf |= REQ_HIPRI; qc = submit_bio(&bio); for (;;) { @@ -239,7 +241,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(bio.bi_private)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc, true)) io_schedule(); } __set_current_state(TASK_RUNNING); @@ -298,12 +300,13 @@ static void blkdev_bio_end_io(struct bio *bio) } dio->iocb->ki_complete(iocb, ret, 0); - bio_put(&dio->bio); + if (dio->multi_bio) + bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } } @@ -328,6 +331,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; + bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; @@ -338,20 +342,27 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) return -EINVAL; bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); - bio_get(bio); /* extra ref for the completion handler */ dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); - if (dio->is_sync) + if (dio->is_sync) { dio->waiter = current; - else + bio_get(bio); + } else { dio->iocb = iocb; + } dio->size = 0; dio->multi_bio = false; dio->should_dirty = is_read && iter_is_iovec(iter); - blk_start_plug(&plug); + /* + * Don't plug for HIPRI/polled IO, as those should go straight + * to issue + */ + if (!is_poll) + blk_start_plug(&plug); + for (;;) { bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; @@ -381,11 +392,21 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); if (!nr_pages) { + if (iocb->ki_flags & IOCB_HIPRI) + bio->bi_opf |= REQ_HIPRI; + qc = submit_bio(bio); break; } if (!dio->multi_bio) { + /* + * AIO needs an extra reference to ensure the dio + * structure which is embedded into the first bio + * stays around. + */ + if (!is_sync) + bio_get(bio); dio->multi_bio = true; atomic_set(&dio->ref, 2); } else { @@ -395,7 +416,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) submit_bio(bio); bio = bio_alloc(GFP_KERNEL, nr_pages); } - blk_finish_plug(&plug); + + if (!is_poll) + blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; @@ -406,7 +429,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc)) + !blk_poll(bdev_get_queue(bdev), qc, true)) io_schedule(); } __set_current_state(TASK_RUNNING); @@ -1966,6 +1989,7 @@ static const struct address_space_operations def_blk_aops = { .writepages = blkdev_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, + .migratepage = buffer_migrate_page_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 68ebe188446a..78556447e1d5 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -591,7 +591,7 @@ unode_aux_to_inode_list(struct ulist_node *node) } /* - * We maintain three seperate rbtrees: one for direct refs, one for + * We maintain three separate rbtrees: one for direct refs, one for * indirect refs which have a key, and one for indirect refs which do not * have a key. Each tree does merge on insertion. * @@ -695,7 +695,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, } /* - * Now it's a direct ref, put it in the the direct tree. We must + * Now it's a direct ref, put it in the direct tree. We must * do this last because the ref could be merged/freed here. */ prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL); @@ -2020,9 +2020,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, ret = -ENOMEM; break; } - extent_buffer_get(eb); - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item = btrfs_item_nr(slot); @@ -2042,7 +2039,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } - btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } @@ -2083,10 +2079,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, ret = -ENOMEM; break; } - extent_buffer_get(eb); - - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item_size = btrfs_item_size_nr(eb, slot); @@ -2107,7 +2099,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, cur_offset += btrfs_inode_extref_name_len(eb, extref); cur_offset += sizeof(*extref); } - btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); offset++; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 97d91e55b70a..6f5d07415dab 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -20,7 +20,7 @@ * new data the application may have written before commit. */ enum { - BTRFS_INODE_ORDERED_DATA_CLOSE = 0, + BTRFS_INODE_ORDERED_DATA_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, @@ -29,6 +29,7 @@ enum { BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, + BTRFS_INODE_SNAPSHOT_FLUSH, }; /* in memory btrfs inode */ @@ -147,6 +148,12 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * Track the transaction id of the last transaction used to create a + * hard link for the inode. This is used by the log tree (fsync). + */ + u64 last_link_trans; + + /* * Number of bytes outstanding that are going to need csums. This is * used in ENOSPC accounting. */ @@ -253,6 +260,11 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) return false; } +static inline bool is_data_inode(struct inode *inode) +{ + return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID; +} + static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, int mod) { diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 2e43fba44035..b0c8094528d1 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1202,24 +1202,24 @@ static void btrfsic_read_from_block_data( void *dstv, u32 offset, size_t len) { size_t cur; - size_t offset_in_page; + size_t pgoff; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(block_ctx->start); unsigned long i = (start_offset + offset) >> PAGE_SHIFT; WARN_ON(offset + len > block_ctx->len); - offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1); + pgoff = offset_in_page(start_offset + offset); while (len > 0) { - cur = min(len, ((size_t)PAGE_SIZE - offset_in_page)); + cur = min(len, ((size_t)PAGE_SIZE - pgoff)); BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE)); kaddr = block_ctx->datav[i]; - memcpy(dst, kaddr + offset_in_page, cur); + memcpy(dst, kaddr + pgoff, cur); dst += cur; len -= cur; - offset_in_page = 0; + pgoff = 0; i++; } } @@ -1601,7 +1601,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, BUG_ON(block_ctx->datav); BUG_ON(block_ctx->pagev); BUG_ON(block_ctx->mem_to_free); - if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) { pr_info("btrfsic: read_block() with unaligned bytenr %llu\n", block_ctx->dev_bytenr); return -1; @@ -1720,7 +1720,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, num_pages = state->metablock_size >> PAGE_SHIFT; h = (struct btrfs_header *)datav[0]; - if (memcmp(h->fsid, fs_info->fsid, BTRFS_FSID_SIZE)) + if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) return 1; for (i = 0; i < num_pages; i++) { @@ -1778,7 +1778,7 @@ again: return; } is_metadata = 1; - BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1)); + BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE)); processed_len = BTRFS_SUPER_INFO_SIZE; if (state->print_mask & BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { @@ -2327,7 +2327,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, * write operations. Therefore it keeps the linkage * information for a block until a block is * rewritten. This can temporarily cause incorrect - * and even circular linkage informations. This + * and even circular linkage information. This * causes no harm unless such blocks are referenced * by the most recent super block. */ @@ -2892,12 +2892,12 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; - if (fs_info->nodesize & ((u64)PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(fs_info->nodesize)) { pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n", fs_info->nodesize, PAGE_SIZE); return -1; } - if (fs_info->sectorsize & ((u64)PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(fs_info->sectorsize)) { pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n", fs_info->sectorsize, PAGE_SIZE); return -1; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2955a4ea2fa8..548057630b69 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -229,7 +229,6 @@ static noinline void end_compressed_writeback(struct inode *inode, */ static void end_compressed_bio_write(struct bio *bio) { - struct extent_io_tree *tree; struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; @@ -248,14 +247,10 @@ static void end_compressed_bio_write(struct bio *bio) * call back into the FS and do all the end_io operations */ inode = cb->inode; - tree = &BTRFS_I(inode)->io_tree; cb->compressed_pages[0]->mapping = cb->inode->i_mapping; - tree->ops->writepage_end_io_hook(cb->compressed_pages[0], - cb->start, - cb->start + cb->len - 1, - NULL, - bio->bi_status ? - BLK_STS_OK : BLK_STS_NOTSUPP); + btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], + cb->start, cb->start + cb->len - 1, + bio->bi_status ? BLK_STS_OK : BLK_STS_NOTSUPP); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -306,7 +301,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, blk_status_t ret; int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - WARN_ON(start & ((u64)PAGE_SIZE - 1)); + WARN_ON(!PAGE_ALIGNED(start)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; @@ -337,7 +332,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); + submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, + 0); page->mapping = NULL; if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < @@ -481,7 +477,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (page->index == end_index) { char *userpage; - size_t zero_offset = isize & (PAGE_SIZE - 1); + size_t zero_offset = offset_in_page(isize); if (zero_offset) { int zeros; @@ -615,8 +611,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - submit = btrfs_merge_bio_hook(page, 0, PAGE_SIZE, - comp_bio, 0); + submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, + comp_bio, 0); page->mapping = NULL; if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < @@ -1207,7 +1203,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, /* * Shannon Entropy calculation * - * Pure byte distribution analysis fails to determine compressiability of data. + * Pure byte distribution analysis fails to determine compressibility of data. * Try calculating entropy to estimate the average minimum number of bits * needed to encode the sampled data. * @@ -1271,7 +1267,7 @@ static u8 get4bits(u64 num, int shift) { /* * Use 4 bits as radix base - * Use 16 u32 counters for calculating new possition in buf array + * Use 16 u32 counters for calculating new position in buf array * * @array - array that will be sorted * @array_buf - buffer array to store sorting results diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 539901fb5165..d92462fe66c8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -12,6 +12,7 @@ #include "transaction.h" #include "print-tree.h" #include "locking.h" +#include "volumes.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -224,7 +225,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer_fsid(cow, fs_info->fsid); + write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) @@ -1050,7 +1051,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, root->root_key.objectid); - write_extent_buffer_fsid(cow, fs_info->fsid); + write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { @@ -1290,7 +1291,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); - extent_buffer_get(eb_rewin); btrfs_tree_read_lock(eb_rewin); __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > @@ -1362,7 +1362,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (!eb) return NULL; - extent_buffer_get(eb); btrfs_tree_read_lock(eb); if (old_root) { btrfs_set_header_bytenr(eb, eb->start); @@ -1415,7 +1414,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, * * What is forced COW: * when we create snapshot during committing the transaction, - * after we've finished coping src root, we must COW the shared + * after we've finished copying src root, we must COW the shared * block to ensure the metadata consistency. */ if (btrfs_header_generation(buf) == trans->transid && @@ -1441,6 +1440,10 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; + if (test_bit(BTRFS_ROOT_DELETING, &root->state)) + btrfs_err(fs_info, + "COW'ing blocks on a fs root that's being dropped"); + if (trans->transaction != fs_info->running_transaction) WARN(1, KERN_CRIT "trans %llu running %llu\n", trans->transid, @@ -2584,14 +2587,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, root_lock = BTRFS_READ_LOCK; if (p->search_commit_root) { - /* The commit roots are read only so we always do read locks */ - if (p->need_commit_sem) + /* + * The commit roots are read only so we always do read locks, + * and we always must hold the commit_root_sem when doing + * searches on them, the only exception is send where we don't + * want to block transaction commits for a long time, so + * we need to clone the commit root in order to avoid races + * with transaction commits that create a snapshot of one of + * the roots used by a send operation. + */ + if (p->need_commit_sem) { down_read(&fs_info->commit_root_sem); - b = root->commit_root; - extent_buffer_get(b); - level = btrfs_header_level(b); - if (p->need_commit_sem) + b = btrfs_clone_extent_buffer(root->commit_root); up_read(&fs_info->commit_root_sem); + if (!b) + return ERR_PTR(-ENOMEM); + + } else { + b = root->commit_root; + extent_buffer_get(b); + } + level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when * p->search_commit_root = 1. @@ -2717,6 +2733,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, again: prev_cmp = -1; b = btrfs_search_slot_get_root(root, p, write_lock_level); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto done; + } while (b) { level = btrfs_header_level(b); @@ -3751,7 +3771,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* Key greater than all keys in the leaf, right neighbor has * enough room for it and we're not emptying our leaf to delete * it, therefore use right neighbor to insert the new item and - * no need to touch/dirty our left leaft. */ + * no need to touch/dirty our left leaf. */ btrfs_tree_unlock(left); free_extent_buffer(left); path->nodes[0] = right; @@ -5390,7 +5410,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, ret = -ENOMEM; goto out; } - extent_buffer_get(left_path->nodes[left_level]); right_level = btrfs_header_level(right_root->commit_root); right_root_level = right_level; @@ -5401,7 +5420,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root, ret = -ENOMEM; goto out; } - extent_buffer_get(right_path->nodes[right_level]); up_read(&fs_info->commit_root_sem); if (left_level == 0) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 68f322f600a0..f031a447a047 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -109,13 +109,26 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) } /* - * File system states + * Runtime (in-memory) states of filesystem */ -#define BTRFS_FS_STATE_ERROR 0 -#define BTRFS_FS_STATE_REMOUNTING 1 -#define BTRFS_FS_STATE_TRANS_ABORTED 2 -#define BTRFS_FS_STATE_DEV_REPLACING 3 -#define BTRFS_FS_STATE_DUMMY_FS_INFO 4 +enum { + /* Global indicator of serious filesystem errors */ + BTRFS_FS_STATE_ERROR, + /* + * Filesystem is being remounted, allow to skip some operations, like + * defrag + */ + BTRFS_FS_STATE_REMOUNTING, + /* Track if a transaction abort has been reported on this filesystem */ + BTRFS_FS_STATE_TRANS_ABORTED, + /* + * Bio operations should be blocked on this filesystem because a source + * or target device is being destroyed as part of a device replace + */ + BTRFS_FS_STATE_DEV_REPLACING, + /* The btrfs_fs_info created for self-tests */ + BTRFS_FS_STATE_DUMMY_FS_INFO, +}; #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 @@ -195,9 +208,10 @@ struct btrfs_root_backup { * it currently lacks any block count etc etc */ struct btrfs_super_block { - u8 csum[BTRFS_CSUM_SIZE]; /* the first 4 fields must match struct btrfs_header */ - u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + u8 csum[BTRFS_CSUM_SIZE]; + /* FS specific UUID, visible to user */ + u8 fsid[BTRFS_FSID_SIZE]; __le64 bytenr; /* this block number */ __le64 flags; @@ -234,8 +248,11 @@ struct btrfs_super_block { __le64 cache_generation; __le64 uuid_tree_generation; + /* the UUID written into btree blocks */ + u8 metadata_uuid[BTRFS_FSID_SIZE]; + /* future expansion */ - __le64 reserved[30]; + __le64 reserved[28]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); @@ -265,7 +282,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES) + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID) #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -316,7 +334,7 @@ struct btrfs_node { * The slots array records the index of the item or block pointer * used while walking the tree. */ -enum { READA_NONE = 0, READA_BACK, READA_FORWARD }; +enum { READA_NONE, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; @@ -360,9 +378,7 @@ struct btrfs_dev_replace { struct btrfs_device *tgtdev; struct mutex lock_finishing_cancel_unmount; - rwlock_t lock; - atomic_t blocking_readers; - wait_queue_head_t read_lock_wq; + struct rw_semaphore rwsem; struct btrfs_scrub_progress scrub_progress; @@ -443,13 +459,19 @@ struct btrfs_space_info { struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; }; -#define BTRFS_BLOCK_RSV_GLOBAL 1 -#define BTRFS_BLOCK_RSV_DELALLOC 2 -#define BTRFS_BLOCK_RSV_TRANS 3 -#define BTRFS_BLOCK_RSV_CHUNK 4 -#define BTRFS_BLOCK_RSV_DELOPS 5 -#define BTRFS_BLOCK_RSV_EMPTY 6 -#define BTRFS_BLOCK_RSV_TEMP 7 +/* + * Types of block reserves + */ +enum { + BTRFS_BLOCK_RSV_GLOBAL, + BTRFS_BLOCK_RSV_DELALLOC, + BTRFS_BLOCK_RSV_TRANS, + BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_DELOPS, + BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_EMPTY, + BTRFS_BLOCK_RSV_TEMP, +}; struct btrfs_block_rsv { u64 size; @@ -509,18 +531,18 @@ struct btrfs_free_cluster { }; enum btrfs_caching_type { - BTRFS_CACHE_NO = 0, - BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FAST = 2, - BTRFS_CACHE_FINISHED = 3, - BTRFS_CACHE_ERROR = 4, + BTRFS_CACHE_NO, + BTRFS_CACHE_STARTED, + BTRFS_CACHE_FAST, + BTRFS_CACHE_FINISHED, + BTRFS_CACHE_ERROR, }; enum btrfs_disk_cache_state { - BTRFS_DC_WRITTEN = 0, - BTRFS_DC_ERROR = 1, - BTRFS_DC_CLEAR = 2, - BTRFS_DC_SETUP = 3, + BTRFS_DC_WRITTEN, + BTRFS_DC_ERROR, + BTRFS_DC_CLEAR, + BTRFS_DC_SETUP, }; struct btrfs_caching_control { @@ -712,41 +734,61 @@ struct btrfs_fs_devices; struct btrfs_balance_control; struct btrfs_delayed_root; -#define BTRFS_FS_BARRIER 1 -#define BTRFS_FS_CLOSING_START 2 -#define BTRFS_FS_CLOSING_DONE 3 -#define BTRFS_FS_LOG_RECOVERING 4 -#define BTRFS_FS_OPEN 5 -#define BTRFS_FS_QUOTA_ENABLED 6 -#define BTRFS_FS_UPDATE_UUID_TREE_GEN 9 -#define BTRFS_FS_CREATING_FREE_SPACE_TREE 10 -#define BTRFS_FS_BTREE_ERR 11 -#define BTRFS_FS_LOG1_ERR 12 -#define BTRFS_FS_LOG2_ERR 13 -#define BTRFS_FS_QUOTA_OVERRIDE 14 -/* Used to record internally whether fs has been frozen */ -#define BTRFS_FS_FROZEN 15 - -/* - * Indicate that a whole-filesystem exclusive operation is running - * (device replace, resize, device add/delete, balance) - */ -#define BTRFS_FS_EXCL_OP 16 - /* - * To info transaction_kthread we need an immediate commit so it doesn't - * need to wait for commit_interval + * Block group or device which contains an active swapfile. Used for preventing + * unsafe operations while a swapfile is active. + * + * These are sorted on (ptr, inode) (note that a block group or device can + * contain more than one swapfile). We compare the pointer values because we + * don't actually care what the object is, we just need a quick check whether + * the object exists in the rbtree. */ -#define BTRFS_FS_NEED_ASYNC_COMMIT 17 +struct btrfs_swapfile_pin { + struct rb_node node; + void *ptr; + struct inode *inode; + /* + * If true, ptr points to a struct btrfs_block_group_cache. Otherwise, + * ptr points to a struct btrfs_device. + */ + bool is_block_group; +}; -/* - * Indicate that balance has been set up from the ioctl and is in the main - * phase. The fs_info::balance_ctl is initialized. - */ -#define BTRFS_FS_BALANCE_RUNNING 18 +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); + +enum { + BTRFS_FS_BARRIER, + BTRFS_FS_CLOSING_START, + BTRFS_FS_CLOSING_DONE, + BTRFS_FS_LOG_RECOVERING, + BTRFS_FS_OPEN, + BTRFS_FS_QUOTA_ENABLED, + BTRFS_FS_UPDATE_UUID_TREE_GEN, + BTRFS_FS_CREATING_FREE_SPACE_TREE, + BTRFS_FS_BTREE_ERR, + BTRFS_FS_LOG1_ERR, + BTRFS_FS_LOG2_ERR, + BTRFS_FS_QUOTA_OVERRIDE, + /* Used to record internally whether fs has been frozen */ + BTRFS_FS_FROZEN, + /* + * Indicate that a whole-filesystem exclusive operation is running + * (device replace, resize, device add/delete, balance) + */ + BTRFS_FS_EXCL_OP, + /* + * To info transaction_kthread we need an immediate commit so it + * doesn't need to wait for commit_interval + */ + BTRFS_FS_NEED_ASYNC_COMMIT, + /* + * Indicate that balance has been set up from the ioctl and is in the + * main phase. The fs_info::balance_ctl is initialized. + */ + BTRFS_FS_BALANCE_RUNNING, +}; struct btrfs_fs_info { - u8 fsid[BTRFS_FSID_SIZE]; u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; struct btrfs_root *extent_root; @@ -790,6 +832,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv chunk_block_rsv; /* block reservation for delayed operations */ struct btrfs_block_rsv delayed_block_rsv; + /* block reservation for delayed refs */ + struct btrfs_block_rsv delayed_refs_rsv; struct btrfs_block_rsv empty_block_rsv; @@ -1114,6 +1158,10 @@ struct btrfs_fs_info { u32 sectorsize; u32 stripesize; + /* Block groups and devices containing active swapfiles. */ + spinlock_t swapfile_pins_lock; + struct rb_root swapfile_pins; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1133,22 +1181,24 @@ struct btrfs_subvolume_writers { /* * The state of btrfs root */ -/* - * btrfs_record_root_in_trans is a multi-step process, - * and it can race with the balancing code. But the - * race is very small, and only the first time the root - * is added to each transaction. So IN_TRANS_SETUP - * is used to tell us when more checks are required - */ -#define BTRFS_ROOT_IN_TRANS_SETUP 0 -#define BTRFS_ROOT_REF_COWS 1 -#define BTRFS_ROOT_TRACK_DIRTY 2 -#define BTRFS_ROOT_IN_RADIX 3 -#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4 -#define BTRFS_ROOT_DEFRAG_RUNNING 5 -#define BTRFS_ROOT_FORCE_COW 6 -#define BTRFS_ROOT_MULTI_LOG_TASKS 7 -#define BTRFS_ROOT_DIRTY 8 +enum { + /* + * btrfs_record_root_in_trans is a multi-step process, and it can race + * with the balancing code. But the race is very small, and only the + * first time the root is added to each transaction. So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ + BTRFS_ROOT_IN_TRANS_SETUP, + BTRFS_ROOT_REF_COWS, + BTRFS_ROOT_TRACK_DIRTY, + BTRFS_ROOT_IN_RADIX, + BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + BTRFS_ROOT_DEFRAG_RUNNING, + BTRFS_ROOT_FORCE_COW, + BTRFS_ROOT_MULTI_LOG_TASKS, + BTRFS_ROOT_DIRTY, + BTRFS_ROOT_DELETING, +}; /* * in ram representation of the tree. extent_root is used for all allocations @@ -1274,6 +1324,9 @@ struct btrfs_root { u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + /* Number of active swapfiles */ + atomic_t nr_swapfiles; + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif @@ -2570,10 +2623,10 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) /* extent-tree.c */ enum btrfs_inline_ref_type { - BTRFS_REF_TYPE_INVALID = 0, - BTRFS_REF_TYPE_BLOCK = 1, - BTRFS_REF_TYPE_DATA = 2, - BTRFS_REF_TYPE_ANY = 3, + BTRFS_REF_TYPE_INVALID, + BTRFS_REF_TYPE_BLOCK, + BTRFS_REF_TYPE_DATA, + BTRFS_REF_TYPE_ANY, }; int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -2599,7 +2652,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans); +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); @@ -2713,10 +2766,12 @@ enum btrfs_reserve_flush_enum { enum btrfs_flush_state { FLUSH_DELAYED_ITEMS_NR = 1, FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELALLOC = 3, - FLUSH_DELALLOC_WAIT = 4, - ALLOC_CHUNK = 5, - COMMIT_TRANS = 6, + FLUSH_DELAYED_REFS_NR = 3, + FLUSH_DELAYED_REFS = 4, + FLUSH_DELALLOC = 5, + FLUSH_DELALLOC_WAIT = 6, + ALLOC_CHUNK = 7, + COMMIT_TRANS = 8, }; int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); @@ -2767,6 +2822,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush); +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes); int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); @@ -3141,7 +3203,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, @@ -3150,9 +3212,16 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, u64 new_dirid); -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, - unsigned long bio_flags); + void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + unsigned *bits); +void btrfs_clear_delalloc_extent(struct inode *inode, + struct extent_state *state, unsigned *bits); +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + struct extent_state *other); +void btrfs_split_delalloc_extent(struct inode *inode, + struct extent_state *orig, u64 split); +int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, + unsigned long bio_flags); void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); @@ -3189,6 +3258,12 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); +int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, + u64 start, u64 end, int *page_started, unsigned long *nr_written, + struct writeback_control *wbc); +int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); +void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, + u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ @@ -3428,6 +3503,16 @@ static inline void assfail(const char *expr, const char *file, int line) #define ASSERT(expr) ((void)0) #endif +/* + * Use that for functions that are conditionally exported for sanity tests but + * otherwise static + */ +#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +#define EXPORT_FOR_TESTS static +#else +#define EXPORT_FOR_TESTS +#endif + __cold static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9301b3ad9217..cad36c99a483 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -251,8 +251,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); - if (trans->delayed_ref_updates) - trans->delayed_ref_updates--; } static bool merge_ref(struct btrfs_trans_handle *trans, @@ -400,6 +398,20 @@ again: return head; } +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + lockdep_assert_held(&delayed_refs->lock); + lockdep_assert_held(&head->lock); + + rb_erase_cached(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); + atomic_dec(&delayed_refs->num_entries); + delayed_refs->num_heads--; + if (head->processing == 0) + delayed_refs->num_heads_ready--; +} + /* * Helper to insert the ref_node to the tail or merge with tail. * @@ -453,7 +465,6 @@ inserted: if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); - trans->delayed_ref_updates++; spin_unlock(&href->lock); return ret; } @@ -462,12 +473,14 @@ inserted: * helper function to update the accounting in the head ref * existing and update must have the same bytenr */ -static noinline void -update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, +static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing, struct btrfs_delayed_ref_head *update, int *old_ref_mod_ret) { + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + struct btrfs_fs_info *fs_info = trans->fs_info; int old_ref_mod; BUG_ON(existing->is_data != update->is_data); @@ -525,10 +538,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * versa we need to make sure to adjust pending_csums accordingly. */ if (existing->is_data) { - if (existing->total_ref_mod >= 0 && old_ref_mod < 0) + u64 csum_leaves = + btrfs_csum_bytes_to_leaves(fs_info, + existing->num_bytes); + + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes; - if (existing->total_ref_mod < 0 && old_ref_mod >= 0) + btrfs_delayed_refs_rsv_release(fs_info, csum_leaves); + } + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes; + trans->delayed_ref_updates += csum_leaves; + } } spin_unlock(&existing->lock); } @@ -634,7 +655,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, && head_ref->qgroup_reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, existing, head_ref, + update_existing_head_ref(trans, existing, head_ref, old_ref_mod); /* * we've updated the existing ref, free the newly @@ -645,8 +666,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } else { if (old_ref_mod) *old_ref_mod = 0; - if (head_ref->is_data && head_ref->ref_mod < 0) + if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; + trans->delayed_ref_updates += + btrfs_csum_bytes_to_leaves(trans->fs_info, + head_ref->num_bytes); + } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -782,6 +807,12 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); + trace_add_delayed_tree_ref(fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); @@ -863,6 +894,12 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); + trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); @@ -899,6 +936,12 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, NULL, NULL, NULL); spin_unlock(&delayed_refs->lock); + + /* + * Need to update the delayed_refs_rsv with any changes we may have + * made. + */ + btrfs_update_delayed_refs_rsv(trans); return 0; } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 8e20c5cb5404..d2af974f68a1 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -261,7 +261,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) { mutex_unlock(&head->mutex); } - +void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); struct btrfs_delayed_ref_head *btrfs_select_ref_head( struct btrfs_delayed_ref_root *delayed_refs); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 2aa48aecc52b..8750c835f535 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -59,7 +59,6 @@ no_valid_dev_replace_entry_found: BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; - dev_replace->replace_state = 0; dev_replace->time_started = 0; dev_replace->time_stopped = 0; atomic64_set(&dev_replace->num_write_errors, 0); @@ -285,13 +284,13 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_dev_replace_item *ptr; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); if (!dev_replace->is_valid || !dev_replace->item_needs_writeback) { - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); return 0; } - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -349,7 +348,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_replace_item); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); if (dev_replace->srcdev) btrfs_set_dev_replace_src_devid(eb, ptr, dev_replace->srcdev->devid); @@ -372,7 +371,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, btrfs_set_dev_replace_cursor_right(eb, ptr, dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); btrfs_mark_buffer_dirty(eb); @@ -390,7 +389,7 @@ static char* btrfs_dev_name(struct btrfs_device *device) return rcu_str_deref(device->name); } -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, +static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, int read_src) { @@ -407,6 +406,13 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (IS_ERR(src_device)) return PTR_ERR(src_device); + if (btrfs_pinned_by_swapfile(fs_info, src_device)) { + btrfs_warn_in_rcu(fs_info, + "cannot replace device %s (devid %llu) due to active swapfile", + btrfs_dev_name(src_device), src_device->devid); + return -ETXTBSY; + } + ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, src_device, &tgt_device); if (ret) @@ -426,7 +432,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, } need_unlock = true; - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -464,7 +470,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, dev_replace->item_needs_writeback = 1; atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); need_unlock = false; ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); @@ -478,7 +484,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (IS_ERR(trans)) { ret = PTR_ERR(trans); need_unlock = true; - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->srcdev = NULL; @@ -497,7 +503,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) { ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - } else { + } else if (ret != -ECANCELED) { WARN_ON(ret); } @@ -505,7 +511,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, leave: if (need_unlock) - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; } @@ -533,8 +539,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, args->start.cont_reading_from_srcdev_mode); args->result = ret; /* don't warn if EINPROGRESS, someone else might be running scrub */ - if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) - ret = 0; + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS || + ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR) + return 0; return ret; } @@ -572,18 +579,18 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* don't allow cancel or unmount to disturb the finishing procedure */ mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); /* was the operation canceled, or is it finished? */ if (dev_replace->replace_state != BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return 0; } tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); /* * flush all outstanding I/O and inode extent mappings before the @@ -607,7 +614,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* keep away write_all_supers() during the finishing procedure */ mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; @@ -622,12 +629,13 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device, tgt_device); } else { - btrfs_err_in_rcu(fs_info, + if (scrub_ret != -ECANCELED) + btrfs_err_in_rcu(fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_rm_dev_replace_blocked(fs_info); @@ -663,8 +671,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; - btrfs_dev_replace_write_unlock(dev_replace); - + up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_remove_srcdev(src_device); @@ -761,7 +768,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); /* even if !dev_replace_is_valid, the values are good enough for * the replace_status ioctl */ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; @@ -773,7 +780,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, args->status.num_uncorrectable_read_errors = atomic64_read(&dev_replace->num_uncorrectable_read_errors); args->status.progress_1000 = btrfs_dev_replace_progress(fs_info); - btrfs_dev_replace_read_unlock(dev_replace); + up_read(&dev_replace->rwsem); } int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) @@ -790,46 +797,74 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) return -EROFS; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; - btrfs_dev_replace_write_unlock(dev_replace); - goto leave; + up_write(&dev_replace->rwsem); + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + up_write(&dev_replace->rwsem); + ret = btrfs_scrub_cancel(fs_info); + if (ret < 0) { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; + } else { + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + /* + * btrfs_dev_replace_finishing() will handle the + * cleanup part + */ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + } + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * Scrub doing the replace isn't running so we need to do the + * cleanup step of btrfs_dev_replace_finishing() here + */ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - break; - } - dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; - dev_replace->time_stopped = ktime_get_real_seconds(); - dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(dev_replace); - btrfs_scrub_cancel(fs_info); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = ktime_get_real_seconds(); + dev_replace->item_needs_writeback = 1; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return PTR_ERR(trans); - } - ret = btrfs_commit_transaction(trans); - WARN_ON(ret); + up_write(&dev_replace->rwsem); - btrfs_info_in_rcu(fs_info, - "dev_replace from %s (devid %llu) to %s canceled", - btrfs_dev_name(src_device), src_device->devid, - btrfs_dev_name(tgt_device)); + /* Scrub for replace must not be running in suspended state */ + ret = btrfs_scrub_cancel(fs_info); + ASSERT(ret != -ENOTCONN); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans); + WARN_ON(ret); - if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(tgt_device); + btrfs_info_in_rcu(fs_info, + "suspended dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(tgt_device); + break; + default: + result = -EINVAL; + } -leave: mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return result; } @@ -839,7 +874,8 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; mutex_lock(&dev_replace->lock_finishing_cancel_unmount); - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); + switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: @@ -855,7 +891,7 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) break; } - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); } @@ -865,12 +901,13 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) struct task_struct *task; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - btrfs_dev_replace_write_lock(dev_replace); + down_write(&dev_replace->rwsem); + switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); return 0; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: break; @@ -884,10 +921,12 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) "cannot continue dev_replace, tgtdev is missing"); btrfs_info(fs_info, "you may cancel the operation after 'mount -o degraded'"); - btrfs_dev_replace_write_unlock(dev_replace); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + up_write(&dev_replace->rwsem); return 0; } - btrfs_dev_replace_write_unlock(dev_replace); + up_write(&dev_replace->rwsem); /* * This could collide with a paused balance, but the exclusive op logic @@ -895,6 +934,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * dev-replace to start anyway. */ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + down_write(&dev_replace->rwsem); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + up_write(&dev_replace->rwsem); btrfs_info(fs_info, "cannot resume dev-replace, other exclusive operation running"); return 0; @@ -925,7 +968,7 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_device_get_total_bytes(dev_replace->srcdev), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - WARN_ON(ret); + WARN_ON(ret && ret != -ECANCELED); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return 0; @@ -948,7 +991,7 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) * something that can happen if the dev_replace * procedure is suspended by an umount and then * the tgtdev is missing (or "btrfs dev scan") was - * not called and the the filesystem is remounted + * not called and the filesystem is remounted * in degraded state. This does not stop the * dev_replace procedure. It needs to be canceled * manually if the cancellation is wanted. @@ -958,42 +1001,6 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) return 1; } -void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace) -{ - read_lock(&dev_replace->lock); -} - -void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace) -{ - read_unlock(&dev_replace->lock); -} - -void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace) -{ -again: - wait_event(dev_replace->read_lock_wq, - atomic_read(&dev_replace->blocking_readers) == 0); - write_lock(&dev_replace->lock); - if (atomic_read(&dev_replace->blocking_readers)) { - write_unlock(&dev_replace->lock); - goto again; - } -} - -void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace) -{ - write_unlock(&dev_replace->lock); -} - -/* inc blocking cnt and release read lock */ -void btrfs_dev_replace_set_lock_blocking( - struct btrfs_dev_replace *dev_replace) -{ - /* only set blocking for read lock */ - atomic_inc(&dev_replace->blocking_readers); - read_unlock(&dev_replace->lock); -} - void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) { percpu_counter_inc(&fs_info->dev_replace.bio_counter); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 795c551f5b5e..4aa40bacc6cc 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -13,19 +13,11 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); -int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, - const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, - int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace); -void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6d776717d8b3..8da2f380d3c0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -279,6 +279,12 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, len = buf->len - offset; while (len > 0) { + /* + * Note: we don't need to check for the err == 1 case here, as + * with the given combination of 'start = BTRFS_CSUM_SIZE (32)' + * and 'min_len = 32' and the currently implemented mapping + * algorithm we cannot cross a page boundary. + */ err = map_private_extent_buffer(buf, offset, 32, &kaddr, &map_start, &map_len); if (err) @@ -542,7 +548,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) if (WARN_ON(!PageUptodate(page))) return -EUCLEAN; - ASSERT(memcmp_extent_buffer(eb, fs_info->fsid, + ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0); return csum_tree_block(fs_info, eb, 0); @@ -557,7 +563,20 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); while (fs_devices) { - if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { + u8 *metadata_uuid; + + /* + * Checking the incompat flag is only valid for the current + * fs. For seed devices it's forbidden to have their uuid + * changed so reading ->fsid in this case is fine + */ + if (fs_devices == fs_info->fs_devices && + btrfs_fs_incompat(fs_info, METADATA_UUID)) + metadata_uuid = fs_devices->metadata_uuid; + else + metadata_uuid = fs_devices->fsid; + + if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) { ret = 0; break; } @@ -660,19 +679,6 @@ out: return ret; } -static int btree_io_failed_hook(struct page *page, int failed_mirror) -{ - struct extent_buffer *eb; - - eb = (struct extent_buffer *)page->private; - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = failed_mirror; - atomic_dec(&eb->io_pages); - if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb, -EIO); - return -EIO; /* we fixed nothing */ -} - static void end_workqueue_bio(struct bio *bio) { struct btrfs_end_io_wq *end_io_wq = bio->bi_private; @@ -751,11 +757,22 @@ static void run_one_async_start(struct btrfs_work *work) async->status = ret; } +/* + * In order to insert checksums into the metadata in large chunks, we wait + * until bio submission time. All the pages in the bio are checksummed and + * sums are attached onto the ordered extent record. + * + * At IO completion time the csums attached on the ordered extent record are + * inserted into the tree. + */ static void run_one_async_done(struct btrfs_work *work) { struct async_submit_bio *async; + struct inode *inode; + blk_status_t ret; async = container_of(work, struct async_submit_bio, work); + inode = async->private_data; /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { @@ -764,7 +781,12 @@ static void run_one_async_done(struct btrfs_work *work) return; } - btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num); + ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, + async->mirror_num, 1); + if (ret) { + async->bio->bi_status = ret; + bio_endio(async->bio); + } } static void run_one_async_free(struct btrfs_work *work) @@ -1178,6 +1200,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshotted, 0); atomic_set(&root->snapshot_force_cow, 0); + atomic_set(&root->nr_swapfiles, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@ -2118,10 +2141,8 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) { mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); - rwlock_init(&fs_info->dev_replace.lock); - atomic_set(&fs_info->dev_replace.blocking_readers, 0); + init_rwsem(&fs_info->dev_replace.rwsem); init_waitqueue_head(&fs_info->dev_replace.replace_wait); - init_waitqueue_head(&fs_info->dev_replace.read_lock_wq); } static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) @@ -2442,10 +2463,11 @@ static int validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, + BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, - "dev_item UUID does not match fsid: %pU != %pU", - fs_info->fsid, sb->dev_item.fsid); + "dev_item UUID does not match metadata fsid: %pU != %pU", + fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); ret = -EINVAL; } @@ -2656,6 +2678,9 @@ int open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); + btrfs_init_block_rsv(&fs_info->delayed_refs_rsv, + BTRFS_BLOCK_RSV_DELREFS); + atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); @@ -2745,6 +2770,9 @@ int open_ctree(struct super_block *sb, fs_info->sectorsize = 4096; fs_info->stripesize = 4096; + spin_lock_init(&fs_info->swapfile_pins_lock); + fs_info->swapfile_pins = RB_ROOT; + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; @@ -2781,11 +2809,29 @@ int open_ctree(struct super_block *sb, * the whole block of INFO_SIZE */ memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); - memcpy(fs_info->super_for_commit, fs_info->super_copy, - sizeof(*fs_info->super_for_commit)); brelse(bh); - memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); + disk_super = fs_info->super_copy; + + ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, + BTRFS_FSID_SIZE)); + + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) { + ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid, + fs_info->super_copy->metadata_uuid, + BTRFS_FSID_SIZE)); + } + + features = btrfs_super_flags(disk_super); + if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { + features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2; + btrfs_set_super_flags(disk_super, features); + btrfs_info(fs_info, + "found metadata UUID change in progress flag, clearing"); + } + + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); ret = btrfs_validate_mount_super(fs_info); if (ret) { @@ -2794,7 +2840,6 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; @@ -2906,7 +2951,7 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE); + memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE); mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(fs_info); @@ -3055,7 +3100,7 @@ retry_root_backup: if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, - "writeable mount is not allowed due to too many missing devices"); + "writable mount is not allowed due to too many missing devices"); goto fail_sysfs; } @@ -3724,7 +3769,8 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) btrfs_set_stack_device_io_width(dev_item, dev->io_width); btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); - memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid, + BTRFS_FSID_SIZE); flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); @@ -4031,7 +4077,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * This is a fast path so only do this check if we have sanity tests - * enabled. Normal people shouldn't be using umapped buffers as dirty + * enabled. Normal people shouldn't be using unmapped buffers as dirty * outside of the sanity tests. */ if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) @@ -4329,6 +4375,8 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, unpin = pinned_extents; again: while (1) { + struct extent_state *cached_state = NULL; + /* * The btrfs_finish_extent_commit() may get the same range as * ours between find_first_extent_bit and clear_extent_dirty. @@ -4337,13 +4385,14 @@ again: */ mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, NULL); + EXTENT_DIRTY, &cached_state); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } - clear_extent_dirty(unpin, start, end); + clear_extent_dirty(unpin, start, end, &cached_state); + free_extent_state(cached_state); btrfs_error_unpin_extent_range(fs_info, start, end); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -4400,6 +4449,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4505,7 +4555,4 @@ static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, - .readpage_io_failed_hook = btree_io_failed_hook, - - /* optional callbacks */ }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4cccba22640f..987a64bc0c66 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -21,11 +21,11 @@ #define BTRFS_BDEV_BLOCKSIZE (4096) enum btrfs_wq_endio_type { - BTRFS_WQ_ENDIO_DATA = 0, - BTRFS_WQ_ENDIO_METADATA = 1, - BTRFS_WQ_ENDIO_FREE_SPACE = 2, - BTRFS_WQ_ENDIO_RAID56 = 3, - BTRFS_WQ_ENDIO_DIO_REPAIR = 4, + BTRFS_WQ_ENDIO_DATA, + BTRFS_WQ_ENDIO_METADATA, + BTRFS_WQ_ENDIO_FREE_SPACE, + BTRFS_WQ_ENDIO_RAID56, + BTRFS_WQ_ENDIO_DIO_REPAIR, }; static inline u64 btrfs_sb_offset(int mirror) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a1febf155747..b15afeae16df 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -51,6 +51,24 @@ enum { CHUNK_ALLOC_FORCE = 2, }; +/* + * Declare a helper function to detect underflow of various space info members + */ +#define DECLARE_SPACE_INFO_UPDATE(name) \ +static inline void update_##name(struct btrfs_space_info *sinfo, \ + s64 bytes) \ +{ \ + if (bytes < 0 && sinfo->name < -bytes) { \ + WARN_ON(1); \ + sinfo->name = 0; \ + return; \ + } \ + sinfo->name += bytes; \ +} + +DECLARE_SPACE_INFO_UPDATE(bytes_may_use); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned); + static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, @@ -1037,7 +1055,7 @@ out_free: /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, - * is_data == BTRFS_REF_TYPE_DATA, data type is requried, + * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, @@ -2406,25 +2424,82 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref btrfs_delayed_ref_unlock(head); } -static int cleanup_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) +static struct btrfs_delayed_extent_op *cleanup_extent_op( + struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; - int ret; if (!extent_op) - return 0; - head->extent_op = NULL; + return NULL; + if (head->must_insert_reserved) { + head->extent_op = NULL; btrfs_free_delayed_extent_op(extent_op); - return 0; + return NULL; } + return extent_op; +} + +static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_extent_op *extent_op; + int ret; + + extent_op = cleanup_extent_op(head); + if (!extent_op) + return 0; + head->extent_op = NULL; spin_unlock(&head->lock); ret = run_delayed_extent_op(trans, head, extent_op); btrfs_free_delayed_extent_op(extent_op); return ret ? ret : 1; } +static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs = + &trans->transaction->delayed_refs; + int nr_items = 1; /* Dropping this ref head update. */ + + if (head->total_ref_mod < 0) { + struct btrfs_space_info *space_info; + u64 flags; + + if (head->is_data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (head->is_system) + flags = BTRFS_BLOCK_GROUP_SYSTEM; + else + flags = BTRFS_BLOCK_GROUP_METADATA; + space_info = __find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add_batch(&space_info->total_bytes_pinned, + -head->num_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + + /* + * We had csum deletions accounted for in our delayed refs rsv, + * we need to drop the csum leaves for this update from our + * delayed_refs_rsv. + */ + if (head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + nr_items += btrfs_csum_bytes_to_leaves(fs_info, + head->num_bytes); + } + } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, + head->qgroup_reserved); + btrfs_delayed_refs_rsv_release(fs_info, nr_items); +} + static int cleanup_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head) { @@ -2435,7 +2510,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; - ret = cleanup_extent_op(trans, head); + ret = run_and_cleanup_extent_op(trans, head); if (ret < 0) { unselect_delayed_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); @@ -2456,37 +2531,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 1; } - delayed_refs->num_heads--; - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); + btrfs_delete_ref_head(delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); - atomic_dec(&delayed_refs->num_entries); - - trace_run_delayed_ref_head(fs_info, head, 0); - - if (head->total_ref_mod < 0) { - struct btrfs_space_info *space_info; - u64 flags; - - if (head->is_data) - flags = BTRFS_BLOCK_GROUP_DATA; - else if (head->is_system) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = __find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -head->num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - - if (head->is_data) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= head->num_bytes; - spin_unlock(&delayed_refs->lock); - } - } if (head->must_insert_reserved) { btrfs_pin_extent(fs_info, head->bytenr, @@ -2497,9 +2544,9 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - /* Also free its reserved qgroup space */ - btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, - head->qgroup_reserved); + cleanup_ref_head_accounting(trans, head); + + trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); btrfs_put_delayed_ref_head(head); return 0; @@ -2792,40 +2839,28 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans) +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_rsv *global_rsv; - u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; - u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; - unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs; - u64 num_bytes, num_dirty_bgs_bytes; - int ret = 0; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + bool ret = false; + u64 reserved; - num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - num_heads = heads_to_leaves(fs_info, num_heads); - if (num_heads > 1) - num_bytes += (num_heads - 1) * fs_info->nodesize; - num_bytes <<= 1; - num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) * - fs_info->nodesize; - num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info, - num_dirty_bgs); - global_rsv = &fs_info->global_block_rsv; + spin_lock(&global_rsv->lock); + reserved = global_rsv->reserved; + spin_unlock(&global_rsv->lock); /* - * If we can't allocate any more chunks lets make sure we have _lots_ of - * wiggle room since running delayed refs can create more delayed refs. + * Since the global reserve is just kind of magic we don't really want + * to rely on it to save our bacon, so if our size is more than the + * delayed_refs_rsv and the global rsv then it's time to think about + * bailing. */ - if (global_rsv->space_info->full) { - num_dirty_bgs_bytes <<= 1; - num_bytes <<= 1; - } - - spin_lock(&global_rsv->lock); - if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) - ret = 1; - spin_unlock(&global_rsv->lock); + spin_lock(&delayed_refs_rsv->lock); + reserved += delayed_refs_rsv->reserved; + if (delayed_refs_rsv->size >= reserved) + ret = true; + spin_unlock(&delayed_refs_rsv->lock); return ret; } @@ -2844,7 +2879,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) if (val >= NSEC_PER_SEC / 2) return 2; - return btrfs_check_space_for_delayed_refs(trans); + return btrfs_check_space_for_delayed_refs(trans->fs_info); } struct async_delayed_refs { @@ -3588,6 +3623,8 @@ again: */ mutex_lock(&trans->transaction->cache_write_mutex); while (!list_empty(&dirty)) { + bool drop_reserve = true; + cache = list_first_entry(&dirty, struct btrfs_block_group_cache, dirty_list); @@ -3660,6 +3697,7 @@ again: list_add_tail(&cache->dirty_list, &cur_trans->dirty_bgs); btrfs_get_block_group(cache); + drop_reserve = false; } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret) { @@ -3667,9 +3705,11 @@ again: } } - /* if its not on the io list, we need to put the block group */ + /* if it's not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + if (drop_reserve) + btrfs_delayed_refs_rsv_release(fs_info, 1); if (ret) break; @@ -3818,6 +3858,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, /* if its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + btrfs_delayed_refs_rsv_release(fs_info, 1); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4256,7 +4297,7 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } - data_sinfo->bytes_may_use += bytes; + update_bytes_may_use(data_sinfo, bytes); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); @@ -4309,10 +4350,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, data_sinfo = fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - if (WARN_ON(data_sinfo->bytes_may_use < len)) - data_sinfo->bytes_may_use = 0; - else - data_sinfo->bytes_may_use -= len; + update_bytes_may_use(data_sinfo, -len); trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); @@ -4637,7 +4675,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, /* * If we have dup, raid1 or raid10 then only half of the free - * space is actually useable. For raid56, the space info used + * space is actually usable. For raid56, the space info used * doesn't include the parity drive, so we don't have to * change the math */ @@ -4793,8 +4831,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, { struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_trans_handle *trans; - u64 bytes; + u64 bytes_needed; + u64 reclaim_bytes = 0; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) @@ -4807,15 +4847,15 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, else if (!list_empty(&space_info->tickets)) ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - bytes = (ticket) ? ticket->bytes : 0; + bytes_needed = (ticket) ? ticket->bytes : 0; spin_unlock(&space_info->lock); - if (!bytes) + if (!bytes_needed) return 0; /* See if there is enough pinned space to make this reservation */ if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes, + bytes_needed, BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) goto commit; @@ -4827,14 +4867,18 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, return -ENOSPC; spin_lock(&delayed_rsv->lock); - if (delayed_rsv->size > bytes) - bytes = 0; - else - bytes -= delayed_rsv->size; + reclaim_bytes += delayed_rsv->reserved; spin_unlock(&delayed_rsv->lock); + spin_lock(&delayed_refs_rsv->lock); + reclaim_bytes += delayed_refs_rsv->reserved; + spin_unlock(&delayed_refs_rsv->lock); + if (reclaim_bytes >= bytes_needed) + goto commit; + bytes_needed -= reclaim_bytes; + if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes, + bytes_needed, BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) { return -ENOSPC; } @@ -4882,6 +4926,20 @@ static void flush_space(struct btrfs_fs_info *fs_info, shrink_delalloc(fs_info, num_bytes * 2, num_bytes, state == FLUSH_DELALLOC_WAIT); break; + case FLUSH_DELAYED_REFS_NR: + case FLUSH_DELAYED_REFS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + if (state == FLUSH_DELAYED_REFS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes); + else + nr = 0; + btrfs_run_delayed_refs(trans, nr); + btrfs_end_transaction(trans); + break; case ALLOC_CHUNK: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { @@ -5108,7 +5166,7 @@ static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, list_del_init(&ticket->list); if (ticket->bytes && ticket->bytes < orig_bytes) { u64 num_bytes = orig_bytes - ticket->bytes; - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); } @@ -5154,13 +5212,13 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * If not things get more complicated. */ if (used + orig_bytes <= space_info->total_bytes) { - space_info->bytes_may_use += orig_bytes; + update_bytes_may_use(space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, system_chunk)) { - space_info->bytes_may_use += orig_bytes; + update_bytes_may_use(space_info, orig_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, orig_bytes, 1); ret = 0; @@ -5223,7 +5281,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, if (ticket.bytes) { if (ticket.bytes < orig_bytes) { u64 num_bytes = orig_bytes - ticket.bytes; - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); @@ -5244,7 +5302,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * @orig_bytes - the number of bytes we want * @flush - whether or not we can flush to make our reservation * - * This will reserve orgi_bytes number of bytes from the space info associated + * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to * flush out space to make room. It will do this by flushing delalloc if * possible or committing the transaction. If flush is 0 then no attempts to @@ -5354,6 +5412,90 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, return 0; } +/** + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. + * @fs_info - the fs info for our fs. + * @src - the source block rsv to transfer from. + * @num_bytes - the number of bytes to transfer. + * + * This transfers up to the num_bytes amount from the src rsv to the + * delayed_refs_rsv. Any extra bytes are returned to the space info. + */ +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + u64 to_free = 0; + + spin_lock(&src->lock); + src->reserved -= num_bytes; + src->size -= num_bytes; + spin_unlock(&src->lock); + + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { + u64 delta = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + if (num_bytes > delta) { + to_free = num_bytes - delta; + num_bytes = delta; + } + } else { + to_free = num_bytes; + num_bytes = 0; + } + + if (num_bytes) + delayed_refs_rsv->reserved += num_bytes; + if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) + delayed_refs_rsv->full = 1; + spin_unlock(&delayed_refs_rsv->lock); + + if (num_bytes) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + if (to_free) + space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, + to_free); +} + +/** + * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. + * @fs_info - the fs_info for our fs. + * @flush - control how we can flush for this reservation. + * + * This will refill the delayed block_rsv up to 1 items size worth of space and + * will return -ENOSPC if we can't make the reservation. + */ +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + num_bytes = block_rsv->size - block_rsv->reserved; + num_bytes = min(num_bytes, limit); + } + spin_unlock(&block_rsv->lock); + + if (!num_bytes) + return 0; + + ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, + num_bytes, flush); + if (ret) + return ret; + block_rsv_add_bytes(block_rsv, num_bytes, 0); + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + return 0; +} + /* * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. @@ -5407,7 +5549,7 @@ again: flush = BTRFS_RESERVE_FLUSH_ALL; goto again; } - space_info->bytes_may_use -= num_bytes; + update_bytes_may_use(space_info, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 0); spin_unlock(&space_info->lock); @@ -5435,7 +5577,7 @@ again: ticket->bytes, 1); list_del_init(&ticket->list); num_bytes -= ticket->bytes; - space_info->bytes_may_use += ticket->bytes; + update_bytes_may_use(space_info, ticket->bytes); ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); @@ -5443,7 +5585,7 @@ again: trace_btrfs_space_reservation(fs_info, "space_info", space_info->flags, num_bytes, 1); - space_info->bytes_may_use += num_bytes; + update_bytes_may_use(space_info, num_bytes); ticket->bytes -= num_bytes; num_bytes = 0; } @@ -5629,11 +5771,11 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, /** * btrfs_inode_rsv_refill - refill the inode block rsv. * @inode - the inode we are refilling. - * @flush - the flusing restriction. + * @flush - the flushing restriction. * * Essentially the same as btrfs_block_rsv_refill, except it uses the * block_rsv->size as the minimum size. We'll either refill the missing amount - * or return if we already have enough space. This will also handle the resreve + * or return if we already have enough space. This will also handle the reserve * tracepoint for the reserved amount. */ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, @@ -5674,6 +5816,31 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, return ret; } +static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *target = delayed_rsv; + + if (target->full || target == block_rsv) + target = global_rsv; + + if (block_rsv->space_info != target->space_info) + target = NULL; + + return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, + qgroup_to_release); +} + +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); +} + /** * btrfs_inode_rsv_release - release any excessive reservation. * @inode - the inode we need to release from. @@ -5688,7 +5855,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode, static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 released = 0; u64 qgroup_to_release = 0; @@ -5698,8 +5864,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) * are releasing 0 bytes, and then we'll just get the reservation over * the size free'd. */ - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0, - &qgroup_to_release); + released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); @@ -5710,16 +5876,26 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) qgroup_to_release); } -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +/** + * btrfs_delayed_refs_rsv_release - release a ref head's reservation. + * @fs_info - the fs_info for our fs. + * @nr - the number of items to drop. + * + * This drops the delayed ref head's count from the delayed refs rsv and frees + * any excess reservation we had. + */ +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); + u64 released = 0; - if (global_rsv == block_rsv || - block_rsv->space_info != global_rsv->space_info) - global_rsv = NULL; - block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL); + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, + num_bytes, NULL); + if (released) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -5750,14 +5926,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = min(num_bytes, block_rsv->size - block_rsv->reserved); block_rsv->reserved += num_bytes; - sinfo->bytes_may_use += num_bytes; + update_bytes_may_use(sinfo, num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 1); } } else if (block_rsv->reserved > block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - sinfo->bytes_may_use -= num_bytes; + update_bytes_may_use(sinfo, -num_bytes); trace_btrfs_space_reservation(fs_info, "space_info", sinfo->flags, num_bytes, 0); block_rsv->reserved = block_rsv->size; @@ -5784,9 +5960,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; fs_info->delayed_block_rsv.space_info = space_info; + fs_info->delayed_refs_rsv.space_info = space_info; - fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; - fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; if (fs_info->quota_root) @@ -5806,8 +5983,34 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->chunk_block_rsv.reserved > 0); WARN_ON(fs_info->delayed_block_rsv.size > 0); WARN_ON(fs_info->delayed_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.size > 0); } +/* + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv + * @trans - the trans that may have generated delayed refs + * + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, + * it'll calculate the additional size and add it to the delayed_refs_rsv. + */ +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes; + + if (!trans->delayed_ref_updates) + return; + + num_bytes = btrfs_calc_trans_metadata_size(fs_info, + trans->delayed_ref_updates); + spin_lock(&delayed_rsv->lock); + delayed_rsv->size += num_bytes; + delayed_rsv->full = 0; + spin_unlock(&delayed_rsv->lock); + trans->delayed_ref_updates = 0; +} /* * To be called after all the new block groups attached to the transaction @@ -6100,6 +6303,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 old_val; u64 byte_in_group; int factor; + int ret = 0; /* block accounting for super block */ spin_lock(&info->delalloc_root_lock); @@ -6113,8 +6317,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, while (total) { cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) - return -ENOENT; + if (!cache) { + ret = -ENOENT; + break; + } factor = btrfs_bg_type_to_factor(cache->flags); /* @@ -6151,7 +6357,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; + update_bytes_pinned(cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); @@ -6173,6 +6379,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); trans->transaction->num_dirty_bgs++; + trans->delayed_ref_updates++; btrfs_get_block_group(cache); } spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -6190,7 +6397,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, total -= num_bytes; bytenr += num_bytes; } - return 0; + + /* Modified block groups are accounted for in the delayed_refs_rsv. */ + btrfs_update_delayed_refs_rsv(trans); + return ret; } static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) @@ -6222,7 +6432,7 @@ static int pin_down_extent(struct btrfs_fs_info *fs_info, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; + update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -6431,7 +6641,7 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, } else { cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; - space_info->bytes_may_use -= ram_bytes; + update_bytes_may_use(space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; } @@ -6587,7 +6797,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - space_info->bytes_pinned -= len; + update_bytes_pinned(space_info, -len); trace_btrfs_space_reservation(fs_info, "pinned", space_info->flags, len, 0); @@ -6608,7 +6818,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, to_add = min(len, global_rsv->size - global_rsv->reserved); global_rsv->reserved += to_add; - space_info->bytes_may_use += to_add; + update_bytes_may_use(space_info, to_add); if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; trace_btrfs_space_reservation(fs_info, @@ -6647,9 +6857,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) unpin = &fs_info->freed_extents[0]; while (!trans->aborted) { + struct extent_state *cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, NULL); + EXTENT_DIRTY, &cached_state); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; @@ -6659,9 +6871,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end); + clear_extent_dirty(unpin, start, end, &cached_state); unpin_extent_range(fs_info, start, end, true); mutex_unlock(&fs_info->unused_bg_unpin_mutex); + free_extent_state(cached_state); cond_resched(); } @@ -6955,12 +7168,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) goto out; - if (head->extent_op) { - if (!head->must_insert_reserved) - goto out; - btrfs_free_delayed_extent_op(head->extent_op); - head->extent_op = NULL; - } + if (cleanup_extent_op(head) != NULL) + goto out; /* * waiting for the lock here would deadlock. If someone else has it @@ -6969,22 +7178,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!mutex_trylock(&head->mutex)) goto out; - /* - * at this point we have a head with no other entries. Go - * ahead and process it. - */ - rb_erase_cached(&head->href_node, &delayed_refs->href_root); - RB_CLEAR_NODE(&head->href_node); - atomic_dec(&delayed_refs->num_entries); - - /* - * we don't take a ref on the node because we're removing it from the - * tree, so we just steal the ref the tree was holding. - */ - delayed_refs->num_heads--; - if (head->processing == 0) - delayed_refs->num_heads_ready--; + btrfs_delete_ref_head(delayed_refs, head); head->processing = 0; + spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); @@ -6992,6 +7188,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (head->must_insert_reserved) ret = 1; + cleanup_ref_head_accounting(trans, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; @@ -7239,6 +7436,345 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, } /* + * Structure used internally for find_free_extent() function. Wraps needed + * parameters. + */ +struct find_free_extent_ctl { + /* Basic allocation info */ + u64 ram_bytes; + u64 num_bytes; + u64 empty_size; + u64 flags; + int delalloc; + + /* Where to start the search inside the bg */ + u64 search_start; + + /* For clustered allocation */ + u64 empty_cluster; + + bool have_caching_bg; + bool orig_have_caching_bg; + + /* RAID index, converted from flags */ + int index; + + /* + * Current loop number, check find_free_extent_update_loop() for details + */ + int loop; + + /* + * Whether we're refilling a cluster, if true we need to re-search + * current block group but don't try to refill the cluster again. + */ + bool retry_clustered; + + /* + * Whether we're updating free space cache, if true we need to re-search + * current block group but don't try updating free space cache again. + */ + bool retry_unclustered; + + /* If current block group is cached */ + int cached; + + /* Max contiguous hole found */ + u64 max_extent_size; + + /* Total free space from free space cache, not always contiguous */ + u64 total_free_space; + + /* Found result */ + u64 found_offset; +}; + + +/* + * Helper function for find_free_extent(). + * + * Return -ENOENT to inform caller that we need fallback to unclustered mode. + * Return -EAGAIN to inform caller that we need to re-search this block group + * Return >0 to inform caller that we find nothing + * Return 0 means we have found a location and set ffe_ctl->found_offset. + */ +static int find_free_extent_clustered(struct btrfs_block_group_cache *bg, + struct btrfs_free_cluster *last_ptr, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group_cache **cluster_bg_ret) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_block_group_cache *cluster_bg; + u64 aligned_cluster; + u64 offset; + int ret; + + cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); + if (!cluster_bg) + goto refill_cluster; + if (cluster_bg != bg && (cluster_bg->ro || + !block_group_bits(cluster_bg, ffe_ctl->flags))) + goto release_cluster; + + offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, + ffe_ctl->num_bytes, cluster_bg->key.objectid, + &ffe_ctl->max_extent_size); + if (offset) { + /* We have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(cluster_bg, + ffe_ctl->search_start, ffe_ctl->num_bytes); + *cluster_bg_ret = cluster_bg; + ffe_ctl->found_offset = offset; + return 0; + } + WARN_ON(last_ptr->block_group != cluster_bg); + +release_cluster: + /* + * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so + * lets just skip it and let the allocator find whatever block it can + * find. If we reach this point, we will have tried the cluster + * allocator plenty of times and not have found anything, so we are + * likely way too fragmented for the clustering stuff to find anything. + * + * However, if the cluster is taken from the current block group, + * release the cluster first, so that we stand a better chance of + * succeeding in the unclustered allocation. + */ + if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { + spin_unlock(&last_ptr->refill_lock); + btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); + return -ENOENT; + } + + /* This cluster didn't work out, free it and start over */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + + if (cluster_bg != bg) + btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); + +refill_cluster: + if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + return -ENOENT; + } + + aligned_cluster = max_t(u64, + ffe_ctl->empty_cluster + ffe_ctl->empty_size, + bg->full_stripe_len); + ret = btrfs_find_space_cluster(fs_info, bg, last_ptr, + ffe_ctl->search_start, ffe_ctl->num_bytes, + aligned_cluster); + if (ret == 0) { + /* Now pull our allocation out of this cluster */ + offset = btrfs_alloc_from_cluster(bg, last_ptr, + ffe_ctl->num_bytes, ffe_ctl->search_start, + &ffe_ctl->max_extent_size); + if (offset) { + /* We found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + trace_btrfs_reserve_extent_cluster(bg, + ffe_ctl->search_start, + ffe_ctl->num_bytes); + ffe_ctl->found_offset = offset; + return 0; + } + } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && + !ffe_ctl->retry_clustered) { + spin_unlock(&last_ptr->refill_lock); + + ffe_ctl->retry_clustered = true; + wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + ffe_ctl->empty_cluster + ffe_ctl->empty_size); + return -EAGAIN; + } + /* + * At this point we either didn't find a cluster or we weren't able to + * allocate a block from our cluster. Free the cluster we've been + * trying to use, and go to the next block group. + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); + spin_unlock(&last_ptr->refill_lock); + return 1; +} + +/* + * Return >0 to inform caller that we find nothing + * Return 0 when we found an free extent and set ffe_ctrl->found_offset + * Return -EAGAIN to inform caller that we need to re-search this block group + */ +static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg, + struct btrfs_free_cluster *last_ptr, + struct find_free_extent_ctl *ffe_ctl) +{ + u64 offset; + + /* + * We are doing an unclustered allocation, set the fragmented flag so + * we don't bother trying to setup a cluster again until we get more + * space. + */ + if (unlikely(last_ptr)) { + spin_lock(&last_ptr->lock); + last_ptr->fragmented = 1; + spin_unlock(&last_ptr->lock); + } + if (ffe_ctl->cached) { + struct btrfs_free_space_ctl *free_space_ctl; + + free_space_ctl = bg->free_space_ctl; + spin_lock(&free_space_ctl->tree_lock); + if (free_space_ctl->free_space < + ffe_ctl->num_bytes + ffe_ctl->empty_cluster + + ffe_ctl->empty_size) { + ffe_ctl->total_free_space = max_t(u64, + ffe_ctl->total_free_space, + free_space_ctl->free_space); + spin_unlock(&free_space_ctl->tree_lock); + return 1; + } + spin_unlock(&free_space_ctl->tree_lock); + } + + offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, + ffe_ctl->num_bytes, ffe_ctl->empty_size, + &ffe_ctl->max_extent_size); + + /* + * If we didn't find a chunk, and we haven't failed on this block group + * before, and this block group is in the middle of caching and we are + * ok with waiting, then go ahead and wait for progress to be made, and + * set @retry_unclustered to true. + * + * If @retry_unclustered is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && + ffe_ctl->loop > LOOP_CACHING_NOWAIT) { + wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + + ffe_ctl->empty_size); + ffe_ctl->retry_unclustered = true; + return -EAGAIN; + } else if (!offset) { + return 1; + } + ffe_ctl->found_offset = offset; + return 0; +} + +/* + * Return >0 means caller needs to re-search for free extent + * Return 0 means we have the needed free extent. + * Return <0 means we failed to locate any free extent. + */ +static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, + struct btrfs_free_cluster *last_ptr, + struct btrfs_key *ins, + struct find_free_extent_ctl *ffe_ctl, + int full_search, bool use_cluster) +{ + struct btrfs_root *root = fs_info->extent_root; + int ret; + + if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && + ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) + ffe_ctl->orig_have_caching_bg = true; + + if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && + ffe_ctl->have_caching_bg) + return 1; + + if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) + return 1; + + if (ins->objectid) { + if (!use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } + return 0; + } + + /* + * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking + * caching kthreads as we move along + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again + */ + if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { + ffe_ctl->index = 0; + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) { + /* + * We want to skip the LOOP_CACHING_WAIT step if we + * don't have any uncached bgs and we've already done a + * full search through. + */ + if (ffe_ctl->orig_have_caching_bg || !full_search) + ffe_ctl->loop = LOOP_CACHING_WAIT; + else + ffe_ctl->loop = LOOP_ALLOC_CHUNK; + } else { + ffe_ctl->loop++; + } + + if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + int exist = 0; + + trans = current->journal_info; + if (trans) + exist = 1; + else + trans = btrfs_join_transaction(root); + + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + return ret; + } + + ret = do_chunk_alloc(trans, ffe_ctl->flags, + CHUNK_ALLOC_FORCE); + + /* + * If we can't allocate a new chunk we've already looped + * through at least once, move on to the NO_EMPTY_SIZE + * case. + */ + if (ret == -ENOSPC) + ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; + + /* Do not bail out on ENOSPC since we can do more. */ + if (ret < 0 && ret != -ENOSPC) + btrfs_abort_transaction(trans, ret); + else + ret = 0; + if (!exist) + btrfs_end_transaction(trans); + if (ret) + return ret; + } + + if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (ffe_ctl->empty_size == 0 && + ffe_ctl->empty_cluster == 0) + return -ENOSPC; + ffe_ctl->empty_size = 0; + ffe_ctl->empty_cluster = 0; + } + return 1; + } + return -ENOSPC; +} + +/* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: * ins->objectid == start position @@ -7248,6 +7784,20 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, * * If there is no suitable free space, we will record the max size of * the free space extent currently. + * + * The overall logic and call chain: + * + * find_free_extent() + * |- Iterate through all block groups + * | |- Get a valid block group + * | |- Try to do clustered allocation in that block group + * | |- Try to do unclustered allocation in that block group + * | |- Check if the result is valid + * | | |- If valid, then exit + * | |- Jump to next block group + * | + * |- Push harder to find free extents + * |- If not found, re-iterate all block groups */ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 ram_bytes, u64 num_bytes, u64 empty_size, @@ -7255,24 +7805,28 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 flags, int delalloc) { int ret = 0; - struct btrfs_root *root = fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - u64 search_start = 0; - u64 max_extent_size = 0; - u64 max_free_space = 0; - u64 empty_cluster = 0; + struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; - int loop = 0; - int index = btrfs_bg_flags_to_raid_index(flags); - bool failed_cluster_refill = false; - bool failed_alloc = false; bool use_cluster = true; - bool have_caching_bg = false; - bool orig_have_caching_bg = false; bool full_search = false; WARN_ON(num_bytes < fs_info->sectorsize); + + ffe_ctl.ram_bytes = ram_bytes; + ffe_ctl.num_bytes = num_bytes; + ffe_ctl.empty_size = empty_size; + ffe_ctl.flags = flags; + ffe_ctl.search_start = 0; + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; + ffe_ctl.delalloc = delalloc; + ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); + ffe_ctl.have_caching_bg = false; + ffe_ctl.orig_have_caching_bg = false; + ffe_ctl.found_offset = 0; + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -7308,7 +7862,8 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } - last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster); + last_ptr = fetch_cluster_info(fs_info, space_info, + &ffe_ctl.empty_cluster); if (last_ptr) { spin_lock(&last_ptr->lock); if (last_ptr->block_group) @@ -7325,10 +7880,12 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, spin_unlock(&last_ptr->lock); } - search_start = max(search_start, first_logical_byte(fs_info, 0)); - search_start = max(search_start, hint_byte); - if (search_start == hint_byte) { - block_group = btrfs_lookup_block_group(fs_info, search_start); + ffe_ctl.search_start = max(ffe_ctl.search_start, + first_logical_byte(fs_info, 0)); + ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte); + if (ffe_ctl.search_start == hint_byte) { + block_group = btrfs_lookup_block_group(fs_info, + ffe_ctl.search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -7350,7 +7907,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { - index = btrfs_bg_flags_to_raid_index( + ffe_ctl.index = btrfs_bg_flags_to_raid_index( block_group->flags); btrfs_lock_block_group(block_group, delalloc); goto have_block_group; @@ -7360,21 +7917,19 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, } } search: - have_caching_bg = false; - if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags)) + ffe_ctl.have_caching_bg = false; + if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || + ffe_ctl.index == 0) full_search = true; down_read(&space_info->groups_sem); - list_for_each_entry(block_group, &space_info->block_groups[index], - list) { - u64 offset; - int cached; - + list_for_each_entry(block_group, + &space_info->block_groups[ffe_ctl.index], list) { /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) continue; btrfs_grab_block_group(block_group, delalloc); - search_start = block_group->key.objectid; + ffe_ctl.search_start = block_group->key.objectid; /* * this can happen if we end up cycling through all the @@ -7398,9 +7953,9 @@ search: } have_block_group: - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) { - have_caching_bg = true; + ffe_ctl.cached = block_group_cache_done(block_group); + if (unlikely(!ffe_ctl.cached)) { + ffe_ctl.have_caching_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; @@ -7414,322 +7969,92 @@ have_block_group: * lets look there */ if (last_ptr && use_cluster) { - struct btrfs_block_group_cache *used_block_group; - unsigned long aligned_cluster; - /* - * the refill lock keeps out other - * people trying to start a new cluster - */ - used_block_group = btrfs_lock_cluster(block_group, - last_ptr, - delalloc); - if (!used_block_group) - goto refill_cluster; - - if (used_block_group != block_group && - (used_block_group->ro || - !block_group_bits(used_block_group, flags))) - goto release_cluster; - - offset = btrfs_alloc_from_cluster(used_block_group, - last_ptr, - num_bytes, - used_block_group->key.objectid, - &max_extent_size); - if (offset) { - /* we have a block, we're done */ - spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster( - used_block_group, - search_start, num_bytes); - if (used_block_group != block_group) { - btrfs_release_block_group(block_group, - delalloc); - block_group = used_block_group; - } - goto checks; - } - - WARN_ON(last_ptr->block_group != used_block_group); -release_cluster: - /* If we are on LOOP_NO_EMPTY_SIZE, we can't - * set up a new clusters, so lets just skip it - * and let the allocator find whatever block - * it can find. If we reach this point, we - * will have tried the cluster allocator - * plenty of times and not have found - * anything, so we are likely way too - * fragmented for the clustering stuff to find - * anything. - * - * However, if the cluster is taken from the - * current block group, release the cluster - * first, so that we stand a better chance of - * succeeding in the unclustered - * allocation. */ - if (loop >= LOOP_NO_EMPTY_SIZE && - used_block_group != block_group) { - spin_unlock(&last_ptr->refill_lock); - btrfs_release_block_group(used_block_group, - delalloc); - goto unclustered_alloc; - } + struct btrfs_block_group_cache *cluster_bg = NULL; - /* - * this cluster didn't work out, free it and - * start over - */ - btrfs_return_cluster_to_free_space(NULL, last_ptr); - - if (used_block_group != block_group) - btrfs_release_block_group(used_block_group, - delalloc); -refill_cluster: - if (loop >= LOOP_NO_EMPTY_SIZE) { - spin_unlock(&last_ptr->refill_lock); - goto unclustered_alloc; - } - - aligned_cluster = max_t(unsigned long, - empty_cluster + empty_size, - block_group->full_stripe_len); + ret = find_free_extent_clustered(block_group, last_ptr, + &ffe_ctl, &cluster_bg); - /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(fs_info, block_group, - last_ptr, search_start, - num_bytes, - aligned_cluster); if (ret == 0) { - /* - * now pull our allocation out of this - * cluster - */ - offset = btrfs_alloc_from_cluster(block_group, - last_ptr, - num_bytes, - search_start, - &max_extent_size); - if (offset) { - /* we found one, proceed */ - spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster( - block_group, search_start, - num_bytes); - goto checks; + if (cluster_bg && cluster_bg != block_group) { + btrfs_release_block_group(block_group, + delalloc); + block_group = cluster_bg; } - } else if (!cached && loop > LOOP_CACHING_NOWAIT - && !failed_cluster_refill) { - spin_unlock(&last_ptr->refill_lock); - - failed_cluster_refill = true; - wait_block_group_cache_progress(block_group, - num_bytes + empty_cluster + empty_size); + goto checks; + } else if (ret == -EAGAIN) { goto have_block_group; - } - - /* - * at this point we either didn't find a cluster - * or we weren't able to allocate a block from our - * cluster. Free the cluster we've been trying - * to use, and go to the next block group - */ - btrfs_return_cluster_to_free_space(NULL, last_ptr); - spin_unlock(&last_ptr->refill_lock); - goto loop; - } - -unclustered_alloc: - /* - * We are doing an unclustered alloc, set the fragmented flag so - * we don't bother trying to setup a cluster again until we get - * more space. - */ - if (unlikely(last_ptr)) { - spin_lock(&last_ptr->lock); - last_ptr->fragmented = 1; - spin_unlock(&last_ptr->lock); - } - if (cached) { - struct btrfs_free_space_ctl *ctl = - block_group->free_space_ctl; - - spin_lock(&ctl->tree_lock); - if (ctl->free_space < - num_bytes + empty_cluster + empty_size) { - max_free_space = max(max_free_space, - ctl->free_space); - spin_unlock(&ctl->tree_lock); + } else if (ret > 0) { goto loop; } - spin_unlock(&ctl->tree_lock); + /* ret == -ENOENT case falls through */ } - offset = btrfs_find_space_for_alloc(block_group, search_start, - num_bytes, empty_size, - &max_extent_size); - /* - * If we didn't find a chunk, and we haven't failed on this - * block group before, and this block group is in the middle of - * caching and we are ok with waiting, then go ahead and wait - * for progress to be made, and set failed_alloc to true. - * - * If failed_alloc is true then we've already waited on this - * block group once and should move on to the next block group. - */ - if (!offset && !failed_alloc && !cached && - loop > LOOP_CACHING_NOWAIT) { - wait_block_group_cache_progress(block_group, - num_bytes + empty_size); - failed_alloc = true; + ret = find_free_extent_unclustered(block_group, last_ptr, + &ffe_ctl); + if (ret == -EAGAIN) goto have_block_group; - } else if (!offset) { + else if (ret > 0) goto loop; - } + /* ret == 0 case falls through */ checks: - search_start = round_up(offset, fs_info->stripesize); + ffe_ctl.search_start = round_up(ffe_ctl.found_offset, + fs_info->stripesize); /* move on to the next group */ - if (search_start + num_bytes > + if (ffe_ctl.search_start + num_bytes > block_group->key.objectid + block_group->key.offset) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + num_bytes); goto loop; } - if (offset < search_start) - btrfs_add_free_space(block_group, offset, - search_start - offset); + if (ffe_ctl.found_offset < ffe_ctl.search_start) + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + ffe_ctl.search_start - ffe_ctl.found_offset); ret = btrfs_add_reserved_bytes(block_group, ram_bytes, num_bytes, delalloc); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(block_group, ffe_ctl.found_offset, + num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ - ins->objectid = search_start; + ins->objectid = ffe_ctl.search_start; ins->offset = num_bytes; - trace_btrfs_reserve_extent(block_group, search_start, num_bytes); + trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, + num_bytes); btrfs_release_block_group(block_group, delalloc); break; loop: - failed_cluster_refill = false; - failed_alloc = false; + ffe_ctl.retry_clustered = false; + ffe_ctl.retry_unclustered = false; BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != - index); + ffe_ctl.index); btrfs_release_block_group(block_group, delalloc); cond_resched(); } up_read(&space_info->groups_sem); - if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg - && !orig_have_caching_bg) - orig_have_caching_bg = true; - - if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) - goto search; - - if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) + ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl, + full_search, use_cluster); + if (ret > 0) goto search; - /* - * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking - * caching kthreads as we move along - * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching - * LOOP_ALLOC_CHUNK, force a chunk allocation and try again - * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try - * again - */ - if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { - index = 0; - if (loop == LOOP_CACHING_NOWAIT) { - /* - * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any uncached bgs and we've already done a - * full search through. - */ - if (orig_have_caching_bg || !full_search) - loop = LOOP_CACHING_WAIT; - else - loop = LOOP_ALLOC_CHUNK; - } else { - loop++; - } - - if (loop == LOOP_ALLOC_CHUNK) { - struct btrfs_trans_handle *trans; - int exist = 0; - - trans = current->journal_info; - if (trans) - exist = 1; - else - trans = btrfs_join_transaction(root); - - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } - - ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE); - - /* - * If we can't allocate a new chunk we've already looped - * through at least once, move on to the NO_EMPTY_SIZE - * case. - */ - if (ret == -ENOSPC) - loop = LOOP_NO_EMPTY_SIZE; - - /* - * Do not bail out on ENOSPC since we - * can do more things. - */ - if (ret < 0 && ret != -ENOSPC) - btrfs_abort_transaction(trans, ret); - else - ret = 0; - if (!exist) - btrfs_end_transaction(trans); - if (ret) - goto out; - } - - if (loop == LOOP_NO_EMPTY_SIZE) { - /* - * Don't loop again if we already have no empty_size and - * no empty_cluster. - */ - if (empty_size == 0 && - empty_cluster == 0) { - ret = -ENOSPC; - goto out; - } - empty_size = 0; - empty_cluster = 0; - } - - goto search; - } else if (!ins->objectid) { - ret = -ENOSPC; - } else if (ins->objectid) { - if (!use_cluster && last_ptr) { - spin_lock(&last_ptr->lock); - last_ptr->window_start = ins->objectid; - spin_unlock(&last_ptr->lock); - } - ret = 0; - } -out: if (ret == -ENOSPC) { - if (!max_extent_size) - max_extent_size = max_free_space; + /* + * Use ffe_ctl->total_free_space as fallback if we can't find + * any contiguous hole. + */ + if (!ffe_ctl.max_extent_size) + ffe_ctl.max_extent_size = ffe_ctl.total_free_space; spin_lock(&space_info->lock); - space_info->max_extent_size = max_extent_size; + space_info->max_extent_size = ffe_ctl.max_extent_size; spin_unlock(&space_info->lock); - ins->offset = max_extent_size; + ins->offset = ffe_ctl.max_extent_size; } return ret; } @@ -8169,13 +8494,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_generation(buf, trans->transid); btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(buf, owner); - write_extent_buffer_fsid(buf, fs_info->fsid); + write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different - * EXENT bit to differentiate dirty pages. + * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) set_extent_dirty(&root->dirty_log_pages, buf->start, @@ -8221,7 +8546,12 @@ again: goto again; } - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + /* + * The global reserve still exists to save us from ourselves, so don't + * warn_on if we are short on our delayed refs reserve. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && + btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, /*DEFAULT_RATELIMIT_BURST*/ 1); @@ -8544,7 +8874,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; - u32 blocksize; struct btrfs_key key; struct btrfs_key first_key; struct extent_buffer *next; @@ -8569,7 +8898,6 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); btrfs_node_key_to_cpu(path->nodes[level], &first_key, path->slots[level]); - blocksize = fs_info->nodesize; next = find_extent_buffer(fs_info, bytenr); if (!next) { @@ -8693,7 +9021,7 @@ skip: ret); } } - ret = btrfs_free_extent(trans, root, bytenr, blocksize, + ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize, parent, root->root_key.objectid, level - 1, 0); if (ret) @@ -8944,9 +9272,22 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_free; } + err = btrfs_run_delayed_items(trans); + if (err) + goto out_end_trans; + if (block_rsv) trans->block_rsv = block_rsv; + /* + * This will help us catch people modifying the fs tree while we're + * dropping it. It is unsafe to mess with the fs tree while it's being + * dropped as we unlock the root node and parent nodes as we walk down + * the tree, assuming nothing will change. If something does change + * then we'll have stale information and drop references to blocks we've + * already dropped. + */ + set_bit(BTRFS_ROOT_DELETING, &root->state); if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); @@ -9421,7 +9762,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) } /* - * checks to see if its even possible to relocate this block group. + * Checks to see if it's even possible to relocate this block group. * * @return - -1 if it's not a good idea to relocate this block group, 0 if its * ok to go ahead and try. @@ -10049,7 +10390,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) * check for two cases, either we are full, and therefore * don't need to bother with the caching work since we won't * find any space, or we are empty, and we can just add all - * the space in and be done with it. This saves us _alot_ of + * the space in and be done with it. This saves us _a_lot_ of * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { @@ -10154,6 +10495,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) add_block_group_free_space(trans, block_group); /* already aborted the transaction if it failed. */ next: + btrfs_delayed_refs_rsv_release(fs_info, 1); list_del_init(&block_group->bg_list); } btrfs_trans_release_chunk_metadata(trans); @@ -10231,6 +10573,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, link_block_group(cache); list_add_tail(&cache->bg_list, &trans->new_bgs); + trans->delayed_ref_updates++; + btrfs_update_delayed_refs_rsv(trans); set_avail_alloc_bits(fs_info, type); return 0; @@ -10268,6 +10612,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int factor; struct btrfs_caching_control *caching_ctl = NULL; bool remove_em; + bool remove_rsv = false; block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!block_group); @@ -10315,7 +10660,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, mutex_lock(&trans->transaction->cache_write_mutex); /* - * make sure our free spache cache IO is done before remove the + * Make sure our free space cache IO is done before removing the * free space inode */ spin_lock(&trans->transaction->dirty_bgs_lock); @@ -10332,6 +10677,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (!list_empty(&block_group->dirty_list)) { list_del_init(&block_group->dirty_list); + remove_rsv = true; btrfs_put_block_group(block_group); } spin_unlock(&trans->transaction->dirty_bgs_lock); @@ -10541,6 +10887,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); out: + if (remove_rsv) + btrfs_delayed_refs_rsv_release(fs_info, 1); btrfs_free_path(path); return ret; } @@ -10698,7 +11046,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - space_info->bytes_pinned -= block_group->pinned; + update_bytes_pinned(space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; percpu_counter_add_batch(&space_info->total_bytes_pinned, -block_group->pinned, @@ -10829,7 +11177,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, if (!blk_queue_discard(bdev_get_queue(device->bdev))) return 0; - /* Not writeable = nothing to do. */ + /* Not writable = nothing to do. */ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d228f706ff3e..fc126b92ea59 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -89,9 +89,18 @@ void btrfs_leak_debug_check(void) static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - if (tree->ops && tree->ops->check_extent_io_range) - tree->ops->check_extent_io_range(tree->private_data, caller, - start, end); + struct inode *inode = tree->private_data; + u64 isize; + + if (!inode || !is_data_inode(inode)) + return; + + isize = i_size_read(inode); + if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", + caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + } } #else #define btrfs_leak_debug_add(new, head) do {} while (0) @@ -344,13 +353,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree, return tree_search_for_insert(tree, offset, NULL, NULL); } -static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, - struct extent_state *other) -{ - if (tree->ops && tree->ops->merge_extent_hook) - tree->ops->merge_extent_hook(tree->private_data, new, other); -} - /* * utility function to look for merge candidates inside a given range. * Any extents with matching state are merged together into a single @@ -374,7 +376,10 @@ static void merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->end == state->start - 1 && other->state == state->state) { - merge_cb(tree, state, other); + if (tree->private_data && + is_data_inode(tree->private_data)) + btrfs_merge_delalloc_extent(tree->private_data, + state, other); state->start = other->start; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -386,7 +391,10 @@ static void merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->start == state->end + 1 && other->state == state->state) { - merge_cb(tree, state, other); + if (tree->private_data && + is_data_inode(tree->private_data)) + btrfs_merge_delalloc_extent(tree->private_data, + state, other); state->end = other->end; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -395,20 +403,6 @@ static void merge_state(struct extent_io_tree *tree, } } -static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits) -{ - if (tree->ops && tree->ops->set_bit_hook) - tree->ops->set_bit_hook(tree->private_data, state, bits); -} - -static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits) -{ - if (tree->ops && tree->ops->clear_bit_hook) - tree->ops->clear_bit_hook(tree->private_data, state, bits); -} - static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, unsigned *bits, struct extent_changeset *changeset); @@ -451,13 +445,6 @@ static int insert_state(struct extent_io_tree *tree, return 0; } -static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, - u64 split) -{ - if (tree->ops && tree->ops->split_extent_hook) - tree->ops->split_extent_hook(tree->private_data, orig, split); -} - /* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an @@ -477,7 +464,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, { struct rb_node *node; - split_cb(tree, orig, split); + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_split_delalloc_extent(tree->private_data, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -504,7 +492,7 @@ static struct extent_state *next_state(struct extent_state *state) /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1). + * it will optionally wake up anyone waiting on this state (wake == 1). * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree @@ -523,7 +511,10 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, WARN_ON(range > tree->dirty_bytes); tree->dirty_bytes -= range; } - clear_state_cb(tree, state, bits); + + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_clear_delalloc_extent(tree->private_data, state, bits); + ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); state->state &= ~bits_to_clear; @@ -800,7 +791,9 @@ static void set_state_bits(struct extent_io_tree *tree, unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; - set_state_cb(tree, state, bits); + if (tree->private_data && is_data_inode(tree->private_data)) + btrfs_set_delalloc_extent(tree->private_data, state, bits); + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; @@ -1459,16 +1452,16 @@ out: * find a contiguous range of bytes in the file marked as delalloc, not * more than 'max_bytes'. start and end are used to return the range, * - * 1 is returned if we find something, 0 if nothing was in the tree + * true is returned if we find something, false if nothing was in the tree */ -static noinline u64 find_delalloc_range(struct extent_io_tree *tree, +static noinline bool find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; - u64 found = 0; + bool found = false; u64 total_bytes = 0; spin_lock(&tree->lock); @@ -1479,8 +1472,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, */ node = tree_search(tree, cur_start); if (!node) { - if (!found) - *end = (u64)-1; + *end = (u64)-1; goto out; } @@ -1500,7 +1492,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, *cached_state = state; refcount_inc(&state->refs); } - found++; + found = true; *end = state->end; cur_start = state->end + 1; node = rb_next(node); @@ -1558,19 +1550,22 @@ static noinline int lock_delalloc_pages(struct inode *inode, } /* - * find a contiguous range of bytes in the file marked as delalloc, not - * more than 'max_bytes'. start and end are used to return the range, + * Find and lock a contiguous range of bytes in the file marked as delalloc, no + * more than @max_bytes. @Start and @end are used to return the range, * - * 1 is returned if we find something, 0 if nothing was in the tree + * Return: true if we find something + * false if nothing was in the tree */ -static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode, +EXPORT_FOR_TESTS +noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) + u64 *end) { + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; - u64 found; + bool found; struct extent_state *cached_state = NULL; int ret; int loops = 0; @@ -1585,7 +1580,7 @@ again: *start = delalloc_start; *end = delalloc_end; free_extent_state(cached_state); - return 0; + return false; } /* @@ -1605,6 +1600,7 @@ again: /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, delalloc_start, delalloc_end); + ASSERT(!ret || ret == -EAGAIN); if (ret == -EAGAIN) { /* some of the pages are gone, lets avoid looping by * shortening the size of the delalloc range we're searching @@ -1616,11 +1612,10 @@ again: loops = 1; goto again; } else { - found = 0; + found = false; goto out_failed; } } - BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); @@ -1643,17 +1638,6 @@ out_failed: return found; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -u64 btrfs_find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes) -{ - return find_lock_delalloc_range(inode, tree, locked_page, start, end, - max_bytes); -} -#endif - static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, pgoff_t start_index, pgoff_t end_index, @@ -2349,13 +2333,11 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, } /* - * this is a generic handler for readpage errors (default - * readpage_io_failed_hook). if other copies exist, read those and write back - * good data to the failed position. does not investigate in remapping the - * failed extent elsewhere, hoping the device will be smart enough to do this as - * needed + * This is a generic handler for readpage errors. If other copies exist, read + * those and write back good data to the failed position. Does not investigate + * in remapping the failed extent elsewhere, hoping the device will be smart + * enough to do this as needed */ - static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int failed_mirror) @@ -2412,14 +2394,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { int uptodate = (err == 0); - struct extent_io_tree *tree; int ret = 0; - tree = &BTRFS_I(page->mapping->host)->io_tree; - - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, end, NULL, - uptodate); + btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); if (!uptodate) { ClearPageUptodate(page); @@ -2522,6 +2499,8 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + bool data_inode = btrfs_ino(BTRFS_I(inode)) + != BTRFS_BTREE_INODE_OBJECTID; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2551,7 +2530,7 @@ static void end_bio_extent_readpage(struct bio *bio) len = bvec->bv_len; mirror = io_bio->mirror_num; - if (likely(uptodate && tree->ops)) { + if (likely(uptodate)) { ret = tree->ops->readpage_end_io_hook(io_bio, offset, page, start, end, mirror); @@ -2567,38 +2546,37 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) goto readpage_ok; - if (tree->ops) { - ret = tree->ops->readpage_io_failed_hook(page, mirror); - if (ret == -EAGAIN) { - /* - * Data inode's readpage_io_failed_hook() always - * returns -EAGAIN. - * - * The generic bio_readpage_error handles errors - * the following way: If possible, new read - * requests are created and submitted and will - * end up in end_bio_extent_readpage as well (if - * we're lucky, not in the !uptodate case). In - * that case it returns 0 and we just go on with - * the next page in our bio. If it can't handle - * the error it will return -EIO and we remain - * responsible for that page. - */ - ret = bio_readpage_error(bio, offset, page, - start, end, mirror); - if (ret == 0) { - uptodate = !bio->bi_status; - offset += len; - continue; - } - } + if (data_inode) { /* - * metadata's readpage_io_failed_hook() always returns - * -EIO and fixes nothing. -EIO is also returned if - * data inode error could not be fixed. + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, + * not in the !uptodate case). In that case it returns + * 0 and we just go on with the next page in our bio. + * If it can't handle the error it will return -EIO and + * we remain responsible for that page. */ - ASSERT(ret == -EIO); + ret = bio_readpage_error(bio, offset, page, start, end, + mirror); + if (ret == 0) { + uptodate = !bio->bi_status; + offset += len; + continue; + } + } else { + struct extent_buffer *eb; + + eb = (struct extent_buffer *)page->private; + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = mirror; + atomic_dec(&eb->io_pages); + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, + &eb->bflags)) + btree_readahead_hook(eb, -EIO); + + ret = -EIO; } readpage_ok: if (likely(uptodate)) { @@ -2607,7 +2585,7 @@ readpage_ok: unsigned off; /* Zero out the end if this page straddles i_size */ - off = i_size & (PAGE_SIZE-1); + off = offset_in_page(i_size); if (page->index == end_index && off) zero_user_segment(page, off, PAGE_SIZE); SetPageUptodate(page); @@ -2644,8 +2622,7 @@ readpage_ok: if (extent_len) endio_readpage_release_extent(tree, extent_start, extent_len, uptodate); - if (io_bio->end_io) - io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status)); + btrfs_io_bio_free_csum(io_bio); bio_put(bio); } @@ -2782,8 +2759,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, else contig = bio_end_sector(bio) == sector; - if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size, - bio, bio_flags)) + ASSERT(tree->ops); + if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) can_merge = false; if (prev_bio_flags != bio_flags || !contig || !can_merge || @@ -2911,7 +2888,7 @@ static int __do_readpage(struct extent_io_tree *tree, if (page->index == last_byte >> PAGE_SHIFT) { char *userpage; - size_t zero_offset = last_byte & (PAGE_SIZE - 1); + size_t zero_offset = offset_in_page(last_byte); if (zero_offset) { iosize = PAGE_SIZE - zero_offset; @@ -3205,7 +3182,7 @@ static void update_nr_written(struct writeback_control *wbc, /* * helper for __extent_writepage, doing all of the delayed allocation setup. * - * This returns 1 if our fill_delalloc function did all the work required + * This returns 1 if btrfs_run_delalloc_range function did all the work required * to write the page (copy into inline extent). In this case the IO has * been started and the page is already unlocked. * @@ -3213,44 +3190,37 @@ static void update_nr_written(struct writeback_control *wbc, * This returns < 0 if there were errors (page still locked) */ static noinline_for_stack int writepage_delalloc(struct inode *inode, - struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, - u64 delalloc_start, - unsigned long *nr_written) + struct page *page, struct writeback_control *wbc, + u64 delalloc_start, unsigned long *nr_written) { - struct extent_io_tree *tree = epd->tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 page_end = delalloc_start + PAGE_SIZE - 1; - u64 nr_delalloc; + bool found; u64 delalloc_to_write = 0; u64 delalloc_end = 0; int ret; int page_started = 0; - if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc) - return 0; while (delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(inode, tree, + found = find_lock_delalloc_range(inode, tree, page, &delalloc_start, - &delalloc_end, - BTRFS_MAX_EXTENT_SIZE); - if (nr_delalloc == 0) { + &delalloc_end); + if (!found) { delalloc_start = delalloc_end + 1; continue; } - ret = tree->ops->fill_delalloc(inode, page, - delalloc_start, - delalloc_end, - &page_started, - nr_written, wbc); + ret = btrfs_run_delalloc_range(inode, page, delalloc_start, + delalloc_end, &page_started, nr_written, wbc); /* File system has been set read-only */ if (ret) { SetPageError(page); - /* fill_delalloc should be return < 0 for error - * but just in case, we use > 0 here meaning the - * IO is started, so we don't want to return > 0 - * unless things are going well. + /* + * btrfs_run_delalloc_range should return < 0 for error + * but just in case, we use > 0 here meaning the IO is + * started, so we don't want to return > 0 unless + * things are going well. */ ret = ret < 0 ? ret : -EIO; goto done; @@ -3323,20 +3293,17 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, int nr = 0; bool compressed; - if (tree->ops && tree->ops->writepage_start_hook) { - ret = tree->ops->writepage_start_hook(page, start, - page_end); - if (ret) { - /* Fixup worker will requeue */ - if (ret == -EBUSY) - wbc->pages_skipped++; - else - redirty_page_for_writepage(wbc, page); + ret = btrfs_writepage_cow_fixup(page, start, page_end); + if (ret) { + /* Fixup worker will requeue */ + if (ret == -EBUSY) + wbc->pages_skipped++; + else + redirty_page_for_writepage(wbc, page); - update_nr_written(wbc, nr_written); - unlock_page(page); - return 1; - } + update_nr_written(wbc, nr_written); + unlock_page(page); + return 1; } /* @@ -3347,9 +3314,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, end = page_end; if (i_size <= start) { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, - page_end, NULL, 1); + btrfs_writepage_endio_finish_ordered(page, start, page_end, 1); goto done; } @@ -3360,9 +3325,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, u64 offset; if (cur >= i_size) { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - page_end, NULL, 1); + btrfs_writepage_endio_finish_ordered(page, cur, + page_end, 1); break; } em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, @@ -3396,11 +3360,10 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, * end_io notification does not happen here for * compressed extents */ - if (!compressed && tree->ops && - tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, cur, - cur + iosize - 1, - NULL, 1); + if (!compressed) + btrfs_writepage_endio_finish_ordered(page, cur, + cur + iosize - 1, + 1); else if (compressed) { /* we don't want to end_page_writeback on * a compressed extent. this happens @@ -3469,7 +3432,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ClearPageError(page); - pg_offset = i_size & (PAGE_SIZE - 1); + pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); @@ -3491,11 +3454,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_page_extent_mapped(page); - ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written); - if (ret == 1) - goto done_unlocked; - if (ret) - goto done; + if (!epd->extent_locked) { + ret = writepage_delalloc(inode, page, wbc, start, &nr_written); + if (ret == 1) + goto done_unlocked; + if (ret) + goto done; + } ret = __extent_writepage_io(inode, page, wbc, epd, i_size, nr_written, write_flags, &nr); @@ -3934,12 +3899,25 @@ static int extent_write_cache_pages(struct address_space *mapping, range_whole = 1; scanned = 1; } - if (wbc->sync_mode == WB_SYNC_ALL) + + /* + * We do the tagged writepage as long as the snapshot flush bit is set + * and we are the first one who do the filemap_flush() on this inode. + * + * The nr_to_write == LONG_MAX is needed to make sure other flushers do + * not race in and drop the bit. + */ + if (range_whole && wbc->nr_to_write == LONG_MAX && + test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &BTRFS_I(inode)->runtime_flags)) + wbc->tagged_writepages = 1; + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && @@ -4084,10 +4062,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, if (clear_page_dirty_for_io(page)) ret = __extent_writepage(page, &wbc_writepages, &epd); else { - if (tree->ops && tree->ops->writepage_end_io_hook) - tree->ops->writepage_end_io_hook(page, start, - start + PAGE_SIZE - 1, - NULL, 1); + btrfs_writepage_endio_finish_ordered(page, start, + start + PAGE_SIZE - 1, 1); unlock_page(page); } put_page(page); @@ -4118,42 +4094,36 @@ int extent_readpages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { struct bio *bio = NULL; - unsigned page_idx; unsigned long bio_flags = 0; struct page *pagepool[16]; - struct page *page; struct extent_map *em_cached = NULL; struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; int nr = 0; u64 prev_em_start = (u64)-1; - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - page = list_entry(pages->prev, struct page, lru); + while (!list_empty(pages)) { + for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { + struct page *page = list_entry(pages->prev, + struct page, lru); - prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page->index, - readahead_gfp_mask(mapping))) { - put_page(page); - continue; + prefetchw(&page->flags); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, page->index, + readahead_gfp_mask(mapping))) { + put_page(page); + continue; + } + + pagepool[nr++] = page; } - pagepool[nr++] = page; - if (nr < ARRAY_SIZE(pagepool)) - continue; __extent_readpages(tree, pagepool, nr, &em_cached, &bio, - &bio_flags, &prev_em_start); - nr = 0; + &bio_flags, &prev_em_start); } - if (nr) - __extent_readpages(tree, pagepool, nr, &em_cached, &bio, - &bio_flags, &prev_em_start); if (em_cached) free_extent_map(em_cached); - BUG_ON(!list_empty(pages)); if (bio) return submit_one_bio(bio, 0, bio_flags); return 0; @@ -4342,7 +4312,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, /* * Sanity check, extent_fiemap() should have ensured that new - * fiemap extent won't overlap with cahced one. + * fiemap extent won't overlap with cached one. * Not recoverable. * * NOTE: Physical address can overlap, due to compression @@ -4914,13 +4884,6 @@ again: check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); - /* - * We will free dummy extent buffer's if they come into - * free_extent_buffer with a ref count of 2, but if we are using this we - * want the buffers to stay in memory until we're done with them, so - * bump the ref count again. - */ - atomic_inc(&eb->refs); return eb; free_eb: btrfs_release_extent_buffer(eb); @@ -5102,7 +5065,9 @@ void free_extent_buffer(struct extent_buffer *eb) while (1) { refs = atomic_read(&eb->refs); - if (refs <= 3) + if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3) + || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && + refs == 1)) break; old = atomic_cmpxchg(&eb->refs, refs, refs - 1); if (old == refs) @@ -5111,10 +5076,6 @@ void free_extent_buffer(struct extent_buffer *eb) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) == 2 && - test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) - atomic_dec(&eb->refs); - - if (atomic_read(&eb->refs) == 2 && test_bit(EXTENT_BUFFER_STALE, &eb->bflags) && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -5340,7 +5301,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, struct page *page; char *kaddr; char *dst = (char *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; if (start + len > eb->len) { @@ -5350,7 +5311,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, return; } - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5375,14 +5336,14 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb, struct page *page; char *kaddr; char __user *dst = (char __user *)dstv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5413,10 +5374,10 @@ int map_private_extent_buffer(const struct extent_buffer *eb, char **map, unsigned long *map_start, unsigned long *map_len) { - size_t offset = start & (PAGE_SIZE - 1); + size_t offset; char *kaddr; struct page *p; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; unsigned long end_i = (start_offset + start + min_len - 1) >> PAGE_SHIFT; @@ -5453,14 +5414,14 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, struct page *page; char *kaddr; char *ptr = (char *)ptrv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5509,13 +5470,13 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, struct page *page; char *kaddr; char *src = (char *)srcv; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5539,13 +5500,13 @@ void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, size_t offset; struct page *page; char *kaddr; - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); unsigned long i = (start_offset + start) >> PAGE_SHIFT; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + start); while (len > 0) { page = eb->pages[i]; @@ -5584,13 +5545,12 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, size_t offset; struct page *page; char *kaddr; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(dst->start); unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT; WARN_ON(src->len != dst_len); - offset = (start_offset + dst_offset) & - (PAGE_SIZE - 1); + offset = offset_in_page(start_offset + dst_offset); while (len > 0) { page = dst->pages[i]; @@ -5626,7 +5586,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, unsigned long *page_index, size_t *page_offset) { - size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(eb->start); size_t byte_offset = BIT_BYTE(nr); size_t offset; @@ -5638,7 +5598,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, offset = start_offset + start + byte_offset; *page_index = offset >> PAGE_SHIFT; - *page_offset = offset & (PAGE_SIZE - 1); + *page_offset = offset_in_page(offset); } /** @@ -5780,7 +5740,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t cur; size_t dst_off_in_page; size_t src_off_in_page; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(dst->start); unsigned long dst_i; unsigned long src_i; @@ -5798,10 +5758,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } while (len > 0) { - dst_off_in_page = (start_offset + dst_offset) & - (PAGE_SIZE - 1); - src_off_in_page = (start_offset + src_offset) & - (PAGE_SIZE - 1); + dst_off_in_page = offset_in_page(start_offset + dst_offset); + src_off_in_page = offset_in_page(start_offset + src_offset); dst_i = (start_offset + dst_offset) >> PAGE_SHIFT; src_i = (start_offset + src_offset) >> PAGE_SHIFT; @@ -5829,7 +5787,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1); + size_t start_offset = offset_in_page(dst->start); unsigned long dst_i; unsigned long src_i; @@ -5853,10 +5811,8 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, dst_i = (start_offset + dst_end) >> PAGE_SHIFT; src_i = (start_offset + src_end) >> PAGE_SHIFT; - dst_off_in_page = (start_offset + dst_end) & - (PAGE_SIZE - 1); - src_off_in_page = (start_offset + src_end) & - (PAGE_SIZE - 1); + dst_off_in_page = offset_in_page(start_offset + dst_end); + src_off_in_page = offset_in_page(start_offset + src_end); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 369daa5d4f73..9673be3f3d1f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -37,18 +37,22 @@ #define EXTENT_BIO_COMPRESSED 1 #define EXTENT_BIO_FLAG_SHIFT 16 -/* these are bit numbers for test/set bit */ -#define EXTENT_BUFFER_UPTODATE 0 -#define EXTENT_BUFFER_DIRTY 2 -#define EXTENT_BUFFER_CORRUPT 3 -#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ -#define EXTENT_BUFFER_TREE_REF 5 -#define EXTENT_BUFFER_STALE 6 -#define EXTENT_BUFFER_WRITEBACK 7 -#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */ -#define EXTENT_BUFFER_UNMAPPED 9 -#define EXTENT_BUFFER_IN_TREE 10 -#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */ +enum { + EXTENT_BUFFER_UPTODATE, + EXTENT_BUFFER_DIRTY, + EXTENT_BUFFER_CORRUPT, + /* this got triggered by readahead */ + EXTENT_BUFFER_READAHEAD, + EXTENT_BUFFER_TREE_REF, + EXTENT_BUFFER_STALE, + EXTENT_BUFFER_WRITEBACK, + /* read IO error */ + EXTENT_BUFFER_READ_ERR, + EXTENT_BUFFER_UNMAPPED, + EXTENT_BUFFER_IN_TREE, + /* write IO error */ + EXTENT_BUFFER_WRITE_ERR, +}; /* these are flags for __process_pages_contig */ #define PAGE_UNLOCK (1 << 0) @@ -94,38 +98,13 @@ typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct extent_io_ops { /* - * The following callbacks must be allways defined, the function + * The following callbacks must be always defined, the function * pointer will be called unconditionally. */ extent_submit_bio_hook_t *submit_bio_hook; int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); - int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - - /* - * Optional hooks, called if the pointer is not NULL - */ - int (*fill_delalloc)(void *private_data, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc); - - int (*writepage_start_hook)(struct page *page, u64 start, u64 end); - void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state, int uptodate); - void (*set_bit_hook)(void *private_data, struct extent_state *state, - unsigned *bits); - void (*clear_bit_hook)(void *private_data, - struct extent_state *state, - unsigned *bits); - void (*merge_extent_hook)(void *private_data, - struct extent_state *new, - struct extent_state *other); - void (*split_extent_hook)(void *private_data, - struct extent_state *orig, u64 split); - void (*check_extent_io_range)(void *private_data, const char *caller, - u64 start, u64 end); }; struct extent_io_tree { @@ -353,11 +332,11 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end) + u64 end, struct extent_state **cached) { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL); + EXTENT_DO_ACCOUNTING, 0, 0, cached); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -546,10 +525,9 @@ int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -u64 btrfs_find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, - struct page *locked_page, u64 *start, - u64 *end, u64 max_bytes); +bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, + struct page *locked_page, u64 *start, + u64 *end); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7eea8b6e2cd3..a042a193c120 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -475,7 +475,8 @@ static struct extent_map *prev_extent_map(struct extent_map *em) return container_of(prev, struct extent_map, rb_node); } -/* helper for btfs_get_extent. Given an existing extent in the tree, +/* + * Helper for btrfs_get_extent. Given an existing extent in the tree, * the existing extent is the nearest extent to map_start, * and an extent that you want to insert, deal with overlap and insert * the best fitted new extent into the tree. diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 31977ffd6190..ef05a0121652 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -11,13 +11,20 @@ #define EXTENT_MAP_INLINE ((u64)-2) #define EXTENT_MAP_DELALLOC ((u64)-1) -/* bits for the flags field */ -#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ -#define EXTENT_FLAG_COMPRESSED 1 -#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ -#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */ -#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */ -#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */ +/* bits for the extent_map::flags field */ +enum { + /* this entry not yet on disk, don't free it */ + EXTENT_FLAG_PINNED, + EXTENT_FLAG_COMPRESSED, + /* pre-allocated extent */ + EXTENT_FLAG_PREALLOC, + /* Logging this extent */ + EXTENT_FLAG_LOGGING, + /* Filling in a preallocated extent */ + EXTENT_FLAG_FILLING, + /* filesystem extent mapping type */ + EXTENT_FLAG_FS_MAPPING, +}; struct extent_map { struct rb_node rb_node; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ba74827beb32..920bf3b4b0ef 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -142,11 +142,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } -static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) -{ - kfree(bio->csum_allocated); -} - static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { @@ -175,14 +170,12 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - btrfs_bio->csum_allocated = kmalloc_array(nblocks, - csum_size, GFP_NOFS); - if (!btrfs_bio->csum_allocated) { + btrfs_bio->csum = kmalloc_array(nblocks, csum_size, + GFP_NOFS); + if (!btrfs_bio->csum) { btrfs_free_path(path); return BLK_STS_RESOURCE; } - btrfs_bio->csum = btrfs_bio->csum_allocated; - btrfs_bio->end_io = btrfs_io_bio_endio_readpage; } else { btrfs_bio->csum = btrfs_bio->csum_inline; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 58e93bce3036..d38dc8c31533 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -399,7 +399,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, size_t copied = 0; size_t total_copied = 0; int pg = 0; - int offset = pos & (PAGE_SIZE - 1); + int offset = offset_in_page(pos); while (write_bytes > 0) { size_t count = min_t(size_t, @@ -1611,7 +1611,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, return -ENOMEM; while (iov_iter_count(i) > 0) { - size_t offset = pos & (PAGE_SIZE - 1); + size_t offset = offset_in_page(pos); size_t sector_offset; size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_SIZE - @@ -2005,7 +2005,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp) filp->private_data = NULL; /* - * ordered_data_close is set by settattr when we are about to truncate + * ordered_data_close is set by setattr when we are about to truncate * a file from a non-zero size to a zero size. This tries to * flush down new bytes that may have been written if the * application were using truncate to replace a file in place. @@ -2114,7 +2114,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* * We have to do this here to avoid the priority inversion of waiting on - * IO of a lower priority task while holding a transaciton open. + * IO of a lower priority task while holding a transaction open. */ ret = btrfs_wait_ordered_range(inode, start, len); if (ret) { @@ -2154,7 +2154,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * here we could get into a situation where we're waiting on IO to * happen that is blocked on a transaction trying to commit. With start * we inc the extwriter counter, so we wait for all extwriters to exit - * before we start blocking join'ers. This comment is to keep somebody + * before we start blocking joiners. This comment is to keep somebody * from thinking they are super smart and changing this to * btrfs_join_transaction *cough*Josef*cough*. */ @@ -2186,25 +2186,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); - /* - * If any of the ordered extents had an error, just return it to user - * space, so that the application knows some writes didn't succeed and - * can take proper action (retry for e.g.). Blindly committing the - * transaction in this case, would fool userspace that everything was - * successful. And we also want to make sure our log doesn't contain - * file extent items pointing to extents that weren't fully written to - - * just like in the non fast fsync path, where we check for the ordered - * operation's error flag before writing to the log tree and return -EIO - * if any of them had this flag set (btrfs_wait_ordered_range) - - * therefore we need to check for errors in the ordered operations, - * which are indicated by ctx.io_err. - */ - if (ctx.io_err) { - btrfs_end_transaction(trans); - ret = ctx.io_err; - goto out; - } - if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { ret = btrfs_sync_log(trans, root, &ctx); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index d6736595ec57..e5089087eaa6 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -74,11 +74,11 @@ out: return ret; } -struct btrfs_free_space_info * -search_free_space_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *block_group, - struct btrfs_path *path, int cow) +EXPORT_FOR_TESTS +struct btrfs_free_space_info *search_free_space_info( + struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, int cow) { struct btrfs_root *root = fs_info->free_space_root; struct btrfs_key key; @@ -176,6 +176,7 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len) } } +EXPORT_FOR_TESTS int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) @@ -315,6 +316,7 @@ out: return ret; } +EXPORT_FOR_TESTS int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) @@ -487,6 +489,7 @@ out: return ret; } +EXPORT_FOR_TESTS int free_space_test_bit(struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 offset) { @@ -775,6 +778,7 @@ out: return ret; } +EXPORT_FOR_TESTS int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) @@ -968,6 +972,7 @@ out: return ret; } +EXPORT_FOR_TESTS int __add_to_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 start, u64 size) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9ea4c6f0352f..43eb4535319d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -27,6 +27,7 @@ #include <linux/uio.h> #include <linux/magic.h> #include <linux/iversion.h> +#include <linux/swap.h> #include <asm/unaligned.h> #include "ctree.h" #include "disk-io.h" @@ -103,23 +104,23 @@ static void __endio_write_update_ordered(struct inode *inode, /* * Cleanup all submitted ordered extents in specified range to handle errors - * from the fill_dellaloc() callback. + * from the btrfs_run_delalloc_range() callback. * * NOTE: caller must ensure that when an error happens, it can not call * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata * to be released, which we want to happen only when finishing the ordered - * extent (btrfs_finish_ordered_io()). Also note that the caller of the - * fill_delalloc() callback already does proper cleanup for the first page of - * the range, that is, it invokes the callback writepage_end_io_hook() for the - * range of the first page. + * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, - const u64 offset, - const u64 bytes) + struct page *locked_page, + u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + u64 page_start = page_offset(locked_page); + u64 page_end = page_start + PAGE_SIZE - 1; + struct page *page; while (index <= end_index) { @@ -130,8 +131,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, ClearPagePrivate2(page); put_page(page); } - return __endio_write_update_ordered(inode, offset + PAGE_SIZE, - bytes - PAGE_SIZE, false); + + /* + * In case this page belongs to the delalloc range being instantiated + * then skip it, since the first page of a range is going to be + * properly cleaned up by the caller of run_delalloc_range + */ + if (page_start >= offset && page_end <= (offset + bytes - 1)) { + offset += PAGE_SIZE; + bytes -= PAGE_SIZE; + } + + return __endio_write_update_ordered(inode, offset, bytes, false); } static int btrfs_dirty_inode(struct inode *inode); @@ -229,7 +240,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, start >> PAGE_SHIFT); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_atomic(page); - offset = start & (PAGE_SIZE - 1); + offset = offset_in_page(start); write_extent_buffer(leaf, kaddr + offset, ptr, size); kunmap_atomic(kaddr); put_page(page); @@ -357,7 +368,7 @@ struct async_extent { struct async_cow { struct inode *inode; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info; struct page *locked_page; u64 start; u64 end; @@ -538,8 +549,7 @@ again: &total_compressed); if (!ret) { - unsigned long offset = total_compressed & - (PAGE_SIZE - 1); + unsigned long offset = offset_in_page(total_compressed); struct page *page = pages[nr_pages - 1]; char *kaddr; @@ -847,14 +857,13 @@ retry: ins.offset, async_extent->pages, async_extent->nr_pages, async_cow->write_flags)) { - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; p->mapping = inode->i_mapping; - tree->ops->writepage_end_io_hook(p, start, end, - NULL, 0); + btrfs_writepage_endio_finish_ordered(p, start, end, 0); + p->mapping = NULL; extent_clear_unlock_delalloc(inode, start, end, end, NULL, 0, @@ -1144,13 +1153,11 @@ static noinline void async_cow_submit(struct btrfs_work *work) { struct btrfs_fs_info *fs_info; struct async_cow *async_cow; - struct btrfs_root *root; unsigned long nr_pages; async_cow = container_of(work, struct async_cow, work); - root = async_cow->root; - fs_info = root->fs_info; + fs_info = async_cow->fs_info; nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> PAGE_SHIFT; @@ -1179,7 +1186,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_cow *async_cow; - struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; u64 cur_end; @@ -1189,7 +1195,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); BUG_ON(!async_cow); /* -ENOMEM */ async_cow->inode = igrab(inode); - async_cow->root = root; + async_cow->fs_info = fs_info; async_cow->locked_page = locked_page; async_cow->start = start; async_cow->write_flags = write_flags; @@ -1372,7 +1378,8 @@ next_slot: * Do the same check as in btrfs_cross_ref_exist but * without the unnecessary search. */ - if (btrfs_file_extent_generation(leaf, fi) <= + if (!nolock && + btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) @@ -1576,12 +1583,12 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) } /* - * extent_io.c call back to do delayed allocation processing + * Function to process delayed allocation (create CoW) for ranges which are + * being touched for the first time. */ -static int run_delalloc_range(void *private_data, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc) +int btrfs_run_delalloc_range(void *private_data, struct page *locked_page, + u64 start, u64 end, int *page_started, unsigned long *nr_written, + struct writeback_control *wbc) { struct inode *inode = private_data; int ret; @@ -1605,14 +1612,14 @@ static int run_delalloc_range(void *private_data, struct page *locked_page, write_flags); } if (ret) - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); + btrfs_cleanup_ordered_extents(inode, locked_page, start, + end - start + 1); return ret; } -static void btrfs_split_extent_hook(void *private_data, - struct extent_state *orig, u64 split) +void btrfs_split_delalloc_extent(struct inode *inode, + struct extent_state *orig, u64 split) { - struct inode *inode = private_data; u64 size; /* not delalloc, ignore it */ @@ -1625,7 +1632,7 @@ static void btrfs_split_extent_hook(void *private_data, u64 new_size; /* - * See the explanation in btrfs_merge_extent_hook, the same + * See the explanation in btrfs_merge_delalloc_extent, the same * applies here, just in reverse. */ new_size = orig->end - split + 1; @@ -1642,16 +1649,13 @@ static void btrfs_split_extent_hook(void *private_data, } /* - * extent_io.c merge_extent_hook, used to track merged delayed allocation - * extents so we can keep track of new extents that are just merged onto old - * extents, such as when we are doing sequential writes, so we can properly - * account for the metadata space we'll need. + * Handle merged delayed allocation extents so we can keep track of new extents + * that are just merged onto old extents, such as when we are doing sequential + * writes, so we can properly account for the metadata space we'll need. */ -static void btrfs_merge_extent_hook(void *private_data, - struct extent_state *new, - struct extent_state *other) +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + struct extent_state *other) { - struct inode *inode = private_data; u64 new_size, old_size; u32 num_extents; @@ -1755,15 +1759,12 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, } /* - * extent_io.c set_bit_hook, used to track delayed allocation - * bytes in this file, and to maintain the list of inodes that - * have pending delalloc work to be done. + * Properly track delayed allocation bytes in the inode and to maintain the + * list of inodes that have pending delalloc work to be done. */ -static void btrfs_set_bit_hook(void *private_data, - struct extent_state *state, unsigned *bits) +void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + unsigned *bits) { - struct inode *inode = private_data; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) @@ -1809,14 +1810,14 @@ static void btrfs_set_bit_hook(void *private_data, } /* - * extent_io.c clear_bit_hook, see set_bit_hook for why + * Once a range is no longer delalloc this function ensures that proper + * accounting happens. */ -static void btrfs_clear_bit_hook(void *private_data, - struct extent_state *state, - unsigned *bits) +void btrfs_clear_delalloc_extent(struct inode *vfs_inode, + struct extent_state *state, unsigned *bits) { - struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(len); @@ -1841,7 +1842,7 @@ static void btrfs_clear_bit_hook(void *private_data, /* * We don't reserve metadata space for space cache inodes so we - * don't need to call dellalloc_release_metadata if there is an + * don't need to call delalloc_release_metadata if there is an * error. */ if (*bits & EXTENT_CLEAR_META_RESV && @@ -1880,16 +1881,21 @@ static void btrfs_clear_bit_hook(void *private_data, } /* - * Merge bio hook, this must check the chunk tree to make sure we don't create - * bios that span stripes or chunks + * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit + * in a chunk's stripe. This function ensures that bios do not span a + * stripe/chunk * - * return 1 if page cannot be merged to bio - * return 0 if page can be merged to bio + * @page - The page we are about to add to the bio + * @size - size we want to add to the bio + * @bio - bio we want to ensure is smaller than a stripe + * @bio_flags - flags of the bio + * + * return 1 if page cannot be added to the bio + * return 0 if page can be added to the bio * return error otherwise */ -int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio, - unsigned long bio_flags) +int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, + unsigned long bio_flags) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1932,29 +1938,6 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, } /* - * in order to insert checksums into the metadata in large chunks, - * we wait until bio submission time. All the pages in the bio are - * checksummed and sums are attached onto the ordered extent record. - * - * At IO completion time the cums attached on the ordered extent record - * are inserted into the btree - */ -blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num) -{ - struct inode *inode = private_data; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - blk_status_t ret; - - ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); - if (ret) { - bio->bi_status = ret; - bio_endio(bio); - } - return ret; -} - -/* * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read. * @@ -2056,7 +2039,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state, int dedupe) { - WARN_ON((end & (PAGE_SIZE - 1)) == 0); + WARN_ON(PAGE_ALIGNED(end)); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, extra_bits, cached_state); } @@ -2152,7 +2135,7 @@ out_page: * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. */ -static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) +int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3159,8 +3142,8 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } -static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int uptodate) +void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, + u64 end, int uptodate) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3686,6 +3669,21 @@ cache_index: * inode is not a directory, logging its parent unnecessarily. */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + /* + * Similar reasoning for last_link_trans, needs to be set otherwise + * for a case like the following: + * + * mkdir A + * touch foo + * ln foo A/bar + * echo 2 > /proc/sys/vm/drop_caches + * fsync foo + * <power failure> + * + * Would result in link bar and directory A not existing after the power + * failure. + */ + BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans; path->slots[0]++; if (inode->i_nlink != 1 || @@ -4444,31 +4442,6 @@ out: return err; } -static int truncate_space_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytes_deleted) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - int ret; - - /* - * This is only used to apply pressure to the enospc system, we don't - * intend to use this reservation at all. - */ - bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted); - bytes_deleted *= fs_info->nodesize; - ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, - bytes_deleted, BTRFS_RESERVE_NO_FLUSH); - if (!ret) { - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, - bytes_deleted, 1); - trans->bytes_reserved += bytes_deleted; - } - return ret; - -} - /* * Return this if we need to call truncate_block for the last bit of the * truncate. @@ -4513,7 +4486,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 bytes_deleted = 0; bool be_nice = false; bool should_throttle = false; - bool should_end = false; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); @@ -4544,7 +4516,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, /* * This function is also used to drop the items in the log tree before * we relog the inode, so if root != BTRFS_I(inode)->root, it means - * it is used to drop the loged items. So we shouldn't kill the delayed + * it is used to drop the logged items. So we shouldn't kill the delayed * items. */ if (min_type == 0 && root == BTRFS_I(inode)->root) @@ -4726,15 +4698,7 @@ delete: btrfs_abort_transaction(trans, ret); break; } - if (btrfs_should_throttle_delayed_refs(trans)) - btrfs_async_run_delayed_refs(fs_info, - trans->delayed_ref_updates * 2, - trans->transid, 0); if (be_nice) { - if (truncate_space_check(trans, root, - extent_num_bytes)) { - should_end = true; - } if (btrfs_should_throttle_delayed_refs(trans)) should_throttle = true; } @@ -4745,7 +4709,7 @@ delete: if (path->slots[0] == 0 || path->slots[0] != pending_del_slot || - should_throttle || should_end) { + should_throttle) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -4757,23 +4721,24 @@ delete: pending_del_nr = 0; } btrfs_release_path(path); - if (should_throttle) { - unsigned long updates = trans->delayed_ref_updates; - if (updates) { - trans->delayed_ref_updates = 0; - ret = btrfs_run_delayed_refs(trans, - updates * 2); - if (ret) - break; - } - } + /* - * if we failed to refill our space rsv, bail out - * and let the transaction restart + * We can generate a lot of delayed refs, so we need to + * throttle every once and a while and make sure we're + * adding enough space to keep up with the work we are + * generating. Since we hold a transaction here we + * can't flush, and we don't want to FLUSH_LIMIT because + * we could have generated too many delayed refs to + * actually allocate, so just bail if we're short and + * let the normal reservation dance happen higher up. */ - if (should_end) { - ret = -EAGAIN; - break; + if (should_throttle) { + ret = btrfs_delayed_refs_rsv_refill(fs_info, + BTRFS_RESERVE_NO_FLUSH); + if (ret) { + ret = -EAGAIN; + break; + } } goto search_again; } else { @@ -4799,18 +4764,6 @@ out: } btrfs_free_path(path); - - if (be_nice && bytes_deleted > SZ_32M && (ret >= 0 || ret == -EAGAIN)) { - unsigned long updates = trans->delayed_ref_updates; - int err; - - if (updates) { - trans->delayed_ref_updates = 0; - err = btrfs_run_delayed_refs(trans, updates * 2); - if (err) - ret = err; - } - } return ret; } @@ -5155,7 +5108,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the end less truncate */ + /* Disable nonlocked read DIO to avoid the endless truncate */ btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); @@ -5333,8 +5286,8 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, * Try to steal from the global reserve if there is space for * it. */ - if (!btrfs_check_space_for_delayed_refs(trans) && - !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false)) + if (!btrfs_check_space_for_delayed_refs(fs_info) && + !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) return trans; /* If not, commit and try again. */ @@ -6406,14 +6359,19 @@ fail_dir_item: err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, &local_index, name, name_len); - + if (err) + btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; err = btrfs_del_inode_ref(trans, root, name, name_len, ino, parent_ino, &local_index); + if (err) + btrfs_abort_transaction(trans, err); } + + /* Return the original error code */ return ret; } @@ -6625,6 +6583,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) goto fail; } + BTRFS_I(inode)->last_link_trans = trans->transid; d_instantiate(dentry, inode); ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, true, NULL); @@ -6652,7 +6611,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; int err = 0; - int drop_on_err = 0; u64 objectid = 0; u64 index = 0; @@ -6678,7 +6636,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; } - drop_on_err = 1; /* these must be set before we unlock the inode */ inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; @@ -6699,7 +6656,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; d_instantiate_new(dentry, inode); - drop_on_err = 0; out_fail: btrfs_end_transaction(trans); @@ -8053,9 +8009,7 @@ static void btrfs_endio_direct_read(struct bio *bio) dio_bio->bi_status = err; dio_end_io(dio_bio); - - if (io_bio->end_io) - io_bio->end_io(io_bio, blk_status_to_errno(err)); + btrfs_io_bio_free_csum(io_bio); bio_put(bio); } @@ -8098,7 +8052,7 @@ static void __endio_write_update_ordered(struct inode *inode, return; /* * Our bio might span multiple ordered extents. In this case - * we keep goin until we have accounted the whole dio. + * we keep going until we have accounted the whole dio. */ if (ordered_offset < offset + bytes) { ordered_bytes = offset + bytes - ordered_offset; @@ -8408,8 +8362,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, if (!ret) return; - if (io_bio->end_io) - io_bio->end_io(io_bio, ret); + btrfs_io_bio_free_csum(io_bio); free_ordered: /* @@ -8912,7 +8865,7 @@ again: /* page is wholly or partially inside EOF */ if (page_start + PAGE_SIZE > size) - zero_start = size & ~PAGE_MASK; + zero_start = offset_in_page(size); else zero_start = PAGE_SIZE; @@ -9157,6 +9110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; + ei->last_link_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); @@ -9968,7 +9922,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr) +static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -9996,6 +9950,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr) } spin_unlock(&root->delalloc_lock); + if (snapshot) + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &binode->runtime_flags); work = btrfs_alloc_delalloc_work(inode); if (!work) { iput(inode); @@ -10029,7 +9986,7 @@ out: return ret; } -int btrfs_start_delalloc_inodes(struct btrfs_root *root) +int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -10037,7 +9994,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1); + ret = start_delalloc_inodes(root, -1, true); if (ret > 0) ret = 0; return ret; @@ -10066,7 +10023,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr); + ret = start_delalloc_inodes(root, nr, false); btrfs_put_fs_root(root); if (ret < 0) goto out; @@ -10445,26 +10402,6 @@ out: return ret; } -__attribute__((const)) -static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror) -{ - return -EAGAIN; -} - -static void btrfs_check_extent_io_range(void *private_data, const char *caller, - u64 start, u64 end) -{ - struct inode *inode = private_data; - u64 isize; - - isize = i_size_read(inode); - if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, - "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); - } -} - void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { struct inode *inode = tree->private_data; @@ -10481,6 +10418,343 @@ void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) } } +#ifdef CONFIG_SWAP +/* + * Add an entry indicating a block group or device which is pinned by a + * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a + * negative errno on failure. + */ +static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, + bool is_block_group) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_swapfile_pin *sp, *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + + sp = kmalloc(sizeof(*sp), GFP_NOFS); + if (!sp) + return -ENOMEM; + sp->ptr = ptr; + sp->inode = inode; + sp->is_block_group = is_block_group; + + spin_lock(&fs_info->swapfile_pins_lock); + p = &fs_info->swapfile_pins.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_swapfile_pin, node); + if (sp->ptr < entry->ptr || + (sp->ptr == entry->ptr && sp->inode < entry->inode)) { + p = &(*p)->rb_left; + } else if (sp->ptr > entry->ptr || + (sp->ptr == entry->ptr && sp->inode > entry->inode)) { + p = &(*p)->rb_right; + } else { + spin_unlock(&fs_info->swapfile_pins_lock); + kfree(sp); + return 1; + } + } + rb_link_node(&sp->node, parent, p); + rb_insert_color(&sp->node, &fs_info->swapfile_pins); + spin_unlock(&fs_info->swapfile_pins_lock); + return 0; +} + +/* Free all of the entries pinned by this swapfile. */ +static void btrfs_free_swapfile_pins(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_swapfile_pin *sp; + struct rb_node *node, *next; + + spin_lock(&fs_info->swapfile_pins_lock); + node = rb_first(&fs_info->swapfile_pins); + while (node) { + next = rb_next(node); + sp = rb_entry(node, struct btrfs_swapfile_pin, node); + if (sp->inode == inode) { + rb_erase(&sp->node, &fs_info->swapfile_pins); + if (sp->is_block_group) + btrfs_put_block_group(sp->ptr); + kfree(sp); + } + node = next; + } + spin_unlock(&fs_info->swapfile_pins_lock); +} + +struct btrfs_swap_info { + u64 start; + u64 block_start; + u64 block_len; + u64 lowest_ppage; + u64 highest_ppage; + unsigned long nr_pages; + int nr_extents; +}; + +static int btrfs_add_swap_extent(struct swap_info_struct *sis, + struct btrfs_swap_info *bsi) +{ + unsigned long nr_pages; + u64 first_ppage, first_ppage_reported, next_ppage; + int ret; + + first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; + next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, + PAGE_SIZE) >> PAGE_SHIFT; + + if (first_ppage >= next_ppage) + return 0; + nr_pages = next_ppage - first_ppage; + + first_ppage_reported = first_ppage; + if (bsi->start == 0) + first_ppage_reported++; + if (bsi->lowest_ppage > first_ppage_reported) + bsi->lowest_ppage = first_ppage_reported; + if (bsi->highest_ppage < (next_ppage - 1)) + bsi->highest_ppage = next_ppage - 1; + + ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); + if (ret < 0) + return ret; + bsi->nr_extents += ret; + bsi->nr_pages += nr_pages; + return 0; +} + +static void btrfs_swap_deactivate(struct file *file) +{ + struct inode *inode = file_inode(file); + + btrfs_free_swapfile_pins(inode); + atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_state *cached_state = NULL; + struct extent_map *em = NULL; + struct btrfs_device *device = NULL; + struct btrfs_swap_info bsi = { + .lowest_ppage = (sector_t)-1ULL, + }; + int ret = 0; + u64 isize; + u64 start; + + /* + * If the swap file was just created, make sure delalloc is done. If the + * file changes again after this, the user is doing something stupid and + * we don't really care. + */ + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret) + return ret; + + /* + * The inode is locked, so these flags won't change after we check them. + */ + if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { + btrfs_warn(fs_info, "swapfile must not be compressed"); + return -EINVAL; + } + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { + btrfs_warn(fs_info, "swapfile must not be copy-on-write"); + return -EINVAL; + } + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_warn(fs_info, "swapfile must not be checksummed"); + return -EINVAL; + } + + /* + * Balance or device remove/replace/resize can move stuff around from + * under us. The EXCL_OP flag makes sure they aren't running/won't run + * concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap file + * is active and moving the extents. Note that this also prevents a + * concurrent device add which isn't actually necessary, but it's not + * really worth the trouble to allow it. + */ + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + btrfs_warn(fs_info, + "cannot activate swapfile while exclusive operation is running"); + return -EBUSY; + } + /* + * Snapshots can create extents which require COW even if NODATACOW is + * set. We use this counter to prevent snapshots. We must increment it + * before walking the extents because we don't want a concurrent + * snapshot to run after we've already checked the extents. + */ + atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles); + + isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); + + lock_extent_bits(io_tree, 0, isize - 1, &cached_state); + start = 0; + while (start < isize) { + u64 logical_block_start, physical_block_start; + struct btrfs_block_group_cache *bg; + u64 len = isize - start; + + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (em->block_start == EXTENT_MAP_HOLE) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + if (em->block_start == EXTENT_MAP_INLINE) { + /* + * It's unlikely we'll ever actually find ourselves + * here, as a file small enough to fit inline won't be + * big enough to store more than the swap header, but in + * case something changes in the future, let's catch it + * here rather than later. + */ + btrfs_warn(fs_info, "swapfile must not be inline"); + ret = -EINVAL; + goto out; + } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + btrfs_warn(fs_info, "swapfile must not be compressed"); + ret = -EINVAL; + goto out; + } + + logical_block_start = em->block_start + (start - em->start); + len = min(len, em->len - (start - em->start)); + free_extent_map(em); + em = NULL; + + ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + } else { + btrfs_warn(fs_info, + "swapfile must not be copy-on-write"); + ret = -EINVAL; + goto out; + } + + em = btrfs_get_chunk_map(fs_info, logical_block_start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + + if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + btrfs_warn(fs_info, + "swapfile must have single data profile"); + ret = -EINVAL; + goto out; + } + + if (device == NULL) { + device = em->map_lookup->stripes[0].dev; + ret = btrfs_add_swapfile_pin(inode, device, false); + if (ret == 1) + ret = 0; + else if (ret) + goto out; + } else if (device != em->map_lookup->stripes[0].dev) { + btrfs_warn(fs_info, "swapfile must be on one device"); + ret = -EINVAL; + goto out; + } + + physical_block_start = (em->map_lookup->stripes[0].physical + + (logical_block_start - em->start)); + len = min(len, em->len - (logical_block_start - em->start)); + free_extent_map(em); + em = NULL; + + bg = btrfs_lookup_block_group(fs_info, logical_block_start); + if (!bg) { + btrfs_warn(fs_info, + "could not find block group containing swapfile"); + ret = -EINVAL; + goto out; + } + + ret = btrfs_add_swapfile_pin(inode, bg, true); + if (ret) { + btrfs_put_block_group(bg); + if (ret == 1) + ret = 0; + else + goto out; + } + + if (bsi.block_len && + bsi.block_start + bsi.block_len == physical_block_start) { + bsi.block_len += len; + } else { + if (bsi.block_len) { + ret = btrfs_add_swap_extent(sis, &bsi); + if (ret) + goto out; + } + bsi.start = start; + bsi.block_start = physical_block_start; + bsi.block_len = len; + } + + start += len; + } + + if (bsi.block_len) + ret = btrfs_add_swap_extent(sis, &bsi); + +out: + if (!IS_ERR_OR_NULL(em)) + free_extent_map(em); + + unlock_extent_cached(io_tree, 0, isize - 1, &cached_state); + + if (ret) + btrfs_swap_deactivate(file); + + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + + if (ret) + return ret; + + if (device) + sis->bdev = device->bdev; + *span = bsi.highest_ppage - bsi.lowest_ppage + 1; + sis->max = bsi.nr_pages; + sis->pages = bsi.nr_pages - 1; + sis->highest_bit = bsi.nr_pages - 1; + return bsi.nr_extents; +} +#else +static void btrfs_swap_deactivate(struct file *file) +{ +} + +static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + return -EOPNOTSUPP; +} +#endif + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -10523,17 +10797,6 @@ static const struct extent_io_ops btrfs_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btrfs_submit_bio_hook, .readpage_end_io_hook = btrfs_readpage_end_io_hook, - .readpage_io_failed_hook = btrfs_readpage_io_failed_hook, - - /* optional callbacks */ - .fill_delalloc = run_delalloc_range, - .writepage_end_io_hook = btrfs_writepage_end_io_hook, - .writepage_start_hook = btrfs_writepage_start_hook, - .set_bit_hook = btrfs_set_bit_hook, - .clear_bit_hook = btrfs_clear_bit_hook, - .merge_extent_hook = btrfs_merge_extent_hook, - .split_extent_hook = btrfs_split_extent_hook, - .check_extent_io_range = btrfs_check_extent_io_range, }; /* @@ -10558,6 +10821,8 @@ static const struct address_space_operations btrfs_aops = { .releasepage = btrfs_releasepage, .set_page_dirty = btrfs_set_page_dirty, .error_remove_page = generic_error_remove_page, + .swap_activate = btrfs_swap_activate, + .swap_deactivate = btrfs_swap_deactivate, }; static const struct inode_operations btrfs_file_inode_operations = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 802a628e9f7d..fab9443f6a42 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -290,6 +290,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } else if (fsflags & FS_COMPR_FL) { const char *comp; + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + goto out_unlock; + } + binode->flags |= BTRFS_INODE_COMPRESS; binode->flags &= ~BTRFS_INODE_NOCOMPRESS; @@ -754,6 +759,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; + if (atomic_read(&root->nr_swapfiles)) { + btrfs_warn(fs_info, + "cannot snapshot subvolume with active swapfile"); + return -ETXTBSY; + } + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL); if (!pending_snapshot) return -ENOMEM; @@ -777,7 +788,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, wait_event(root->subv_writers->wait, percpu_counter_sum(&root->subv_writers->counter) == 0); - ret = btrfs_start_delalloc_inodes(root); + ret = btrfs_start_delalloc_snapshot(root); if (ret) goto dec_and_free; @@ -1505,9 +1516,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } inode_lock(inode); - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = cluster_pages_for_defrag(inode, pages, i, cluster); + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + } else { + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; + ret = cluster_pages_for_defrag(inode, pages, i, cluster); + } if (ret < 0) { inode_unlock(inode); goto out_ra; @@ -3135,7 +3150,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, } rcu_read_unlock(); - memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); + memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid)); fi_args->nodesize = fs_info->nodesize; fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; @@ -3191,92 +3206,6 @@ out: return ret; } -static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) -{ - struct page *page; - - page = grab_cache_page(inode->i_mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); - - if (!PageUptodate(page)) { - int ret; - - ret = btrfs_readpage(NULL, page); - if (ret) - return ERR_PTR(ret); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EIO); - } - if (page->mapping != inode->i_mapping) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EAGAIN); - } - } - - return page; -} - -static int gather_extent_pages(struct inode *inode, struct page **pages, - int num_pages, u64 off) -{ - int i; - pgoff_t index = off >> PAGE_SHIFT; - - for (i = 0; i < num_pages; i++) { -again: - pages[i] = extent_same_get_page(inode, index + i); - if (IS_ERR(pages[i])) { - int err = PTR_ERR(pages[i]); - - if (err == -EAGAIN) - goto again; - pages[i] = NULL; - return err; - } - } - return 0; -} - -static int lock_extent_range(struct inode *inode, u64 off, u64 len, - bool retry_range_locking) -{ - /* - * Do any pending delalloc/csum calculations on inode, one way or - * another, and lock file content. - * The locking order is: - * - * 1) pages - * 2) range in the inode's io tree - */ - while (1) { - struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); - ordered = btrfs_lookup_first_ordered_extent(inode, - off + len - 1); - if ((!ordered || - ordered->file_offset + ordered->len <= off || - ordered->file_offset >= off + len) && - !test_range_bit(&BTRFS_I(inode)->io_tree, off, - off + len - 1, EXTENT_DELALLOC, 0, NULL)) { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); - if (ordered) - btrfs_put_ordered_extent(ordered); - if (!retry_range_locking) - return -EAGAIN; - btrfs_wait_ordered_range(inode, off, len); - } - return 0; -} - static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { inode_unlock(inode1); @@ -3292,261 +3221,32 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) inode_lock_nested(inode2, I_MUTEX_CHILD); } -static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) -{ - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); -} - -static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len, - bool retry_range_locking) -{ - int ret; - - if (inode1 < inode2) { - swap(inode1, inode2); - swap(loff1, loff2); - } - ret = lock_extent_range(inode1, loff1, len, retry_range_locking); - if (ret) - return ret; - ret = lock_extent_range(inode2, loff2, len, retry_range_locking); - if (ret) - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, - loff1 + len - 1); - return ret; -} - -struct cmp_pages { - int num_pages; - struct page **src_pages; - struct page **dst_pages; -}; - -static void btrfs_cmp_data_free(struct cmp_pages *cmp) -{ - int i; - struct page *pg; - - for (i = 0; i < cmp->num_pages; i++) { - pg = cmp->src_pages[i]; - if (pg) { - unlock_page(pg); - put_page(pg); - cmp->src_pages[i] = NULL; - } - pg = cmp->dst_pages[i]; - if (pg) { - unlock_page(pg); - put_page(pg); - cmp->dst_pages[i] = NULL; - } - } -} - -static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, - struct inode *dst, u64 dst_loff, - u64 len, struct cmp_pages *cmp) -{ - int ret; - int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT; - - cmp->num_pages = num_pages; - - ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff); - if (ret) - goto out; - - ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff); - -out: - if (ret) - btrfs_cmp_data_free(cmp); - return ret; -} - -static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) -{ - int ret = 0; - int i; - struct page *src_page, *dst_page; - unsigned int cmp_len = PAGE_SIZE; - void *addr, *dst_addr; - - i = 0; - while (len) { - if (len < PAGE_SIZE) - cmp_len = len; - - BUG_ON(i >= cmp->num_pages); - - src_page = cmp->src_pages[i]; - dst_page = cmp->dst_pages[i]; - ASSERT(PageLocked(src_page)); - ASSERT(PageLocked(dst_page)); - - addr = kmap_atomic(src_page); - dst_addr = kmap_atomic(dst_page); - - flush_dcache_page(src_page); - flush_dcache_page(dst_page); - - if (memcmp(addr, dst_addr, cmp_len)) - ret = -EBADE; - - kunmap_atomic(addr); - kunmap_atomic(dst_addr); - - if (ret) - break; - - len -= cmp_len; - i++; - } - - return ret; -} - -static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, - u64 olen) -{ - u64 len = *plen; - u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; - - if (off + olen > inode->i_size || off + olen < off) - return -EINVAL; - - /* if we extend to eof, continue to block boundary */ - if (off + len == inode->i_size) - *plen = len = ALIGN(inode->i_size, bs) - off; - - /* Check that we are block aligned - btrfs_clone() requires this */ - if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) - return -EINVAL; - - return 0; -} - static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen, - struct inode *dst, u64 dst_loff, - struct cmp_pages *cmp) + struct inode *dst, u64 dst_loff) { + u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; int ret; u64 len = olen; - bool same_inode = (src == dst); - u64 same_lock_start = 0; - u64 same_lock_len = 0; - - ret = extent_same_check_offsets(src, loff, &len, olen); - if (ret) - return ret; - - ret = extent_same_check_offsets(dst, dst_loff, &len, olen); - if (ret) - return ret; - - if (same_inode) { - /* - * Single inode case wants the same checks, except we - * don't want our length pushed out past i_size as - * comparing that data range makes no sense. - * - * extent_same_check_offsets() will do this for an - * unaligned length at i_size, so catch it here and - * reject the request. - * - * This effectively means we require aligned extents - * for the single-inode case, whereas the other cases - * allow an unaligned length so long as it ends at - * i_size. - */ - if (len != olen) - return -EINVAL; - - /* Check for overlapping ranges */ - if (dst_loff + len > loff && dst_loff < loff + len) - return -EINVAL; - - same_lock_start = min_t(u64, loff, dst_loff); - same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; - } else { - /* - * If the source and destination inodes are different, the - * source's range end offset matches the source's i_size, that - * i_size is not a multiple of the sector size, and the - * destination range does not go past the destination's i_size, - * we must round down the length to the nearest sector size - * multiple. If we don't do this adjustment we end replacing - * with zeroes the bytes in the range that starts at the - * deduplication range's end offset and ends at the next sector - * size multiple. - */ - if (loff + olen == i_size_read(src) && - dst_loff + len < i_size_read(dst)) { - const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize; - - len = round_down(i_size_read(src), sz) - loff; - if (len == 0) - return 0; - olen = len; - } - } - -again: - ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp); - if (ret) - return ret; - if (same_inode) - ret = lock_extent_range(src, same_lock_start, same_lock_len, - false); - else - ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len, - false); + if (loff + len == src->i_size) + len = ALIGN(src->i_size, bs) - loff; /* - * If one of the inodes has dirty pages in the respective range or - * ordered extents, we need to flush dellaloc and wait for all ordered - * extents in the range. We must unlock the pages and the ranges in the - * io trees to avoid deadlocks when flushing delalloc (requires locking - * pages) and when waiting for ordered extents to complete (they require - * range locking). + * For same inode case we don't want our length pushed out past i_size + * as comparing that data range makes no sense. + * + * This effectively means we require aligned extents for the single + * inode case, whereas the other cases allow an unaligned length so long + * as it ends at i_size. */ - if (ret == -EAGAIN) { - /* - * Ranges in the io trees already unlocked. Now unlock all - * pages before waiting for all IO to complete. - */ - btrfs_cmp_data_free(cmp); - if (same_inode) { - btrfs_wait_ordered_range(src, same_lock_start, - same_lock_len); - } else { - btrfs_wait_ordered_range(src, loff, len); - btrfs_wait_ordered_range(dst, dst_loff, len); - } - goto again; - } - ASSERT(ret == 0); - if (WARN_ON(ret)) { - /* ranges in the io trees already unlocked */ - btrfs_cmp_data_free(cmp); - return ret; - } - - /* pass original length for comparison so we stay within i_size */ - ret = btrfs_cmp_data(olen, cmp); - if (ret == 0) - ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); - - if (same_inode) - unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, - same_lock_start + same_lock_len - 1); - else - btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + if (dst == src && len != olen) + return -EINVAL; - btrfs_cmp_data_free(cmp); + /* + * Lock destination range to serialize with concurrent readpages(). + */ + lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); + unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, dst_loff + len - 1); return ret; } @@ -3557,58 +3257,27 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret; - struct cmp_pages cmp; int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT; - bool same_inode = (src == dst); u64 i, tail_len, chunk_count; - if (olen == 0) - return 0; - - if (same_inode) - inode_lock(src); - else - btrfs_double_inode_lock(src, dst); - /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { - ret = -EINVAL; - goto out_unlock; - } + (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) + return -EINVAL; + + if (IS_SWAPFILE(src) || IS_SWAPFILE(dst)) + return -ETXTBSY; tail_len = olen % BTRFS_MAX_DEDUPE_LEN; chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); if (chunk_count == 0) num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT; - /* - * If deduping ranges in the same inode, locking rules make it - * mandatory to always lock pages in ascending order to avoid deadlocks - * with concurrent tasks (such as starting writeback/delalloc). - */ - if (same_inode && dst_loff < loff) - swap(loff, dst_loff); - - /* - * We must gather up all the pages before we initiate our extent - * locking. We use an array for the page pointers. Size of the array is - * bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN. - */ - cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_ZERO); - cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *), - GFP_KERNEL | __GFP_ZERO); - if (!cmp.src_pages || !cmp.dst_pages) { - ret = -ENOMEM; - goto out_free; - } - for (i = 0; i < chunk_count; i++) { ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff, &cmp); + dst, dst_loff); if (ret) - goto out_free; + return ret; loff += BTRFS_MAX_DEDUPE_LEN; dst_loff += BTRFS_MAX_DEDUPE_LEN; @@ -3616,17 +3285,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, if (tail_len > 0) ret = btrfs_extent_same_range(src, loff, tail_len, dst, - dst_loff, &cmp); - -out_free: - kvfree(cmp.src_pages); - kvfree(cmp.dst_pages); - -out_unlock: - if (same_inode) - inode_unlock(src); - else - btrfs_double_inode_unlock(src, dst); + dst_loff); return ret; } @@ -4213,11 +3872,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; u64 len = olen; u64 bs = fs_info->sb->s_blocksize; - int same_inode = src == inode; /* * TODO: @@ -4230,101 +3887,35 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * be either compressed or non-compressed. */ - if (btrfs_root_readonly(root)) - return -EROFS; - - if (file_src->f_path.mnt != file->f_path.mnt || - src->i_sb != inode->i_sb) - return -EXDEV; - - if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) - return -EISDIR; - - if (!same_inode) { - btrfs_double_inode_lock(src, inode); - } else { - inode_lock(src); - } - /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = -EINVAL; - goto out_unlock; - } + (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + return -EINVAL; + + if (IS_SWAPFILE(src) || IS_SWAPFILE(inode)) + return -ETXTBSY; - /* determine range to clone */ - ret = -EINVAL; - if (off + len > src->i_size || off + len < off) - goto out_unlock; - if (len == 0) - olen = len = src->i_size - off; /* - * If we extend to eof, continue to block boundary if and only if the - * destination end offset matches the destination file's size, otherwise - * we would be corrupting data by placing the eof block into the middle - * of a file. + * VFS's generic_remap_file_range_prep() protects us from cloning the + * eof block into the middle of a file, which would result in corruption + * if the file size is not blocksize aligned. So we don't need to check + * for that case here. */ - if (off + len == src->i_size) { - if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size) - goto out_unlock; + if (off + len == src->i_size) len = ALIGN(src->i_size, bs) - off; - } - - if (len == 0) { - ret = 0; - goto out_unlock; - } - - /* verify the end result is block aligned */ - if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || - !IS_ALIGNED(destoff, bs)) - goto out_unlock; - - /* verify if ranges are overlapped within the same file */ - if (same_inode) { - if (destoff + len > off && destoff < off + len) - goto out_unlock; - } if (destoff > inode->i_size) { ret = btrfs_cont_expand(inode, inode->i_size, destoff); if (ret) - goto out_unlock; + return ret; } /* - * Lock the target range too. Right after we replace the file extent - * items in the fs tree (which now point to the cloned data), we might - * have a worker replace them with extent items relative to a write - * operation that was issued before this clone operation (i.e. confront - * with inode.c:btrfs_finish_ordered_io). + * Lock destination range to serialize with concurrent readpages(). */ - if (same_inode) { - u64 lock_start = min_t(u64, off, destoff); - u64 lock_len = max_t(u64, off, destoff) + len - lock_start; - - ret = lock_extent_range(src, lock_start, lock_len, true); - } else { - ret = btrfs_double_extent_lock(src, off, inode, destoff, len, - true); - } - ASSERT(ret == 0); - if (WARN_ON(ret)) { - /* ranges in the io trees already unlocked */ - goto out_unlock; - } - + lock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - - if (same_inode) { - u64 lock_start = min_t(u64, off, destoff); - u64 lock_end = max_t(u64, off, destoff) + len - 1; - - unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end); - } else { - btrfs_double_extent_unlock(src, off, inode, destoff, len); - } + unlock_extent(&BTRFS_I(inode)->io_tree, destoff, destoff + len - 1); /* * Truncate page cache pages so that future reads will see the cloned * data immediately and not the previous data. @@ -4332,11 +3923,87 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, truncate_inode_pages_range(&inode->i_data, round_down(destoff, PAGE_SIZE), round_up(destoff + len, PAGE_SIZE) - 1); -out_unlock: + + return ret; +} + +static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; + bool same_inode = inode_out == inode_in; + u64 wb_len; + int ret; + + if (!(remap_flags & REMAP_FILE_DEDUP)) { + struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + + if (btrfs_root_readonly(root_out)) + return -EROFS; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + } + + if (same_inode) + inode_lock(inode_in); + else + btrfs_double_inode_lock(inode_in, inode_out); + + /* + * Now that the inodes are locked, we need to start writeback ourselves + * and can not rely on the writeback from the VFS's generic helper + * generic_remap_file_range_prep() because: + * + * 1) For compression we must call filemap_fdatawrite_range() range + * twice (btrfs_fdatawrite_range() does it for us), and the generic + * helper only calls it once; + * + * 2) filemap_fdatawrite_range(), called by the generic helper only + * waits for the writeback to complete, i.e. for IO to be done, and + * not for the ordered extents to complete. We need to wait for them + * to complete so that new file extent items are in the fs tree. + */ + if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) + wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + else + wb_len = ALIGN(*len, bs); + + /* + * Since we don't lock ranges, wait for ongoing lockless dio writes (as + * any in progress could create its ordered extents after we wait for + * existing ordered extents below). + */ + inode_dio_wait(inode_in); if (!same_inode) - btrfs_double_inode_unlock(src, inode); + inode_dio_wait(inode_out); + + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), + wb_len); + if (ret < 0) + goto out_unlock; + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), + wb_len); + if (ret < 0) + goto out_unlock; + + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + len, remap_flags); + if (ret < 0 || *len == 0) + goto out_unlock; + + return 0; + + out_unlock: + if (same_inode) + inode_unlock(inode_in); else - inode_unlock(src); + btrfs_double_inode_unlock(inode_in, inode_out); + return ret; } @@ -4344,29 +4011,29 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { + struct inode *src_inode = file_inode(src_file); + struct inode *dst_inode = file_inode(dst_file); + bool same_inode = dst_inode == src_inode; int ret; if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - if (remap_flags & REMAP_FILE_DEDUP) { - struct inode *src = file_inode(src_file); - struct inode *dst = file_inode(dst_file); - u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - - if (WARN_ON_ONCE(bs < PAGE_SIZE)) { - /* - * Btrfs does not support blocksize < page_size. As a - * result, btrfs_cmp_data() won't correctly handle - * this situation without an update. - */ - return -EINVAL; - } + ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, + &len, remap_flags); + if (ret < 0 || len == 0) + return ret; - ret = btrfs_extent_same(src, off, len, dst, destoff); - } else { + if (remap_flags & REMAP_FILE_DEDUP) + ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); - } + + if (same_inode) + inode_unlock(src_inode); + else + btrfs_double_inode_unlock(src_inode, dst_inode); + return ret < 0 ? ret : len; } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index b6a4cc178bee..90639140439f 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -27,7 +27,7 @@ * Records the total size (including the header) of compressed data. * * 2. Segment(s) - * Variable size. Each segment includes one segment header, followd by data + * Variable size. Each segment includes one segment header, followed by data * payload. * One regular LZO compressed extent can have one or more segments. * For inlined LZO compressed extent, only one segment is allowed. diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0c4ef208b8b9..6fde2b2741ef 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -460,7 +460,6 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; struct rb_node *node; - bool dec_pending_ordered = false; /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); @@ -477,37 +476,8 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); - if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags)) - dec_pending_ordered = true; spin_unlock_irq(&tree->lock); - /* - * The current running transaction is waiting on us, we need to let it - * know that we're complete and wake it up. - */ - if (dec_pending_ordered) { - struct btrfs_transaction *trans; - - /* - * The checks for trans are just a formality, it should be set, - * but if it isn't we don't want to deref/assert under the spin - * lock, so be nice and check if trans is set, but ASSERT() so - * if it isn't set a developer will notice. - */ - spin_lock(&fs_info->trans_lock); - trans = fs_info->running_transaction; - if (trans) - refcount_inc(&trans->use_count); - spin_unlock(&fs_info->trans_lock); - - ASSERT(trans); - if (trans) { - if (atomic_dec_and_test(&trans->pending_ordered)) - wake_up(&trans->pending_wait); - btrfs_put_transaction(trans); - } - } - spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 02d813aaa261..fb9a161f0215 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -37,28 +37,31 @@ struct btrfs_ordered_sum { * rbtree, just before waking any waiters. It is used to indicate the * IO is done and any metadata is inserted into the tree. */ -#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ - -#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ - -#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ - -#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ - -#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */ - -#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ - -#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */ - -#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent - * has done its due diligence in updating - * the isize. */ -#define BTRFS_ORDERED_TRUNCATED 8 /* Set when we have to truncate an extent */ - -#define BTRFS_ORDERED_PENDING 9 /* We are waiting for this ordered extent to - * complete in the current transaction. */ -#define BTRFS_ORDERED_REGULAR 10 /* Regular IO for COW */ +enum { + /* set when all the pages are written */ + BTRFS_ORDERED_IO_DONE, + /* set when removed from the tree */ + BTRFS_ORDERED_COMPLETE, + /* set when we want to write in place */ + BTRFS_ORDERED_NOCOW, + /* writing a zlib compressed extent */ + BTRFS_ORDERED_COMPRESSED, + /* set when writing to preallocated extent */ + BTRFS_ORDERED_PREALLOC, + /* set when we're doing DIO with this extent */ + BTRFS_ORDERED_DIRECT, + /* We had an io error when writing this out */ + BTRFS_ORDERED_IOERR, + /* + * indicates whether this ordered extent has done its due diligence in + * updating the isize + */ + BTRFS_ORDERED_UPDATED_ISIZE, + /* Set when we have to truncate an extent */ + BTRFS_ORDERED_TRUNCATED, + /* Regular IO for COW */ + BTRFS_ORDERED_REGULAR, +}; struct btrfs_ordered_extent { /* logical offset in the file */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f70825af6438..4e473a998219 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -30,7 +30,7 @@ * - sync * - copy also limits on subvol creation * - limit - * - caches fuer ulists + * - caches for ulists * - performance benchmarks * - check all ioctl parameters */ @@ -522,7 +522,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) __del_qgroup_rb(qgroup); } /* - * we call btrfs_free_qgroup_config() when umounting + * We call btrfs_free_qgroup_config() when unmounting * filesystem and disabling quota, so we set qgroup_ulist * to be null here to avoid double free. */ @@ -1013,16 +1013,22 @@ out_add_root: btrfs_abort_transaction(trans, ret); goto out_free_path; } - spin_lock(&fs_info->qgroup_lock); - fs_info->quota_root = quota_root; - set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - spin_unlock(&fs_info->qgroup_lock); ret = btrfs_commit_transaction(trans); trans = NULL; if (ret) goto out_free_path; + /* + * Set quota enabled flag after committing the transaction, to avoid + * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot + * creation. + */ + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_root = quota_root; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + spin_unlock(&fs_info->qgroup_lock); + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); @@ -1122,7 +1128,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, * The easy accounting, we're updating qgroup relationship whose child qgroup * only has exclusive extents. * - * In this case, all exclsuive extents will also be exlusive for parent, so + * In this case, all exclusive extents will also be exclusive for parent, so * excl/rfer just get added/removed. * * So is qgroup reservation space, which should also be added/removed to @@ -1749,14 +1755,14 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level) * * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. - * They should be marked during preivous (@dst_level = 1) iteration. + * They should be marked during previous (@dst_level = 1) iteration. * * 3) Mark file extents in leaves dirty * We don't have good way to pick out new file extents only. * So we still follow the old method by scanning all file extents in * the leave. * - * This function can free us from keeping two pathes, thus later we only need + * This function can free us from keeping two paths, thus later we only need * to care about how to iterate all new tree blocks in reloc tree. */ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, @@ -1895,7 +1901,7 @@ out: * * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace * above tree blocks along with their counter parts in file tree. - * While during search, old tree blocsk OO(c) will be skiped as tree block swap + * While during search, old tree blocks OO(c) will be skipped as tree block swap * won't affect OO(c). */ static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, @@ -2020,7 +2026,7 @@ out: * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and * @dst_slot), and find any tree blocks whose generation is at @last_snapshot, * and then go down @src_eb (pointed by @src_parent and @src_slot) to find - * the conterpart of the tree block, then mark both tree blocks as qgroup dirty, + * the counterpart of the tree block, then mark both tree blocks as qgroup dirty, * and skip all tree blocks whose generation is smaller than last_snapshot. * * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(), @@ -3104,9 +3110,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->qgroup_rescan_lock); goto out; } - extent_buffer_get(scratch_leaf); - btrfs_tree_read_lock(scratch_leaf); - btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK); slot = path->slots[0]; btrfs_release_path(path); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -3132,10 +3135,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, goto out; } out: - if (scratch_leaf) { - btrfs_tree_read_unlock_blocking(scratch_leaf); + if (scratch_leaf) free_extent_buffer(scratch_leaf); - } if (done && !ret) { ret = 1; diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index d8f78f5ab854..20c6bd5fa701 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -70,7 +70,7 @@ struct btrfs_qgroup_extent_record { * be converted into META_PERTRANS. */ enum btrfs_qgroup_rsv_type { - BTRFS_QGROUP_RSV_DATA = 0, + BTRFS_QGROUP_RSV_DATA, BTRFS_QGROUP_RSV_META_PERTRANS, BTRFS_QGROUP_RSV_META_PREALLOC, BTRFS_QGROUP_RSV_LAST, @@ -81,10 +81,10 @@ enum btrfs_qgroup_rsv_type { * * Each type should have different reservation behavior. * E.g, data follows its io_tree flag modification, while - * *currently* meta is just reserve-and-clear during transcation. + * *currently* meta is just reserve-and-clear during transaction. * * TODO: Add new type for reservation which can survive transaction commit. - * Currect metadata reservation behavior is not suitable for such case. + * Current metadata reservation behavior is not suitable for such case. */ struct btrfs_qgroup_rsv { u64 values[BTRFS_QGROUP_RSV_LAST]; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index df41d7049936..e74455eb42f9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1980,7 +1980,7 @@ cleanup_io: * - In case of single failure, where rbio->failb == -1: * * Cache this rbio iff the above read reconstruction is - * excuted without problems. + * executed without problems. */ if (err == BLK_STS_OK && rbio->failb < 0) cache_rbio_pages(rbio); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index dec14b739b10..10d9589001a9 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -376,26 +376,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, goto error; } + /* Insert extent in reada tree + all per-device trees, all or nothing */ + down_read(&fs_info->dev_replace.rwsem); ret = radix_tree_preload(GFP_KERNEL); - if (ret) + if (ret) { + up_read(&fs_info->dev_replace.rwsem); goto error; + } - /* insert extent in reada_tree + all per-device trees, all or nothing */ - btrfs_dev_replace_read_lock(&fs_info->dev_replace); spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { re_exist = radix_tree_lookup(&fs_info->reada_tree, index); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); radix_tree_preload_end(); + up_read(&fs_info->dev_replace.rwsem); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); radix_tree_preload_end(); + up_read(&fs_info->dev_replace.rwsem); goto error; } radix_tree_preload_end(); @@ -437,13 +439,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, } radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); goto error; } have_zone = 1; } spin_unlock(&fs_info->reada_lock); - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); if (!have_zone) goto error; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index d69fbfb30aa9..c3557c12656b 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -43,7 +43,7 @@ struct ref_entry { * back to the delayed ref action. We hold the ref we are changing in the * action so we can account for the history properly, and we record the root we * were called with since it could be different from ref_root. We also store - * stack traces because thats how I roll. + * stack traces because that's how I roll. */ struct ref_action { int action; @@ -56,7 +56,7 @@ struct ref_action { /* * One of these for every block we reference, it holds the roots and references - * to it as well as all of the ref actions that have occured to it. We never + * to it as well as all of the ref actions that have occurred to it. We never * free it until we unmount the file system in order to make sure re-allocations * are happening properly. */ @@ -859,7 +859,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, * This shouldn't happen because we will add our re * above when we lookup the be with !parent, but just in * case catch this case so we don't panic because I - * didn't thik of some other corner case. + * didn't think of some other corner case. */ btrfs_err(fs_info, "failed to find root %llu for %llu", root->root_key.objectid, be->bytenr); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a3f75b8926d4..272b287f8cf0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2631,7 +2631,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, * only one thread can access block_rsv at this point, * so we don't need hold lock to protect block_rsv. * we expand more reservation size here to allow enough - * space for relocation and we will return eailer in + * space for relocation and we will return earlier in * enospc case. */ rc->block_rsv->size = tmp + fs_info->nodesize * @@ -4185,37 +4185,13 @@ static struct reloc_control *alloc_reloc_control(void) static void describe_relocation(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group) { - char buf[128]; /* prefixed by a '|' that'll be dropped */ - u64 flags = block_group->flags; + char buf[128] = {'\0'}; - /* Shouldn't happen */ - if (!flags) { - strcpy(buf, "|NONE"); - } else { - char *bp = buf; - -#define DESCRIBE_FLAG(f, d) \ - if (flags & BTRFS_BLOCK_GROUP_##f) { \ - bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \ - flags &= ~BTRFS_BLOCK_GROUP_##f; \ - } - DESCRIBE_FLAG(DATA, "data"); - DESCRIBE_FLAG(SYSTEM, "system"); - DESCRIBE_FLAG(METADATA, "metadata"); - DESCRIBE_FLAG(RAID0, "raid0"); - DESCRIBE_FLAG(RAID1, "raid1"); - DESCRIBE_FLAG(DUP, "dup"); - DESCRIBE_FLAG(RAID10, "raid10"); - DESCRIBE_FLAG(RAID5, "raid5"); - DESCRIBE_FLAG(RAID6, "raid6"); - if (flags) - snprintf(bp, buf - bp + sizeof(buf), "|0x%llx", flags); -#undef DESCRIBE_FLAG - } + btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf)); btrfs_info(fs_info, "relocating block group %llu flags %s", - block_group->key.objectid, buf + 1); + block_group->key.objectid, buf); } /* @@ -4223,6 +4199,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, */ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) { + struct btrfs_block_group_cache *bg; struct btrfs_root *extent_root = fs_info->extent_root; struct reloc_control *rc; struct inode *inode; @@ -4231,14 +4208,23 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) int rw = 0; int err = 0; + bg = btrfs_lookup_block_group(fs_info, group_start); + if (!bg) + return -ENOENT; + + if (btrfs_pinned_by_swapfile(fs_info, bg)) { + btrfs_put_block_group(bg); + return -ETXTBSY; + } + rc = alloc_reloc_control(); - if (!rc) + if (!rc) { + btrfs_put_block_group(bg); return -ENOMEM; + } rc->extent_root = extent_root; - - rc->block_group = btrfs_lookup_block_group(fs_info, group_start); - BUG_ON(!rc->block_group); + rc->block_group = bg; ret = btrfs_inc_block_group_ro(rc->block_group); if (ret) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 902819d3cf41..6dcd36d7b849 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -339,7 +339,9 @@ static struct full_stripe_lock *insert_full_stripe_lock( } } - /* Insert new lock */ + /* + * Insert new lock. + */ ret = kmalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return ERR_PTR(-ENOMEM); @@ -568,12 +570,11 @@ static void scrub_put_ctx(struct scrub_ctx *sctx) scrub_free_ctx(sctx); } -static noinline_for_stack -struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) +static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( + struct btrfs_fs_info *fs_info, int is_dev_replace) { struct scrub_ctx *sctx; int i; - struct btrfs_fs_info *fs_info = dev->fs_info; sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) @@ -582,7 +583,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; - sctx->fs_info = dev->fs_info; + sctx->fs_info = fs_info; for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio; @@ -832,6 +833,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int page_num; int success; bool full_stripe_locked; + unsigned int nofs_flag; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -857,6 +859,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) dev = sblock_to_check->pagev[0]->dev; /* + * We must use GFP_NOFS because the scrub task might be waiting for a + * worker task executing this function and in turn a transaction commit + * might be waiting the scrub task to pause (which needs to wait for all + * the worker tasks to complete before pausing). + * We do allocations in the workers through insert_full_stripe_lock() + * and scrub_add_page_to_wr_bio(), which happens down the call chain of + * this function. + */ + nofs_flag = memalloc_nofs_save(); + /* * For RAID5/6, race can happen for a different device scrub thread. * For data corruption, Parity and Data threads will both try * to recovery the data. @@ -865,6 +877,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); if (ret < 0) { + memalloc_nofs_restore(nofs_flag); spin_lock(&sctx->stat_lock); if (ret == -ENOMEM) sctx->stat.malloc_errors++; @@ -904,7 +917,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, - sizeof(*sblocks_for_recheck), GFP_NOFS); + sizeof(*sblocks_for_recheck), GFP_KERNEL); if (!sblocks_for_recheck) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -1202,6 +1215,7 @@ out: } ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); + memalloc_nofs_restore(nofs_flag); if (ret < 0) return ret; return 0; @@ -3540,7 +3554,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!ret && sctx->is_dev_replace) { /* * If we are doing a device replace wait for any tasks - * that started dellaloc right before we set the block + * that started delalloc right before we set the block * group to RO mode, as they might have just allocated * an extent from it or decided they could do a nocow * write. And if any such tasks did that, wait for their @@ -3596,11 +3610,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } - btrfs_dev_replace_write_lock(&fs_info->dev_replace); + down_write(&fs_info->dev_replace.rwsem); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(&fs_info->dev_replace); + up_write(&dev_replace->rwsem); + ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, found_key.offset, cache); @@ -3636,10 +3651,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); - btrfs_dev_replace_write_lock(&fs_info->dev_replace); + down_write(&fs_info->dev_replace.rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(&fs_info->dev_replace); + up_write(&fs_info->dev_replace.rwsem); if (ro_set) btrfs_dec_block_group_ro(cache); @@ -3772,6 +3787,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, struct scrub_ctx *sctx; int ret; struct btrfs_device *dev; + unsigned int nofs_flag; if (btrfs_fs_closing(fs_info)) return -EINVAL; @@ -3813,13 +3829,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -EINVAL; } + /* Allocate outside of device_list_mutex */ + sctx = scrub_setup_ctx(fs_info, is_dev_replace); + if (IS_ERR(sctx)) + return PTR_ERR(sctx); mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -ENODEV; + ret = -ENODEV; + goto out_free_ctx; } if (!is_dev_replace && !readonly && @@ -3827,7 +3848,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", rcu_str_deref(dev->name)); - return -EROFS; + ret = -EROFS; + goto out_free_ctx; } mutex_lock(&fs_info->scrub_lock); @@ -3835,34 +3857,29 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -EIO; + ret = -EIO; + goto out_free_ctx; } - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (dev->scrub_ctx || (!is_dev_replace && btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -EINPROGRESS; + ret = -EINPROGRESS; + goto out_free_ctx; } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); ret = scrub_workers_get(fs_info, is_dev_replace); if (ret) { mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return ret; + goto out_free_ctx; } - sctx = scrub_setup_ctx(dev, is_dev_replace); - if (IS_ERR(sctx)) { - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - scrub_workers_put(fs_info); - return PTR_ERR(sctx); - } sctx->readonly = readonly; dev->scrub_ctx = sctx; mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -3875,6 +3892,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, atomic_inc(&fs_info->scrubs_running); mutex_unlock(&fs_info->scrub_lock); + /* + * In order to avoid deadlock with reclaim when there is a transaction + * trying to pause scrub, make sure we use GFP_NOFS for all the + * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity() + * invoked by our callees. The pausing request is done when the + * transaction commit starts, and it blocks the transaction until scrub + * is paused (done at specific points at scrub_stripe() or right above + * before incrementing fs_info->scrubs_running). + */ + nofs_flag = memalloc_nofs_save(); if (!is_dev_replace) { /* * by holding device list mutex, we can @@ -3887,6 +3914,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (!ret) ret = scrub_enumerate_chunks(sctx, dev, start, end); + memalloc_nofs_restore(nofs_flag); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); atomic_dec(&fs_info->scrubs_running); @@ -3905,6 +3933,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_put_ctx(sctx); return ret; + +out_free_ctx: + scrub_free_ctx(sctx); + + return ret; } void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5be83b5a1b43..1b15b43905f8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2238,7 +2238,7 @@ out: * inodes "orphan" name instead of the real name and stop. Same with new inodes * that were not created yet and overwritten inodes/refs. * - * When do we have have orphan inodes: + * When do we have orphan inodes: * 1. When an inode is freshly created and thus no valid refs are available yet * 2. When a directory lost all it's refs (deleted) but still has dir items * inside which were not processed yet (pending for move/delete). If anyone @@ -3854,7 +3854,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) /* * We may have refs where the parent directory does not exist * yet. This happens if the parent directories inum is higher - * the the current inum. To handle this case, we create the + * than the current inum. To handle this case, we create the * parent directory out of order. But we need to check if this * did already happen before due to other refs in the same dir. */ @@ -4775,7 +4775,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) struct btrfs_key key; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; - unsigned pg_offset = offset & ~PAGE_MASK; + unsigned pg_offset = offset_in_page(offset); ssize_t ret = 0; key.objectid = sctx->cur_ino; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 645fc81e2a94..368a5b9e6c13 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -93,7 +93,7 @@ const char *btrfs_decode_error(int errno) /* * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the approciate error response. + * invokes the appropriate error response. */ __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, @@ -151,7 +151,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * although there is no way to update the progress. It would add the * risk of a deadlock, therefore the canceling is omitted. The only * penalty is that some I/O remains active until the procedure - * completes. The next time when the filesystem is mounted writeable + * completes. The next time when the filesystem is mounted writable * again, the device replace operation continues. */ } @@ -1848,7 +1848,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (!btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, - "too many missing devices, writeable remount is not allowed"); + "too many missing devices, writable remount is not allowed"); ret = -EACCES; goto restore; } @@ -2090,7 +2090,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 total_free_data = 0; u64 total_free_meta = 0; int bits = dentry->d_sb->s_blocksize_bits; - __be32 *fsid = (__be32 *)fs_info->fsid; + __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid; unsigned factor = 1; struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; @@ -2312,7 +2312,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) * device_list_mutex here as we only read the device data and the list * is protected by RCU. Even if a device is deleted during the list * traversals, we'll get valid data, the freeing callback will wait at - * least until until the rcu_read_unlock. + * least until the rcu_read_unlock. */ rcu_read_lock(); cur_devices = fs_info->fs_devices; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 3717c864ba23..5a5930e3d32b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -191,6 +191,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); +BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); static struct attribute *btrfs_supported_feature_attrs[] = { @@ -204,6 +205,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), BTRFS_FEAT_ATTR_PTR(no_holes), + BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), NULL }; @@ -505,12 +507,24 @@ static ssize_t quota_override_store(struct kobject *kobj, BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); +static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%pU\n", + fs_info->fs_devices->metadata_uuid); +} + +BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), BTRFS_ATTR_PTR(, sectorsize), BTRFS_ATTR_PTR(, clone_alignment), BTRFS_ATTR_PTR(, quota_override), + BTRFS_ATTR_PTR(, metadata_uuid), NULL, }; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index c6ee600aff89..40716b357c1d 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -9,7 +9,7 @@ extern u64 btrfs_debugfs_test; enum btrfs_feature_set { - FEAT_COMPAT = 0, + FEAT_COMPAT, FEAT_COMPAT_RO, FEAT_INCOMPAT, FEAT_MAX diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index db72b3b6209e..8a59597f1883 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -174,8 +174,10 @@ void btrfs_free_dummy_root(struct btrfs_root *root) /* Will be freed by btrfs_free_fs_roots */ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) return; - if (root->node) + if (root->node) { + /* One for allocate_extent_buffer */ free_extent_buffer(root->node); + } kfree(root); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 9e0f4a01be14..3c46d7f23456 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -62,10 +62,11 @@ static int test_find_delalloc(u32 sectorsize) struct page *page; struct page *locked_page = NULL; unsigned long index = 0; - u64 total_dirty = SZ_256M; - u64 max_bytes = SZ_128M; + /* In this test we need at least 2 file extents at its maximum size */ + u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; + u64 total_dirty = 2 * max_bytes; u64 start, end, test_start; - u64 found; + bool found; int ret = -EINVAL; test_msg("running find delalloc tests"); @@ -76,7 +77,7 @@ static int test_find_delalloc(u32 sectorsize) return -ENOMEM; } - extent_io_tree_init(&tmp, inode); + extent_io_tree_init(&tmp, NULL); /* * First go through and create and mark all of our pages dirty, we pin @@ -106,8 +107,8 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("should have found at least one delalloc"); goto out_bits; @@ -137,8 +138,8 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("couldn't find delalloc in our range"); goto out_bits; @@ -171,8 +172,8 @@ static int test_find_delalloc(u32 sectorsize) } start = test_start; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (found) { test_err("found range when we shouldn't have"); goto out_bits; @@ -192,8 +193,8 @@ static int test_find_delalloc(u32 sectorsize) set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("didn't find our range"); goto out_bits; @@ -233,8 +234,8 @@ static int test_find_delalloc(u32 sectorsize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start, - &end, max_bytes); + found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + &end); if (!found) { test_err("didn't find our range"); goto out_bits; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 64043f028820..af0c8e30d9e2 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -254,11 +254,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } - /* - * We will just free a dummy node if it's ref count is 2 so we need an - * extra ref so our searches don't accidentally release our page. - */ - extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); btrfs_set_header_level(root->node, 0); ret = -EINVAL; @@ -860,7 +855,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } - extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); btrfs_set_header_level(root->node, 0); BTRFS_I(inode)->root = root; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d1eeef9ec5da..127fa1535f58 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -233,14 +233,12 @@ loop: extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); - init_waitqueue_head(&cur_trans->pending_wait); cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ refcount_set(&cur_trans->use_count, 2); - atomic_set(&cur_trans->pending_ordered, 0); cur_trans->flags = 0; cur_trans->start_time = ktime_get_seconds(); @@ -456,7 +454,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, bool enforce_qgroups) { struct btrfs_fs_info *fs_info = root->fs_info; - + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; @@ -485,13 +483,28 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * the appropriate flushing if need be. */ if (num_items && root != fs_info->chunk_root) { + struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv; + u64 delayed_refs_bytes = 0; + qgroup_reserved = num_items * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, enforce_qgroups); if (ret) return ERR_PTR(ret); + /* + * We want to reserve all the bytes we may need all at once, so + * we only do 1 enospc flushing cycle per transaction start. We + * accomplish this by simply assuming we'll do 2 x num_items + * worth of delayed refs updates in this trans handle, and + * refill that amount for whatever is missing in the reserve. + */ num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); + if (delayed_refs_rsv->full == 0) { + delayed_refs_bytes = num_bytes; + num_bytes <<= 1; + } + /* * Do the reservation for the relocation root creation */ @@ -500,8 +513,24 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, - num_bytes, flush); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush); + if (ret) + goto reserve_fail; + if (delayed_refs_bytes) { + btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv, + delayed_refs_bytes); + num_bytes -= delayed_refs_bytes; + } + } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && + !delayed_refs_rsv->full) { + /* + * Some people call with btrfs_start_transaction(root, 0) + * because they can be throttled, but have some other mechanism + * for reserving space. We still want these guys to refill the + * delayed block_rsv so just add 1 items worth of reservation + * here. + */ + ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); if (ret) goto reserve_fail; } @@ -670,7 +699,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) /* * btrfs_attach_transaction_barrier() - catch the running transaction * - * It is similar to the above function, the differentia is this one + * It is similar to the above function, the difference is this one * will wait for all the inactive transactions until they fully * complete. */ @@ -760,7 +789,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - if (btrfs_check_space_for_delayed_refs(trans)) + if (btrfs_check_space_for_delayed_refs(fs_info)) return 1; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); @@ -769,22 +798,12 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; - int updates; - int err; smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED || cur_trans->delayed_refs.flushing) return 1; - updates = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (updates) { - err = btrfs_run_delayed_refs(trans, updates * 2); - if (err) /* Error code will also eval true */ - return err; - } - return should_end_transaction(trans); } @@ -814,11 +833,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - u64 transid = trans->transid; - unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; - int must_run_delayed_refs = 0; if (refcount_read(&trans->use_count) > 1) { refcount_dec(&trans->use_count); @@ -832,27 +848,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans); - trans->delayed_ref_updates = 0; - if (!trans->sync) { - must_run_delayed_refs = - btrfs_should_throttle_delayed_refs(trans); - cur = max_t(unsigned long, cur, 32); - - /* - * don't make the caller wait if they are from a NOLOCK - * or ATTACH transaction, it will deadlock with commit - */ - if (must_run_delayed_refs == 1 && - (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH))) - must_run_delayed_refs = 2; - } - - btrfs_trans_release_metadata(trans); - trans->block_rsv = NULL; - - if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans); - btrfs_trans_release_chunk_metadata(trans); if (lock && should_end_transaction(trans) && @@ -894,10 +889,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, } kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (must_run_delayed_refs) { - btrfs_async_run_delayed_refs(info, cur, transid, - must_run_delayed_refs == 1); - } return err; } @@ -1338,7 +1329,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, return 0; /* - * Ensure dirty @src will be commited. Or, after comming + * Ensure dirty @src will be committed. Or, after coming * commit_fs_roots() and switch_commit_roots(), any dirty but not * recorded root will never be updated again, causing an outdated root * item. @@ -1842,7 +1833,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - DEFINE_WAIT(wait); WARN_ON(refcount_read(&trans->use_count) > 1); @@ -1911,13 +1901,6 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } -static inline void -btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) -{ - wait_event(cur_trans->pending_wait, - atomic_read(&cur_trans->pending_ordered) == 0); -} - int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -2052,8 +2035,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_wait_delalloc_flush(fs_info); - btrfs_wait_pending_ordered(cur_trans); - btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 4cbb1b55387d..f1ba78949d1b 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -12,13 +12,13 @@ #include "ctree.h" enum btrfs_trans_state { - TRANS_STATE_RUNNING = 0, - TRANS_STATE_BLOCKED = 1, - TRANS_STATE_COMMIT_START = 2, - TRANS_STATE_COMMIT_DOING = 3, - TRANS_STATE_UNBLOCKED = 4, - TRANS_STATE_COMPLETED = 5, - TRANS_STATE_MAX = 6, + TRANS_STATE_RUNNING, + TRANS_STATE_BLOCKED, + TRANS_STATE_COMMIT_START, + TRANS_STATE_COMMIT_DOING, + TRANS_STATE_UNBLOCKED, + TRANS_STATE_COMPLETED, + TRANS_STATE_MAX, }; #define BTRFS_TRANS_HAVE_FREE_BGS 0 @@ -39,7 +39,6 @@ struct btrfs_transaction { */ atomic_t num_writers; refcount_t use_count; - atomic_t pending_ordered; unsigned long flags; @@ -51,7 +50,6 @@ struct btrfs_transaction { time64_t start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; - wait_queue_head_t pending_wait; struct list_head pending_snapshots; struct list_head pending_chunks; struct list_head switch_commits; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 1a4e2b101ef2..a62e1e837a89 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -27,10 +27,10 @@ * * @type: leaf or node * @identifier: the necessary info to locate the leaf/node. - * It's recommened to decode key.objecitd/offset if it's + * It's recommended to decode key.objecitd/offset if it's * meaningful. * @reason: describe the error - * @bad_value: optional, it's recommened to output bad value and its + * @bad_value: optional, it's recommended to output bad value and its * expected value (range). * * Since comma is used to separate the components, only space is allowed @@ -130,7 +130,7 @@ static int check_extent_data_item(struct btrfs_fs_info *fs_info, } /* - * Support for new compression/encrption must introduce incompat flag, + * Support for new compression/encryption must introduce incompat flag, * and must be caught in open_ctree(). */ if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a5ce99a6c936..ac232b3d6d7e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1144,7 +1144,7 @@ next: } btrfs_release_path(path); - /* look for a conflicing name */ + /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); if (di && !IS_ERR(di)) { @@ -3149,7 +3149,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); /* - * nobody else is going to jump in and write the the ctree + * Nobody else is going to jump in and write the ctree * super here because the log_commit atomic below is protecting * us. We must be called with a transaction handle pinning * the running transaction open, so a full commit can't hop @@ -3201,8 +3201,6 @@ static void free_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log) { int ret; - u64 start; - u64 end; struct walk_control wc = { .free = 1, .process_func = process_one_buffer @@ -3216,18 +3214,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans, btrfs_handle_fs_error(log->fs_info, ret, NULL); } - while (1) { - ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, - NULL); - if (ret) - break; - - clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); - } - + clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); free_extent_buffer(log->node); kfree(log); } @@ -4383,7 +4371,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; - u64 logged_start, logged_end; u64 test_gen; int ret = 0; int num = 0; @@ -4392,8 +4379,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; - logged_start = start; - logged_end = end; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { /* @@ -4434,11 +4419,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em->start >= i_size_read(&inode->vfs_inode)) continue; - if (em->start < logged_start) - logged_start = em->start; - if ((em->start + em->len - 1) > logged_end) - logged_end = em->start + em->len - 1; - /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -5778,6 +5758,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } + /* + * If a new hard link was added to the inode in the current transaction + * and its link count is now greater than 1, we need to fallback to a + * transaction commit, otherwise we can end up not logging all its new + * parents for all the hard links. Here just from the dentry used to + * fsync, we can not visit the ancestor inodes for all the other hard + * links to figure out if any is new, so we fallback to a transaction + * commit (instead of adding a lot of complexity of scanning a btree, + * since this scenario is not a common use case). + */ + if (inode->vfs_inode.i_nlink > 1 && + inode->last_link_trans > last_committed) { + ret = -EMLINK; + goto end_trans; + } + while (1) { if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 767765031e59..0fab84a8f670 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -15,7 +15,6 @@ struct btrfs_log_ctx { int log_ret; int log_transid; - int io_err; bool log_new_dentries; struct inode *inode; struct list_head list; @@ -26,7 +25,6 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, { ctx->log_ret = 0; ctx->log_transid = 0; - ctx->io_err = 0; ctx->log_new_dentries = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f435d397019e..2576b1a379c9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -37,6 +37,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .nparity = 0, .raid_name = "raid10", .bg_flag = BTRFS_BLOCK_GROUP_RAID10, .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, @@ -49,6 +50,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, + .nparity = 0, .raid_name = "raid1", .bg_flag = BTRFS_BLOCK_GROUP_RAID1, .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, @@ -61,6 +63,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 2, + .nparity = 0, .raid_name = "dup", .bg_flag = BTRFS_BLOCK_GROUP_DUP, .mindev_error = 0, @@ -73,6 +76,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .nparity = 0, .raid_name = "raid0", .bg_flag = BTRFS_BLOCK_GROUP_RAID0, .mindev_error = 0, @@ -85,6 +89,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, + .nparity = 0, .raid_name = "single", .bg_flag = 0, .mindev_error = 0, @@ -96,7 +101,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_min = 2, .tolerated_failures = 1, .devs_increment = 1, - .ncopies = 2, + .ncopies = 1, + .nparity = 1, .raid_name = "raid5", .bg_flag = BTRFS_BLOCK_GROUP_RAID5, .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, @@ -108,7 +114,8 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .devs_min = 3, .tolerated_failures = 2, .devs_increment = 1, - .ncopies = 3, + .ncopies = 1, + .nparity = 2, .raid_name = "raid6", .bg_flag = BTRFS_BLOCK_GROUP_RAID6, .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, @@ -123,6 +130,60 @@ const char *get_raid_name(enum btrfs_raid_types type) return btrfs_raid_array[type].raid_name; } +/* + * Fill @buf with textual description of @bg_flags, no more than @size_buf + * bytes including terminating null byte. + */ +void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) +{ + int i; + int ret; + char *bp = buf; + u64 flags = bg_flags; + u32 size_bp = size_buf; + + if (!flags) { + strcpy(bp, "NONE"); + return; + } + +#define DESCRIBE_FLAG(flag, desc) \ + do { \ + if (flags & (flag)) { \ + ret = snprintf(bp, size_bp, "%s|", (desc)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + flags &= ~(flag); \ + } \ + } while (0) + + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); + + DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, + btrfs_raid_array[i].raid_name); +#undef DESCRIBE_FLAG + + if (flags) { + ret = snprintf(bp, size_bp, "0x%llx|", flags); + size_bp -= ret; + } + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ + + /* + * The text is trimmed, it's up to the caller to provide sufficiently + * large buffer + */ +out_overflow:; +} + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); @@ -151,7 +212,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * the mutex can be very coarse and can cover long-running operations * * protects: updates to fs_devices counters like missing devices, rw devices, - * seeding, structure cloning, openning/closing devices at mount/umount time + * seeding, structure cloning, opening/closing devices at mount/umount time * * global::fs_devs - add, remove, updates to the global list * @@ -238,13 +299,15 @@ struct list_head *btrfs_get_fs_uuids(void) /* * alloc_fs_devices - allocate struct btrfs_fs_devices - * @fsid: if not NULL, copy the uuid to fs_devices::fsid + * @fsid: if not NULL, copy the UUID to fs_devices::fsid + * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid * * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). * The returned struct is not linked onto any lists and can be destroyed with * kfree() right away. */ -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, + const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devs; @@ -261,6 +324,11 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + if (metadata_fsid) + memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); + else if (fsid) + memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); + return fs_devs; } @@ -368,13 +436,57 @@ static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, return NULL; } -static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) +static noinline struct btrfs_fs_devices *find_fsid( + const u8 *fsid, const u8 *metadata_fsid) { struct btrfs_fs_devices *fs_devices; + ASSERT(fsid); + + if (metadata_fsid) { + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by first scanning + * a device which didn't have its fsid/metadata_uuid changed + * at all and the CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(metadata_fsid, fs_devices->fsid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by a device that + * has an outdated pair of fsid/metadata_uuid and + * CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(fs_devices->metadata_uuid, + fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && + memcmp(metadata_fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + } + + /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) - return fs_devices; + if (metadata_fsid) { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 + && memcmp(metadata_fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) + return fs_devices; + } else { + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) + return fs_devices; + } } return NULL; } @@ -709,6 +821,13 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, device->generation = btrfs_super_generation(disk_super); if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + if (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { + pr_err( + "BTRFS: Invalid seeding and uuid-changed device detected\n"); + goto error_brelse; + } + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = 1; } else { @@ -744,6 +863,51 @@ error_brelse: } /* + * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices + * being created with a disk that has already completed its fsid change. + */ +static struct btrfs_fs_devices *find_fsid_inprogress( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->fsid, + BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { + return fs_devices; + } + } + + return NULL; +} + + +static struct btrfs_fs_devices *find_fsid_changed( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Handles the case where scanned device is part of an fs that had + * multiple successful changes of FSID but curently device didn't + * observe it. Meaning our fsid will be different than theirs. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, disk_super->fsid, + BTRFS_FSID_SIZE) != 0) { + return fs_devices; + } + } + + return NULL; +} +/* * Add new device to list of registered devices * * Returns: @@ -755,14 +919,46 @@ static noinline struct btrfs_device *device_list_add(const char *path, bool *new_device_added) { struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices; + struct btrfs_fs_devices *fs_devices = NULL; struct rcu_string *name; u64 found_transid = btrfs_super_generation(disk_super); u64 devid = btrfs_stack_device_id(&disk_super->dev_item); + bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & + BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + + if (fsid_change_in_progress) { + if (!has_metadata_uuid) { + /* + * When we have an image which has CHANGING_FSID_V2 set + * it might belong to either a filesystem which has + * disks with completed fsid change or it might belong + * to fs with no UUID changes in effect, handle both. + */ + fs_devices = find_fsid_inprogress(disk_super); + if (!fs_devices) + fs_devices = find_fsid(disk_super->fsid, NULL); + } else { + fs_devices = find_fsid_changed(disk_super); + } + } else if (has_metadata_uuid) { + fs_devices = find_fsid(disk_super->fsid, + disk_super->metadata_uuid); + } else { + fs_devices = find_fsid(disk_super->fsid, NULL); + } + - fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { - fs_devices = alloc_fs_devices(disk_super->fsid); + if (has_metadata_uuid) + fs_devices = alloc_fs_devices(disk_super->fsid, + disk_super->metadata_uuid); + else + fs_devices = alloc_fs_devices(disk_super->fsid, NULL); + + fs_devices->fsid_change = fsid_change_in_progress; + if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); @@ -774,6 +970,21 @@ static noinline struct btrfs_device *device_list_add(const char *path, mutex_lock(&fs_devices->device_list_mutex); device = find_device(fs_devices, devid, disk_super->dev_item.uuid); + + /* + * If this disk has been pulled into an fs devices created by + * a device which had the CHANGING_FSID_V2 flag then replace the + * metadata_uuid/fsid values of the fs_devices. + */ + if (has_metadata_uuid && fs_devices->fsid_change && + found_transid > fs_devices->latest_generation) { + memcpy(fs_devices->fsid, disk_super->fsid, + BTRFS_FSID_SIZE); + memcpy(fs_devices->metadata_uuid, + disk_super->metadata_uuid, BTRFS_FSID_SIZE); + + fs_devices->fsid_change = false; + } } if (!device) { @@ -850,6 +1061,35 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-EEXIST); } + /* + * We are going to replace the device path for a given devid, + * make sure it's the same device if the device is mounted + */ + if (device->bdev) { + struct block_device *path_bdev; + + path_bdev = lookup_bdev(path); + if (IS_ERR(path_bdev)) { + mutex_unlock(&fs_devices->device_list_mutex); + return ERR_CAST(path_bdev); + } + + if (device->bdev != path_bdev) { + bdput(path_bdev); + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_warn_in_rcu(device->fs_info, + "duplicate device fsid:devid for %pU:%llu old:%s new:%s", + disk_super->fsid, devid, + rcu_str_deref(device->name), path); + return ERR_PTR(-EEXIST); + } + bdput(path_bdev); + btrfs_info_in_rcu(device->fs_info, + "device fsid %pU devid %llu moved old:%s new:%s", + disk_super->fsid, devid, + rcu_str_deref(device->name), path); + } + name = rcu_string_strdup(path, GFP_NOFS); if (!name) { mutex_unlock(&fs_devices->device_list_mutex); @@ -869,8 +1109,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, * it back. We need it to pick the disk with largest generation * (as above). */ - if (!fs_devices->opened) + if (!fs_devices->opened) { device->generation = found_transid; + fs_devices->latest_generation = max_t(u64, found_transid, + fs_devices->latest_generation); + } fs_devices->total_devices = btrfs_super_num_devices(disk_super); @@ -884,7 +1127,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_device *device; struct btrfs_device *orig_dev; - fs_devices = alloc_fs_devices(orig->fsid); + fs_devices = alloc_fs_devices(orig->fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; @@ -1193,7 +1436,7 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, p = kmap(*page); /* align our pointer to the offset of the super block */ - *disk_super = p + (bytenr & ~PAGE_MASK); + *disk_super = p + offset_in_page(bytenr); if (btrfs_super_bytenr(*disk_super) != bytenr || btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { @@ -1709,7 +1952,8 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); ptr = btrfs_device_fsid(dev_item); - write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE); + write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, + ptr, BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(leaf); ret = 0; @@ -1862,12 +2106,12 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) { u64 num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { ASSERT(num_devices > 1); num_devices--; } - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); return num_devices; } @@ -1900,6 +2144,14 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, goto out; } + if (btrfs_pinned_by_swapfile(fs_info, device)) { + btrfs_warn_in_rcu(fs_info, + "cannot remove device %s (devid %llu) due to active swapfile", + rcu_str_deref(device->name), device->devid); + ret = -ETXTBSY; + goto out; + } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; goto out; @@ -2132,7 +2384,13 @@ static struct btrfs_device *btrfs_find_device_by_path( disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) + device = btrfs_find_device(fs_info, devid, dev_uuid, + disk_super->metadata_uuid); + else + device = btrfs_find_device(fs_info, devid, + dev_uuid, disk_super->fsid); + brelse(bh); if (!device) device = ERR_PTR(-ENOENT); @@ -2202,7 +2460,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) if (!fs_devices->seeding) return -EINVAL; - seed_devices = alloc_fs_devices(NULL); + seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) return PTR_ERR(seed_devices); @@ -2238,7 +2496,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); - memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); mutex_unlock(&fs_devices->device_list_mutex); @@ -2480,7 +2738,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path * so rename the fsid on the sysfs */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", - fs_info->fsid); + fs_info->fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_info, "sysfs: failed to create fsid for sprout"); @@ -2718,8 +2976,15 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) return ret; } -static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) +/* + * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. + * @logical: Logical block offset in bytes. + * @length: Length of extent in bytes. + * + * Return: Chunk mapping or ERR_PTR. + */ +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { struct extent_map_tree *em_tree; struct extent_map *em; @@ -2756,7 +3021,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em = get_chunk_map(fs_info, chunk_offset, 1); + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) { /* * This is a logic error, but we don't want to just rely on the @@ -2797,13 +3062,11 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) mutex_unlock(&fs_info->chunk_mutex); } - if (map->stripes[i].dev) { - ret = btrfs_update_device(trans, map->stripes[i].dev); - if (ret) { - mutex_unlock(&fs_devices->device_list_mutex); - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = btrfs_update_device(trans, device); + if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); + goto out; } } mutex_unlock(&fs_devices->device_list_mutex); @@ -3437,17 +3700,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_root *chunk_root = fs_info->chunk_root; - struct btrfs_root *dev_root = fs_info->dev_root; - struct list_head *devices; - struct btrfs_device *device; - u64 old_size; - u64 size_to_free; u64 chunk_type; struct btrfs_chunk *chunk; struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_trans_handle *trans; struct extent_buffer *leaf; int slot; int ret; @@ -3462,53 +3719,6 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_sys = 0; int chunk_reserved = 0; - /* step one make some room on all the devices */ - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { - old_size = btrfs_device_get_total_bytes(device); - size_to_free = div_factor(old_size, 1); - size_to_free = min_t(u64, size_to_free, SZ_1M); - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) || - btrfs_device_get_total_bytes(device) - - btrfs_device_get_bytes_used(device) > size_to_free || - test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) - continue; - - ret = btrfs_shrink_device(device, old_size - size_to_free); - if (ret == -ENOSPC) - break; - if (ret) { - /* btrfs_shrink_device never returns ret > 0 */ - WARN_ON(ret > 0); - goto error; - } - - trans = btrfs_start_transaction(dev_root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - btrfs_info_in_rcu(fs_info, - "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", - rcu_str_deref(device->name), ret, - old_size, old_size - size_to_free); - goto error; - } - - ret = btrfs_grow_device(trans, device, old_size); - if (ret) { - btrfs_end_transaction(trans); - /* btrfs_grow_device never returns ret > 0 */ - WARN_ON(ret > 0); - btrfs_info_in_rcu(fs_info, - "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", - rcu_str_deref(device->name), ret, - old_size, old_size - size_to_free); - goto error; - } - - btrfs_end_transaction(trans); - } - - /* step two, relocate all the chunks */ path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -3638,10 +3848,15 @@ again: ret = btrfs_relocate_chunk(fs_info, found_key.offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret && ret != -ENOSPC) - goto error; if (ret == -ENOSPC) { enospc_errors++; + } else if (ret == -ETXTBSY) { + btrfs_info(fs_info, + "skipping relocation of block group %llu due to active swapfile", + found_key.offset); + ret = 0; + } else if (ret) { + goto error; } else { spin_lock(&fs_info->balance_lock); bctl->stat.completed++; @@ -3712,6 +3927,162 @@ static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, } /* + * Fill @buf with textual description of balance filter flags @bargs, up to + * @size_buf including the terminating null. The output may be trimmed if it + * does not fit into the provided buffer. + */ +static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, + u32 size_buf) +{ + int ret; + u32 size_bp = size_buf; + char *bp = buf; + u64 flags = bargs->flags; + char tmp_buf[128] = {'\0'}; + + if (!flags) + return; + +#define CHECK_APPEND_NOARG(a) \ + do { \ + ret = snprintf(bp, size_bp, (a)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_1ARG(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + +#define CHECK_APPEND_2ARG(a, v1, v2) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (flags & BTRFS_BALANCE_ARGS_CONVERT) { + int index = btrfs_bg_flags_to_raid_index(bargs->target); + + CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index)); + } + + if (flags & BTRFS_BALANCE_ARGS_SOFT) + CHECK_APPEND_NOARG("soft,"); + + if (flags & BTRFS_BALANCE_ARGS_PROFILES) { + btrfs_describe_block_groups(bargs->profiles, tmp_buf, + sizeof(tmp_buf)); + CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); + } + + if (flags & BTRFS_BALANCE_ARGS_USAGE) + CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); + + if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) + CHECK_APPEND_2ARG("usage=%u..%u,", + bargs->usage_min, bargs->usage_max); + + if (flags & BTRFS_BALANCE_ARGS_DEVID) + CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); + + if (flags & BTRFS_BALANCE_ARGS_DRANGE) + CHECK_APPEND_2ARG("drange=%llu..%llu,", + bargs->pstart, bargs->pend); + + if (flags & BTRFS_BALANCE_ARGS_VRANGE) + CHECK_APPEND_2ARG("vrange=%llu..%llu,", + bargs->vstart, bargs->vend); + + if (flags & BTRFS_BALANCE_ARGS_LIMIT) + CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); + + if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) + CHECK_APPEND_2ARG("limit=%u..%u,", + bargs->limit_min, bargs->limit_max); + + if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) + CHECK_APPEND_2ARG("stripes=%u..%u,", + bargs->stripes_min, bargs->stripes_max); + +#undef CHECK_APPEND_2ARG +#undef CHECK_APPEND_1ARG +#undef CHECK_APPEND_NOARG + +out_overflow: + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ + else + buf[0] = '\0'; +} + +static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) +{ + u32 size_buf = 1024; + char tmp_buf[192] = {'\0'}; + char *buf; + char *bp; + u32 size_bp = size_buf; + int ret; + struct btrfs_balance_control *bctl = fs_info->balance_ctl; + + buf = kzalloc(size_buf, GFP_KERNEL); + if (!buf) + return; + + bp = buf; + +#define CHECK_APPEND_1ARG(a, v1) \ + do { \ + ret = snprintf(bp, size_bp, (a), (v1)); \ + if (ret < 0 || ret >= size_bp) \ + goto out_overflow; \ + size_bp -= ret; \ + bp += ret; \ + } while (0) + + if (bctl->flags & BTRFS_BALANCE_FORCE) + CHECK_APPEND_1ARG("%s", "-f "); + + if (bctl->flags & BTRFS_BALANCE_DATA) { + describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-d%s ", tmp_buf); + } + + if (bctl->flags & BTRFS_BALANCE_METADATA) { + describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-m%s ", tmp_buf); + } + + if (bctl->flags & BTRFS_BALANCE_SYSTEM) { + describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); + CHECK_APPEND_1ARG("-s%s ", tmp_buf); + } + +#undef CHECK_APPEND_1ARG + +out_overflow: + + if (size_bp < size_buf) + buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ + btrfs_info(fs_info, "balance: %s %s", + (bctl->flags & BTRFS_BALANCE_RESUME) ? + "resume" : "start", buf); + + kfree(buf); +} + +/* * Should be called with balance mutexe held */ int btrfs_balance(struct btrfs_fs_info *fs_info, @@ -3724,6 +4095,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, int ret; u64 num_devices; unsigned seq; + bool reducing_integrity; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -3803,24 +4175,30 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, !(bctl->sys.target & allowed)) || ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - btrfs_info(fs_info, - "balance: force reducing metadata integrity"); - } else { - btrfs_err(fs_info, - "balance: reduces metadata integrity, use --force if you want this"); - ret = -EINVAL; - goto out; - } - } + !(bctl->meta.target & allowed))) + reducing_integrity = true; + else + reducing_integrity = false; + + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); - /* if we're not converting, the target field is uninitialized */ - meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->meta.target : fs_info->avail_metadata_alloc_bits; - data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->data.target : fs_info->avail_data_alloc_bits; + if (reducing_integrity) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + btrfs_info(fs_info, + "balance: force reducing metadata integrity"); + } else { + btrfs_err(fs_info, + "balance: reduces metadata integrity, use --force if you want this"); + ret = -EINVAL; + goto out; + } + } + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { int meta_index = btrfs_bg_flags_to_raid_index(meta_target); @@ -3850,11 +4228,19 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); + describe_balance_start_or_resume(fs_info); mutex_unlock(&fs_info->balance_mutex); ret = __btrfs_balance(fs_info); mutex_lock(&fs_info->balance_mutex); + if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) + btrfs_info(fs_info, "balance: paused"); + else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req)) + btrfs_info(fs_info, "balance: canceled"); + else + btrfs_info(fs_info, "balance: ended with status: %d", ret); + clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); if (bargs) { @@ -3887,10 +4273,8 @@ static int balance_kthread(void *data) int ret = 0; mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - btrfs_info(fs_info, "balance: resuming"); + if (fs_info->balance_ctl) ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); - } mutex_unlock(&fs_info->balance_mutex); return ret; @@ -4433,10 +4817,16 @@ again: ret = btrfs_relocate_chunk(fs_info, chunk_offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret && ret != -ENOSPC) - goto done; - if (ret == -ENOSPC) + if (ret == -ENOSPC) { failed++; + } else if (ret) { + if (ret == -ETXTBSY) { + btrfs_warn(fs_info, + "could not shrink block group %llu due to active swapfile", + chunk_offset); + } + goto done; + } } while (key.offset-- > 0); if (failed && !retried) { @@ -4602,11 +4992,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int devs_min; /* min devs needed */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ + int nparity; /* number of stripes worth of bytes to + store parity information */ int ret; u64 max_stripe_size; u64 max_chunk_size; u64 stripe_size; - u64 num_bytes; + u64 chunk_size; int ndevs; int i; int j; @@ -4628,6 +5020,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, devs_min = btrfs_raid_array[index].devs_min; devs_increment = btrfs_raid_array[index].devs_increment; ncopies = btrfs_raid_array[index].ncopies; + nparity = btrfs_raid_array[index].nparity; if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = SZ_1G; @@ -4654,7 +5047,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, BUG_ON(1); } - /* we don't want a chunk larger than 10% of writeable space */ + /* We don't want a chunk larger than 10% of writable space */ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); @@ -4757,30 +5150,22 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, * this will have to be fixed for RAID1 and RAID10 over * more drives */ - data_stripes = num_stripes / ncopies; - - if (type & BTRFS_BLOCK_GROUP_RAID5) - data_stripes = num_stripes - 1; - - if (type & BTRFS_BLOCK_GROUP_RAID6) - data_stripes = num_stripes - 2; + data_stripes = (num_stripes - nparity) / ncopies; /* * Use the number of data stripes to figure out how big this chunk * is really going to be in terms of logical address space, - * and compare that answer with the max chunk size + * and compare that answer with the max chunk size. If it's higher, + * we try to reduce stripe_size. */ if (stripe_size * data_stripes > max_chunk_size) { - stripe_size = div_u64(max_chunk_size, data_stripes); - - /* bump the answer up to a 16MB boundary */ - stripe_size = round_up(stripe_size, SZ_16M); - /* - * But don't go higher than the limits we found while searching - * for free extents + * Reduce stripe_size, round it up to a 16MB boundary again and + * then use it, unless it ends up being even bigger than the + * previous value we had already. */ - stripe_size = min(devices_info[ndevs - 1].max_avail, + stripe_size = min(round_up(div_u64(max_chunk_size, + data_stripes), SZ_16M), stripe_size); } @@ -4808,9 +5193,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, map->type = type; map->sub_stripes = sub_stripes; - num_bytes = stripe_size * data_stripes; + chunk_size = stripe_size * data_stripes; - trace_btrfs_chunk_alloc(info, map, start, num_bytes); + trace_btrfs_chunk_alloc(info, map, start, chunk_size); em = alloc_extent_map(); if (!em) { @@ -4821,7 +5206,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); em->map_lookup = map; em->start = start; - em->len = num_bytes; + em->len = chunk_size; em->block_start = 0; em->block_len = em->len; em->orig_block_len = stripe_size; @@ -4839,14 +5224,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, refcount_inc(&em->refs); write_unlock(&em_tree->lock); - ret = btrfs_make_block_group(trans, 0, type, start, num_bytes); + ret = btrfs_make_block_group(trans, 0, type, start, chunk_size); if (ret) goto error_del_extent; - for (i = 0; i < map->num_stripes; i++) { - num_bytes = map->stripes[i].dev->bytes_used + stripe_size; - btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); - } + for (i = 0; i < map->num_stripes; i++) + btrfs_device_set_bytes_used(map->stripes[i].dev, + map->stripes[i].dev->bytes_used + stripe_size); atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space); @@ -4890,7 +5274,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int i = 0; int ret = 0; - em = get_chunk_map(fs_info, chunk_offset, chunk_size); + em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); if (IS_ERR(em)) return PTR_ERR(em); @@ -4971,10 +5355,10 @@ out: } /* - * Chunk allocation falls into two parts. The first part does works - * that make the new allocated chunk useable, but not do any operation - * that modifies the chunk tree. The second part does the works that - * require modifying the chunk tree. This division is important for the + * Chunk allocation falls into two parts. The first part does work + * that makes the new allocated chunk usable, but does not do any operation + * that modifies the chunk tree. The second part does the work that + * requires modifying the chunk tree. This division is important for the * bootstrap process of adding storage to a seed btrfs. */ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type) @@ -5032,7 +5416,7 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) int miss_ndevs = 0; int i; - em = get_chunk_map(fs_info, chunk_offset, 1); + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) return 1; @@ -5092,7 +5476,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(em)) /* * We could return errors for these cases, but that could get @@ -5122,11 +5506,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) ret = 1; free_extent_map(em); - btrfs_dev_replace_read_lock(&fs_info->dev_replace); + down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && fs_info->dev_replace.tgtdev) ret++; - btrfs_dev_replace_read_unlock(&fs_info->dev_replace); + up_read(&fs_info->dev_replace.rwsem); return ret; } @@ -5138,7 +5522,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct map_lookup *map; unsigned long len = fs_info->sectorsize; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if (!WARN_ON(IS_ERR(em))) { map = em->map_lookup; @@ -5155,7 +5539,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret = 0; - em = get_chunk_map(fs_info, logical, len); + em = btrfs_get_chunk_map(fs_info, logical, len); if(!WARN_ON(IS_ERR(em))) { map = em->map_lookup; @@ -5314,7 +5698,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, /* discard always return a bbio */ ASSERT(bbio_ret); - em = get_chunk_map(fs_info, logical, length); + em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) return PTR_ERR(em); @@ -5640,7 +6024,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, return __btrfs_map_block_for_discard(fs_info, logical, *length, bbio_ret); - em = get_chunk_map(fs_info, logical, *length); + em = btrfs_get_chunk_map(fs_info, logical, *length); if (IS_ERR(em)) return PTR_ERR(em); @@ -5699,17 +6083,21 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, *length = em->len - offset; } - /* This is for when we're called from btrfs_merge_bio_hook() and all - it cares about is the length */ + /* + * This is for when we're called from btrfs_bio_fits_in_stripe and all + * it cares about is the length + */ if (!bbio_ret) goto out; - btrfs_dev_replace_read_lock(dev_replace); + down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + /* + * Hold the semaphore for read during the whole operation, write is + * requested at commit time but must wait. + */ if (!dev_replace_is_ongoing) - btrfs_dev_replace_read_unlock(dev_replace); - else - btrfs_dev_replace_set_lock_blocking(dev_replace); + up_read(&dev_replace->rwsem); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && !need_full_stripe(op) && dev_replace->tgtdev != NULL) { @@ -5904,12 +6292,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } out: if (dev_replace_is_ongoing) { - ASSERT(atomic_read(&dev_replace->blocking_readers) > 0); - btrfs_dev_replace_read_lock(dev_replace); - /* Barrier implied by atomic_dec_and_test */ - if (atomic_dec_and_test(&dev_replace->blocking_readers)) - cond_wake_up_nomb(&dev_replace->read_lock_wq); - btrfs_dev_replace_read_unlock(dev_replace); + lockdep_assert_held(&dev_replace->rwsem); + /* Unlock and let waiting writers proceed */ + up_read(&dev_replace->rwsem); } free_extent_map(em); return ret; @@ -5943,7 +6328,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 rmap_len; int i, j, nr = 0; - em = get_chunk_map(fs_info, chunk_start, 1); + em = btrfs_get_chunk_map(fs_info, chunk_start, 1); if (IS_ERR(em)) return -EIO; @@ -6083,12 +6468,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device, int should_queue = 1; struct btrfs_pending_bios *pending_bios; - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || - !device->bdev) { - bio_io_error(bio); - return; - } - /* don't bother with additional async steps for reads, right now */ if (bio_op(bio) == REQ_OP_READ) { btrfsic_submit_bio(bio); @@ -6217,7 +6596,8 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; - if (!dev || !dev->bdev || + if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, + &dev->dev_state) || (bio_op(first_bio) == REQ_OP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { bbio_error(bbio, first_bio, logical); @@ -6245,7 +6625,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, cur_devices = fs_info->fs_devices; while (cur_devices) { if (!fsid || - !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) { + !memcmp(cur_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { device = find_device(cur_devices, devid, uuid); if (device) return device; @@ -6574,12 +6954,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = fs_devices->seed; } - fs_devices = find_fsid(fsid); + fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { if (!btrfs_test_opt(fs_info, DEGRADED)) return ERR_PTR(-ENOENT); - fs_devices = alloc_fs_devices(fsid); + fs_devices = alloc_fs_devices(fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; @@ -6629,7 +7009,7 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); - if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { + if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { fs_devices = open_seed_devices(fs_info, fs_uuid); if (IS_ERR(fs_devices)) return PTR_ERR(fs_devices); @@ -6876,7 +7256,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, if (missing > max_tolerated) { if (!failing_dev) btrfs_warn(fs_info, - "chunk %llu missing %d devices, max tolerance is %d for writeable mount", + "chunk %llu missing %d devices, max tolerance is %d for writable mount", em->start, missing, max_tolerated); free_extent_map(em); ret = false; @@ -7387,6 +7767,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; struct extent_map *em; struct map_lookup *map; + struct btrfs_device *dev; u64 stripe_len; bool found = false; int ret = 0; @@ -7436,6 +7817,22 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, physical_offset, devid); ret = -EUCLEAN; } + + /* Make sure no dev extent is beyond device bondary */ + dev = btrfs_find_device(fs_info, devid, NULL, NULL); + if (!dev) { + btrfs_err(fs_info, "failed to find devid %llu", devid); + ret = -EUCLEAN; + goto out; + } + if (physical_offset + physical_len > dev->disk_total_bytes) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", + devid, physical_offset, physical_len, + dev->disk_total_bytes); + ret = -EUCLEAN; + goto out; + } out: free_extent_map(em); return ret; @@ -7478,6 +7875,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) struct btrfs_path *path; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; + u64 prev_devid = 0; + u64 prev_dev_ext_end = 0; int ret = 0; key.objectid = 1; @@ -7522,10 +7921,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); physical_len = btrfs_dev_extent_length(leaf, dext); + /* Check if this dev extent overlaps with the previous one */ + if (devid == prev_devid && physical_offset < prev_dev_ext_end) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", + devid, physical_offset, prev_dev_ext_end); + ret = -EUCLEAN; + goto out; + } + ret = verify_one_dev_extent(fs_info, chunk_offset, devid, physical_offset, physical_len); if (ret < 0) goto out; + prev_devid = devid; + prev_dev_ext_end = physical_offset + physical_len; + ret = btrfs_next_item(root, path); if (ret < 0) goto out; @@ -7541,3 +7952,27 @@ out: btrfs_free_path(path); return ret; } + +/* + * Check whether the given block group or device is pinned by any inode being + * used as a swapfile. + */ +bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) +{ + struct btrfs_swapfile_pin *sp; + struct rb_node *node; + + spin_lock(&fs_info->swapfile_pins_lock); + node = fs_info->swapfile_pins.rb_node; + while (node) { + sp = rb_entry(node, struct btrfs_swapfile_pin, node); + if (ptr < sp->ptr) + node = node->rb_left; + else if (ptr > sp->ptr) + node = node->rb_right; + else + break; + } + spin_unlock(&fs_info->swapfile_pins_lock); + return node != NULL; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index aefce895e994..ed806649a473 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -210,6 +210,8 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + u8 metadata_uuid[BTRFS_FSID_SIZE]; + bool fsid_change; struct list_head fs_list; u64 num_devices; @@ -218,6 +220,10 @@ struct btrfs_fs_devices { u64 missing_devices; u64 total_rw_bytes; u64 total_devices; + + /* Highest generation number of seen devices */ + u64 latest_generation; + struct block_device *latest_bdev; /* all of the devices in the FS, protected by a mutex @@ -261,15 +267,12 @@ struct btrfs_fs_devices { * we allocate are actually btrfs_io_bios. We'll cram as much of * struct btrfs_bio as we can into this over time. */ -typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); struct btrfs_io_bio { unsigned int mirror_num; unsigned int stripe_index; u64 logical; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; - u8 *csum_allocated; - btrfs_io_bio_end_io_t *end_io; struct bvec_iter iter; /* * This member must come last, bio_alloc_bioset will allocate enough @@ -283,15 +286,20 @@ static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) return container_of(bio, struct btrfs_io_bio, bio); } +static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio) +{ + if (io_bio->csum != io_bio->csum_inline) { + kfree(io_bio->csum); + io_bio->csum = NULL; + } +} + struct btrfs_bio_stripe { struct btrfs_device *dev; u64 physical; u64 length; /* only used for discard mappings */ }; -struct btrfs_bio; -typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); - struct btrfs_bio { refcount_t refs; atomic_t stripes_pending; @@ -331,6 +339,8 @@ struct btrfs_raid_attr { int tolerated_failures; /* max tolerated fail devs */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ + int nparity; /* number of stripes worth of bytes to store + * parity information */ int mindev_error; /* error code if min devs requisite is unmet */ const char raid_name[8]; /* name of the raid */ u64 bg_flag; /* block group flag of the raid */ @@ -430,6 +440,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_balance(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); +void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); @@ -462,6 +473,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 chunk_size); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); +struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ea78c3d6dcfc..f141b45ce349 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -11,6 +11,7 @@ #include <linux/security.h> #include <linux/posix_acl_xattr.h> #include <linux/iversion.h> +#include <linux/sched/mm.h> #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" @@ -422,9 +423,15 @@ static int btrfs_initxattrs(struct inode *inode, { const struct xattr *xattr; struct btrfs_trans_handle *trans = fs_info; + unsigned int nofs_flag; char *name; int err = 0; + /* + * We're holding a transaction handle, so use a NOFS memory allocation + * context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); for (xattr = xattr_array; xattr->name != NULL; xattr++) { name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name) + 1, GFP_KERNEL); @@ -440,6 +447,7 @@ static int btrfs_initxattrs(struct inode *inode, if (err < 0) break; } + memalloc_nofs_restore(nofs_flag); return err; } diff --git a/fs/buffer.c b/fs/buffer.c index 1286c2b95498..d60d61e8ed7d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3060,11 +3060,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, */ bio = bio_alloc(GFP_NOIO, 1); - if (wbc) { - wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); - } - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; @@ -3084,6 +3079,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, op_flags |= REQ_PRIO; bio_set_op_attrs(bio, op, op_flags); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + submit_bio(bio); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 79a265ba9200..dfb64a5211b6 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -810,7 +810,7 @@ static inline int default_congestion_kb(void) * This allows larger machines to have larger/more transfers. * Limit the default to 256M */ - congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); if (congestion_kb > 256*1024) congestion_kb = 256*1024; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 85dadb93c992..f1ddc9d03c10 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -190,8 +190,9 @@ config CIFS_DFS_UPCALL moves to a different server. This feature also enables an upcall mechanism for CIFS which contacts userspace helper utilities to provide server name resolution (host names to - IP addresses) which is needed for implicit mounts of DFS junction - points. If unsure, say Y. + IP addresses) which is needed in order to reconnect to + servers if their addresses change or for implicit mounts of + DFS junction points. If unsure, say Y. config CIFS_NFSD_EXPORT bool "Allow nfsd to export CIFS file system" diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 85817991ee68..51af69a1a328 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -17,7 +17,7 @@ cifs-$(CONFIG_CIFS_ACL) += cifsacl.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o -cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o +cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o dfs_cache.o cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index ba178b09de0b..593fb422d0f3 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -30,6 +30,9 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#ifdef CONFIG_CIFS_DFS_UPCALL +#include "dfs_cache.h" +#endif #ifdef CONFIG_CIFS_SMB_DIRECT #include "smbdirect.h" #endif @@ -629,6 +632,11 @@ cifs_proc_init(void) &cifs_security_flags_proc_fops); proc_create("LookupCacheEnabled", 0644, proc_fs_cifs, &cifs_lookup_cache_proc_fops); + +#ifdef CONFIG_CIFS_DFS_UPCALL + proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_fops); +#endif + #ifdef CONFIG_CIFS_SMB_DIRECT proc_create("rdma_readwrite_threshold", 0644, proc_fs_cifs, &cifs_rdma_readwrite_threshold_proc_fops); @@ -663,6 +671,10 @@ cifs_proc_clean(void) remove_proc_entry("SecurityFlags", proc_fs_cifs); remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); + +#ifdef CONFIG_CIFS_DFS_UPCALL + remove_proc_entry("dfscache", proc_fs_cifs); +#endif #ifdef CONFIG_CIFS_SMB_DIRECT remove_proc_entry("rdma_readwrite_threshold", proc_fs_cifs); remove_proc_entry("smbd_max_frmr_depth", proc_fs_cifs); diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index b97c74efd04a..d9b99abe1243 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -25,6 +25,7 @@ #include "dns_resolve.h" #include "cifs_debug.h" #include "cifs_unicode.h" +#include "dfs_cache.h" static LIST_HEAD(cifs_dfs_automount_list); @@ -126,7 +127,7 @@ cifs_build_devname(char *nodename, const char *prepath) * @sb_mountdata: parent/root DFS mount options (template) * @fullpath: full path in UNC format * @ref: server's referral - * @devname: pointer for saving device name + * @devname: optional pointer for saving device name * * creates mount options for submount based on template options sb_mountdata * and replacing unc,ip,prefixpath options with ones we've got form ref_unc. @@ -140,6 +141,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, char **devname) { int rc; + char *name; char *mountdata = NULL; const char *prepath = NULL; int md_len; @@ -158,17 +160,17 @@ char *cifs_compose_mount_options(const char *sb_mountdata, prepath++; } - *devname = cifs_build_devname(ref->node_name, prepath); - if (IS_ERR(*devname)) { - rc = PTR_ERR(*devname); - *devname = NULL; + name = cifs_build_devname(ref->node_name, prepath); + if (IS_ERR(name)) { + rc = PTR_ERR(name); + name = NULL; goto compose_mount_options_err; } - rc = dns_resolve_server_name_to_ip(*devname, &srvIP); + rc = dns_resolve_server_name_to_ip(name, &srvIP); if (rc < 0) { cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n", - __func__, *devname, rc); + __func__, name, rc); goto compose_mount_options_err; } @@ -224,6 +226,9 @@ char *cifs_compose_mount_options(const char *sb_mountdata, strcat(mountdata, "ip="); strcat(mountdata, srvIP); + if (devname) + *devname = name; + /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/ /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/ @@ -234,8 +239,7 @@ compose_mount_options_out: compose_mount_options_err: kfree(mountdata); mountdata = ERR_PTR(rc); - kfree(*devname); - *devname = NULL; + kfree(name); goto compose_mount_options_out; } @@ -251,20 +255,30 @@ static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt, { struct vfsmount *mnt; char *mountdata; - char *devname = NULL; + char *devname; + + /* + * Always pass down the DFS full path to smb3_do_mount() so we + * can use it later for failover. + */ + devname = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); + if (!devname) + return ERR_PTR(-ENOMEM); + + convert_delimiter(devname, '/'); /* strip first '\' from fullpath */ mountdata = cifs_compose_mount_options(cifs_sb->mountdata, - fullpath + 1, ref, &devname); - - if (IS_ERR(mountdata)) + fullpath + 1, ref, NULL); + if (IS_ERR(mountdata)) { + kfree(devname); return (struct vfsmount *)mountdata; + } mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata); kfree(mountdata); kfree(devname); return mnt; - } static void dump_referral(const struct dfs_info3_param *ref) @@ -282,16 +296,15 @@ static void dump_referral(const struct dfs_info3_param *ref) */ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) { - struct dfs_info3_param *referrals = NULL; - unsigned int num_referrals = 0; + struct dfs_info3_param referral = {0}; struct cifs_sb_info *cifs_sb; struct cifs_ses *ses; - char *full_path; + struct cifs_tcon *tcon; + char *full_path, *root_path; unsigned int xid; - int i; + int len; int rc; struct vfsmount *mnt; - struct tcon_link *tlink; cifs_dbg(FYI, "in %s\n", __func__); BUG_ON(IS_ROOT(mntpt)); @@ -315,48 +328,69 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) if (full_path == NULL) goto cdda_exit; - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) { - mnt = ERR_CAST(tlink); + cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path); + + if (!cifs_sb_master_tlink(cifs_sb)) { + cifs_dbg(FYI, "%s: master tlink is NULL\n", __func__); goto free_full_path; } - ses = tlink_tcon(tlink)->ses; + tcon = cifs_sb_master_tcon(cifs_sb); + if (!tcon) { + cifs_dbg(FYI, "%s: master tcon is NULL\n", __func__); + goto free_full_path; + } + + root_path = kstrdup(tcon->treeName, GFP_KERNEL); + if (!root_path) { + mnt = ERR_PTR(-ENOMEM); + goto free_full_path; + } + cifs_dbg(FYI, "%s: root path: %s\n", __func__, root_path); + + ses = tcon->ses; xid = get_xid(); - rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls, - &num_referrals, &referrals, - cifs_remap(cifs_sb)); - free_xid(xid); - cifs_put_tlink(tlink); - - mnt = ERR_PTR(-ENOENT); - for (i = 0; i < num_referrals; i++) { - int len; - dump_referral(referrals + i); - /* connect to a node */ - len = strlen(referrals[i].node_name); - if (len < 2) { - cifs_dbg(VFS, "%s: Net Address path too short: %s\n", - __func__, referrals[i].node_name); - mnt = ERR_PTR(-EINVAL); - break; - } - mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, - full_path, referrals + i); - cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", - __func__, referrals[i].node_name, mnt); - if (!IS_ERR(mnt)) - goto success; + /* + * If DFS root has been expired, then unconditionally fetch it again to + * refresh DFS referral cache. + */ + rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), + root_path + 1, NULL, NULL); + if (!rc) { + rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, + cifs_remap(cifs_sb), full_path + 1, + &referral, NULL); } - /* no valid submounts were found; return error from get_dfs_path() by - * preference */ - if (rc != 0) + free_xid(xid); + + if (rc) { mnt = ERR_PTR(rc); + goto free_root_path; + } + + dump_referral(&referral); -success: - free_dfs_info_array(referrals, num_referrals); + len = strlen(referral.node_name); + if (len < 2) { + cifs_dbg(VFS, "%s: Net Address path too short: %s\n", + __func__, referral.node_name); + mnt = ERR_PTR(-EINVAL); + goto free_dfs_ref; + } + /* + * cifs_mount() will retry every available node server in case + * of failures. + */ + mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, &referral); + cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__, + referral.node_name, mnt); + +free_dfs_ref: + free_dfs_info_param(&referral); +free_root_path: + kfree(root_path); free_full_path: kfree(full_path); cdda_exit: diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 63d7530f2e1d..42f0d67f1054 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -72,6 +72,15 @@ struct cifs_sb_info { char *mountdata; /* options received at mount time or via DFS refs */ struct delayed_work prune_tlinks; struct rcu_head rcu; + + /* only used when CIFS_MOUNT_USE_PREFIX_PATH is set */ char *prepath; + + /* + * Path initially provided by the mount call. We might connect + * to something different via DFS but we want to keep it to do + * failover properly. + */ + char *origin_fullpath; /* \\HOST\SHARE\[OPTIONAL PATH] */ }; #endif /* _CIFS_FS_SB_H */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 85b31cfa2f3c..d2a05e46d6f5 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -224,7 +224,7 @@ int cifs_verify_signature(struct smb_rqst *rqst, if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) { struct smb_com_lock_req *pSMB = (struct smb_com_lock_req *)cifs_pdu; - if (pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE) + if (pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE) return 0; } @@ -304,12 +304,17 @@ int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp) int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key) { - int i; + int i, len; int rc; char password_with_pad[CIFS_ENCPWD_SIZE] = {0}; - if (password) - strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE); + if (password) { + for (len = 0; len < CIFS_ENCPWD_SIZE; len++) + if (!password[len]) + break; + + memcpy(password_with_pad, password, len); + } if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) { memcpy(lnm_session_key, password_with_pad, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 865706edb307..62d48d486d8f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -52,6 +52,9 @@ #include "cifs_spnego.h" #include "fscache.h" #include "smb2pdu.h" +#ifdef CONFIG_CIFS_DFS_UPCALL +#include "dfs_cache.h" +#endif int cifsFYI = 0; bool traceSMB; @@ -1494,10 +1497,15 @@ init_cifs(void) if (rc) goto out_destroy_mids; +#ifdef CONFIG_CIFS_DFS_UPCALL + rc = dfs_cache_init(); + if (rc) + goto out_destroy_request_bufs; +#endif /* CONFIG_CIFS_DFS_UPCALL */ #ifdef CONFIG_CIFS_UPCALL rc = init_cifs_spnego(); if (rc) - goto out_destroy_request_bufs; + goto out_destroy_dfs_cache; #endif /* CONFIG_CIFS_UPCALL */ #ifdef CONFIG_CIFS_ACL @@ -1525,6 +1533,10 @@ out_register_key_type: #endif #ifdef CONFIG_CIFS_UPCALL exit_cifs_spnego(); +out_destroy_dfs_cache: +#endif +#ifdef CONFIG_CIFS_DFS_UPCALL + dfs_cache_destroy(); out_destroy_request_bufs: #endif cifs_destroy_request_bufs(); @@ -1556,6 +1568,9 @@ exit_cifs(void) #ifdef CONFIG_CIFS_UPCALL exit_cifs_spnego(); #endif +#ifdef CONFIG_CIFS_DFS_UPCALL + dfs_cache_destroy(); +#endif cifs_destroy_request_bufs(); cifs_destroy_mids(); cifs_destroy_inodecache(); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 4c3b5cfccc49..26776eddd85d 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -150,5 +150,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.14" +#define CIFS_VERSION "2.15" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 38ab0fca49e1..01ded7038b19 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -701,6 +701,13 @@ struct TCP_Server_Info { struct delayed_work reconnect; /* reconnect workqueue job */ struct mutex reconnect_mutex; /* prevent simultaneous reconnects */ unsigned long echo_interval; + + /* + * Number of targets available for reconnect. The more targets + * the more tasks have to wait to let the demultiplex thread + * reconnect. + */ + int nr_targets; }; static inline unsigned int @@ -1014,6 +1021,11 @@ struct cifs_tcon { struct list_head pending_opens; /* list of incomplete opens */ struct cached_fid crfid; /* Cached root fid */ /* BB add field for back pointer to sb struct(s)? */ +#ifdef CONFIG_CIFS_DFS_UPCALL + char *dfs_path; + int remap:2; + struct list_head ulist; /* cache update list */ +#endif }; /* @@ -1508,6 +1520,7 @@ struct dfs_info3_param { int ref_flag; char *path_name; char *node_name; + int ttl; }; /* @@ -1545,7 +1558,6 @@ static inline void free_dfs_info_param(struct dfs_info3_param *param) if (param) { kfree(param->path_name); kfree(param->node_name); - kfree(param); } } @@ -1790,6 +1802,7 @@ extern struct smb_version_values smb3any_values; extern struct smb_version_operations smb30_operations; extern struct smb_version_values smb30_values; #define SMB302_VERSION_STRING "3.02" +#define ALT_SMB302_VERSION_STRING "3.0.2" /*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */ extern struct smb_version_values smb302_values; #define SMB311_VERSION_STRING "3.1.1" diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index fa361bc00602..336c116995d7 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -22,6 +22,9 @@ #define _CIFSPROTO_H #include <linux/nls.h> #include "trace.h" +#ifdef CONFIG_CIFS_DFS_UPCALL +#include "dfs_cache.h" +#endif struct statfs; struct smb_vol; @@ -213,7 +216,7 @@ extern int cifs_match_super(struct super_block *, void *); extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); extern struct smb_vol *cifs_get_volume_info(char *mount_data, const char *devname, bool is_smb3); -extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); +extern int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol); extern void cifs_umount(struct cifs_sb_info *); extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); extern void cifs_reopen_persistent_handles(struct cifs_tcon *tcon); @@ -294,11 +297,6 @@ extern int CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses, unsigned int *num_of_nodes, const struct nls_table *nls_codepage, int remap); -extern int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, - const char *old_path, - const struct nls_table *nls_codepage, - unsigned int *num_referrals, - struct dfs_info3_param **referrals, int remap); extern int parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size, unsigned int *num_of_nodes, struct dfs_info3_param **target_nodes, @@ -524,6 +522,11 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, const struct nls_table *codepage); extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); +extern void +cifs_cleanup_volume_info_contents(struct smb_vol *volume_info); + +extern struct TCP_Server_Info * +cifs_find_tcp_session(struct smb_vol *vol); void cifs_readdata_release(struct kref *refcount); int cifs_async_readv(struct cifs_readdata *rdata); @@ -562,4 +565,17 @@ void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc); extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, unsigned int *len, unsigned int *offset); +void extract_unc_hostname(const char *unc, const char **h, size_t *len); + +#ifdef CONFIG_CIFS_DFS_UPCALL +static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, + const char *old_path, + const struct nls_table *nls_codepage, + struct dfs_info3_param *referral, int remap) +{ + return dfs_cache_find(xid, ses, nls_codepage, remap, old_path, + referral, NULL); +} +#endif + #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f82fd342bca5..b1f49c1c543a 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -44,6 +44,9 @@ #include "cifs_debug.h" #include "fscache.h" #include "smbdirect.h" +#ifdef CONFIG_CIFS_DFS_UPCALL +#include "dfs_cache.h" +#endif #ifdef CONFIG_CIFS_POSIX static struct { @@ -118,6 +121,77 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) */ } +#ifdef CONFIG_CIFS_DFS_UPCALL +static int __cifs_reconnect_tcon(const struct nls_table *nlsc, + struct cifs_tcon *tcon) +{ + int rc; + struct dfs_cache_tgt_list tl; + struct dfs_cache_tgt_iterator *it = NULL; + char tree[MAX_TREE_SIZE + 1]; + const char *tcp_host; + size_t tcp_host_len; + const char *dfs_host; + size_t dfs_host_len; + + if (tcon->ipc) { + snprintf(tree, sizeof(tree), "\\\\%s\\IPC$", + tcon->ses->server->hostname); + return CIFSTCon(0, tcon->ses, tree, tcon, nlsc); + } + + if (!tcon->dfs_path) + return CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc); + + rc = dfs_cache_noreq_find(tcon->dfs_path + 1, NULL, &tl); + if (rc) + return rc; + + extract_unc_hostname(tcon->ses->server->hostname, &tcp_host, + &tcp_host_len); + + for (it = dfs_cache_get_tgt_iterator(&tl); it; + it = dfs_cache_get_next_tgt(&tl, it)) { + const char *tgt = dfs_cache_get_tgt_name(it); + + extract_unc_hostname(tgt, &dfs_host, &dfs_host_len); + + if (dfs_host_len != tcp_host_len + || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) { + cifs_dbg(FYI, "%s: skipping %.*s, doesn't match %.*s", + __func__, + (int)dfs_host_len, dfs_host, + (int)tcp_host_len, tcp_host); + continue; + } + + snprintf(tree, sizeof(tree), "\\%s", tgt); + + rc = CIFSTCon(0, tcon->ses, tree, tcon, nlsc); + if (!rc) + break; + if (rc == -EREMOTE) + break; + } + + if (!rc) { + if (it) + rc = dfs_cache_noreq_update_tgthint(tcon->dfs_path + 1, + it); + else + rc = -ENOENT; + } + dfs_cache_free_tgts(&tl); + return rc; +} +#else +static inline int __cifs_reconnect_tcon(const struct nls_table *nlsc, + struct cifs_tcon *tcon) +{ + return CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nlsc); +} +#endif + /* reconnect the socket, tcon, and smb session if needed */ static int cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) @@ -126,6 +200,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) struct cifs_ses *ses; struct TCP_Server_Info *server; struct nls_table *nls_codepage; + int retries; /* * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for @@ -152,9 +227,12 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) } } + retries = server->nr_targets; + /* - * Give demultiplex thread up to 10 seconds to reconnect, should be - * greater than cifs socket timeout which is 7 seconds + * Give demultiplex thread up to 10 seconds to each target available for + * reconnect -- should be greater than cifs socket timeout which is 7 + * seconds. */ while (server->tcpStatus == CifsNeedReconnect) { rc = wait_event_interruptible_timeout(server->response_q, @@ -170,6 +248,9 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) if (server->tcpStatus != CifsNeedReconnect) break; + if (--retries) + continue; + /* * on "soft" mounts we wait once. Hard mounts keep * retrying until process is killed or server comes @@ -179,6 +260,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); return -EHOSTDOWN; } + retries = server->nr_targets; } if (!ses->need_reconnect && !tcon->need_reconnect) @@ -214,7 +296,7 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) } cifs_mark_open_files_invalid(tcon); - rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); + rc = __cifs_reconnect_tcon(nls_codepage, tcon); mutex_unlock(&ses->session_mutex); cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6f24f129a751..69b9d5606eba 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -56,6 +56,11 @@ #include "fscache.h" #include "smb2proto.h" #include "smbdirect.h" +#include "dns_resolve.h" +#include "cifsfs.h" +#ifdef CONFIG_CIFS_DFS_UPCALL +#include "dfs_cache.h" +#endif extern mempool_t *cifs_req_poolp; extern bool disable_legacy_dialects; @@ -304,6 +309,7 @@ static const match_table_t cifs_smb_version_tokens = { { Smb_21, SMB21_VERSION_STRING }, { Smb_30, SMB30_VERSION_STRING }, { Smb_302, SMB302_VERSION_STRING }, + { Smb_302, ALT_SMB302_VERSION_STRING }, { Smb_311, SMB311_VERSION_STRING }, { Smb_311, ALT_SMB311_VERSION_STRING }, { Smb_3any, SMB3ANY_VERSION_STRING }, @@ -317,6 +323,131 @@ static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); static void cifs_prune_tlinks(struct work_struct *work); static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, const char *devname, bool is_smb3); +static char *extract_hostname(const char *unc); + +/* + * Resolve hostname and set ip addr in tcp ses. Useful for hostnames that may + * get their ip addresses changed at some point. + * + * This should be called with server->srv_mutex held. + */ +#ifdef CONFIG_CIFS_DFS_UPCALL +static int reconn_set_ipaddr(struct TCP_Server_Info *server) +{ + int rc; + int len; + char *unc, *ipaddr = NULL; + + if (!server->hostname) + return -EINVAL; + + len = strlen(server->hostname) + 3; + + unc = kmalloc(len, GFP_KERNEL); + if (!unc) { + cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__); + return -ENOMEM; + } + snprintf(unc, len, "\\\\%s", server->hostname); + + rc = dns_resolve_server_name_to_ip(unc, &ipaddr); + kfree(unc); + + if (rc < 0) { + cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n", + __func__, server->hostname, rc); + return rc; + } + + rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr, + strlen(ipaddr)); + kfree(ipaddr); + + return !rc ? -1 : 0; +} +#else +static inline int reconn_set_ipaddr(struct TCP_Server_Info *server) +{ + return 0; +} +#endif + +#ifdef CONFIG_CIFS_DFS_UPCALL +struct super_cb_data { + struct TCP_Server_Info *server; + struct cifs_sb_info *cifs_sb; +}; + +/* These functions must be called with server->srv_mutex held */ + +static void super_cb(struct super_block *sb, void *arg) +{ + struct super_cb_data *d = arg; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + + if (d->cifs_sb) + return; + + cifs_sb = CIFS_SB(sb); + tcon = cifs_sb_master_tcon(cifs_sb); + if (tcon->ses->server == d->server) + d->cifs_sb = cifs_sb; +} + +static inline struct cifs_sb_info * +find_super_by_tcp(struct TCP_Server_Info *server) +{ + struct super_cb_data d = { + .server = server, + .cifs_sb = NULL, + }; + + iterate_supers_type(&cifs_fs_type, super_cb, &d); + return d.cifs_sb ? d.cifs_sb : ERR_PTR(-ENOENT); +} + +static void reconn_inval_dfs_target(struct TCP_Server_Info *server, + struct cifs_sb_info *cifs_sb, + struct dfs_cache_tgt_list *tgt_list, + struct dfs_cache_tgt_iterator **tgt_it) +{ + const char *name; + + if (!cifs_sb || !cifs_sb->origin_fullpath || !tgt_list || + !server->nr_targets) + return; + + if (!*tgt_it) { + *tgt_it = dfs_cache_get_tgt_iterator(tgt_list); + } else { + *tgt_it = dfs_cache_get_next_tgt(tgt_list, *tgt_it); + if (!*tgt_it) + *tgt_it = dfs_cache_get_tgt_iterator(tgt_list); + } + + cifs_dbg(FYI, "%s: UNC: %s\n", __func__, cifs_sb->origin_fullpath); + + name = dfs_cache_get_tgt_name(*tgt_it); + + kfree(server->hostname); + + server->hostname = extract_hostname(name); + if (!server->hostname) { + cifs_dbg(FYI, "%s: failed to extract hostname from target: %d\n", + __func__, -ENOMEM); + } +} + +static inline int reconn_setup_dfs_targets(struct cifs_sb_info *cifs_sb, + struct dfs_cache_tgt_list *tl, + struct dfs_cache_tgt_iterator **it) +{ + if (!cifs_sb->origin_fullpath) + return -EOPNOTSUPP; + return dfs_cache_noreq_find(cifs_sb->origin_fullpath + 1, NULL, tl); +} +#endif /* * cifs tcp session reconnection @@ -335,8 +466,33 @@ cifs_reconnect(struct TCP_Server_Info *server) struct cifs_tcon *tcon; struct mid_q_entry *mid_entry; struct list_head retry_list; +#ifdef CONFIG_CIFS_DFS_UPCALL + struct cifs_sb_info *cifs_sb = NULL; + struct dfs_cache_tgt_list tgt_list = {0}; + struct dfs_cache_tgt_iterator *tgt_it = NULL; +#endif spin_lock(&GlobalMid_Lock); + server->nr_targets = 1; +#ifdef CONFIG_CIFS_DFS_UPCALL + cifs_sb = find_super_by_tcp(server); + if (IS_ERR(cifs_sb)) { + rc = PTR_ERR(cifs_sb); + cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n", + __func__, rc); + cifs_sb = NULL; + } else { + rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list, &tgt_it); + if (rc) { + cifs_dbg(VFS, "%s: no target servers for DFS failover\n", + __func__); + } else { + server->nr_targets = dfs_cache_get_nr_tgts(&tgt_list); + } + } + cifs_dbg(FYI, "%s: will retry %d target(s)\n", __func__, + server->nr_targets); +#endif if (server->tcpStatus == CifsExiting) { /* the demux thread will exit normally next time through the loop */ @@ -410,14 +566,27 @@ cifs_reconnect(struct TCP_Server_Info *server) do { try_to_freeze(); - /* we should try only the port we connected to before */ mutex_lock(&server->srv_mutex); + /* + * Set up next DFS target server (if any) for reconnect. If DFS + * feature is disabled, then we will retry last server we + * connected to before. + */ if (cifs_rdma_enabled(server)) rc = smbd_reconnect(server); else rc = generic_ip_connect(server); if (rc) { cifs_dbg(FYI, "reconnect error %d\n", rc); +#ifdef CONFIG_CIFS_DFS_UPCALL + reconn_inval_dfs_target(server, cifs_sb, &tgt_list, + &tgt_it); +#endif + rc = reconn_set_ipaddr(server); + if (rc) { + cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", + __func__, rc); + } mutex_unlock(&server->srv_mutex); msleep(3000); } else { @@ -430,6 +599,22 @@ cifs_reconnect(struct TCP_Server_Info *server) } } while (server->tcpStatus == CifsNeedReconnect); +#ifdef CONFIG_CIFS_DFS_UPCALL + if (tgt_it) { + rc = dfs_cache_noreq_update_tgthint(cifs_sb->origin_fullpath + 1, + tgt_it); + if (rc) { + cifs_dbg(VFS, "%s: failed to update DFS target hint: rc = %d\n", + __func__, rc); + } + rc = dfs_cache_update_vol(cifs_sb->origin_fullpath, server); + if (rc) { + cifs_dbg(VFS, "%s: failed to update vol info in DFS cache: rc = %d\n", + __func__, rc); + } + dfs_cache_free_tgts(&tgt_list); + } +#endif if (server->tcpStatus == CifsNeedNegotiate) mod_delayed_work(cifsiod_wq, &server->echo, 0); @@ -1043,7 +1228,12 @@ extract_hostname(const char *unc) /* skip double chars at beginning of string */ /* BB: check validity of these bytes? */ - src = unc + 2; + if (strlen(unc) < 3) + return ERR_PTR(-EINVAL); + for (src = unc; *src && *src == '\\'; src++) + ; + if (!*src) + return ERR_PTR(-EINVAL); /* delimiter between hostname and sharename is always '\\' now */ delim = strchr(src, '\\'); @@ -1827,7 +2017,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->password = NULL; break; } - /* Yes it is. Drop down to Opt_pass below.*/ + /* Fallthrough - to Opt_pass below.*/ case Opt_pass: /* Obtain the value string */ value = strchr(data, '='); @@ -2289,7 +2479,7 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) return 1; } -static struct TCP_Server_Info * +struct TCP_Server_Info * cifs_find_tcp_session(struct smb_vol *vol) { struct TCP_Server_Info *server; @@ -2461,6 +2651,8 @@ smbd_connected: } tcp_ses->tcpStatus = CifsNeedNegotiate; + tcp_ses->nr_targets = 1; + /* thread spawned, put it on the list */ spin_lock(&cifs_tcp_ses_lock); list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); @@ -3256,25 +3448,6 @@ out: return rc; } -int -get_dfs_path(const unsigned int xid, struct cifs_ses *ses, const char *old_path, - const struct nls_table *nls_codepage, unsigned int *num_referrals, - struct dfs_info3_param **referrals, int remap) -{ - int rc = 0; - - if (!ses->server->ops->get_dfs_refer) - return -ENOSYS; - - *num_referrals = 0; - *referrals = NULL; - - rc = ses->server->ops->get_dfs_refer(xid, ses, old_path, - referrals, num_referrals, - nls_codepage, remap); - return rc; -} - #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key cifs_key[2]; static struct lock_class_key cifs_slock_key[2]; @@ -3746,8 +3919,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, return 0; } -static void -cleanup_volume_info_contents(struct smb_vol *volume_info) +void +cifs_cleanup_volume_info_contents(struct smb_vol *volume_info) { kfree(volume_info->username); kzfree(volume_info->password); @@ -3762,10 +3935,136 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info) { if (!volume_info) return; - cleanup_volume_info_contents(volume_info); + cifs_cleanup_volume_info_contents(volume_info); kfree(volume_info); } +/* Release all succeed connections */ +static inline void mount_put_conns(struct cifs_sb_info *cifs_sb, + unsigned int xid, + struct TCP_Server_Info *server, + struct cifs_ses *ses, struct cifs_tcon *tcon) +{ + int rc = 0; + + if (tcon) + cifs_put_tcon(tcon); + else if (ses) + cifs_put_smb_ses(ses); + else if (server) + cifs_put_tcp_session(server, 0); + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS; + free_xid(xid); +} + +/* Get connections for tcp, ses and tcon */ +static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, + unsigned int *xid, + struct TCP_Server_Info **nserver, + struct cifs_ses **nses, struct cifs_tcon **ntcon) +{ + int rc = 0; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + *nserver = NULL; + *nses = NULL; + *ntcon = NULL; + + *xid = get_xid(); + + /* get a reference to a tcp session */ + server = cifs_get_tcp_session(vol); + if (IS_ERR(server)) { + rc = PTR_ERR(server); + return rc; + } + + *nserver = server; + + if ((vol->max_credits < 20) || (vol->max_credits > 60000)) + server->max_credits = SMB2_MAX_CREDITS_AVAILABLE; + else + server->max_credits = vol->max_credits; + + /* get a reference to a SMB session */ + ses = cifs_get_smb_ses(server, vol); + if (IS_ERR(ses)) { + rc = PTR_ERR(ses); + return rc; + } + + *nses = ses; + + if ((vol->persistent == true) && (!(ses->server->capabilities & + SMB2_GLOBAL_CAP_PERSISTENT_HANDLES))) { + cifs_dbg(VFS, "persistent handles not supported by server\n"); + return -EOPNOTSUPP; + } + + /* search for existing tcon to this server share */ + tcon = cifs_get_tcon(ses, vol); + if (IS_ERR(tcon)) { + rc = PTR_ERR(tcon); + return rc; + } + + *ntcon = tcon; + + /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */ + if (tcon->posix_extensions) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; + + /* tell server which Unix caps we support */ + if (cap_unix(tcon->ses)) { + /* + * reset of caps checks mount to see if unix extensions disabled + * for just this mount. + */ + reset_cifs_unix_caps(*xid, tcon, cifs_sb, vol); + if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) && + (le64_to_cpu(tcon->fsUnixInfo.Capability) & + CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) + return -EACCES; + } else + tcon->unix_ext = 0; /* server does not support them */ + + /* do not care if a following call succeed - informational */ + if (!tcon->pipe && server->ops->qfs_tcon) + server->ops->qfs_tcon(*xid, tcon); + + cifs_sb->wsize = server->ops->negotiate_wsize(tcon, vol); + cifs_sb->rsize = server->ops->negotiate_rsize(tcon, vol); + + return 0; +} + +static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, + struct cifs_tcon *tcon) +{ + struct tcon_link *tlink; + + /* hang the tcon off of the superblock */ + tlink = kzalloc(sizeof(*tlink), GFP_KERNEL); + if (tlink == NULL) + return -ENOMEM; + + tlink->tl_uid = ses->linux_uid; + tlink->tl_tcon = tcon; + tlink->tl_time = jiffies; + set_bit(TCON_LINK_MASTER, &tlink->tl_flags); + set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + + cifs_sb->master_tlink = tlink; + spin_lock(&cifs_sb->tlink_tree_lock); + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); + + queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks, + TLINK_IDLE_EXPIRE); + return 0; +} #ifdef CONFIG_CIFS_DFS_UPCALL /* @@ -3774,10 +4073,11 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info) */ static char * build_unc_path_to_root(const struct smb_vol *vol, - const struct cifs_sb_info *cifs_sb) + const struct cifs_sb_info *cifs_sb, bool useppath) { char *full_path, *pos; - unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0; + unsigned int pplen = useppath && vol->prepath ? + strlen(vol->prepath) + 1 : 0; unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1); full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); @@ -3799,8 +4099,9 @@ build_unc_path_to_root(const struct smb_vol *vol, return full_path; } -/* - * Perform a dfs referral query for a share and (optionally) prefix +/** + * expand_dfs_referral - Perform a dfs referral query and update the cifs_sb + * * * If a referral is found, cifs_sb->mountdata will be (re-)allocated * to a string containing updated options for the submount. Otherwise it @@ -3815,39 +4116,36 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, int check_prefix) { int rc; - unsigned int num_referrals = 0; - struct dfs_info3_param *referrals = NULL; + struct dfs_info3_param referral = {0}; char *full_path = NULL, *ref_path = NULL, *mdata = NULL; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) return -EREMOTE; - full_path = build_unc_path_to_root(volume_info, cifs_sb); + full_path = build_unc_path_to_root(volume_info, cifs_sb, true); if (IS_ERR(full_path)) return PTR_ERR(full_path); /* For DFS paths, skip the first '\' of the UNC */ ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1; - rc = get_dfs_path(xid, ses, ref_path, cifs_sb->local_nls, - &num_referrals, &referrals, cifs_remap(cifs_sb)); - - if (!rc && num_referrals > 0) { + rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), + ref_path, &referral, NULL); + if (!rc) { char *fake_devname = NULL; mdata = cifs_compose_mount_options(cifs_sb->mountdata, - full_path + 1, referrals, + full_path + 1, &referral, &fake_devname); - - free_dfs_info_array(referrals, num_referrals); + free_dfs_info_param(&referral); if (IS_ERR(mdata)) { rc = PTR_ERR(mdata); mdata = NULL; } else { - cleanup_volume_info_contents(volume_info); + cifs_cleanup_volume_info_contents(volume_info); rc = cifs_setup_volume_info(volume_info, mdata, - fake_devname, false); + fake_devname, false); } kfree(fake_devname); kfree(cifs_sb->mountdata); @@ -3856,6 +4154,143 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, kfree(full_path); return rc; } + +static inline int get_next_dfs_tgt(const char *path, + struct dfs_cache_tgt_list *tgt_list, + struct dfs_cache_tgt_iterator **tgt_it) +{ + if (!*tgt_it) + *tgt_it = dfs_cache_get_tgt_iterator(tgt_list); + else + *tgt_it = dfs_cache_get_next_tgt(tgt_list, *tgt_it); + return !*tgt_it ? -EHOSTDOWN : 0; +} + +static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it, + struct smb_vol *fake_vol, struct smb_vol *vol) +{ + const char *tgt = dfs_cache_get_tgt_name(tgt_it); + int len = strlen(tgt) + 2; + char *new_unc; + + new_unc = kmalloc(len, GFP_KERNEL); + if (!new_unc) + return -ENOMEM; + snprintf(new_unc, len, "\\%s", tgt); + + kfree(vol->UNC); + vol->UNC = new_unc; + + if (fake_vol->prepath) { + kfree(vol->prepath); + vol->prepath = fake_vol->prepath; + fake_vol->prepath = NULL; + } + memcpy(&vol->dstaddr, &fake_vol->dstaddr, sizeof(vol->dstaddr)); + + return 0; +} + +static int setup_dfs_tgt_conn(const char *path, + const struct dfs_cache_tgt_iterator *tgt_it, + struct cifs_sb_info *cifs_sb, + struct smb_vol *vol, + unsigned int *xid, + struct TCP_Server_Info **server, + struct cifs_ses **ses, + struct cifs_tcon **tcon) +{ + int rc; + struct dfs_info3_param ref = {0}; + char *mdata = NULL, *fake_devname = NULL; + struct smb_vol fake_vol = {0}; + + cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path); + + rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref); + if (rc) + return rc; + + mdata = cifs_compose_mount_options(cifs_sb->mountdata, path, &ref, + &fake_devname); + free_dfs_info_param(&ref); + + if (IS_ERR(mdata)) { + rc = PTR_ERR(mdata); + mdata = NULL; + } else { + cifs_dbg(FYI, "%s: fake_devname: %s\n", __func__, fake_devname); + rc = cifs_setup_volume_info(&fake_vol, mdata, fake_devname, + false); + } + kfree(mdata); + kfree(fake_devname); + + if (!rc) { + /* + * We use a 'fake_vol' here because we need pass it down to the + * mount_{get,put} functions to test connection against new DFS + * targets. + */ + mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon); + rc = mount_get_conns(&fake_vol, cifs_sb, xid, server, ses, + tcon); + if (!rc) { + /* + * We were able to connect to new target server. + * Update current volume info with new target server. + */ + rc = update_vol_info(tgt_it, &fake_vol, vol); + } + } + cifs_cleanup_volume_info_contents(&fake_vol); + return rc; +} + +static int mount_do_dfs_failover(const char *path, + struct cifs_sb_info *cifs_sb, + struct smb_vol *vol, + struct cifs_ses *root_ses, + unsigned int *xid, + struct TCP_Server_Info **server, + struct cifs_ses **ses, + struct cifs_tcon **tcon) +{ + int rc; + struct dfs_cache_tgt_list tgt_list; + struct dfs_cache_tgt_iterator *tgt_it = NULL; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) + return -EOPNOTSUPP; + + rc = dfs_cache_noreq_find(path, NULL, &tgt_list); + if (rc) + return rc; + + for (;;) { + /* Get next DFS target server - if any */ + rc = get_next_dfs_tgt(path, &tgt_list, &tgt_it); + if (rc) + break; + /* Connect to next DFS target */ + rc = setup_dfs_tgt_conn(path, tgt_it, cifs_sb, vol, xid, server, + ses, tcon); + if (!rc || rc == -EACCES || rc == -EOPNOTSUPP) + break; + } + if (!rc) { + /* + * Update DFS target hint in DFS referral cache with the target + * server we successfully reconnected to. + */ + rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses, + cifs_sb->local_nls, + cifs_remap(cifs_sb), path, + tgt_it); + } + dfs_cache_free_tgts(&tgt_list); + return rc; +} #endif static int @@ -3954,107 +4389,108 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server, return rc; } -int -cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) +/* + * Check if path is remote (e.g. a DFS share). Return -EREMOTE if it is, + * otherwise 0. + */ +static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol, + const unsigned int xid, + struct TCP_Server_Info *server, + struct cifs_tcon *tcon) { int rc; - unsigned int xid; - struct cifs_ses *ses; - struct cifs_tcon *tcon; - struct TCP_Server_Info *server; - char *full_path; - struct tcon_link *tlink; -#ifdef CONFIG_CIFS_DFS_UPCALL - int referral_walks_count = 0; -#endif + char *full_path; -#ifdef CONFIG_CIFS_DFS_UPCALL -try_mount_again: - /* cleanup activities if we're chasing a referral */ - if (referral_walks_count) { - if (tcon) - cifs_put_tcon(tcon); - else if (ses) - cifs_put_smb_ses(ses); + if (!server->ops->is_path_accessible) + return -EOPNOTSUPP; - cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS; - - free_xid(xid); - } -#endif - rc = 0; - tcon = NULL; - ses = NULL; - server = NULL; - full_path = NULL; - tlink = NULL; + /* + * cifs_build_path_to_root works only when we have a valid tcon + */ + full_path = cifs_build_path_to_root(vol, cifs_sb, tcon, + tcon->Flags & SMB_SHARE_IS_IN_DFS); + if (full_path == NULL) + return -ENOMEM; - xid = get_xid(); + cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path); - /* get a reference to a tcp session */ - server = cifs_get_tcp_session(volume_info); - if (IS_ERR(server)) { - rc = PTR_ERR(server); - goto out; - } - if ((volume_info->max_credits < 20) || - (volume_info->max_credits > 60000)) - server->max_credits = SMB2_MAX_CREDITS_AVAILABLE; - else - server->max_credits = volume_info->max_credits; - /* get a reference to a SMB session */ - ses = cifs_get_smb_ses(server, volume_info); - if (IS_ERR(ses)) { - rc = PTR_ERR(ses); - ses = NULL; - goto mount_fail_check; + rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, + full_path); + if (rc != 0 && rc != -EREMOTE) { + kfree(full_path); + return rc; } - if ((volume_info->persistent == true) && ((ses->server->capabilities & - SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) == 0)) { - cifs_dbg(VFS, "persistent handles not supported by server\n"); - rc = -EOPNOTSUPP; - goto mount_fail_check; + if (rc != -EREMOTE) { + rc = cifs_are_all_path_components_accessible(server, xid, tcon, + cifs_sb, + full_path); + if (rc != 0) { + cifs_dbg(VFS, "cannot query dirs between root and final path, " + "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + rc = 0; + } } - /* search for existing tcon to this server share */ - tcon = cifs_get_tcon(ses, volume_info); - if (IS_ERR(tcon)) { - rc = PTR_ERR(tcon); - tcon = NULL; - if (rc == -EACCES) - goto mount_fail_check; - - goto remote_path_check; - } + kfree(full_path); + return rc; +} - /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */ - if (tcon->posix_extensions) - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; +#ifdef CONFIG_CIFS_DFS_UPCALL +int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) +{ + int rc = 0; + unsigned int xid; + struct cifs_ses *ses; + struct cifs_tcon *root_tcon = NULL; + struct cifs_tcon *tcon = NULL; + struct TCP_Server_Info *server; + char *root_path = NULL, *full_path = NULL; + char *old_mountdata; + int count; - /* tell server which Unix caps we support */ - if (cap_unix(tcon->ses)) { - /* reset of caps checks mount to see if unix extensions - disabled for just this mount */ - reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info); - if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) && - (le64_to_cpu(tcon->fsUnixInfo.Capability) & - CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) { - rc = -EACCES; - goto mount_fail_check; + rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon); + if (!rc && tcon) { + /* If not a standalone DFS root, then check if path is remote */ + rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, + cifs_remap(cifs_sb), vol->UNC + 1, NULL, + NULL); + if (rc) { + rc = is_path_remote(cifs_sb, vol, xid, server, tcon); + if (!rc) + goto out; + if (rc != -EREMOTE) + goto error; } - } else - tcon->unix_ext = 0; /* server does not support them */ - - /* do not care if a following call succeed - informational */ - if (!tcon->pipe && server->ops->qfs_tcon) - server->ops->qfs_tcon(xid, tcon); - - cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info); - cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info); + } + /* + * If first DFS target server went offline and we failed to connect it, + * server and ses pointers are NULL at this point, though we still have + * chance to get a cached DFS referral in expand_dfs_referral() and + * retry next target available in it. + * + * If a NULL ses ptr is passed to dfs_cache_find(), a lookup will be + * performed against DFS path and *no* requests will be sent to server + * for any new DFS referrals. Hence it's safe to skip checking whether + * server or ses ptr is NULL. + */ + if (rc == -EACCES || rc == -EOPNOTSUPP) + goto error; + + root_path = build_unc_path_to_root(vol, cifs_sb, false); + if (IS_ERR(root_path)) { + rc = PTR_ERR(root_path); + root_path = NULL; + goto error; + } -remote_path_check: -#ifdef CONFIG_CIFS_DFS_UPCALL + full_path = build_unc_path_to_root(vol, cifs_sb, true); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); + full_path = NULL; + goto error; + } /* * Perform an unconditional check for whether there are DFS * referrals for this path without prefix, to provide support @@ -4062,119 +4498,173 @@ remote_path_check: * with PATH_NOT_COVERED to requests that include the prefix. * Chase the referral if found, otherwise continue normally. */ - if (referral_walks_count == 0) { - int refrc = expand_dfs_referral(xid, ses, volume_info, cifs_sb, - false); - if (!refrc) { - referral_walks_count++; - goto try_mount_again; - } + old_mountdata = cifs_sb->mountdata; + (void)expand_dfs_referral(xid, ses, vol, cifs_sb, false); + + if (cifs_sb->mountdata == NULL) { + rc = -ENOENT; + goto error; } -#endif - /* check if a whole path is not remote */ - if (!rc && tcon) { - if (!server->ops->is_path_accessible) { - rc = -ENOSYS; - goto mount_fail_check; + if (cifs_sb->mountdata != old_mountdata) { + /* If we were redirected, reconnect to new target server */ + mount_put_conns(cifs_sb, xid, server, ses, tcon); + rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon); + } + if (rc) { + if (rc == -EACCES || rc == -EOPNOTSUPP) + goto error; + /* Perform DFS failover to any other DFS targets */ + rc = mount_do_dfs_failover(root_path + 1, cifs_sb, vol, NULL, + &xid, &server, &ses, &tcon); + if (rc) + goto error; + } + + kfree(root_path); + root_path = build_unc_path_to_root(vol, cifs_sb, false); + if (IS_ERR(root_path)) { + rc = PTR_ERR(root_path); + root_path = NULL; + goto error; + } + /* Cache out resolved root server */ + (void)dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), + root_path + 1, NULL, NULL); + /* + * Save root tcon for additional DFS requests to update or create a new + * DFS cache entry, or even perform DFS failover. + */ + spin_lock(&cifs_tcp_ses_lock); + tcon->tc_count++; + tcon->dfs_path = root_path; + root_path = NULL; + tcon->remap = cifs_remap(cifs_sb); + spin_unlock(&cifs_tcp_ses_lock); + + root_tcon = tcon; + + for (count = 1; ;) { + if (!rc && tcon) { + rc = is_path_remote(cifs_sb, vol, xid, server, tcon); + if (!rc || rc != -EREMOTE) + break; } /* - * cifs_build_path_to_root works only when we have a valid tcon + * BB: when we implement proper loop detection, + * we will remove this check. But now we need it + * to prevent an indefinite loop if 'DFS tree' is + * misconfigured (i.e. has loops). */ - full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon, - tcon->Flags & SMB_SHARE_IS_IN_DFS); - if (full_path == NULL) { - rc = -ENOMEM; - goto mount_fail_check; - } - rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, - full_path); - if (rc != 0 && rc != -EREMOTE) { - kfree(full_path); - goto mount_fail_check; + if (count++ > MAX_NESTED_LINKS) { + rc = -ELOOP; + break; } - if (rc != -EREMOTE) { - rc = cifs_are_all_path_components_accessible(server, - xid, tcon, cifs_sb, - full_path); - if (rc != 0) { - cifs_dbg(VFS, "cannot query dirs between root and final path, " - "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; - rc = 0; - } - } kfree(full_path); - } - - /* get referral if needed */ - if (rc == -EREMOTE) { -#ifdef CONFIG_CIFS_DFS_UPCALL - if (referral_walks_count > MAX_NESTED_LINKS) { - /* - * BB: when we implement proper loop detection, - * we will remove this check. But now we need it - * to prevent an indefinite loop if 'DFS tree' is - * misconfigured (i.e. has loops). - */ - rc = -ELOOP; - goto mount_fail_check; + full_path = build_unc_path_to_root(vol, cifs_sb, true); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); + full_path = NULL; + break; } - rc = expand_dfs_referral(xid, ses, volume_info, cifs_sb, true); + old_mountdata = cifs_sb->mountdata; + rc = expand_dfs_referral(xid, root_tcon->ses, vol, cifs_sb, + true); + if (rc) + break; - if (!rc) { - referral_walks_count++; - goto try_mount_again; + if (cifs_sb->mountdata != old_mountdata) { + mount_put_conns(cifs_sb, xid, server, ses, tcon); + rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, + &tcon); + } + if (rc) { + if (rc == -EACCES || rc == -EOPNOTSUPP) + break; + /* Perform DFS failover to any other DFS targets */ + rc = mount_do_dfs_failover(full_path + 1, cifs_sb, vol, + root_tcon->ses, &xid, + &server, &ses, &tcon); + if (rc == -EACCES || rc == -EOPNOTSUPP || !server || + !ses) + goto error; } - goto mount_fail_check; -#else /* No DFS support, return error on mount */ - rc = -EOPNOTSUPP; -#endif } + cifs_put_tcon(root_tcon); if (rc) - goto mount_fail_check; + goto error; - /* now, hang the tcon off of the superblock */ - tlink = kzalloc(sizeof *tlink, GFP_KERNEL); - if (tlink == NULL) { + spin_lock(&cifs_tcp_ses_lock); + if (!tcon->dfs_path) { + /* Save full path in new tcon to do failover when reconnecting tcons */ + tcon->dfs_path = full_path; + full_path = NULL; + tcon->remap = cifs_remap(cifs_sb); + } + cifs_sb->origin_fullpath = kstrndup(tcon->dfs_path, + strlen(tcon->dfs_path), + GFP_ATOMIC); + if (!cifs_sb->origin_fullpath) { + spin_unlock(&cifs_tcp_ses_lock); rc = -ENOMEM; - goto mount_fail_check; + goto error; } + spin_unlock(&cifs_tcp_ses_lock); - tlink->tl_uid = ses->linux_uid; - tlink->tl_tcon = tcon; - tlink->tl_time = jiffies; - set_bit(TCON_LINK_MASTER, &tlink->tl_flags); - set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rc = dfs_cache_add_vol(vol, cifs_sb->origin_fullpath); + if (rc) { + kfree(cifs_sb->origin_fullpath); + goto error; + } + /* + * After reconnecting to a different server, unique ids won't + * match anymore, so we disable serverino. This prevents + * dentry revalidation to think the dentry are stale (ESTALE). + */ + cifs_autodisable_serverino(cifs_sb); +out: + free_xid(xid); + return mount_setup_tlink(cifs_sb, ses, tcon); - cifs_sb->master_tlink = tlink; - spin_lock(&cifs_sb->tlink_tree_lock); - tlink_rb_insert(&cifs_sb->tlink_tree, tlink); - spin_unlock(&cifs_sb->tlink_tree_lock); +error: + kfree(full_path); + kfree(root_path); + mount_put_conns(cifs_sb, xid, server, ses, tcon); + return rc; +} +#else +int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol) +{ + int rc = 0; + unsigned int xid; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct TCP_Server_Info *server; - queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks, - TLINK_IDLE_EXPIRE); + rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses, &tcon); + if (rc) + goto error; -mount_fail_check: - /* on error free sesinfo and tcon struct if needed */ - if (rc) { - /* If find_unc succeeded then rc == 0 so we can not end */ - /* up accidentally freeing someone elses tcon struct */ - if (tcon) - cifs_put_tcon(tcon); - else if (ses) - cifs_put_smb_ses(ses); - else - cifs_put_tcp_session(server, 0); + if (tcon) { + rc = is_path_remote(cifs_sb, vol, xid, server, tcon); + if (rc == -EREMOTE) + rc = -EOPNOTSUPP; + if (rc) + goto error; } -out: free_xid(xid); + + return mount_setup_tlink(cifs_sb, ses, tcon); + +error: + mount_put_conns(cifs_sb, xid, server, ses, tcon); return rc; } +#endif /* * Issue a TREE_CONNECT request. @@ -4370,6 +4860,10 @@ cifs_umount(struct cifs_sb_info *cifs_sb) kfree(cifs_sb->mountdata); kfree(cifs_sb->prepath); +#ifdef CONFIG_CIFS_DFS_UPCALL + dfs_cache_del_vol(cifs_sb->origin_fullpath); + kfree(cifs_sb->origin_fullpath); +#endif call_rcu(&cifs_sb->rcu, delayed_free); } diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c new file mode 100644 index 000000000000..cd63c4a70875 --- /dev/null +++ b/fs/cifs/dfs_cache.c @@ -0,0 +1,1367 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DFS referral cache routines + * + * Copyright (c) 2018 Paulo Alcantara <palcantara@suse.de> + */ + +#include <linux/rcupdate.h> +#include <linux/rculist.h> +#include <linux/jhash.h> +#include <linux/ktime.h> +#include <linux/slab.h> +#include <linux/nls.h> +#include <linux/workqueue.h> +#include "cifsglob.h" +#include "smb2pdu.h" +#include "smb2proto.h" +#include "cifsproto.h" +#include "cifs_debug.h" +#include "cifs_unicode.h" +#include "smb2glob.h" + +#include "dfs_cache.h" + +#define DFS_CACHE_HTABLE_SIZE 32 +#define DFS_CACHE_MAX_ENTRIES 64 + +#define IS_INTERLINK_SET(v) ((v) & (DFSREF_REFERRAL_SERVER | \ + DFSREF_STORAGE_SERVER)) + +struct dfs_cache_tgt { + char *t_name; + struct list_head t_list; +}; + +struct dfs_cache_entry { + struct hlist_node ce_hlist; + const char *ce_path; + int ce_ttl; + int ce_srvtype; + int ce_flags; + struct timespec64 ce_etime; + int ce_path_consumed; + int ce_numtgts; + struct list_head ce_tlist; + struct dfs_cache_tgt *ce_tgthint; + struct rcu_head ce_rcu; +}; + +static struct kmem_cache *dfs_cache_slab __read_mostly; + +struct dfs_cache_vol_info { + char *vi_fullpath; + struct smb_vol vi_vol; + struct list_head vi_list; +}; + +struct dfs_cache { + struct mutex dc_lock; + struct nls_table *dc_nlsc; + struct list_head dc_vol_list; + int dc_ttl; + struct delayed_work dc_refresh; +}; + +static struct dfs_cache dfs_cache; + +/* + * Number of entries in the cache + */ +static size_t dfs_cache_count; + +static DEFINE_MUTEX(dfs_cache_list_lock); +static struct hlist_head dfs_cache_htable[DFS_CACHE_HTABLE_SIZE]; + +static void refresh_cache_worker(struct work_struct *work); + +static inline bool is_path_valid(const char *path) +{ + return path && (strchr(path + 1, '\\') || strchr(path + 1, '/')); +} + +static inline int get_normalized_path(const char *path, char **npath) +{ + if (*path == '\\') { + *npath = (char *)path; + } else { + *npath = kstrndup(path, strlen(path), GFP_KERNEL); + if (!*npath) + return -ENOMEM; + convert_delimiter(*npath, '\\'); + } + return 0; +} + +static inline void free_normalized_path(const char *path, char *npath) +{ + if (path != npath) + kfree(npath); +} + +static inline bool cache_entry_expired(const struct dfs_cache_entry *ce) +{ + struct timespec64 ts; + + ktime_get_coarse_real_ts64(&ts); + return timespec64_compare(&ts, &ce->ce_etime) >= 0; +} + +static inline void free_tgts(struct dfs_cache_entry *ce) +{ + struct dfs_cache_tgt *t, *n; + + list_for_each_entry_safe(t, n, &ce->ce_tlist, t_list) { + list_del(&t->t_list); + kfree(t->t_name); + kfree(t); + } +} + +static void free_cache_entry(struct rcu_head *rcu) +{ + struct dfs_cache_entry *ce = container_of(rcu, struct dfs_cache_entry, + ce_rcu); + kmem_cache_free(dfs_cache_slab, ce); +} + +static inline void flush_cache_ent(struct dfs_cache_entry *ce) +{ + if (hlist_unhashed(&ce->ce_hlist)) + return; + + hlist_del_init_rcu(&ce->ce_hlist); + kfree(ce->ce_path); + free_tgts(ce); + dfs_cache_count--; + call_rcu(&ce->ce_rcu, free_cache_entry); +} + +static void flush_cache_ents(void) +{ + int i; + + rcu_read_lock(); + for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++) { + struct hlist_head *l = &dfs_cache_htable[i]; + struct dfs_cache_entry *ce; + + hlist_for_each_entry_rcu(ce, l, ce_hlist) + flush_cache_ent(ce); + } + rcu_read_unlock(); +} + +/* + * dfs cache /proc file + */ +static int dfscache_proc_show(struct seq_file *m, void *v) +{ + int bucket; + struct dfs_cache_entry *ce; + struct dfs_cache_tgt *t; + + seq_puts(m, "DFS cache\n---------\n"); + + mutex_lock(&dfs_cache_list_lock); + + rcu_read_lock(); + hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) { + seq_printf(m, + "cache entry: path=%s,type=%s,ttl=%d,etime=%ld," + "interlink=%s,path_consumed=%d,expired=%s\n", + ce->ce_path, + ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link", + ce->ce_ttl, ce->ce_etime.tv_nsec, + IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no", + ce->ce_path_consumed, + cache_entry_expired(ce) ? "yes" : "no"); + + list_for_each_entry(t, &ce->ce_tlist, t_list) { + seq_printf(m, " %s%s\n", + t->t_name, + ce->ce_tgthint == t ? " (target hint)" : ""); + } + + } + rcu_read_unlock(); + + mutex_unlock(&dfs_cache_list_lock); + return 0; +} + +static ssize_t dfscache_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char c; + int rc; + + rc = get_user(c, buffer); + if (rc) + return rc; + + if (c != '0') + return -EINVAL; + + cifs_dbg(FYI, "clearing dfs cache"); + mutex_lock(&dfs_cache_list_lock); + flush_cache_ents(); + mutex_unlock(&dfs_cache_list_lock); + + return count; +} + +static int dfscache_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, dfscache_proc_show, NULL); +} + +const struct file_operations dfscache_proc_fops = { + .open = dfscache_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = dfscache_proc_write, +}; + +#ifdef CONFIG_CIFS_DEBUG2 +static inline void dump_tgts(const struct dfs_cache_entry *ce) +{ + struct dfs_cache_tgt *t; + + cifs_dbg(FYI, "target list:\n"); + list_for_each_entry(t, &ce->ce_tlist, t_list) { + cifs_dbg(FYI, " %s%s\n", t->t_name, + ce->ce_tgthint == t ? " (target hint)" : ""); + } +} + +static inline void dump_ce(const struct dfs_cache_entry *ce) +{ + cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld," + "interlink=%s,path_consumed=%d,expired=%s\n", ce->ce_path, + ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ce_ttl, + ce->ce_etime.tv_nsec, + IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no", + ce->ce_path_consumed, + cache_entry_expired(ce) ? "yes" : "no"); + dump_tgts(ce); +} + +static inline void dump_refs(const struct dfs_info3_param *refs, int numrefs) +{ + int i; + + cifs_dbg(FYI, "DFS referrals returned by the server:\n"); + for (i = 0; i < numrefs; i++) { + const struct dfs_info3_param *ref = &refs[i]; + + cifs_dbg(FYI, + "\n" + "flags: 0x%x\n" + "path_consumed: %d\n" + "server_type: 0x%x\n" + "ref_flag: 0x%x\n" + "path_name: %s\n" + "node_name: %s\n" + "ttl: %d (%dm)\n", + ref->flags, ref->path_consumed, ref->server_type, + ref->ref_flag, ref->path_name, ref->node_name, + ref->ttl, ref->ttl / 60); + } +} +#else +#define dump_tgts(e) +#define dump_ce(e) +#define dump_refs(r, n) +#endif + +/** + * dfs_cache_init - Initialize DFS referral cache. + * + * Return zero if initialized successfully, otherwise non-zero. + */ +int dfs_cache_init(void) +{ + int i; + + dfs_cache_slab = kmem_cache_create("cifs_dfs_cache", + sizeof(struct dfs_cache_entry), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!dfs_cache_slab) + return -ENOMEM; + + for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++) + INIT_HLIST_HEAD(&dfs_cache_htable[i]); + + INIT_LIST_HEAD(&dfs_cache.dc_vol_list); + mutex_init(&dfs_cache.dc_lock); + INIT_DELAYED_WORK(&dfs_cache.dc_refresh, refresh_cache_worker); + dfs_cache.dc_ttl = -1; + dfs_cache.dc_nlsc = load_nls_default(); + + cifs_dbg(FYI, "%s: initialized DFS referral cache\n", __func__); + return 0; +} + +static inline unsigned int cache_entry_hash(const void *data, int size) +{ + unsigned int h; + + h = jhash(data, size, 0); + return h & (DFS_CACHE_HTABLE_SIZE - 1); +} + +/* Check whether second path component of @path is SYSVOL or NETLOGON */ +static inline bool is_sysvol_or_netlogon(const char *path) +{ + const char *s; + char sep = path[0]; + + s = strchr(path + 1, sep) + 1; + return !strncasecmp(s, "sysvol", strlen("sysvol")) || + !strncasecmp(s, "netlogon", strlen("netlogon")); +} + +/* Return target hint of a DFS cache entry */ +static inline char *get_tgt_name(const struct dfs_cache_entry *ce) +{ + struct dfs_cache_tgt *t = ce->ce_tgthint; + + return t ? t->t_name : ERR_PTR(-ENOENT); +} + +/* Return expire time out of a new entry's TTL */ +static inline struct timespec64 get_expire_time(int ttl) +{ + struct timespec64 ts = { + .tv_sec = ttl, + .tv_nsec = 0, + }; + struct timespec64 now; + + ktime_get_coarse_real_ts64(&now); + return timespec64_add(now, ts); +} + +/* Allocate a new DFS target */ +static inline struct dfs_cache_tgt *alloc_tgt(const char *name) +{ + struct dfs_cache_tgt *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return ERR_PTR(-ENOMEM); + t->t_name = kstrndup(name, strlen(name), GFP_KERNEL); + if (!t->t_name) { + kfree(t); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&t->t_list); + return t; +} + +/* + * Copy DFS referral information to a cache entry and conditionally update + * target hint. + */ +static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs, + struct dfs_cache_entry *ce, const char *tgthint) +{ + int i; + + ce->ce_ttl = refs[0].ttl; + ce->ce_etime = get_expire_time(ce->ce_ttl); + ce->ce_srvtype = refs[0].server_type; + ce->ce_flags = refs[0].ref_flag; + ce->ce_path_consumed = refs[0].path_consumed; + + for (i = 0; i < numrefs; i++) { + struct dfs_cache_tgt *t; + + t = alloc_tgt(refs[i].node_name); + if (IS_ERR(t)) { + free_tgts(ce); + return PTR_ERR(t); + } + if (tgthint && !strcasecmp(t->t_name, tgthint)) { + list_add(&t->t_list, &ce->ce_tlist); + tgthint = NULL; + } else { + list_add_tail(&t->t_list, &ce->ce_tlist); + } + ce->ce_numtgts++; + } + + ce->ce_tgthint = list_first_entry_or_null(&ce->ce_tlist, + struct dfs_cache_tgt, t_list); + + return 0; +} + +/* Allocate a new cache entry */ +static struct dfs_cache_entry * +alloc_cache_entry(const char *path, const struct dfs_info3_param *refs, + int numrefs) +{ + struct dfs_cache_entry *ce; + int rc; + + ce = kmem_cache_zalloc(dfs_cache_slab, GFP_KERNEL); + if (!ce) + return ERR_PTR(-ENOMEM); + + ce->ce_path = kstrdup_const(path, GFP_KERNEL); + if (!ce->ce_path) { + kmem_cache_free(dfs_cache_slab, ce); + return ERR_PTR(-ENOMEM); + } + INIT_HLIST_NODE(&ce->ce_hlist); + INIT_LIST_HEAD(&ce->ce_tlist); + + rc = copy_ref_data(refs, numrefs, ce, NULL); + if (rc) { + kfree(ce->ce_path); + kmem_cache_free(dfs_cache_slab, ce); + ce = ERR_PTR(rc); + } + return ce; +} + +static void remove_oldest_entry(void) +{ + int bucket; + struct dfs_cache_entry *ce; + struct dfs_cache_entry *to_del = NULL; + + rcu_read_lock(); + hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) { + if (!to_del || timespec64_compare(&ce->ce_etime, + &to_del->ce_etime) < 0) + to_del = ce; + } + if (!to_del) { + cifs_dbg(FYI, "%s: no entry to remove", __func__); + goto out; + } + cifs_dbg(FYI, "%s: removing entry", __func__); + dump_ce(to_del); + flush_cache_ent(to_del); +out: + rcu_read_unlock(); +} + +/* Add a new DFS cache entry */ +static inline struct dfs_cache_entry * +add_cache_entry(unsigned int hash, const char *path, + const struct dfs_info3_param *refs, int numrefs) +{ + struct dfs_cache_entry *ce; + + ce = alloc_cache_entry(path, refs, numrefs); + if (IS_ERR(ce)) + return ce; + + hlist_add_head_rcu(&ce->ce_hlist, &dfs_cache_htable[hash]); + + mutex_lock(&dfs_cache.dc_lock); + if (dfs_cache.dc_ttl < 0) { + dfs_cache.dc_ttl = ce->ce_ttl; + queue_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh, + dfs_cache.dc_ttl * HZ); + } else { + dfs_cache.dc_ttl = min_t(int, dfs_cache.dc_ttl, ce->ce_ttl); + mod_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh, + dfs_cache.dc_ttl * HZ); + } + mutex_unlock(&dfs_cache.dc_lock); + + return ce; +} + +static struct dfs_cache_entry *__find_cache_entry(unsigned int hash, + const char *path) +{ + struct dfs_cache_entry *ce; + bool found = false; + + rcu_read_lock(); + hlist_for_each_entry_rcu(ce, &dfs_cache_htable[hash], ce_hlist) { + if (!strcasecmp(path, ce->ce_path)) { +#ifdef CONFIG_CIFS_DEBUG2 + char *name = get_tgt_name(ce); + + if (unlikely(IS_ERR(name))) { + rcu_read_unlock(); + return ERR_CAST(name); + } + cifs_dbg(FYI, "%s: cache hit\n", __func__); + cifs_dbg(FYI, "%s: target hint: %s\n", __func__, name); +#endif + found = true; + break; + } + } + rcu_read_unlock(); + return found ? ce : ERR_PTR(-ENOENT); +} + +/* + * Find a DFS cache entry in hash table and optionally check prefix path against + * @path. + * Use whole path components in the match. + * Return ERR_PTR(-ENOENT) if the entry is not found. + */ +static inline struct dfs_cache_entry *find_cache_entry(const char *path, + unsigned int *hash) +{ + *hash = cache_entry_hash(path, strlen(path)); + return __find_cache_entry(*hash, path); +} + +static inline void destroy_slab_cache(void) +{ + rcu_barrier(); + kmem_cache_destroy(dfs_cache_slab); +} + +static inline void free_vol(struct dfs_cache_vol_info *vi) +{ + list_del(&vi->vi_list); + kfree(vi->vi_fullpath); + cifs_cleanup_volume_info_contents(&vi->vi_vol); + kfree(vi); +} + +static inline void free_vol_list(void) +{ + struct dfs_cache_vol_info *vi, *nvi; + + list_for_each_entry_safe(vi, nvi, &dfs_cache.dc_vol_list, vi_list) + free_vol(vi); +} + +/** + * dfs_cache_destroy - destroy DFS referral cache + */ +void dfs_cache_destroy(void) +{ + cancel_delayed_work_sync(&dfs_cache.dc_refresh); + unload_nls(dfs_cache.dc_nlsc); + free_vol_list(); + mutex_destroy(&dfs_cache.dc_lock); + + flush_cache_ents(); + destroy_slab_cache(); + mutex_destroy(&dfs_cache_list_lock); + + cifs_dbg(FYI, "%s: destroyed DFS referral cache\n", __func__); +} + +static inline struct dfs_cache_entry * +__update_cache_entry(const char *path, const struct dfs_info3_param *refs, + int numrefs) +{ + int rc; + unsigned int h; + struct dfs_cache_entry *ce; + char *s, *th = NULL; + + ce = find_cache_entry(path, &h); + if (IS_ERR(ce)) + return ce; + + if (ce->ce_tgthint) { + s = ce->ce_tgthint->t_name; + th = kstrndup(s, strlen(s), GFP_KERNEL); + if (!th) + return ERR_PTR(-ENOMEM); + } + + free_tgts(ce); + ce->ce_numtgts = 0; + + rc = copy_ref_data(refs, numrefs, ce, th); + kfree(th); + + if (rc) + ce = ERR_PTR(rc); + + return ce; +} + +/* Update an expired cache entry by getting a new DFS referral from server */ +static struct dfs_cache_entry * +update_cache_entry(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_codepage, int remap, + const char *path, struct dfs_cache_entry *ce) +{ + int rc; + struct dfs_info3_param *refs = NULL; + int numrefs = 0; + + cifs_dbg(FYI, "%s: update expired cache entry\n", __func__); + /* + * Check if caller provided enough parameters to update an expired + * entry. + */ + if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) + return ERR_PTR(-ETIME); + if (unlikely(!nls_codepage)) + return ERR_PTR(-ETIME); + + cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__, path); + + rc = ses->server->ops->get_dfs_refer(xid, ses, path, &refs, &numrefs, + nls_codepage, remap); + if (rc) + ce = ERR_PTR(rc); + else + ce = __update_cache_entry(path, refs, numrefs); + + dump_refs(refs, numrefs); + free_dfs_info_array(refs, numrefs); + + return ce; +} + +/* + * Find, create or update a DFS cache entry. + * + * If the entry wasn't found, it will create a new one. Or if it was found but + * expired, then it will update the entry accordingly. + * + * For interlinks, __cifs_dfs_mount() and expand_dfs_referral() are supposed to + * handle them properly. + */ +static struct dfs_cache_entry * +do_dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_codepage, int remap, + const char *path, bool noreq) +{ + int rc; + unsigned int h; + struct dfs_cache_entry *ce; + struct dfs_info3_param *nrefs; + int numnrefs; + + cifs_dbg(FYI, "%s: search path: %s\n", __func__, path); + + ce = find_cache_entry(path, &h); + if (IS_ERR(ce)) { + cifs_dbg(FYI, "%s: cache miss\n", __func__); + /* + * If @noreq is set, no requests will be sent to the server for + * either updating or getting a new DFS referral. + */ + if (noreq) + return ce; + /* + * No cache entry was found, so check for valid parameters that + * will be required to get a new DFS referral and then create a + * new cache entry. + */ + if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) { + ce = ERR_PTR(-EOPNOTSUPP); + return ce; + } + if (unlikely(!nls_codepage)) { + ce = ERR_PTR(-EINVAL); + return ce; + } + + nrefs = NULL; + numnrefs = 0; + + cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__, + path); + + rc = ses->server->ops->get_dfs_refer(xid, ses, path, &nrefs, + &numnrefs, nls_codepage, + remap); + if (rc) { + ce = ERR_PTR(rc); + return ce; + } + + dump_refs(nrefs, numnrefs); + + cifs_dbg(FYI, "%s: new cache entry\n", __func__); + + if (dfs_cache_count >= DFS_CACHE_MAX_ENTRIES) { + cifs_dbg(FYI, "%s: reached max cache size (%d)", + __func__, DFS_CACHE_MAX_ENTRIES); + remove_oldest_entry(); + } + ce = add_cache_entry(h, path, nrefs, numnrefs); + free_dfs_info_array(nrefs, numnrefs); + + if (IS_ERR(ce)) + return ce; + + dfs_cache_count++; + } + + dump_ce(ce); + + /* Just return the found cache entry in case @noreq is set */ + if (noreq) + return ce; + + if (cache_entry_expired(ce)) { + cifs_dbg(FYI, "%s: expired cache entry\n", __func__); + ce = update_cache_entry(xid, ses, nls_codepage, remap, path, + ce); + if (IS_ERR(ce)) { + cifs_dbg(FYI, "%s: failed to update expired entry\n", + __func__); + } + } + return ce; +} + +/* Set up a new DFS referral from a given cache entry */ +static int setup_ref(const char *path, const struct dfs_cache_entry *ce, + struct dfs_info3_param *ref, const char *tgt) +{ + int rc; + + cifs_dbg(FYI, "%s: set up new ref\n", __func__); + + memset(ref, 0, sizeof(*ref)); + + ref->path_name = kstrndup(path, strlen(path), GFP_KERNEL); + if (!ref->path_name) + return -ENOMEM; + + ref->path_consumed = ce->ce_path_consumed; + + ref->node_name = kstrndup(tgt, strlen(tgt), GFP_KERNEL); + if (!ref->node_name) { + rc = -ENOMEM; + goto err_free_path; + } + + ref->ttl = ce->ce_ttl; + ref->server_type = ce->ce_srvtype; + ref->ref_flag = ce->ce_flags; + + return 0; + +err_free_path: + kfree(ref->path_name); + ref->path_name = NULL; + return rc; +} + +/* Return target list of a DFS cache entry */ +static int get_tgt_list(const struct dfs_cache_entry *ce, + struct dfs_cache_tgt_list *tl) +{ + int rc; + struct list_head *head = &tl->tl_list; + struct dfs_cache_tgt *t; + struct dfs_cache_tgt_iterator *it, *nit; + + memset(tl, 0, sizeof(*tl)); + INIT_LIST_HEAD(head); + + list_for_each_entry(t, &ce->ce_tlist, t_list) { + it = kzalloc(sizeof(*it), GFP_KERNEL); + if (!it) { + rc = -ENOMEM; + goto err_free_it; + } + + it->it_name = kstrndup(t->t_name, strlen(t->t_name), + GFP_KERNEL); + if (!it->it_name) { + rc = -ENOMEM; + goto err_free_it; + } + + if (ce->ce_tgthint == t) + list_add(&it->it_list, head); + else + list_add_tail(&it->it_list, head); + } + tl->tl_numtgts = ce->ce_numtgts; + + return 0; + +err_free_it: + list_for_each_entry_safe(it, nit, head, it_list) { + kfree(it->it_name); + kfree(it); + } + return rc; +} + +/** + * dfs_cache_find - find a DFS cache entry + * + * If it doesn't find the cache entry, then it will get a DFS referral + * for @path and create a new entry. + * + * In case the cache entry exists but expired, it will get a DFS referral + * for @path and then update the respective cache entry. + * + * These parameters are passed down to the get_dfs_refer() call if it + * needs to be issued: + * @xid: syscall xid + * @ses: smb session to issue the request on + * @nls_codepage: charset conversion + * @remap: path character remapping type + * @path: path to lookup in DFS referral cache. + * + * @ref: when non-NULL, store single DFS referral result in it. + * @tgt_list: when non-NULL, store complete DFS target list in it. + * + * Return zero if the target was found, otherwise non-zero. + */ +int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_codepage, int remap, + const char *path, struct dfs_info3_param *ref, + struct dfs_cache_tgt_list *tgt_list) +{ + int rc; + char *npath; + struct dfs_cache_entry *ce; + + if (unlikely(!is_path_valid(path))) + return -EINVAL; + + rc = get_normalized_path(path, &npath); + if (rc) + return rc; + + mutex_lock(&dfs_cache_list_lock); + ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); + if (!IS_ERR(ce)) { + if (ref) + rc = setup_ref(path, ce, ref, get_tgt_name(ce)); + else + rc = 0; + if (!rc && tgt_list) + rc = get_tgt_list(ce, tgt_list); + } else { + rc = PTR_ERR(ce); + } + mutex_unlock(&dfs_cache_list_lock); + free_normalized_path(path, npath); + return rc; +} + +/** + * dfs_cache_noreq_find - find a DFS cache entry without sending any requests to + * the currently connected server. + * + * NOTE: This function will neither update a cache entry in case it was + * expired, nor create a new cache entry if @path hasn't been found. It heavily + * relies on an existing cache entry. + * + * @path: path to lookup in the DFS referral cache. + * @ref: when non-NULL, store single DFS referral result in it. + * @tgt_list: when non-NULL, store complete DFS target list in it. + * + * Return 0 if successful. + * Return -ENOENT if the entry was not found. + * Return non-zero for other errors. + */ +int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, + struct dfs_cache_tgt_list *tgt_list) +{ + int rc; + char *npath; + struct dfs_cache_entry *ce; + + if (unlikely(!is_path_valid(path))) + return -EINVAL; + + rc = get_normalized_path(path, &npath); + if (rc) + return rc; + + mutex_lock(&dfs_cache_list_lock); + ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true); + if (IS_ERR(ce)) { + rc = PTR_ERR(ce); + goto out; + } + + if (ref) + rc = setup_ref(path, ce, ref, get_tgt_name(ce)); + else + rc = 0; + if (!rc && tgt_list) + rc = get_tgt_list(ce, tgt_list); +out: + mutex_unlock(&dfs_cache_list_lock); + free_normalized_path(path, npath); + return rc; +} + +/** + * dfs_cache_update_tgthint - update target hint of a DFS cache entry + * + * If it doesn't find the cache entry, then it will get a DFS referral for @path + * and create a new entry. + * + * In case the cache entry exists but expired, it will get a DFS referral + * for @path and then update the respective cache entry. + * + * @xid: syscall id + * @ses: smb session + * @nls_codepage: charset conversion + * @remap: type of character remapping for paths + * @path: path to lookup in DFS referral cache. + * @it: DFS target iterator + * + * Return zero if the target hint was updated successfully, otherwise non-zero. + */ +int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_codepage, int remap, + const char *path, + const struct dfs_cache_tgt_iterator *it) +{ + int rc; + char *npath; + struct dfs_cache_entry *ce; + struct dfs_cache_tgt *t; + + if (unlikely(!is_path_valid(path))) + return -EINVAL; + + rc = get_normalized_path(path, &npath); + if (rc) + return rc; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + + mutex_lock(&dfs_cache_list_lock); + ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); + if (IS_ERR(ce)) { + rc = PTR_ERR(ce); + goto out; + } + + rc = 0; + + t = ce->ce_tgthint; + + if (likely(!strcasecmp(it->it_name, t->t_name))) + goto out; + + list_for_each_entry(t, &ce->ce_tlist, t_list) { + if (!strcasecmp(t->t_name, it->it_name)) { + ce->ce_tgthint = t; + cifs_dbg(FYI, "%s: new target hint: %s\n", __func__, + it->it_name); + break; + } + } + +out: + mutex_unlock(&dfs_cache_list_lock); + free_normalized_path(path, npath); + return rc; +} + +/** + * dfs_cache_noreq_update_tgthint - update target hint of a DFS cache entry + * without sending any requests to the currently connected server. + * + * NOTE: This function will neither update a cache entry in case it was + * expired, nor create a new cache entry if @path hasn't been found. It heavily + * relies on an existing cache entry. + * + * @path: path to lookup in DFS referral cache. + * @it: target iterator which contains the target hint to update the cache + * entry with. + * + * Return zero if the target hint was updated successfully, otherwise non-zero. + */ +int dfs_cache_noreq_update_tgthint(const char *path, + const struct dfs_cache_tgt_iterator *it) +{ + int rc; + char *npath; + struct dfs_cache_entry *ce; + struct dfs_cache_tgt *t; + + if (unlikely(!is_path_valid(path)) || !it) + return -EINVAL; + + rc = get_normalized_path(path, &npath); + if (rc) + return rc; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + + mutex_lock(&dfs_cache_list_lock); + + ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true); + if (IS_ERR(ce)) { + rc = PTR_ERR(ce); + goto out; + } + + rc = 0; + + t = ce->ce_tgthint; + + if (unlikely(!strcasecmp(it->it_name, t->t_name))) + goto out; + + list_for_each_entry(t, &ce->ce_tlist, t_list) { + if (!strcasecmp(t->t_name, it->it_name)) { + ce->ce_tgthint = t; + cifs_dbg(FYI, "%s: new target hint: %s\n", __func__, + it->it_name); + break; + } + } + +out: + mutex_unlock(&dfs_cache_list_lock); + free_normalized_path(path, npath); + return rc; +} + +/** + * dfs_cache_get_tgt_referral - returns a DFS referral (@ref) from a given + * target iterator (@it). + * + * @path: path to lookup in DFS referral cache. + * @it: DFS target iterator. + * @ref: DFS referral pointer to set up the gathered information. + * + * Return zero if the DFS referral was set up correctly, otherwise non-zero. + */ +int dfs_cache_get_tgt_referral(const char *path, + const struct dfs_cache_tgt_iterator *it, + struct dfs_info3_param *ref) +{ + int rc; + char *npath; + struct dfs_cache_entry *ce; + unsigned int h; + + if (!it || !ref) + return -EINVAL; + if (unlikely(!is_path_valid(path))) + return -EINVAL; + + rc = get_normalized_path(path, &npath); + if (rc) + return rc; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + + mutex_lock(&dfs_cache_list_lock); + + ce = find_cache_entry(npath, &h); + if (IS_ERR(ce)) { + rc = PTR_ERR(ce); + goto out; + } + + cifs_dbg(FYI, "%s: target name: %s\n", __func__, it->it_name); + + rc = setup_ref(path, ce, ref, it->it_name); + +out: + mutex_unlock(&dfs_cache_list_lock); + free_normalized_path(path, npath); + return rc; +} + +static int dup_vol(struct smb_vol *vol, struct smb_vol *new) +{ + memcpy(new, vol, sizeof(*new)); + + if (vol->username) { + new->username = kstrndup(vol->username, strlen(vol->username), + GFP_KERNEL); + if (!new->username) + return -ENOMEM; + } + if (vol->password) { + new->password = kstrndup(vol->password, strlen(vol->password), + GFP_KERNEL); + if (!new->password) + goto err_free_username; + } + if (vol->UNC) { + cifs_dbg(FYI, "%s: vol->UNC: %s\n", __func__, vol->UNC); + new->UNC = kstrndup(vol->UNC, strlen(vol->UNC), GFP_KERNEL); + if (!new->UNC) + goto err_free_password; + } + if (vol->domainname) { + new->domainname = kstrndup(vol->domainname, + strlen(vol->domainname), GFP_KERNEL); + if (!new->domainname) + goto err_free_unc; + } + if (vol->iocharset) { + new->iocharset = kstrndup(vol->iocharset, + strlen(vol->iocharset), GFP_KERNEL); + if (!new->iocharset) + goto err_free_domainname; + } + if (vol->prepath) { + cifs_dbg(FYI, "%s: vol->prepath: %s\n", __func__, vol->prepath); + new->prepath = kstrndup(vol->prepath, strlen(vol->prepath), + GFP_KERNEL); + if (!new->prepath) + goto err_free_iocharset; + } + + return 0; + +err_free_iocharset: + kfree(new->iocharset); +err_free_domainname: + kfree(new->domainname); +err_free_unc: + kfree(new->UNC); +err_free_password: + kzfree(new->password); +err_free_username: + kfree(new->username); + kfree(new); + return -ENOMEM; +} + +/** + * dfs_cache_add_vol - add a cifs volume during mount() that will be handled by + * DFS cache refresh worker. + * + * @vol: cifs volume. + * @fullpath: origin full path. + * + * Return zero if volume was set up correctly, otherwise non-zero. + */ +int dfs_cache_add_vol(struct smb_vol *vol, const char *fullpath) +{ + int rc; + struct dfs_cache_vol_info *vi; + + if (!vol || !fullpath) + return -EINVAL; + + cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); + + vi = kzalloc(sizeof(*vi), GFP_KERNEL); + if (!vi) + return -ENOMEM; + + vi->vi_fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); + if (!vi->vi_fullpath) { + rc = -ENOMEM; + goto err_free_vi; + } + + rc = dup_vol(vol, &vi->vi_vol); + if (rc) + goto err_free_fullpath; + + mutex_lock(&dfs_cache.dc_lock); + list_add_tail(&vi->vi_list, &dfs_cache.dc_vol_list); + mutex_unlock(&dfs_cache.dc_lock); + return 0; + +err_free_fullpath: + kfree(vi->vi_fullpath); +err_free_vi: + kfree(vi); + return rc; +} + +static inline struct dfs_cache_vol_info *find_vol(const char *fullpath) +{ + struct dfs_cache_vol_info *vi; + + list_for_each_entry(vi, &dfs_cache.dc_vol_list, vi_list) { + cifs_dbg(FYI, "%s: vi->vi_fullpath: %s\n", __func__, + vi->vi_fullpath); + if (!strcasecmp(vi->vi_fullpath, fullpath)) + return vi; + } + return ERR_PTR(-ENOENT); +} + +/** + * dfs_cache_update_vol - update vol info in DFS cache after failover + * + * @fullpath: fullpath to look up in volume list. + * @server: TCP ses pointer. + * + * Return zero if volume was updated, otherwise non-zero. + */ +int dfs_cache_update_vol(const char *fullpath, struct TCP_Server_Info *server) +{ + int rc; + struct dfs_cache_vol_info *vi; + + if (!fullpath || !server) + return -EINVAL; + + cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); + + mutex_lock(&dfs_cache.dc_lock); + + vi = find_vol(fullpath); + if (IS_ERR(vi)) { + rc = PTR_ERR(vi); + goto out; + } + + cifs_dbg(FYI, "%s: updating volume info\n", __func__); + memcpy(&vi->vi_vol.dstaddr, &server->dstaddr, + sizeof(vi->vi_vol.dstaddr)); + rc = 0; + +out: + mutex_unlock(&dfs_cache.dc_lock); + return rc; +} + +/** + * dfs_cache_del_vol - remove volume info in DFS cache during umount() + * + * @fullpath: fullpath to look up in volume list. + */ +void dfs_cache_del_vol(const char *fullpath) +{ + struct dfs_cache_vol_info *vi; + + if (!fullpath || !*fullpath) + return; + + cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); + + mutex_lock(&dfs_cache.dc_lock); + vi = find_vol(fullpath); + if (!IS_ERR(vi)) + free_vol(vi); + mutex_unlock(&dfs_cache.dc_lock); +} + +/* Get all tcons that are within a DFS namespace and can be refreshed */ +static void get_tcons(struct TCP_Server_Info *server, struct list_head *head) +{ + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + INIT_LIST_HEAD(head); + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + if (!tcon->need_reconnect && !tcon->need_reopen_files && + tcon->dfs_path) { + tcon->tc_count++; + list_add_tail(&tcon->ulist, head); + } + } + if (ses->tcon_ipc && !ses->tcon_ipc->need_reconnect && + ses->tcon_ipc->dfs_path) { + list_add_tail(&ses->tcon_ipc->ulist, head); + } + } + spin_unlock(&cifs_tcp_ses_lock); +} + +/* Refresh DFS cache entry from a given tcon */ +static void do_refresh_tcon(struct dfs_cache *dc, struct cifs_tcon *tcon) +{ + int rc = 0; + unsigned int xid; + char *path, *npath; + unsigned int h; + struct dfs_cache_entry *ce; + struct dfs_info3_param *refs = NULL; + int numrefs = 0; + + xid = get_xid(); + + path = tcon->dfs_path + 1; + + rc = get_normalized_path(path, &npath); + if (rc) + goto out; + + mutex_lock(&dfs_cache_list_lock); + ce = find_cache_entry(npath, &h); + mutex_unlock(&dfs_cache_list_lock); + + if (IS_ERR(ce)) { + rc = PTR_ERR(ce); + goto out; + } + + if (!cache_entry_expired(ce)) + goto out; + + if (unlikely(!tcon->ses->server->ops->get_dfs_refer)) { + rc = -EOPNOTSUPP; + } else { + rc = tcon->ses->server->ops->get_dfs_refer(xid, tcon->ses, path, + &refs, &numrefs, + dc->dc_nlsc, + tcon->remap); + if (!rc) { + mutex_lock(&dfs_cache_list_lock); + ce = __update_cache_entry(npath, refs, numrefs); + mutex_unlock(&dfs_cache_list_lock); + dump_refs(refs, numrefs); + free_dfs_info_array(refs, numrefs); + if (IS_ERR(ce)) + rc = PTR_ERR(ce); + } + } + if (rc) + cifs_dbg(FYI, "%s: failed to update expired entry\n", __func__); +out: + free_xid(xid); + free_normalized_path(path, npath); +} + +/* + * Worker that will refresh DFS cache based on lowest TTL value from a DFS + * referral. + * + * FIXME: ensure that all requests are sent to DFS root for refreshing the + * cache. + */ +static void refresh_cache_worker(struct work_struct *work) +{ + struct dfs_cache *dc = container_of(work, struct dfs_cache, + dc_refresh.work); + struct dfs_cache_vol_info *vi; + struct TCP_Server_Info *server; + LIST_HEAD(list); + struct cifs_tcon *tcon, *ntcon; + + mutex_lock(&dc->dc_lock); + + list_for_each_entry(vi, &dc->dc_vol_list, vi_list) { + server = cifs_find_tcp_session(&vi->vi_vol); + if (IS_ERR_OR_NULL(server)) + continue; + if (server->tcpStatus != CifsGood) + goto next; + get_tcons(server, &list); + list_for_each_entry_safe(tcon, ntcon, &list, ulist) { + do_refresh_tcon(dc, tcon); + list_del_init(&tcon->ulist); + cifs_put_tcon(tcon); + } +next: + cifs_put_tcp_session(server, 0); + } + queue_delayed_work(cifsiod_wq, &dc->dc_refresh, dc->dc_ttl * HZ); + mutex_unlock(&dc->dc_lock); +} diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h new file mode 100644 index 000000000000..22f366514f3a --- /dev/null +++ b/fs/cifs/dfs_cache.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DFS referral cache routines + * + * Copyright (c) 2018 Paulo Alcantara <palcantara@suse.de> + */ + +#ifndef _CIFS_DFS_CACHE_H +#define _CIFS_DFS_CACHE_H + +#include <linux/nls.h> +#include <linux/list.h> +#include "cifsglob.h" + +struct dfs_cache_tgt_list { + int tl_numtgts; + struct list_head tl_list; +}; + +struct dfs_cache_tgt_iterator { + char *it_name; + struct list_head it_list; +}; + +extern int dfs_cache_init(void); +extern void dfs_cache_destroy(void); +extern const struct file_operations dfscache_proc_fops; + +extern int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_codepage, int remap, + const char *path, struct dfs_info3_param *ref, + struct dfs_cache_tgt_list *tgt_list); +extern int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, + struct dfs_cache_tgt_list *tgt_list); +extern int dfs_cache_update_tgthint(const unsigned int xid, + struct cifs_ses *ses, + const struct nls_table *nls_codepage, + int remap, const char *path, + const struct dfs_cache_tgt_iterator *it); +extern int +dfs_cache_noreq_update_tgthint(const char *path, + const struct dfs_cache_tgt_iterator *it); +extern int dfs_cache_get_tgt_referral(const char *path, + const struct dfs_cache_tgt_iterator *it, + struct dfs_info3_param *ref); +extern int dfs_cache_add_vol(struct smb_vol *vol, const char *fullpath); +extern int dfs_cache_update_vol(const char *fullpath, + struct TCP_Server_Info *server); +extern void dfs_cache_del_vol(const char *fullpath); + +static inline struct dfs_cache_tgt_iterator * +dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl, + struct dfs_cache_tgt_iterator *it) +{ + if (!tl || list_empty(&tl->tl_list) || !it || + list_is_last(&it->it_list, &tl->tl_list)) + return NULL; + return list_next_entry(it, it_list); +} + +static inline struct dfs_cache_tgt_iterator * +dfs_cache_get_tgt_iterator(struct dfs_cache_tgt_list *tl) +{ + if (!tl) + return NULL; + return list_first_entry_or_null(&tl->tl_list, + struct dfs_cache_tgt_iterator, + it_list); +} + +static inline void dfs_cache_free_tgts(struct dfs_cache_tgt_list *tl) +{ + struct dfs_cache_tgt_iterator *it, *nit; + + if (!tl || list_empty(&tl->tl_list)) + return; + list_for_each_entry_safe(it, nit, &tl->tl_list, it_list) { + list_del(&it->it_list); + kfree(it->it_name); + kfree(it); + } + tl->tl_numtgts = 0; +} + +static inline const char * +dfs_cache_get_tgt_name(const struct dfs_cache_tgt_iterator *it) +{ + return it ? it->it_name : NULL; +} + +static inline int +dfs_cache_get_nr_tgts(const struct dfs_cache_tgt_list *tl) +{ + return tl ? tl->tl_numtgts : 0; +} + +#endif /* _CIFS_DFS_CACHE_H */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c9bc56b1baac..5e405164394a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1103,10 +1103,10 @@ try_again: rc = posix_lock_file(file, flock, NULL); up_write(&cinode->lock_sem); if (rc == FILE_LOCK_DEFERRED) { - rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next); + rc = wait_event_interruptible(flock->fl_wait, !flock->fl_blocker); if (!rc) goto try_again; - posix_unblock_lock(flock); + locks_delete_block(flock); } return rc; } @@ -2617,11 +2617,13 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, if (rc) break; + cur_len = min_t(const size_t, len, wsize); + if (ctx->direct_io) { ssize_t result; result = iov_iter_get_pages_alloc( - from, &pagevec, wsize, &start); + from, &pagevec, cur_len, &start); if (result < 0) { cifs_dbg(VFS, "direct_writev couldn't get user pages " @@ -2630,6 +2632,9 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, result, from->type, from->iov_offset, from->count); dump_stack(); + + rc = result; + add_credits_and_wake_if(server, credits, 0); break; } cur_len = (size_t)result; @@ -3313,13 +3318,16 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, cur_len, &start); if (result < 0) { cifs_dbg(VFS, - "couldn't get user pages (cur_len=%zd)" + "couldn't get user pages (rc=%zd)" " iter type %d" " iov_offset %zd count %zd\n", result, direct_iov.type, direct_iov.iov_offset, direct_iov.count); dump_stack(); + + rc = result; + add_credits_and_wake_if(server, credits, 0); break; } cur_len = (size_t)result; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a81a9df997c1..13fb59aadebc 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -333,7 +333,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) fattr->cf_mtime = timespec64_trunc(fattr->cf_mtime, sb->s_time_gran); fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime; fattr->cf_nlink = 2; - fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; + fattr->cf_flags = CIFS_FATTR_DFS_REFERRAL; } static int @@ -730,7 +730,6 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, FILE_ALL_INFO *data, struct super_block *sb, int xid, const struct cifs_fid *fid) { - bool validinum = false; __u16 srchflgs; int rc = 0, tmprc = ENOSYS; struct cifs_tcon *tcon; @@ -821,7 +820,6 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, (FILE_DIRECTORY_INFO *)data, cifs_sb); fattr.cf_uniqueid = le64_to_cpu( ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); - validinum = true; cifs_buf_release(srchinf->ntwrk_buf_start); } @@ -840,31 +838,29 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, */ if (*inode == NULL) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { - if (validinum == false) { - if (server->ops->get_srv_inum) - tmprc = server->ops->get_srv_inum(xid, - tcon, cifs_sb, full_path, - &fattr.cf_uniqueid, data); - if (tmprc) { - cifs_dbg(FYI, "GetSrvInodeNum rc %d\n", - tmprc); - fattr.cf_uniqueid = iunique(sb, ROOT_I); - cifs_autodisable_serverino(cifs_sb); - } else if ((fattr.cf_uniqueid == 0) && - strlen(full_path) == 0) { - /* some servers ret bad root ino ie 0 */ - cifs_dbg(FYI, "Invalid (0) inodenum\n"); - fattr.cf_flags |= - CIFS_FATTR_FAKE_ROOT_INO; - fattr.cf_uniqueid = - simple_hashstr(tcon->treeName); - } + if (server->ops->get_srv_inum) + tmprc = server->ops->get_srv_inum(xid, + tcon, cifs_sb, full_path, + &fattr.cf_uniqueid, data); + if (tmprc) { + cifs_dbg(FYI, "GetSrvInodeNum rc %d\n", + tmprc); + fattr.cf_uniqueid = iunique(sb, ROOT_I); + cifs_autodisable_serverino(cifs_sb); + } else if ((fattr.cf_uniqueid == 0) && + strlen(full_path) == 0) { + /* some servers ret bad root ino ie 0 */ + cifs_dbg(FYI, "Invalid (0) inodenum\n"); + fattr.cf_flags |= + CIFS_FATTR_FAKE_ROOT_INO; + fattr.cf_uniqueid = + simple_hashstr(tcon->treeName); } } else fattr.cf_uniqueid = iunique(sb, ROOT_I); } else { - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && - validinum == false && server->ops->get_srv_inum) { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) + && server->ops->get_srv_inum) { /* * Pass a NULL tcon to ensure we don't make a round * trip to the server. This only works for SMB2+. diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 8a41f4eba726..bee203055b30 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -111,21 +111,27 @@ struct cifs_tcon * tconInfoAlloc(void) { struct cifs_tcon *ret_buf; - ret_buf = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL); - if (ret_buf) { - atomic_inc(&tconInfoAllocCount); - ret_buf->tidStatus = CifsNew; - ++ret_buf->tc_count; - INIT_LIST_HEAD(&ret_buf->openFileList); - INIT_LIST_HEAD(&ret_buf->tcon_list); - spin_lock_init(&ret_buf->open_file_lock); - mutex_init(&ret_buf->crfid.fid_mutex); - ret_buf->crfid.fid = kzalloc(sizeof(struct cifs_fid), - GFP_KERNEL); - spin_lock_init(&ret_buf->stat_lock); - atomic_set(&ret_buf->num_local_opens, 0); - atomic_set(&ret_buf->num_remote_opens, 0); + + ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL); + if (!ret_buf) + return NULL; + ret_buf->crfid.fid = kzalloc(sizeof(*ret_buf->crfid.fid), GFP_KERNEL); + if (!ret_buf->crfid.fid) { + kfree(ret_buf); + return NULL; } + + atomic_inc(&tconInfoAllocCount); + ret_buf->tidStatus = CifsNew; + ++ret_buf->tc_count; + INIT_LIST_HEAD(&ret_buf->openFileList); + INIT_LIST_HEAD(&ret_buf->tcon_list); + spin_lock_init(&ret_buf->open_file_lock); + mutex_init(&ret_buf->crfid.fid_mutex); + spin_lock_init(&ret_buf->stat_lock); + atomic_set(&ret_buf->num_local_opens, 0); + atomic_set(&ret_buf->num_remote_opens, 0); + return ret_buf; } @@ -140,6 +146,9 @@ tconInfoFree(struct cifs_tcon *buf_to_free) kfree(buf_to_free->nativeFileSystem); kzfree(buf_to_free->password); kfree(buf_to_free->crfid.fid); +#ifdef CONFIG_CIFS_DFS_UPCALL + kfree(buf_to_free->dfs_path); +#endif kfree(buf_to_free); } @@ -525,9 +534,17 @@ void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { + struct cifs_tcon *tcon = NULL; + + if (cifs_sb->master_tlink) + tcon = cifs_sb_master_tcon(cifs_sb); + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; - cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s. This server doesn't seem to support them properly. Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n", - cifs_sb_master_tcon(cifs_sb)->treeName); + cifs_dbg(VFS, "Autodisabling the use of server inode numbers on %s.\n", + tcon ? tcon->treeName : "new server"); + cifs_dbg(VFS, "The server doesn't seem to support them properly or the files might be on different servers (DFS).\n"); + cifs_dbg(VFS, "Hardlinks will not be recognized on this mount. Consider mounting with the \"noserverino\" option to silence this message.\n"); + } } @@ -732,6 +749,8 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size, goto parse_DFS_referrals_exit; } + node->ttl = le32_to_cpu(ref->TimeToLive); + ref++; } @@ -933,3 +952,20 @@ void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, else if (page == 0) *len = rqst->rq_pagesz - rqst->rq_offset; } + +void extract_unc_hostname(const char *unc, const char **h, size_t *len) +{ + const char *end; + + /* skip initial slashes */ + while (*unc && (*unc == '\\' || *unc == '/')) + unc++; + + end = unc; + + while (*end && !(*end == '\\' || *end == '/')) + end++; + + *h = unc; + *len = end - unc; +} diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index e169e1a5fd35..3925a7bfc74d 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -655,7 +655,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, /* scan and find it */ int i; char *cur_ent; - char *end_of_smb = cfile->srch_inf.ntwrk_buf_start + + char *end_of_smb; + + if (cfile->srch_inf.ntwrk_buf_start == NULL) { + cifs_dbg(VFS, "ntwrk_buf_start is NULL during readdir\n"); + return -EIO; + } + + end_of_smb = cfile->srch_inf.ntwrk_buf_start + server->ops->calc_smb_size( cfile->srch_inf.ntwrk_buf_start, server); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index aa23c00367ec..dcd49ad60c83 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -534,9 +534,9 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) if (global_secflags & CIFSSEC_MAY_NTLM) return NTLM; default: - /* Fallthrough to attempt LANMAN authentication next */ break; } + /* Fallthrough - to attempt LANMAN authentication next */ case CIFS_NEGFLAVOR_LANMAN: switch (requested) { case LANMAN: @@ -1154,14 +1154,12 @@ out: static int _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) { - struct smb_hdr *smb_buf; SESSION_SETUP_ANDX *pSMB; struct cifs_ses *ses = sess_data->ses; __u32 capabilities; char *bcc_ptr; pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; - smb_buf = (struct smb_hdr *)pSMB; capabilities = cifs_ssetup_hdr(ses, pSMB); if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 378151e09e91..32a6c020478f 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -929,19 +929,18 @@ cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon, { #ifdef CONFIG_CIFS_DFS_UPCALL int rc; - unsigned int num_referrals = 0; - struct dfs_info3_param *referrals = NULL; + struct dfs_info3_param referral = {0}; - rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage, - &num_referrals, &referrals, 0); + rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage, &referral, + 0); - if (!rc && num_referrals > 0) { - *symlinkinfo = kstrndup(referrals->node_name, - strlen(referrals->node_name), + if (!rc) { + *symlinkinfo = kstrndup(referral.node_name, + strlen(referral.node_name), GFP_KERNEL); + free_dfs_info_param(&referral); if (!*symlinkinfo) rc = -ENOMEM; - free_dfs_info_array(referrals, num_referrals); } return rc; #else /* No DFS support */ diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 9e7ef7ec2d70..f14533da3a93 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -49,7 +49,6 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = ses->server; int num_rqst = 0; struct smb_rqst rqst[3]; int resp_buftype[3]; @@ -97,7 +96,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto finished; - smb2_set_next_command(server, &rqst[num_rqst++]); + smb2_set_next_command(tcon, &rqst[num_rqst++]); /* Operation */ switch (command) { @@ -111,7 +110,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, sizeof(struct smb2_file_all_info) + PATH_MAX * 2, 0, NULL); - smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_DELETE: @@ -127,14 +126,14 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst[num_rqst].rq_iov = si_iov; rqst[num_rqst].rq_nvec = 1; - size[0] = 8; + size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ data[0] = &delete_pending[0]; rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, COMPOUND_FID, current->tgid, FILE_DISPOSITION_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_SET_EOF: @@ -149,7 +148,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_SET_INFO: @@ -165,7 +164,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_BASIC_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_RENAME: @@ -189,7 +188,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_HARDLINK: @@ -213,7 +212,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, COMPOUND_FID, current->tgid, FILE_LINK_INFORMATION, SMB2_O_INFO_FILE, 0, data, size); - smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); break; default: @@ -388,7 +387,6 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, rc = -ENOMEM; goto smb2_rename_path; } - rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, FILE_OPEN, 0, smb2_to_name, command); smb2_rename_path: diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index d47b7f5dfa6c..924269cec135 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -379,8 +379,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_NONEXISTENT_EA_ENTRY, -EIO, "STATUS_NONEXISTENT_EA_ENTRY"}, {STATUS_NO_EAS_ON_FILE, -ENODATA, "STATUS_NO_EAS_ON_FILE"}, {STATUS_EA_CORRUPT_ERROR, -EIO, "STATUS_EA_CORRUPT_ERROR"}, - {STATUS_FILE_LOCK_CONFLICT, -EIO, "STATUS_FILE_LOCK_CONFLICT"}, - {STATUS_LOCK_NOT_GRANTED, -EIO, "STATUS_LOCK_NOT_GRANTED"}, + {STATUS_FILE_LOCK_CONFLICT, -EACCES, "STATUS_FILE_LOCK_CONFLICT"}, + {STATUS_LOCK_NOT_GRANTED, -EACCES, "STATUS_LOCK_NOT_GRANTED"}, {STATUS_DELETE_PENDING, -ENOENT, "STATUS_DELETE_PENDING"}, {STATUS_CTL_FILE_NOT_SUPPORTED, -ENOSYS, "STATUS_CTL_FILE_NOT_SUPPORTED"}, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 225fec1cfa67..33100ef74d7f 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -831,72 +831,48 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, { int rc; __le16 *utf16_path; - __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; - struct cifs_open_parms oparms; - struct cifs_fid fid; - struct smb2_file_full_ea_info *smb2_data; - int ea_buf_size = SMB2_MIN_EA_BUF; + struct kvec rsp_iov = {NULL, 0}; + int buftype = CIFS_NO_BUFFER; + struct smb2_query_info_rsp *rsp; + struct smb2_file_full_ea_info *info = NULL; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) return -ENOMEM; - oparms.tcon = tcon; - oparms.desired_access = FILE_READ_EA; - oparms.disposition = FILE_OPEN; - if (backup_cred(cifs_sb)) - oparms.create_options = CREATE_OPEN_BACKUP_INTENT; - else - oparms.create_options = 0; - oparms.fid = &fid; - oparms.reconnect = false; - - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); - kfree(utf16_path); + rc = smb2_query_info_compound(xid, tcon, utf16_path, + FILE_READ_EA, + FILE_FULL_EA_INFORMATION, + SMB2_O_INFO_FILE, + SMB2_MAX_EA_BUF, + &rsp_iov, &buftype, cifs_sb); if (rc) { - cifs_dbg(FYI, "open failed rc=%d\n", rc); - return rc; - } - - while (1) { - smb2_data = kzalloc(ea_buf_size, GFP_KERNEL); - if (smb2_data == NULL) { - SMB2_close(xid, tcon, fid.persistent_fid, - fid.volatile_fid); - return -ENOMEM; - } - - rc = SMB2_query_eas(xid, tcon, fid.persistent_fid, - fid.volatile_fid, - ea_buf_size, smb2_data); - - if (rc != -E2BIG) - break; - - kfree(smb2_data); - ea_buf_size <<= 1; - - if (ea_buf_size > SMB2_MAX_EA_BUF) { - cifs_dbg(VFS, "EA size is too large\n"); - SMB2_close(xid, tcon, fid.persistent_fid, - fid.volatile_fid); - return -ENOMEM; - } + /* + * If ea_name is NULL (listxattr) and there are no EAs, + * return 0 as it's not an error. Otherwise, the specified + * ea_name was not found. + */ + if (!ea_name && rc == -ENODATA) + rc = 0; + goto qeas_exit; } - SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; + rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset), + le32_to_cpu(rsp->OutputBufferLength), + &rsp_iov, + sizeof(struct smb2_file_full_ea_info)); + if (rc) + goto qeas_exit; - /* - * If ea_name is NULL (listxattr) and there are no EAs, return 0 as it's - * not an error. Otherwise, the specified ea_name was not found. - */ - if (!rc) - rc = move_smb2_ea_to_cifs(ea_data, buf_size, smb2_data, - SMB2_MAX_EA_BUF, ea_name); - else if (!ea_name && rc == -ENODATA) - rc = 0; + info = (struct smb2_file_full_ea_info *)( + le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp); + rc = move_smb2_ea_to_cifs(ea_data, buf_size, info, + le32_to_cpu(rsp->OutputBufferLength), ea_name); - kfree(smb2_data); + qeas_exit: + kfree(utf16_path); + free_rsp_buf(buftype, rsp_iov.iov_base); return rc; } @@ -907,14 +883,27 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, const __u16 ea_value_len, const struct nls_table *nls_codepage, struct cifs_sb_info *cifs_sb) { - int rc; - __le16 *utf16_path; - __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; - struct cifs_open_parms oparms; - struct cifs_fid fid; - struct smb2_file_full_ea_info *ea; + struct cifs_ses *ses = tcon->ses; + __le16 *utf16_path = NULL; int ea_name_len = strlen(ea_name); + int flags = 0; int len; + struct smb_rqst rqst[3]; + int resp_buftype[3]; + struct kvec rsp_iov[3]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct cifs_open_parms oparms; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_fid fid; + struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; + unsigned int size[1]; + void *data[1]; + struct smb2_file_full_ea_info *ea = NULL; + struct kvec close_iov[1]; + int rc; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; if (ea_name_len > 255) return -EINVAL; @@ -923,6 +912,16 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, if (!utf16_path) return -ENOMEM; + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + /* Open */ + memset(&open_iov, 0, sizeof(open_iov)); + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; + + memset(&oparms, 0, sizeof(oparms)); oparms.tcon = tcon; oparms.desired_access = FILE_WRITE_EA; oparms.disposition = FILE_OPEN; @@ -933,18 +932,22 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); - kfree(utf16_path); - if (rc) { - cifs_dbg(FYI, "open failed rc=%d\n", rc); - return rc; - } + rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path); + if (rc) + goto sea_exit; + smb2_set_next_command(tcon, &rqst[0]); + + + /* Set Info */ + memset(&si_iov, 0, sizeof(si_iov)); + rqst[1].rq_iov = si_iov; + rqst[1].rq_nvec = 1; len = sizeof(ea) + ea_name_len + ea_value_len + 1; ea = kzalloc(len, GFP_KERNEL); if (ea == NULL) { - SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); - return -ENOMEM; + rc = -ENOMEM; + goto sea_exit; } ea->ea_name_length = ea_name_len; @@ -952,12 +955,36 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, memcpy(ea->ea_data, ea_name, ea_name_len + 1); memcpy(ea->ea_data + ea_name_len + 1, ea_value, ea_value_len); - rc = SMB2_set_ea(xid, tcon, fid.persistent_fid, fid.volatile_fid, ea, - len); - kfree(ea); + size[0] = len; + data[0] = ea; + + rc = SMB2_set_info_init(tcon, &rqst[1], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_FULL_EA_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_next_command(tcon, &rqst[1]); + smb2_set_related(&rqst[1]); - SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + /* Close */ + memset(&close_iov, 0, sizeof(close_iov)); + rqst[2].rq_iov = close_iov; + rqst[2].rq_nvec = 1; + rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID); + smb2_set_related(&rqst[2]); + + rc = compound_send_recv(xid, ses, flags, 3, rqst, + resp_buftype, rsp_iov); + + sea_exit: + kfree(ea); + kfree(utf16_path); + SMB2_open_free(&rqst[0]); + SMB2_set_info_free(&rqst[1]); + SMB2_close_free(&rqst[2]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); return rc; } #endif @@ -1194,7 +1221,7 @@ smb2_ioctl_query_info(const unsigned int xid, rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, path); if (rc) goto iqinf_exit; - smb2_set_next_command(ses->server, &rqst[0]); + smb2_set_next_command(tcon, &rqst[0]); /* Query */ memset(&qi_iov, 0, sizeof(qi_iov)); @@ -1208,7 +1235,7 @@ smb2_ioctl_query_info(const unsigned int xid, qi.output_buffer_length, buffer); if (rc) goto iqinf_exit; - smb2_set_next_command(ses->server, &rqst[1]); + smb2_set_next_command(tcon, &rqst[1]); smb2_set_related(&rqst[1]); /* Close */ @@ -1761,42 +1788,79 @@ smb2_set_related(struct smb_rqst *rqst) char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0}; void -smb2_set_next_command(struct TCP_Server_Info *server, struct smb_rqst *rqst) +smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) { struct smb2_sync_hdr *shdr; + struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = ses->server; unsigned long len = smb_rqst_len(server, rqst); + int i, num_padding; /* SMB headers in a compound are 8 byte aligned. */ - if (len & 7) { + + /* No padding needed */ + if (!(len & 7)) + goto finished; + + num_padding = 8 - (len & 7); + if (!smb3_encryption_required(tcon)) { + /* + * If we do not have encryption then we can just add an extra + * iov for the padding. + */ rqst->rq_iov[rqst->rq_nvec].iov_base = smb2_padding; - rqst->rq_iov[rqst->rq_nvec].iov_len = 8 - (len & 7); + rqst->rq_iov[rqst->rq_nvec].iov_len = num_padding; rqst->rq_nvec++; - len = smb_rqst_len(server, rqst); + len += num_padding; + } else { + /* + * We can not add a small padding iov for the encryption case + * because the encryption framework can not handle the padding + * iovs. + * We have to flatten this into a single buffer and add + * the padding to it. + */ + for (i = 1; i < rqst->rq_nvec; i++) { + memcpy(rqst->rq_iov[0].iov_base + + rqst->rq_iov[0].iov_len, + rqst->rq_iov[i].iov_base, + rqst->rq_iov[i].iov_len); + rqst->rq_iov[0].iov_len += rqst->rq_iov[i].iov_len; + } + memset(rqst->rq_iov[0].iov_base + rqst->rq_iov[0].iov_len, + 0, num_padding); + rqst->rq_iov[0].iov_len += num_padding; + len += num_padding; + rqst->rq_nvec = 1; } + finished: shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base); shdr->NextCommand = cpu_to_le32(len); } -static int -smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, - struct kstatfs *buf) +/* + * Passes the query info response back to the caller on success. + * Caller need to free this with free_rsp_buf(). + */ +int +smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, + __le16 *utf16_path, u32 desired_access, + u32 class, u32 type, u32 output_len, + struct kvec *rsp, int *buftype, + struct cifs_sb_info *cifs_sb) { - struct smb2_query_info_rsp *rsp; - struct smb2_fs_full_size_info *info = NULL; + struct cifs_ses *ses = tcon->ses; + int flags = 0; struct smb_rqst rqst[3]; int resp_buftype[3]; struct kvec rsp_iov[3]; struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; struct kvec qi_iov[1]; struct kvec close_iov[1]; - struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server = ses->server; - __le16 srch_path = 0; /* Null - open root of share */ u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; - int flags = 0; int rc; if (smb3_encryption_required(tcon)) @@ -1811,29 +1875,31 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; oparms.tcon = tcon; - oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.desired_access = desired_access; oparms.disposition = FILE_OPEN; - oparms.create_options = 0; + if (cifs_sb && backup_cred(cifs_sb)) + oparms.create_options = CREATE_OPEN_BACKUP_INTENT; + else + oparms.create_options = 0; oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &srch_path); + rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path); if (rc) - goto qfs_exit; - smb2_set_next_command(server, &rqst[0]); + goto qic_exit; + smb2_set_next_command(tcon, &rqst[0]); memset(&qi_iov, 0, sizeof(qi_iov)); rqst[1].rq_iov = qi_iov; rqst[1].rq_nvec = 1; rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID, - FS_FULL_SIZE_INFORMATION, - SMB2_O_INFO_FILESYSTEM, 0, - sizeof(struct smb2_fs_full_size_info), 0, + class, type, 0, + output_len, 0, NULL); if (rc) - goto qfs_exit; - smb2_set_next_command(server, &rqst[1]); + goto qic_exit; + smb2_set_next_command(tcon, &rqst[1]); smb2_set_related(&rqst[1]); memset(&close_iov, 0, sizeof(close_iov)); @@ -1842,32 +1908,61 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID); if (rc) - goto qfs_exit; + goto qic_exit; smb2_set_related(&rqst[2]); rc = compound_send_recv(xid, ses, flags, 3, rqst, resp_buftype, rsp_iov); + if (rc) { + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + goto qic_exit; + } + *rsp = rsp_iov[1]; + *buftype = resp_buftype[1]; + + qic_exit: + SMB2_open_free(&rqst[0]); + SMB2_query_info_free(&rqst[1]); + SMB2_close_free(&rqst[2]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + return rc; +} + +static int +smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, + struct kstatfs *buf) +{ + struct smb2_query_info_rsp *rsp; + struct smb2_fs_full_size_info *info = NULL; + __le16 utf16_path = 0; /* Null - open root of share */ + struct kvec rsp_iov = {NULL, 0}; + int buftype = CIFS_NO_BUFFER; + int rc; + + + rc = smb2_query_info_compound(xid, tcon, &utf16_path, + FILE_READ_ATTRIBUTES, + FS_FULL_SIZE_INFORMATION, + SMB2_O_INFO_FILESYSTEM, + sizeof(struct smb2_fs_full_size_info), + &rsp_iov, &buftype, NULL); if (rc) goto qfs_exit; - rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; buf->f_type = SMB2_MAGIC_NUMBER; info = (struct smb2_fs_full_size_info *)( le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp); rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset), le32_to_cpu(rsp->OutputBufferLength), - &rsp_iov[1], + &rsp_iov, sizeof(struct smb2_fs_full_size_info)); if (!rc) smb2_copy_fs_info_to_kstatfs(info, buf); qfs_exit: - SMB2_open_free(&rqst[0]); - SMB2_query_info_free(&rqst[1]); - SMB2_close_free(&rqst[2]); - free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); - free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); - free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + free_rsp_buf(buftype, rsp_iov.iov_base); return rc; } @@ -2736,7 +2831,7 @@ init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign) smb2_sg_set_buf(&sg[idx++], rqst[i].rq_iov[j].iov_base + skip, rqst[i].rq_iov[j].iov_len - skip); - } + } for (j = 0; j < rqst[i].rq_npages; j++) { unsigned int len, offset; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 27f86537a5d1..e283590955cd 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -50,6 +50,9 @@ #include "cifs_spnego.h" #include "smbdirect.h" #include "trace.h" +#ifdef CONFIG_CIFS_DFS_UPCALL +#include "dfs_cache.h" +#endif /* * The following table defines the expected "StructureSize" of SMB2 requests @@ -152,6 +155,77 @@ out: return; } +#ifdef CONFIG_CIFS_DFS_UPCALL +static int __smb2_reconnect(const struct nls_table *nlsc, + struct cifs_tcon *tcon) +{ + int rc; + struct dfs_cache_tgt_list tl; + struct dfs_cache_tgt_iterator *it = NULL; + char tree[MAX_TREE_SIZE + 1]; + const char *tcp_host; + size_t tcp_host_len; + const char *dfs_host; + size_t dfs_host_len; + + if (tcon->ipc) { + snprintf(tree, sizeof(tree), "\\\\%s\\IPC$", + tcon->ses->server->hostname); + return SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); + } + + if (!tcon->dfs_path) + return SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nlsc); + + rc = dfs_cache_noreq_find(tcon->dfs_path + 1, NULL, &tl); + if (rc) + return rc; + + extract_unc_hostname(tcon->ses->server->hostname, &tcp_host, + &tcp_host_len); + + for (it = dfs_cache_get_tgt_iterator(&tl); it; + it = dfs_cache_get_next_tgt(&tl, it)) { + const char *tgt = dfs_cache_get_tgt_name(it); + + extract_unc_hostname(tgt, &dfs_host, &dfs_host_len); + + if (dfs_host_len != tcp_host_len + || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) { + cifs_dbg(FYI, "%s: skipping %.*s, doesn't match %.*s", + __func__, + (int)dfs_host_len, dfs_host, + (int)tcp_host_len, tcp_host); + continue; + } + + snprintf(tree, sizeof(tree), "\\%s", tgt); + + rc = SMB2_tcon(0, tcon->ses, tree, tcon, nlsc); + if (!rc) + break; + if (rc == -EREMOTE) + break; + } + + if (!rc) { + if (it) + rc = dfs_cache_noreq_update_tgthint(tcon->dfs_path + 1, + it); + else + rc = -ENOENT; + } + dfs_cache_free_tgts(&tl); + return rc; +} +#else +static inline int __smb2_reconnect(const struct nls_table *nlsc, + struct cifs_tcon *tcon) +{ + return SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nlsc); +} +#endif + static int smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) { @@ -159,6 +233,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) struct nls_table *nls_codepage; struct cifs_ses *ses; struct TCP_Server_Info *server; + int retries; /* * SMB2s NegProt, SessSetup, Logoff do not have tcon yet so @@ -192,9 +267,12 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) ses = tcon->ses; server = ses->server; + retries = server->nr_targets; + /* - * Give demultiplex thread up to 10 seconds to reconnect, should be - * greater than cifs socket timeout which is 7 seconds + * Give demultiplex thread up to 10 seconds to each target available for + * reconnect -- should be greater than cifs socket timeout which is 7 + * seconds. */ while (server->tcpStatus == CifsNeedReconnect) { /* @@ -225,6 +303,9 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) if (server->tcpStatus != CifsNeedReconnect) break; + if (--retries) + continue; + /* * on "soft" mounts we wait once. Hard mounts keep * retrying until process is killed or server comes @@ -234,6 +315,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); return -EHOSTDOWN; } + retries = server->nr_targets; } if (!tcon->ses->need_reconnect && !tcon->need_reconnect) @@ -271,7 +353,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) if (tcon->use_persistent) tcon->need_reopen_files = true; - rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); + rc = __smb2_reconnect(nls_codepage, tcon); mutex_unlock(&tcon->ses->session_mutex); cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); @@ -1955,7 +2037,6 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, struct smb_rqst rqst; struct smb2_create_req *req; struct smb2_create_rsp *rsp = NULL; - struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; struct kvec iov[3]; /* make sure at least one for each open context */ struct kvec rsp_iov = {NULL, 0}; @@ -1978,9 +2059,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, if (!utf16_path) return -ENOMEM; - if (ses && (ses->server)) - server = ses->server; - else { + if (!ses || !(ses->server)) { rc = -EIO; goto err_free_path; } @@ -2768,18 +2847,6 @@ qinf_exit: return rc; } -int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, - int ea_buf_size, struct smb2_file_full_ea_info *data) -{ - return query_info(xid, tcon, persistent_fid, volatile_fid, - FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, 0, - ea_buf_size, - sizeof(struct smb2_file_full_ea_info), - (void **)&data, - NULL); -} - int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct smb2_file_all_info *data) { @@ -3994,7 +4061,6 @@ static int build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, int outbuf_len, u64 persistent_fid, u64 volatile_fid) { - struct TCP_Server_Info *server; int rc; struct smb2_query_info_req *req; unsigned int total_len; @@ -4004,8 +4070,6 @@ build_qfs_info_req(struct kvec *iov, struct cifs_tcon *tcon, int level, if ((tcon->ses == NULL) || (tcon->ses->server == NULL)) return -EIO; - server = tcon->ses->server; - rc = smb2_plain_req_init(SMB2_QUERY_INFO, tcon, (void **) &req, &total_len); if (rc) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 5671d5ee7f58..05dea6750c33 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -1398,7 +1398,6 @@ struct smb2_file_link_info { /* encoding of request for level 11 */ char FileName[0]; /* Name to be assigned to new link */ } __packed; /* level 11 Set */ -#define SMB2_MIN_EA_BUF 2048 #define SMB2_MAX_EA_BUF 65536 struct smb2_file_full_ea_info { /* encoding of response for level 15 */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 9f4e9ed9ce53..87733b27a65f 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -116,7 +116,7 @@ extern void smb2_reconnect_server(struct work_struct *work); extern int smb3_crypto_aead_allocate(struct TCP_Server_Info *server); extern unsigned long smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst); -extern void smb2_set_next_command(struct TCP_Server_Info *server, +extern void smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst); extern void smb2_set_related(struct smb_rqst *rqst); @@ -153,10 +153,6 @@ extern int SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, extern void SMB2_close_free(struct smb_rqst *rqst); extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); -extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_file_id, u64 volatile_file_id, - int ea_buf_size, - struct smb2_file_full_ea_info *data); extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct smb2_file_all_info *data); @@ -240,4 +236,10 @@ extern void smb2_copy_fs_info_to_kstatfs( extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server); extern int smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec); +extern int smb2_query_info_compound(const unsigned int xid, + struct cifs_tcon *tcon, + __le16 *utf16_path, u32 desired_access, + u32 class, u32 type, u32 output_len, + struct kvec *rsp, int *buftype, + struct cifs_sb_info *cifs_sb); #endif /* _SMB2PROTO_H */ diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index e94a8d1d08a3..a568dac7b3a1 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -1724,7 +1724,7 @@ static struct smbd_connection *_smbd_get_connection( info->responder_resources); /* Need to send IRD/ORD in private data for iWARP */ - info->id->device->get_port_immutable( + info->id->device->ops.get_port_immutable( info->id->device, info->id->port_num, &port_immutable); if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { ird_ord_hdr[0] = info->responder_resources; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 83ff0c25710d..5be7302853b6 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -126,9 +126,11 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) if ((slow_rsp_threshold != 0) && time_after(now, midEntry->when_alloc + (slow_rsp_threshold * HZ)) && (midEntry->command != command)) { - /* smb2slowcmd[NUMBER_OF_SMB2_COMMANDS] counts by command */ - if ((le16_to_cpu(midEntry->command) < NUMBER_OF_SMB2_COMMANDS) && - (le16_to_cpu(midEntry->command) >= 0)) + /* + * smb2slowcmd[NUMBER_OF_SMB2_COMMANDS] counts by command + * NB: le16_to_cpu returns unsigned so can not be negative below + */ + if (le16_to_cpu(midEntry->command) < NUMBER_OF_SMB2_COMMANDS) cifs_stats_inc(&midEntry->server->smb2slowcmd[le16_to_cpu(midEntry->command)]); trace_smb3_slow_rsp(le16_to_cpu(midEntry->command), @@ -246,18 +246,16 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry) ewait.wait.func = wake_exceptional_entry_func; wq = dax_entry_waitqueue(xas, entry, &ewait.key); - prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); + /* + * Unlike get_unlocked_entry() there is no guarantee that this + * path ever successfully retrieves an unlocked entry before an + * inode dies. Perform a non-exclusive wait in case this path + * never successfully performs its own wake up. + */ + prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xas_unlock_irq(xas); schedule(); finish_wait(wq, &ewait.wait); - - /* - * Entry lock waits are exclusive. Wake up the next waiter since - * we aren't sure we will acquire the entry lock and thus wake - * the next waiter up on unlock. - */ - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, 1, &ewait.key); } static void put_unlocked_entry(struct xa_state *xas, void *entry) @@ -779,7 +777,8 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { - unsigned long address, start, end; + struct mmu_notifier_range range; + unsigned long address; cond_resched(); @@ -793,7 +792,8 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, * call mmu_notifier_invalidate_range_start() on our behalf * before taking any lock. */ - if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) + if (follow_pte_pmd(vma->vm_mm, address, &range, + &ptep, &pmdp, &ptl)) continue; /* @@ -835,7 +835,7 @@ unlock_pte: pte_unmap_unlock(ptep, ptl); } - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_end(&range); } i_mmap_unlock_read(mapping); } diff --git a/fs/direct-io.c b/fs/direct-io.c index 41a0e97252ae..dbc1a1f080ce 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -518,7 +518,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(dio->bio_disk->queue, dio->bio_cookie)) + !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); @@ -1265,6 +1265,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } else { dio->op = REQ_OP_READ; } + if (iocb->ki_flags & IOCB_HIPRI) + dio->op_flags |= REQ_HIPRI; /* * For AIO O_(D)SYNC writes we need to defer completions to a workqueue diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 562fa8c3edff..47ee66d70109 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -292,6 +292,8 @@ void dlm_callback_suspend(struct dlm_ls *ls) flush_workqueue(ls->ls_callback_wq); } +#define MAX_CB_QUEUE 25 + void dlm_callback_resume(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; @@ -302,15 +304,23 @@ void dlm_callback_resume(struct dlm_ls *ls) if (!ls->ls_callback_wq) return; +more: mutex_lock(&ls->ls_cb_mutex); list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { list_del_init(&lkb->lkb_cb_list); queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); count++; + if (count == MAX_CB_QUEUE) + break; } mutex_unlock(&ls->ls_cb_mutex); if (count) log_rinfo(ls, "dlm_callback_resume %d", count); + if (count == MAX_CB_QUEUE) { + count = 0; + cond_resched(); + goto more; + } } diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index cc91963683de..a928ba008d7d 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1209,6 +1209,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) if (rv < 0) { log_error(ls, "create_lkb idr error %d", rv); + dlm_free_lkb(lkb); return rv; } @@ -4179,6 +4180,7 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) (unsigned long long)lkb->lkb_recover_seq, ms->m_header.h_nodeid, ms->m_lkid); error = -ENOENT; + dlm_put_lkb(lkb); goto fail; } @@ -4232,6 +4234,7 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) lkb->lkb_id, lkb->lkb_remid, ms->m_header.h_nodeid, ms->m_lkid); error = -ENOENT; + dlm_put_lkb(lkb); goto fail; } @@ -5792,20 +5795,20 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } } - - /* After ua is attached to lkb it will be freed by dlm_free_lkb(). - When DLM_IFL_USER is set, the dlm knows that this is a userspace - lock and that lkb_astparam is the dlm_user_args structure. */ - error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, fake_astfn, ua, fake_bastfn, &args); - lkb->lkb_flags |= DLM_IFL_USER; - if (error) { + kfree(ua->lksb.sb_lvbptr); + ua->lksb.sb_lvbptr = NULL; + kfree(ua); __put_lkb(ls, lkb); goto out; } + /* After ua is attached to lkb it will be freed by dlm_free_lkb(). + When DLM_IFL_USER is set, the dlm knows that this is a userspace + lock and that lkb_astparam is the dlm_user_args structure. */ + lkb->lkb_flags |= DLM_IFL_USER; error = request_lock(ls, lkb, name, namelen, &args); switch (error) { diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 5ba94be006ee..db43b98c4d64 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -431,7 +431,7 @@ static int new_lockspace(const char *name, const char *cluster, int do_unreg = 0; int namelen = strlen(name); - if (namelen > DLM_LOCKSPACE_LEN) + if (namelen > DLM_LOCKSPACE_LEN || namelen == 0) return -EINVAL; if (!lvblen || (lvblen % 8)) @@ -680,11 +680,9 @@ static int new_lockspace(const char *name, const char *cluster, kfree(ls->ls_recover_buf); out_lkbidr: idr_destroy(&ls->ls_lkbidr); - for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { - if (ls->ls_remove_names[i]) - kfree(ls->ls_remove_names[i]); - } out_rsbtbl: + for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) + kfree(ls->ls_remove_names[i]); vfree(ls->ls_rsbtbl); out_lsfree: if (do_unreg) @@ -807,6 +805,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_delete_debug_file(ls); + idr_destroy(&ls->ls_recover_idr); kfree(ls->ls_recover_buf); /* diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 3fda3832cf6a..0bc43b35d2c5 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -671,7 +671,7 @@ int dlm_ls_stop(struct dlm_ls *ls) int dlm_ls_start(struct dlm_ls *ls) { struct dlm_recover *rv, *rv_old; - struct dlm_config_node *nodes; + struct dlm_config_node *nodes = NULL; int error, count; rv = kzalloc(sizeof(*rv), GFP_NOFS); @@ -680,7 +680,7 @@ int dlm_ls_start(struct dlm_ls *ls) error = dlm_config_nodes(ls->ls_name, &nodes, &count); if (error < 0) - goto fail; + goto fail_rv; spin_lock(&ls->ls_recover_lock); @@ -712,8 +712,9 @@ int dlm_ls_start(struct dlm_ls *ls) return 0; fail: - kfree(rv); kfree(nodes); + fail_rv: + kfree(rv); return error; } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index 7cd24bccd4fe..37be29f21d04 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -38,10 +38,8 @@ int __init dlm_memory_init(void) void dlm_memory_exit(void) { - if (lkb_cache) - kmem_cache_destroy(lkb_cache); - if (rsb_cache) - kmem_cache_destroy(rsb_cache); + kmem_cache_destroy(lkb_cache); + kmem_cache_destroy(rsb_cache); } char *dlm_allocate_lvb(struct dlm_ls *ls) @@ -86,8 +84,7 @@ void dlm_free_lkb(struct dlm_lkb *lkb) struct dlm_user_args *ua; ua = lkb->lkb_ua; if (ua) { - if (ua->lksb.sb_lvbptr) - kfree(ua->lksb.sb_lvbptr); + kfree(ua->lksb.sb_lvbptr); kfree(ua); } } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 2a669390cd7f..3c84c62dadb7 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -25,6 +25,7 @@ #include "lvb_table.h" #include "user.h" #include "ast.h" +#include "config.h" static const char name_prefix[] = "dlm"; static const struct file_operations device_fops; @@ -404,7 +405,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - error = dlm_new_lockspace(params->name, NULL, params->flags, + error = dlm_new_lockspace(params->name, dlm_config.ci_cluster_name, params->flags, DLM_USER_LVB_LEN, NULL, NULL, NULL, &lockspace); if (error) @@ -702,7 +703,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, result.version[0] = DLM_DEVICE_VERSION_MAJOR; result.version[1] = DLM_DEVICE_VERSION_MINOR; result.version[2] = DLM_DEVICE_VERSION_PATCH; - memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb)); + memcpy(&result.lksb, &ua->lksb, offsetof(struct dlm_lksb, sb_lvbptr)); result.user_lksb = ua->user_lksb; /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 42bbe6824b4b..8a5a1010886b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2223,31 +2223,13 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ - if (sigmask) { - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) - return -EFAULT; - sigsaved = current->blocked; - set_current_blocked(&ksigmask); - } + error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (error) + return error; error = do_epoll_wait(epfd, events, maxevents, timeout); - /* - * If we changed the signal mask, we need to restore the original one. - * In case we've got a signal while waiting, we do not restore the - * signal mask yet, and we allow do_signal() to deliver the signal on - * the way back to userspace, before the signal mask is restored. - */ - if (sigmask) { - if (error == -EINTR) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } else - set_current_blocked(&sigsaved); - } + restore_user_sigmask(sigmask, &sigsaved); return error; } @@ -2266,31 +2248,13 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (get_compat_sigset(&ksigmask, sigmask)) - return -EFAULT; - sigsaved = current->blocked; - set_current_blocked(&ksigmask); - } + err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (err) + return err; err = do_epoll_wait(epfd, events, maxevents, timeout); - /* - * If we changed the signal mask, we need to restore the original one. - * In case we've got a signal while waiting, we do not restore the - * signal mask yet, and we allow do_signal() to deliver the signal on - * the way back to userspace, before the signal mask is restored. - */ - if (sigmask) { - if (err == -EINTR) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } else - set_current_blocked(&sigsaved); - } + restore_user_sigmask(sigmask, &sigsaved); return err; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index eb11502e3fcd..73b2d528237f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -73,7 +73,7 @@ void ext2_error(struct super_block *sb, const char *function, if (test_opt(sb, ERRORS_PANIC)) panic("EXT2-fs: panic from previous error\n"); - if (test_opt(sb, ERRORS_RO)) { + if (!sb_rdonly(sb) && test_opt(sb, ERRORS_RO)) { ext2_msg(sb, KERN_CRIT, "error: remounting filesystem read-only"); sb->s_flags |= SB_RDONLY; @@ -148,10 +148,9 @@ static void ext2_put_super (struct super_block * sb) ext2_quota_off_umount(sb); - if (sbi->s_ea_block_cache) { - ext2_xattr_destroy_cache(sbi->s_ea_block_cache); - sbi->s_ea_block_cache = NULL; - } + ext2_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; + if (!sb_rdonly(sb)) { struct ext2_super_block *es = sbi->s_es; @@ -1198,8 +1197,7 @@ cantfind_ext2: sb->s_id); goto failed_mount; failed_mount3: - if (sbi->s_ea_block_cache) - ext2_xattr_destroy_cache(sbi->s_ea_block_cache); + ext2_xattr_destroy_cache(sbi->s_ea_block_cache); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index dd8f10db82e9..4f30876ee325 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -835,7 +835,8 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh) __u32 hash = le32_to_cpu(HDR(bh)->h_hash); int error; - error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1); + error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, + true); if (error) { if (error == -EBUSY) { ea_bdebug(bh, "already in cache (%d cache entries)", diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index c1d570ee1d9f..8c7bbf3e566d 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -248,7 +248,8 @@ retry: error = posix_acl_update_mode(inode, &mode, &acl); if (error) goto out_stop; - update_mode = 1; + if (mode != inode->i_mode) + update_mode = 1; } error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3f89d0ab08fc..185a05d3257e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2454,8 +2454,19 @@ int do_journal_get_write_access(handle_t *handle, #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 -extern struct inode *ext4_iget(struct super_block *, unsigned long); -extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); +typedef enum { + EXT4_IGET_NORMAL = 0, + EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ + EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ +} ext4_iget_flags; + +extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line); + +#define ext4_iget(sb, ino, flags) \ + __ext4_iget((sb), (ino), (flags), __func__, __LINE__) + extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct dentry *, struct iattr *); extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int); @@ -2538,6 +2549,8 @@ extern int ext4_group_extend(struct super_block *sb, extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern struct buffer_head *ext4_sb_bread(struct super_block *sb, + sector_t block, int op_flags); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 014f6a698cb7..7ff14a1adba3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1225,7 +1225,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) if (!ext4_test_bit(bit, bitmap_bh->b_data)) goto bad_orphan; - inode = ext4_iget(sb, ino); + inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(sb, "couldn't read orphan inode %lu (err %d)", diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 9c4bac18cc6c..27373d88b5f0 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -705,8 +705,11 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); - if (ret < 0) + if (ret < 0) { + unlock_page(page); + put_page(page); goto out_up_read; + } } ret = 1; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 22a9d8159720..9affabd07682 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4817,7 +4817,9 @@ static inline u64 ext4_inode_peek_iversion(const struct inode *inode) return inode_peek_iversion(inode); } -struct inode *ext4_iget(struct super_block *sb, unsigned long ino) +struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; @@ -4831,6 +4833,18 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) gid_t i_gid; projid_t i_projid; + if (((flags & EXT4_IGET_NORMAL) && + (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) || + (ino < EXT4_ROOT_INO) || + (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { + if (flags & EXT4_IGET_HANDLE) + return ERR_PTR(-ESTALE); + __ext4_error(sb, function, line, + "inode #%lu: comm %s: iget: illegal inode #", + ino, current->comm); + return ERR_PTR(-EFSCORRUPTED); + } + inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); @@ -4846,18 +4860,26 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) raw_inode = ext4_raw_inode(&iloc); if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { - EXT4_ERROR_INODE(inode, "root inode unallocated"); + ext4_error_inode(inode, function, line, 0, + "iget: root inode unallocated"); ret = -EFSCORRUPTED; goto bad_inode; } + if ((flags & EXT4_IGET_HANDLE) && + (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { + ret = -ESTALE; + goto bad_inode; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { - EXT4_ERROR_INODE(inode, - "bad extra_isize %u (inode size %u)", + ext4_error_inode(inode, function, line, 0, + "iget: bad extra_isize %u " + "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; @@ -4879,7 +4901,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { - EXT4_ERROR_INODE(inode, "checksum invalid"); + ext4_error_inode(inode, function, line, 0, + "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } @@ -4936,7 +4959,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { - EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); + ext4_error_inode(inode, function, line, 0, + "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } @@ -5012,7 +5036,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { - EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", + ext4_error_inode(inode, function, line, 0, + "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; @@ -5040,8 +5065,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - EXT4_ERROR_INODE(inode, - "immutable or append flags not allowed on symlinks"); + ext4_error_inode(inode, function, line, 0, + "iget: immutable or append flags " + "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } @@ -5071,7 +5097,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) make_bad_inode(inode); } else { ret = -EFSCORRUPTED; - EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); + ext4_error_inode(inode, function, line, 0, + "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } brelse(iloc.bh); @@ -5085,13 +5112,6 @@ bad_inode: return ERR_PTR(ret); } -struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) -{ - if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) - return ERR_PTR(-EFSCORRUPTED); - return ext4_iget(sb, ino); -} - static int ext4_inode_blocks_set(handle_t *handle, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) @@ -5380,9 +5400,13 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || + sb_rdonly(inode->i_sb)) return 0; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); @@ -5398,7 +5422,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; - err = ext4_force_commit(inode->i_sb); + err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0edee31913d1..d37dafa1d133 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -125,7 +125,7 @@ static long swap_inode_boot_loader(struct super_block *sb, !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) return -EPERM; - inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 61a9d1927817..b1e4d359f73b 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -116,9 +116,9 @@ static int update_ind_extent_range(handle_t *handle, struct inode *inode, int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -145,9 +145,9 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode, int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -175,9 +175,9 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode, int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -224,9 +224,9 @@ static int free_dind_blocks(handle_t *handle, struct buffer_head *bh; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -254,9 +254,9 @@ static int free_tind_blocks(handle_t *handle, struct buffer_head *bh; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -382,9 +382,9 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, struct ext4_extent_header *eh; block = ext4_idx_pblock(ix); - bh = sb_bread(inode->i_sb, block); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); eh = (struct ext4_extent_header *)bh->b_data; if (eh->eh_depth != 0) { @@ -535,22 +535,22 @@ int ext4_ext_migrate(struct inode *inode) if (i_data[EXT4_IND_BLOCK]) { retval = update_ind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); - if (retval) - goto err_out; + if (retval) + goto err_out; } else lb.curr_block += max_entries; if (i_data[EXT4_DIND_BLOCK]) { retval = update_dind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); - if (retval) - goto err_out; + if (retval) + goto err_out; } else lb.curr_block += max_entries * max_entries; if (i_data[EXT4_TIND_BLOCK]) { retval = update_tind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); - if (retval) - goto err_out; + if (retval) + goto err_out; } /* * Build the last extent diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 437f71fe83ae..2b928eb07fa2 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1571,7 +1571,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi dentry); return ERR_PTR(-EFSCORRUPTED); } - inode = ext4_iget_normal(dir->i_sb, ino); + inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", @@ -1613,7 +1613,7 @@ struct dentry *ext4_get_parent(struct dentry *child) return ERR_PTR(-EFSCORRUPTED); } - return d_obtain_alias(ext4_iget_normal(child->d_sb, ino)); + return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL)); } /* diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index db7590178dfc..2aa62d58d8dd 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io, bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); if (!bio) return -ENOMEM; - wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; io->io_next_block = bh->b_blocknr; + wbc_init_bio(io->io_wbc, bio); return 0; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a5efee34415f..48421de803b7 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -127,10 +127,12 @@ static int verify_group_input(struct super_block *sb, else if (free_blocks_count < 0) ext4_warning(sb, "Bad blocks count %u", input->blocks_count); - else if (!(bh = sb_bread(sb, end - 1))) + else if (IS_ERR(bh = ext4_sb_bread(sb, end - 1, 0))) { + err = PTR_ERR(bh); + bh = NULL; ext4_warning(sb, "Cannot read last block (%llu)", end - 1); - else if (outside(input->block_bitmap, start, end)) + } else if (outside(input->block_bitmap, start, end)) ext4_warning(sb, "Block bitmap not in group (block %llu)", (unsigned long long)input->block_bitmap); else if (outside(input->inode_bitmap, start, end)) @@ -781,11 +783,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, struct ext4_super_block *es = EXT4_SB(sb)->s_es; unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; - struct buffer_head **o_group_desc, **n_group_desc; - struct buffer_head *dind; - struct buffer_head *gdb_bh; + struct buffer_head **o_group_desc, **n_group_desc = NULL; + struct buffer_head *dind = NULL; + struct buffer_head *gdb_bh = NULL; int gdbackups; - struct ext4_iloc iloc; + struct ext4_iloc iloc = { .bh = NULL }; __le32 *data; int err; @@ -794,21 +796,22 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", gdb_num); - gdb_bh = sb_bread(sb, gdblock); - if (!gdb_bh) - return -EIO; + gdb_bh = ext4_sb_bread(sb, gdblock, 0); + if (IS_ERR(gdb_bh)) + return PTR_ERR(gdb_bh); gdbackups = verify_reserved_gdb(sb, group, gdb_bh); if (gdbackups < 0) { err = gdbackups; - goto exit_bh; + goto errout; } data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; - goto exit_bh; + dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); + if (IS_ERR(dind)) { + err = PTR_ERR(dind); + dind = NULL; + goto errout; } data = (__le32 *)dind->b_data; @@ -816,18 +819,18 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, ext4_warning(sb, "new group %u GDT block %llu not reserved", group, gdblock); err = -EINVAL; - goto exit_dind; + goto errout; } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (unlikely(err)) - goto exit_dind; + goto errout; BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) - goto exit_dind; + goto errout; BUFFER_TRACE(dind, "get_write_access"); err = ext4_journal_get_write_access(handle, dind); @@ -837,7 +840,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, /* ext4_reserve_inode_write() gets a reference on the iloc */ err = ext4_reserve_inode_write(handle, inode, &iloc); if (unlikely(err)) - goto exit_dind; + goto errout; n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), @@ -846,7 +849,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); - goto exit_inode; + goto errout; } /* @@ -862,7 +865,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_metadata(handle, NULL, dind); if (unlikely(err)) { ext4_std_error(sb, err); - goto exit_inode; + goto errout; } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> (9 - EXT4_SB(sb)->s_cluster_bits); @@ -871,8 +874,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); - iloc.bh = NULL; - goto exit_inode; + goto errout; } brelse(dind); @@ -888,15 +890,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_super(handle, sb); if (err) ext4_std_error(sb, err); - return err; - -exit_inode: +errout: kvfree(n_group_desc); brelse(iloc.bh); -exit_dind: brelse(dind); -exit_bh: brelse(gdb_bh); ext4_debug("leaving with error %d\n", err); @@ -916,9 +914,9 @@ static int add_new_gdb_meta_bg(struct super_block *sb, gdblock = ext4_meta_bg_first_block_no(sb, group) + ext4_bg_has_super(sb, group); - gdb_bh = sb_bread(sb, gdblock); - if (!gdb_bh) - return -EIO; + gdb_bh = ext4_sb_bread(sb, gdblock, 0); + if (IS_ERR(gdb_bh)) + return PTR_ERR(gdb_bh); n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), GFP_NOFS); @@ -975,9 +973,10 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, return -ENOMEM; data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; + dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); + if (IS_ERR(dind)) { + err = PTR_ERR(dind); + dind = NULL; goto exit_free; } @@ -996,9 +995,10 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, err = -EINVAL; goto exit_bh; } - primary[res] = sb_bread(sb, blk); - if (!primary[res]) { - err = -EIO; + primary[res] = ext4_sb_bread(sb, blk, 0); + if (IS_ERR(primary[res])) { + err = PTR_ERR(primary[res]); + primary[res] = NULL; goto exit_bh; } gdbackups = verify_reserved_gdb(sb, group, primary[res]); @@ -1631,13 +1631,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } if (reserved_gdb || gdb_off == 0) { - if (ext4_has_feature_resize_inode(sb) || + if (!ext4_has_feature_resize_inode(sb) || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, "No reserved GDT blocks, can't resize"); return -EPERM; } - inode = ext4_iget(sb, EXT4_RESIZE_INO); + inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(inode); @@ -1965,7 +1965,8 @@ retry: } if (!resize_inode) - resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO, + EXT4_IGET_SPECIAL); if (IS_ERR(resize_inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(resize_inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 53ff6c2a26ed..d6c142d73d99 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -140,6 +140,29 @@ MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) +/* + * This works like sb_bread() except it uses ERR_PTR for error + * returns. Currently with sb_bread it's impossible to distinguish + * between ENOMEM and EIO situations (since both result in a NULL + * return. + */ +struct buffer_head * +ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags) +{ + struct buffer_head *bh = sb_getblk(sb, block); + + if (bh == NULL) + return ERR_PTR(-ENOMEM); + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + return ERR_PTR(-EIO); +} + static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { @@ -1000,14 +1023,13 @@ static void ext4_put_super(struct super_block *sb) invalidate_bdev(sbi->journal_bdev); ext4_blkdev_remove(sbi); } - if (sbi->s_ea_inode_cache) { - ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); - sbi->s_ea_inode_cache = NULL; - } - if (sbi->s_ea_block_cache) { - ext4_xattr_destroy_cache(sbi->s_ea_block_cache); - sbi->s_ea_block_cache = NULL; - } + + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; + if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); brelse(sbi->s_sbh); @@ -1151,20 +1173,11 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb, { struct inode *inode; - if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) - return ERR_PTR(-ESTALE); - if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) - return ERR_PTR(-ESTALE); - - /* iget isn't really right if the inode is currently unallocated!! - * - * ext4_read_inode will return a bad_inode if the inode had been - * deleted, so we should be safe. - * + /* * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ - inode = ext4_iget_normal(sb, ino); + inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { @@ -1189,6 +1202,16 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, ext4_nfs_get_inode); } +static int ext4_nfs_commit_metadata(struct inode *inode) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL + }; + + trace_ext4_nfs_commit_metadata(inode); + return ext4_write_inode(inode, &wbc); +} + /* * Try to release metadata pages (indirect blocks, directories) which are * mapped via the block device. Since these pages could have journal heads @@ -1393,6 +1416,7 @@ static const struct export_operations ext4_export_ops = { .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, + .commit_metadata = ext4_nfs_commit_metadata, }; enum { @@ -1939,7 +1963,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, #ifdef CONFIG_FS_DAX ext4_msg(sb, KERN_WARNING, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - sbi->s_mount_opt |= m->mount_opt; + sbi->s_mount_opt |= m->mount_opt; #else ext4_msg(sb, KERN_INFO, "dax option not supported"); return -1; @@ -3842,12 +3866,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; + goto failed_mount; } if (!bdev_dax_supported(sb->s_bdev, blocksize)) { ext4_msg(sb, KERN_ERR, - "DAX unsupported by block device. Turning off DAX."); - sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; + "DAX unsupported by block device."); + goto failed_mount; } } @@ -4328,7 +4352,7 @@ no_journal: * so we can safely mount the rest of the filesystem now. */ - root = ext4_iget(sb, EXT4_ROOT_INO); + root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); ret = PTR_ERR(root); @@ -4522,14 +4546,12 @@ failed_mount4: if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: - if (sbi->s_ea_inode_cache) { - ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); - sbi->s_ea_inode_cache = NULL; - } - if (sbi->s_ea_block_cache) { - ext4_xattr_destroy_cache(sbi->s_ea_block_cache); - sbi->s_ea_block_cache = NULL; - } + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; + + ext4_xattr_destroy_cache(sbi->s_ea_block_cache); + sbi->s_ea_block_cache = NULL; + if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -4598,7 +4620,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, * happen if we iget() an unused inode, as the subsequent iput() * will try to delete it. */ - journal_inode = ext4_iget(sb, journal_inum); + journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); return NULL; @@ -5680,7 +5702,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, if (!qf_inums[type]) return -EPERM; - qf_inode = ext4_iget(sb, qf_inums[type]); + qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); return PTR_ERR(qf_inode); @@ -5690,9 +5712,9 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_enable(qf_inode, type, format_id, flags); - iput(qf_inode); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); + iput(qf_inode); return err; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7643d52c776c..86ed9c686249 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -384,7 +384,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, struct inode *inode; int err; - inode = ext4_iget(parent->i_sb, ea_ino); + inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(parent->i_sb, @@ -522,14 +522,13 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); - error = -ENODATA; if (!EXT4_I(inode)->i_file_acl) - goto cleanup; + return -ENODATA; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) - goto cleanup; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); @@ -696,26 +695,23 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); - error = 0; if (!EXT4_I(inode)->i_file_acl) - goto cleanup; + return 0; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bh) - goto cleanup; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); - error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); - + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, + buffer_size); cleanup: brelse(bh); - return error; } @@ -830,9 +826,9 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) } if (EXT4_I(inode)->i_file_acl) { - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - ret = -EIO; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); goto out; } @@ -1486,7 +1482,8 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, } while (ce) { - ea_inode = ext4_iget(inode->i_sb, ce->e_value); + ea_inode = ext4_iget(inode->i_sb, ce->e_value, + EXT4_IGET_NORMAL); if (!IS_ERR(ea_inode) && !is_bad_inode(ea_inode) && (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) && @@ -1821,16 +1818,15 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, if (EXT4_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ - bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bs->bh) - goto cleanup; + bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bs->bh)) + return PTR_ERR(bs->bh); ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); error = ext4_xattr_check_block(inode, bs->bh); if (error) - goto cleanup; + return error; /* Find the named attribute. */ bs->s.base = BHDR(bs->bh); bs->s.first = BFIRST(bs->bh); @@ -1839,13 +1835,10 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, error = xattr_find_entry(inode, &bs->s.here, bs->s.end, i->name_index, i->name, 1); if (error && error != -ENODATA) - goto cleanup; + return error; bs->s.not_found = error; } - error = 0; - -cleanup: - return error; + return 0; } static int @@ -2274,9 +2267,9 @@ static struct buffer_head *ext4_xattr_get_block(struct inode *inode) if (!EXT4_I(inode)->i_file_acl) return NULL; - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) - return ERR_PTR(-EIO); + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return bh; error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); @@ -2729,7 +2722,7 @@ retry: base = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; min_offs = end - base; - total_ino = sizeof(struct ext4_xattr_ibody_header); + total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); error = xattr_check_inode(inode, header, end); if (error) @@ -2746,10 +2739,11 @@ retry: if (EXT4_I(inode)->i_file_acl) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bh) + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); goto cleanup; + } error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); @@ -2903,11 +2897,12 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, } if (EXT4_I(inode)->i_file_acl) { - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - EXT4_ERROR_INODE(inode, "block %llu read error", - EXT4_I(inode)->i_file_acl); - error = -EIO; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); + if (error == -EIO) + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); goto cleanup; } error = ext4_xattr_check_block(inode, bh); @@ -3060,8 +3055,10 @@ ext4_xattr_block_cache_find(struct inode *inode, while (ce) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, ce->e_value); - if (!bh) { + bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); + if (IS_ERR(bh)) { + if (PTR_ERR(bh) == -ENOMEM) + return NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index fa707cdd4120..63e599524085 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -160,7 +160,7 @@ static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, return (void *)f2fs_acl; fail: - kfree(f2fs_acl); + kvfree(f2fs_acl); return ERR_PTR(-EINVAL); } @@ -190,7 +190,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, acl = NULL; else acl = ERR_PTR(retval); - kfree(value); + kvfree(value); return acl; } @@ -240,7 +240,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); - kfree(value); + kvfree(value); if (!error) set_cached_acl(inode, type, acl); @@ -352,12 +352,14 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode, return PTR_ERR(p); clone = f2fs_acl_clone(p, GFP_NOFS); - if (!clone) - goto no_mem; + if (!clone) { + ret = -ENOMEM; + goto release_acl; + } ret = f2fs_acl_create_masq(clone, mode); if (ret < 0) - goto no_mem_clone; + goto release_clone; if (ret == 0) posix_acl_release(clone); @@ -371,11 +373,11 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode, return 0; -no_mem_clone: +release_clone: posix_acl_release(clone); -no_mem: +release_acl: posix_acl_release(p); - return -ENOMEM; + return ret; } int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9c28ea439e0b..f955cd3e0677 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -44,7 +44,7 @@ repeat: cond_resched(); goto repeat; } - f2fs_wait_on_page_writeback(page, META, true); + f2fs_wait_on_page_writeback(page, META, true, true); if (!PageUptodate(page)) SetPageUptodate(page); return page; @@ -370,9 +370,8 @@ continue_unlock: goto continue_unlock; } - f2fs_wait_on_page_writeback(page, META, true); + f2fs_wait_on_page_writeback(page, META, true, true); - BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; @@ -911,7 +910,7 @@ free_fail_no_cp: f2fs_put_page(cp1, 1); f2fs_put_page(cp2, 1); fail_no_cp: - kfree(sbi->ckpt); + kvfree(sbi->ckpt); return -EINVAL; } @@ -1290,11 +1289,11 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, struct page *page = f2fs_grab_meta_page(sbi, blk_addr); int err; + f2fs_wait_on_page_writeback(page, META, true, true); + memcpy(page_address(page), src, PAGE_SIZE); - set_page_dirty(page); - f2fs_wait_on_page_writeback(page, META, true); - f2fs_bug_on(sbi, PageWriteback(page)); + set_page_dirty(page); if (unlikely(!clear_page_dirty_for_io(page))) f2fs_bug_on(sbi, 1); @@ -1328,11 +1327,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) int err; /* Flush all the NAT/SIT pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) { - f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); - if (unlikely(f2fs_cp_error(sbi))) - break; - } + f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) && + !f2fs_cp_error(sbi)); /* * modify checkpoint @@ -1405,14 +1402,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) for (i = 0; i < nm_i->nat_bits_blocks; i++) f2fs_update_meta_page(sbi, nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), blk + i); - - /* Flush all the NAT BITS pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) { - f2fs_sync_meta_pages(sbi, META, LONG_MAX, - FS_CP_META_IO); - if (unlikely(f2fs_cp_error(sbi))) - break; - } } /* write out checkpoint buffer at block 0 */ @@ -1448,6 +1437,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) && + !f2fs_cp_error(sbi)); /* wait for previous submitted meta pages writeback */ f2fs_wait_on_all_pages_writeback(sbi); @@ -1465,7 +1456,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) * invalidate intermediate page cache borrowed from meta inode * which are used for migration of encrypted inode's blocks. */ - if (f2fs_sb_has_encrypt(sbi->sb)) + if (f2fs_sb_has_encrypt(sbi)) invalidate_mapping_pages(META_MAPPING(sbi), MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b293cb3e27a2..f91d8630c9a2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -372,29 +372,6 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, return false; } -static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, nid_t ino, - enum page_type type) -{ - enum page_type btype = PAGE_TYPE_OF_BIO(type); - enum temp_type temp; - struct f2fs_bio_info *io; - bool ret = false; - - for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { - io = sbi->write_io[btype] + temp; - - down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, page, ino); - up_read(&io->io_rwsem); - - /* TODO: use HOT temp only for meta pages now. */ - if (ret || btype == META) - break; - } - return ret; -} - static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { @@ -420,13 +397,19 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, nid_t ino, enum page_type type, bool force) { enum temp_type temp; - - if (!force && !has_merged_page(sbi, inode, page, ino, type)) - return; + bool ret = true; for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + if (!force) { + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; - __f2fs_submit_merged_write(sbi, type, temp); + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, page, ino); + up_read(&io->io_rwsem); + } + if (ret) + __f2fs_submit_merged_write(sbi, type, temp); /* TODO: use HOT temp only for meta pages now. */ if (type >= META) @@ -643,7 +626,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn) */ void f2fs_set_data_blkaddr(struct dnode_of_data *dn) { - f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); __set_data_blkaddr(dn); if (set_page_dirty(dn->node_page)) dn->node_changed = true; @@ -673,7 +656,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); - f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); for (; count > 0; dn->ofs_in_node++) { block_t blkaddr = datablock_addr(dn->inode, @@ -957,6 +940,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) return err; } + if (direct_io && allow_outplace_dio(inode, iocb, from)) + return 0; + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; @@ -970,6 +956,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = true; if (direct_io) { map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); @@ -1028,7 +1015,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int mode = create ? ALLOC_NODE : LOOKUP_NODE; + int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE; pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; @@ -1048,6 +1035,10 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, end = pgofs + maxblocks; if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO && + map->m_may_create) + goto next_dnode; + map->m_pblk = ei.blk + pgofs - ei.fofs; map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); map->m_flags = F2FS_MAP_MAPPED; @@ -1062,7 +1053,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, } next_dnode: - if (create) + if (map->m_may_create) __do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ @@ -1099,11 +1090,13 @@ next_block: if (is_valid_data_blkaddr(sbi, blkaddr)) { /* use out-place-update for driect IO under LFS mode */ - if (test_opt(sbi, LFS) && create && - flag == F2FS_GET_BLOCK_DIO) { + if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO && + map->m_may_create) { err = __allocate_data_block(&dn, map->m_seg_type); - if (!err) + if (!err) { + blkaddr = dn.data_blkaddr; set_inode_flag(inode, FI_APPEND_WRITE); + } } } else { if (create) { @@ -1209,7 +1202,7 @@ skip: f2fs_put_dnode(&dn); - if (create) { + if (map->m_may_create) { __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } @@ -1235,7 +1228,7 @@ sync_out: } f2fs_put_dnode(&dn); unlock_out: - if (create) { + if (map->m_may_create) { __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } @@ -1257,6 +1250,7 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; last_lblk = F2FS_BLK_ALIGN(pos + len); while (map.m_lblk < last_lblk) { @@ -1271,7 +1265,7 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create, int flag, - pgoff_t *next_pgofs, int seg_type) + pgoff_t *next_pgofs, int seg_type, bool may_write) { struct f2fs_map_blocks map; int err; @@ -1281,6 +1275,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map.m_next_pgofs = next_pgofs; map.m_next_extent = NULL; map.m_seg_type = seg_type; + map.m_may_create = may_write; err = f2fs_map_blocks(inode, &map, create, flag); if (!err) { @@ -1297,16 +1292,25 @@ static int get_data_block(struct inode *inode, sector_t iblock, { return __get_data_block(inode, iblock, bh_result, create, flag, next_pgofs, - NO_CHECK_TYPE); + NO_CHECK_TYPE, create); +} + +static int get_data_block_dio_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return __get_data_block(inode, iblock, bh_result, create, + F2FS_GET_BLOCK_DIO, NULL, + f2fs_rw_hint_to_seg_type(inode->i_write_hint), + true); } static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL, - f2fs_rw_hint_to_seg_type( - inode->i_write_hint)); + F2FS_GET_BLOCK_DIO, NULL, + f2fs_rw_hint_to_seg_type(inode->i_write_hint), + false); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1318,7 +1322,7 @@ static int get_data_block_bmap(struct inode *inode, sector_t iblock, return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_BMAP, NULL, - NO_CHECK_TYPE); + NO_CHECK_TYPE, create); } static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -1525,6 +1529,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; for (; nr_pages; nr_pages--) { if (pages) { @@ -1855,6 +1860,8 @@ got_it: if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); + if (err && PageWriteback(page)) + end_page_writeback(page); trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); return err; @@ -2142,12 +2149,11 @@ continue_unlock: if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) f2fs_wait_on_page_writeback(page, - DATA, true); + DATA, true, true); else goto continue_unlock; } - BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; @@ -2324,6 +2330,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, bool locked = false; struct extent_info ei = {0,0,0}; int err = 0; + int flag; /* * we already allocated all the blocks, so we don't need to get @@ -2333,9 +2340,15 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, !is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; + /* f2fs_lock_op avoids race between write CP and convert_inline_page */ + if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode)) + flag = F2FS_GET_BLOCK_DEFAULT; + else + flag = F2FS_GET_BLOCK_PRE_AIO; + if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + __do_map_lock(sbi, flag, true); locked = true; } restart: @@ -2373,6 +2386,7 @@ restart: f2fs_put_dnode(&dn); __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); locked = true; goto restart; } @@ -2386,7 +2400,7 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + __do_map_lock(sbi, flag, false); return err; } @@ -2457,7 +2471,7 @@ repeat: } } - f2fs_wait_on_page_writeback(page, DATA, false); + f2fs_wait_on_page_writeback(page, DATA, false, true); if (len == PAGE_SIZE || PageUptodate(page)) return 0; @@ -2548,6 +2562,53 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, return 0; } +static void f2fs_dio_end_io(struct bio *bio) +{ + struct f2fs_private_dio *dio = bio->bi_private; + + dec_page_count(F2FS_I_SB(dio->inode), + dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ); + + bio->bi_private = dio->orig_private; + bio->bi_end_io = dio->orig_end_io; + + kvfree(dio); + + bio_endio(bio); +} + +static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode, + loff_t file_offset) +{ + struct f2fs_private_dio *dio; + bool write = (bio_op(bio) == REQ_OP_WRITE); + int err; + + dio = f2fs_kzalloc(F2FS_I_SB(inode), + sizeof(struct f2fs_private_dio), GFP_NOFS); + if (!dio) { + err = -ENOMEM; + goto out; + } + + dio->inode = inode; + dio->orig_end_io = bio->bi_end_io; + dio->orig_private = bio->bi_private; + dio->write = write; + + bio->bi_end_io = f2fs_dio_end_io; + bio->bi_private = dio; + + inc_page_count(F2FS_I_SB(inode), + write ? F2FS_DIO_WRITE : F2FS_DIO_READ); + + submit_bio(bio); + return; +out: + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); +} + static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; @@ -2594,7 +2655,10 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) down_read(&fi->i_gc_rwsem[READ]); } - err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); + err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, rw == WRITE ? get_data_block_dio_write : + get_data_block_dio, NULL, f2fs_dio_submit_bio, + DIO_LOCKING | DIO_SKIP_HOLES); if (do_opu) up_read(&fi->i_gc_rwsem[READ]); @@ -2738,7 +2802,7 @@ int f2fs_migrate_page(struct address_space *mapping, */ extra_count = (atomic_written ? 1 : 0) - page_has_private(page); rc = migrate_page_move_mapping(mapping, newpage, - page, NULL, mode, extra_count); + page, mode, extra_count); if (rc != MIGRATEPAGE_SUCCESS) { if (atomic_written) mutex_unlock(&fi->inmem_lock); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 139b4d5c83d5..ebcc121920ba 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -53,6 +53,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); + si->nr_dio_read = get_pages(sbi, F2FS_DIO_READ); + si->nr_dio_write = get_pages(sbi, F2FS_DIO_WRITE); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->nr_rd_data = get_pages(sbi, F2FS_RD_DATA); @@ -62,7 +64,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->nr_flushed = atomic_read(&SM_I(sbi)->fcc_info->issued_flush); si->nr_flushing = - atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + atomic_read(&SM_I(sbi)->fcc_info->queued_flush); si->flush_list_empty = llist_empty(&SM_I(sbi)->fcc_info->issue_list); } @@ -70,7 +72,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->nr_discarded = atomic_read(&SM_I(sbi)->dcc_info->issued_discard); si->nr_discarding = - atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + atomic_read(&SM_I(sbi)->dcc_info->queued_discard); si->nr_discard_cmd = atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; @@ -197,7 +199,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; - if (sbi->segs_per_sec > 1) + if (__is_large_section(sbi)) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); si->base_mem += __bitmap_size(sbi, SIT_BITMAP); @@ -374,6 +376,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); + seq_printf(s, " - DIO (R: %4d, W: %4d)\n", + si->nr_dio_read, si->nr_dio_write); seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n", si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta); seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " @@ -444,18 +448,7 @@ static int stat_show(struct seq_file *s, void *v) return 0; } -static int stat_open(struct inode *inode, struct file *file) -{ - return single_open(file, stat_show, inode->i_private); -} - -static const struct file_operations stat_fops = { - .owner = THIS_MODULE, - .open = stat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(stat); int f2fs_build_stats(struct f2fs_sb_info *sbi) { @@ -510,7 +503,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) list_del(&si->stat_list); mutex_unlock(&f2fs_stat_mutex); - kfree(si); + kvfree(si); } int __init f2fs_create_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index bacc667950b6..50d0d36280fa 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -293,7 +293,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; lock_page(page); - f2fs_wait_on_page_writeback(page, type, true); + f2fs_wait_on_page_writeback(page, type, true, true); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode->i_mode); set_page_dirty(page); @@ -307,7 +307,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) { struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); /* copy name info. to this inode page */ ri = F2FS_INODE(ipage); @@ -550,7 +550,7 @@ start: ++level; goto start; add_dentry: - f2fs_wait_on_page_writeback(dentry_page, DATA, true); + f2fs_wait_on_page_writeback(dentry_page, DATA, true, true); if (inode) { down_write(&F2FS_I(inode)->i_sem); @@ -705,7 +705,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, return f2fs_delete_inline_entry(dentry, page, dir, inode); lock_page(page); - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; @@ -808,6 +808,17 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); + /* check memory boundary before moving forward */ + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + if (unlikely(bit_pos > d->max)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted namelen=%d, run fsck to fix.", + __func__, le16_to_cpu(de->name_len)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = -EINVAL; + goto out; + } + if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; @@ -830,7 +841,6 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (readdir_ra) f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); - bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } out: diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1e031971a466..12fabd6735dd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -67,7 +67,7 @@ struct f2fs_fault_info { unsigned int inject_type; }; -extern char *f2fs_fault_name[FAULT_MAX]; +extern const char *f2fs_fault_name[FAULT_MAX]; #define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif @@ -152,12 +152,13 @@ struct f2fs_mount_info { #define F2FS_FEATURE_VERITY 0x0400 /* reserved */ #define F2FS_FEATURE_SB_CHKSUM 0x0800 -#define F2FS_HAS_FEATURE(sb, mask) \ - ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) -#define F2FS_SET_FEATURE(sb, mask) \ - (F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)) -#define F2FS_CLEAR_FEATURE(sb, mask) \ - (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)) +#define __F2FS_HAS_FEATURE(raw_super, mask) \ + ((raw_super->feature & cpu_to_le32(mask)) != 0) +#define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask) +#define F2FS_SET_FEATURE(sbi, mask) \ + (sbi->raw_super->feature |= cpu_to_le32(mask)) +#define F2FS_CLEAR_FEATURE(sbi, mask) \ + (sbi->raw_super->feature &= ~cpu_to_le32(mask)) /* * Default values for user and/or group using reserved blocks @@ -284,7 +285,7 @@ struct discard_cmd { struct block_device *bdev; /* bdev */ unsigned short ref; /* reference count */ unsigned char state; /* state */ - unsigned char issuing; /* issuing discard */ + unsigned char queued; /* queued discard */ int error; /* bio error */ spinlock_t lock; /* for state/bio_ref updating */ unsigned short bio_ref; /* bio reference count */ @@ -326,7 +327,7 @@ struct discard_cmd_control { unsigned int undiscard_blks; /* # of undiscard blocks */ unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ - atomic_t issing_discard; /* # of issing discard */ + atomic_t queued_discard; /* # of queued discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root_cached root; /* root of discard rb-tree */ bool rbtree_check; /* config for consistence check */ @@ -416,6 +417,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ #define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ #define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */ +#define F2FS_GOING_DOWN_NEED_FSCK 0x4 /* going down to trigger fsck */ #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -557,16 +559,8 @@ struct extent_info { }; struct extent_node { - struct rb_node rb_node; - union { - struct { - unsigned int fofs; - unsigned int len; - u32 blk; - }; - struct extent_info ei; /* extent info */ - - }; + struct rb_node rb_node; /* rb node located in rb-tree */ + struct extent_info ei; /* extent info */ struct list_head list; /* node in global extent list of sbi */ struct extent_tree *et; /* extent tree pointer */ }; @@ -601,6 +595,7 @@ struct f2fs_map_blocks { pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; + bool m_may_create; /* indicate it is from write path */ }; /* for flag in get_data_block */ @@ -889,7 +884,7 @@ struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ atomic_t issued_flush; /* # of issued flushes */ - atomic_t issing_flush; /* # of issing flushes */ + atomic_t queued_flush; /* # of queued flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -956,6 +951,8 @@ enum count_type { F2FS_RD_DATA, F2FS_RD_NODE, F2FS_RD_META, + F2FS_DIO_WRITE, + F2FS_DIO_READ, NR_COUNT_TYPE, }; @@ -1170,8 +1167,6 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ - struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; - /* bio ordering for NODE/DATA */ /* keep migration IO order for LFS mode */ struct rw_semaphore io_order_lock; mempool_t *write_io_dummy; /* Dummy pages */ @@ -1263,6 +1258,7 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ + unsigned int next_victim_seg[2]; /* next segment in victim section */ /* for skip statistic */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ unsigned long long skipped_gc_rwsem; /* FG_GC only */ @@ -1272,6 +1268,8 @@ struct f2fs_sb_info { /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; + /* migration granularity of garbage collection, unit: segment */ + unsigned int migration_granularity; /* * for stat information. @@ -1330,6 +1328,13 @@ struct f2fs_sb_info { __u32 s_chksum_seed; }; +struct f2fs_private_dio { + struct inode *inode; + void *orig_private; + bio_end_io_t *orig_end_io; + bool write; +}; + #ifdef CONFIG_F2FS_FAULT_INJECTION #define f2fs_show_injection_info(type) \ printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n", \ @@ -1608,12 +1613,16 @@ static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) { unsigned long flags; - set_sbi_flag(sbi, SBI_NEED_FSCK); + /* + * In order to re-enable nat_bits we need to call fsck.f2fs by + * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, + * so let's rely on regular fsck or unclean shutdown. + */ if (lock) spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); - kfree(NM_I(sbi)->nat_bits); + kvfree(NM_I(sbi)->nat_bits); NM_I(sbi)->nat_bits = NULL; if (lock) spin_unlock_irqrestore(&sbi->cp_lock, flags); @@ -2146,7 +2155,11 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) { if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || - get_pages(sbi, F2FS_WB_CP_DATA)) + get_pages(sbi, F2FS_WB_CP_DATA) || + get_pages(sbi, F2FS_DIO_READ) || + get_pages(sbi, F2FS_DIO_WRITE) || + atomic_read(&SM_I(sbi)->dcc_info->queued_discard) || + atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) return false; return f2fs_time_over(sbi, type); } @@ -2370,6 +2383,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_NEW_INODE: if (set) return; + /* fall through */ case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: @@ -2672,22 +2686,37 @@ static inline bool is_dot_dotdot(const struct qstr *str) static inline bool f2fs_may_extent_tree(struct inode *inode) { - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, EXTENT_CACHE) || is_inode_flag_set(inode, FI_NO_EXTENT)) return false; + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&sbi->s_list)) + return false; + return S_ISREG(inode->i_mode); } static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { + void *ret; + if (time_to_inject(sbi, FAULT_KMALLOC)) { f2fs_show_injection_info(FAULT_KMALLOC); return NULL; } - return kmalloc(size, flags); + ret = kmalloc(size, flags); + if (ret) + return ret; + + return kvmalloc(size, flags); } static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, @@ -2762,6 +2791,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, spin_unlock(&sbi->iostat_lock); } +#define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) + #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ (!is_read_io(fio->op) || fio->is_meta)) @@ -3007,7 +3038,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool ordered); + enum page_type type, bool ordered, bool locked); void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len); @@ -3147,6 +3178,7 @@ struct f2fs_stat_info { int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; int nr_rd_data, nr_rd_node, nr_rd_meta; + int nr_dio_read, nr_dio_write; unsigned int io_skip_bggc, other_skip_bggc; int nr_flushing, nr_flushed, flush_list_empty; int nr_discarding, nr_discarded; @@ -3459,9 +3491,9 @@ static inline bool f2fs_post_read_required(struct inode *inode) } #define F2FS_FEATURE_FUNCS(name, flagname) \ -static inline int f2fs_sb_has_##name(struct super_block *sb) \ +static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \ { \ - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_##flagname); \ + return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \ } F2FS_FEATURE_FUNCS(encrypt, ENCRYPT); @@ -3491,7 +3523,7 @@ static inline int get_blkz_type(struct f2fs_sb_info *sbi, static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) { - return f2fs_sb_has_blkzoned(sbi->sb); + return f2fs_sb_has_blkzoned(sbi); } static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) @@ -3566,7 +3598,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, * for blkzoned device, fallback direct IO to buffered IO, so * all IOs can be serialized by log-structured write. */ - if (f2fs_sb_has_blkzoned(sbi->sb)) + if (f2fs_sb_has_blkzoned(sbi)) return true; if (test_opt(sbi, LFS) && (rw == WRITE) && block_unaligned_IO(inode, iocb, iter)) @@ -3589,7 +3621,7 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) { #ifdef CONFIG_QUOTA - if (f2fs_sb_has_quota_ino(sbi->sb)) + if (f2fs_sb_has_quota_ino(sbi)) return true; if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 88b124677189..bba56b39dcc5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -82,7 +82,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) } /* fill the page */ - f2fs_wait_on_page_writeback(page, DATA, false); + f2fs_wait_on_page_writeback(page, DATA, false, true); /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); @@ -216,6 +216,9 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, trace_f2fs_sync_file_enter(inode); + if (S_ISDIR(inode->i_mode)) + goto go_write; + /* if fdatasync is triggered, let's do in-place-update */ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(inode, FI_NEED_IPU); @@ -575,7 +578,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, if (IS_ERR(page)) return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); zero_user(page, offset, PAGE_SIZE - offset); /* An encrypted inode should have a key and truncate the last page. */ @@ -696,7 +699,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, unsigned int flags; if (f2fs_has_extra_attr(inode) && - f2fs_sb_has_inode_crtime(inode->i_sb) && + f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = fi->i_crtime.tv_sec; @@ -892,7 +895,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (IS_ERR(page)) return PTR_ERR(page); - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); zero_user(page, start, len); set_page_dirty(page); f2fs_put_page(page, 1); @@ -1496,7 +1499,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_map_blocks map = { .m_next_pgofs = NULL, - .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE }; + .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, + .m_may_create = true }; pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; @@ -1681,7 +1685,7 @@ static int __f2fs_ioc_setflags(struct inode *inode, unsigned int flags) inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); - f2fs_mark_inode_dirty_sync(inode, false); + f2fs_mark_inode_dirty_sync(inode, true); return 0; } @@ -1962,6 +1966,13 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; + case F2FS_GOING_DOWN_NEED_FSCK: + set_sbi_flag(sbi, SBI_NEED_FSCK); + /* do checkpoint only */ + ret = f2fs_sync_fs(sb, 1); + if (ret) + goto out; + break; default: ret = -EINVAL; goto out; @@ -2030,7 +2041,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); - if (!f2fs_sb_has_encrypt(inode->i_sb)) + if (!f2fs_sb_has_encrypt(F2FS_I_SB(inode))) return -EOPNOTSUPP; f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -2040,7 +2051,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - if (!f2fs_sb_has_encrypt(file_inode(filp)->i_sb)) + if (!f2fs_sb_has_encrypt(F2FS_I_SB(file_inode(filp)))) return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } @@ -2051,7 +2062,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; - if (!f2fs_sb_has_encrypt(inode->i_sb)) + if (!f2fs_sb_has_encrypt(sbi)) return -EOPNOTSUPP; err = mnt_want_write_file(filp); @@ -2155,7 +2166,7 @@ do_more: } ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); - range.start += sbi->blocks_per_seg; + range.start += BLKS_PER_SEC(sbi); if (range.start <= end) goto do_more; out: @@ -2197,7 +2208,8 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_extent = NULL, - .m_seg_type = NO_CHECK_TYPE }; + .m_seg_type = NO_CHECK_TYPE , + .m_may_create = false }; struct extent_info ei = {0, 0, 0}; pgoff_t pg_start, pg_end, next_pgofs; unsigned int blk_per_seg = sbi->blocks_per_seg; @@ -2560,7 +2572,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) return -EFAULT; if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || - sbi->segs_per_sec != 1) { + __is_large_section(sbi)) { f2fs_msg(sbi->sb, KERN_WARNING, "Can't flush %u in %d for segs_per_sec %u != 1\n", range.dev_num, sbi->s_ndevs, @@ -2635,12 +2647,11 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct super_block *sb = sbi->sb; struct page *ipage; kprojid_t kprojid; int err; - if (!f2fs_sb_has_project_quota(sb)) { + if (!f2fs_sb_has_project_quota(sbi)) { if (projid != F2FS_DEF_PROJID) return -EOPNOTSUPP; else @@ -2757,7 +2768,7 @@ static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg) fa.fsx_xflags = f2fs_iflags_to_xflags(fi->i_flags & F2FS_FL_USER_VISIBLE); - if (f2fs_sb_has_project_quota(inode->i_sb)) + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode))) fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, fi->i_projid); @@ -2932,6 +2943,7 @@ int f2fs_precache_extents(struct inode *inode) map.m_next_pgofs = NULL; map.m_next_extent = &m_next_extent; map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; end = F2FS_I_SB(inode)->max_file_blocks; while (map.m_lblk < end) { diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a07241fb8537..195cf0f9d9ef 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -142,7 +142,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { err = PTR_ERR(gc_th->f2fs_gc_task); - kfree(gc_th); + kvfree(gc_th); sbi->gc_thread = NULL; } out: @@ -155,7 +155,7 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); - kfree(gc_th); + kvfree(gc_th); sbi->gc_thread = NULL; } @@ -323,8 +323,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_cost = get_max_cost(sbi, &p); if (*result != NULL_SEGNO) { - if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && - get_valid_blocks(sbi, *result, false) && + if (get_valid_blocks(sbi, *result, false) && !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) p.min_segno = *result; goto out; @@ -333,6 +332,22 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (p.max_search == 0) goto out; + if (__is_large_section(sbi) && p.alloc_mode == LFS) { + if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) { + p.min_segno = sbi->next_victim_seg[BG_GC]; + *result = p.min_segno; + sbi->next_victim_seg[BG_GC] = NULL_SEGNO; + goto got_result; + } + if (gc_type == FG_GC && + sbi->next_victim_seg[FG_GC] != NULL_SEGNO) { + p.min_segno = sbi->next_victim_seg[FG_GC]; + *result = p.min_segno; + sbi->next_victim_seg[FG_GC] = NULL_SEGNO; + goto got_result; + } + } + last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); @@ -395,6 +410,8 @@ next: } if (p.min_segno != NULL_SEGNO) { got_it: + *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; +got_result: if (p.alloc_mode == LFS) { secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) @@ -402,13 +419,13 @@ got_it: else set_bit(secno, dirty_i->victim_secmap); } - *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; + } +out: + if (p.min_segno != NULL_SEGNO) trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, sbi->cur_victim_sec, prefree_segments(sbi), free_segments(sbi)); - } -out: mutex_unlock(&dirty_i->seglist_lock); return (p.min_segno == NULL_SEGNO) ? 0 : 1; @@ -658,6 +675,14 @@ got_it: fio.page = page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + /* + * don't cache encrypted data into meta inode until previous dirty + * data were writebacked to avoid racing between GC and flush. + */ + f2fs_wait_on_page_writeback(page, DATA, true, true); + + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi), dn.data_blkaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -743,7 +768,9 @@ static int move_data_block(struct inode *inode, block_t bidx, * don't cache encrypted data into meta inode until previous dirty * data were writebacked to avoid racing between GC and flush. */ - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); + + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); if (err) @@ -802,8 +829,8 @@ static int move_data_block(struct inode *inode, block_t bidx, } write_page: + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true); set_page_dirty(fio.encrypted_page); - f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true); if (clear_page_dirty_for_io(fio.encrypted_page)) dec_page_count(fio.sbi, F2FS_DIRTY_META); @@ -811,7 +838,7 @@ write_page: ClearPageError(page); /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE, true); + f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC; @@ -897,8 +924,9 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, bool is_dirty = PageDirty(page); retry: + f2fs_wait_on_page_writeback(page, DATA, true, true); + set_page_dirty(page); - f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); @@ -1093,15 +1121,18 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int seg_freed = 0; + int seg_freed = 0, migrated = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; int submitted = 0; + if (__is_large_section(sbi)) + end_segno = rounddown(end_segno, sbi->segs_per_sec); + /* readahead multi ssa blocks those have contiguous address */ - if (sbi->segs_per_sec > 1) + if (__is_large_section(sbi)) f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), - sbi->segs_per_sec, META_SSA, true); + end_segno - segno, META_SSA, true); /* reference all summary page */ while (segno < end_segno) { @@ -1130,10 +1161,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); - if (get_valid_blocks(sbi, segno, false) == 0 || - !PageUptodate(sum_page) || - unlikely(f2fs_cp_error(sbi))) - goto next; + if (get_valid_blocks(sbi, segno, false) == 0) + goto freed; + if (__is_large_section(sbi) && + migrated >= sbi->migration_granularity) + goto skip; + if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) + goto skip; sum = page_address(sum_page); if (type != GET_SUM_TYPE((&sum->footer))) { @@ -1141,7 +1175,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, "type [%d, %d] in SSA and SIT", segno, type, GET_SUM_TYPE((&sum->footer))); set_sbi_flag(sbi, SBI_NEED_FSCK); - goto next; + goto skip; } /* @@ -1160,10 +1194,15 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, stat_inc_seg_count(sbi, type, gc_type); +freed: if (gc_type == FG_GC && get_valid_blocks(sbi, segno, false) == 0) seg_freed++; -next: + migrated++; + + if (__is_large_section(sbi) && segno + 1 < end_segno) + sbi->next_victim_seg[gc_type] = segno + 1; +skip: f2fs_put_page(sum_page, 0); } @@ -1307,7 +1346,7 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ - if (sbi->s_ndevs && sbi->segs_per_sec == 1) + if (sbi->s_ndevs && !__is_large_section(sbi)) SIT_I(sbi)->last_victim[ALLOC_NEXT] = GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 7b0cff7e6051..d636cbcf68f2 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -72,7 +72,7 @@ void f2fs_truncate_inline_inode(struct inode *inode, addr = inline_data_addr(inode, ipage); - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); set_page_dirty(ipage); @@ -161,7 +161,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) fio.old_blkaddr = dn->data_blkaddr; set_inode_flag(dn->inode, FI_HOT_DATA); f2fs_outplace_write_data(dn, &fio); - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); if (dirty) { inode_dec_dirty_pages(dn->inode); f2fs_remove_dirty_inode(dn->inode); @@ -236,7 +236,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), page->index); - f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); + f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true); src_addr = kmap_atomic(page); dst_addr = inline_data_addr(inode, dn.inode_page); memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); @@ -277,7 +277,7 @@ process_inline: ipage = f2fs_get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); src_addr = inline_data_addr(inode, npage); dst_addr = inline_data_addr(inode, ipage); @@ -391,7 +391,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; } - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); dentry_blk = page_address(page); @@ -501,18 +501,18 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); - kfree(backup_dentry); + kvfree(backup_dentry); return 0; recover: lock_page(ipage); - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); set_page_dirty(ipage); f2fs_put_page(ipage, 1); - kfree(backup_dentry); + kvfree(backup_dentry); return err; } @@ -565,7 +565,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, } } - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); name_hash = f2fs_dentry_hash(new_name, NULL); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); @@ -597,7 +597,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, int i; lock_page(page); - f2fs_wait_on_page_writeback(page, NODE, true); + f2fs_wait_on_page_writeback(page, NODE, true, true); inline_dentry = inline_data_addr(dir, page); make_dentry_ptr_inline(dir, &d, inline_dentry); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 91ceee0ed4c4..bec52961630b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -103,7 +103,7 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) while (start < end) { if (*start++) { - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); set_inode_flag(inode, FI_DATA_EXIST); set_raw_inline(inode, F2FS_INODE(ipage)); @@ -118,7 +118,7 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page { struct f2fs_inode *ri = &F2FS_NODE(page)->i; - if (!f2fs_sb_has_inode_chksum(sbi->sb)) + if (!f2fs_sb_has_inode_chksum(sbi)) return false; if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR)) @@ -218,7 +218,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } - if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) + if (f2fs_sb_has_flexible_inline_xattr(sbi) && !f2fs_has_extra_attr(inode)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, @@ -228,7 +228,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) } if (f2fs_has_extra_attr(inode) && - !f2fs_sb_has_extra_attr(sbi->sb)) { + !f2fs_sb_has_extra_attr(sbi)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, "%s: inode (ino=%lx) is with extra_attr, " @@ -340,7 +340,7 @@ static int do_read_inode(struct inode *inode) fi->i_extra_isize = f2fs_has_extra_attr(inode) ? le16_to_cpu(ri->i_extra_isize) : 0; - if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size); } else if (f2fs_has_inline_xattr(inode) || f2fs_has_inline_dentry(inode)) { @@ -390,14 +390,14 @@ static int do_read_inode(struct inode *inode) if (fi->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); - if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi) && F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) i_projid = (projid_t)le32_to_cpu(ri->i_projid); else i_projid = F2FS_DEF_PROJID; fi->i_projid = make_kprojid(&init_user_ns, i_projid); - if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi->sb) && + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_inode_crtime(sbi) && F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_crtime)) { fi->i_crtime.tv_sec = le64_to_cpu(ri->i_crtime); fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); @@ -497,7 +497,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) struct f2fs_inode *ri; struct extent_tree *et = F2FS_I(inode)->extent_tree; - f2fs_wait_on_page_writeback(node_page, NODE, true); + f2fs_wait_on_page_writeback(node_page, NODE, true, true); set_page_dirty(node_page); f2fs_inode_synced(inode); @@ -542,11 +542,11 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (f2fs_has_extra_attr(inode)) { ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); - if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb)) + if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode))) ri->i_inline_xattr_size = cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size); - if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_projid)) { projid_t i_projid; @@ -556,7 +556,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) ri->i_projid = cpu_to_le32(i_projid); } - if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)->sb) && + if (f2fs_sb_has_inode_crtime(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_crtime)) { ri->i_crtime = diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 99299ede7429..62d9829f3a6a 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -61,7 +61,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) goto fail; } - if (f2fs_sb_has_project_quota(sbi->sb) && + if (f2fs_sb_has_project_quota(sbi) && (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL)) F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; else @@ -79,7 +79,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - if (f2fs_sb_has_extra_attr(sbi->sb)) { + if (f2fs_sb_has_extra_attr(sbi)) { set_inode_flag(inode, FI_EXTRA_ATTR); F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; } @@ -92,7 +92,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_may_inline_dentry(inode)) set_inode_flag(inode, FI_INLINE_DENTRY); - if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + if (f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); if (f2fs_has_inline_xattr(inode)) xattr_size = F2FS_OPTION(sbi).inline_xattr_size; @@ -635,7 +635,7 @@ out_f2fs_handle_failed_inode: f2fs_handle_failed_inode(inode); out_free_encrypted_link: if (disk_link.name != (unsigned char *)symname) - kfree(disk_link.name); + kvfree(disk_link.name); return err; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d338740d0fda..4f450e573312 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -826,6 +826,7 @@ static int truncate_node(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; int err; + pgoff_t index; err = f2fs_get_node_info(sbi, dn->nid, &ni); if (err) @@ -845,10 +846,11 @@ static int truncate_node(struct dnode_of_data *dn) clear_node_page_dirty(dn->node_page); set_sbi_flag(sbi, SBI_IS_DIRTY); + index = dn->node_page->index; f2fs_put_page(dn->node_page, 1); invalidate_mapping_pages(NODE_MAPPING(sbi), - dn->node_page->index, dn->node_page->index); + index, index); dn->node_page = NULL; trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); @@ -1104,7 +1106,7 @@ skip_partial: ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); BUG_ON(page->mapping != NODE_MAPPING(sbi)); - f2fs_wait_on_page_writeback(page, NODE, true); + f2fs_wait_on_page_writeback(page, NODE, true, true); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); @@ -1232,7 +1234,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) new_ni.version = 0; set_node_addr(sbi, &new_ni, NEW_ADDR, false); - f2fs_wait_on_page_writeback(page, NODE, true); + f2fs_wait_on_page_writeback(page, NODE, true, true); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(page, S_ISDIR(dn->inode->i_mode)); if (!PageUptodate(page)) @@ -1596,10 +1598,10 @@ int f2fs_move_node_page(struct page *node_page, int gc_type) .for_reclaim = 0, }; + f2fs_wait_on_page_writeback(node_page, NODE, true, true); + set_page_dirty(node_page); - f2fs_wait_on_page_writeback(node_page, NODE, true); - f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page)); if (!clear_page_dirty_for_io(node_page)) { err = -EAGAIN; goto out_page; @@ -1687,8 +1689,7 @@ continue_unlock: goto continue_unlock; } - f2fs_wait_on_page_writeback(page, NODE, true); - BUG_ON(PageWriteback(page)); + f2fs_wait_on_page_writeback(page, NODE, true, true); set_fsync_mark(page, 0); set_dentry_mark(page, 0); @@ -1739,7 +1740,7 @@ continue_unlock: "Retry to write fsync mark: ino=%u, idx=%lx", ino, last_page->index); lock_page(last_page); - f2fs_wait_on_page_writeback(last_page, NODE, true); + f2fs_wait_on_page_writeback(last_page, NODE, true, true); set_page_dirty(last_page); unlock_page(last_page); goto retry; @@ -1820,9 +1821,8 @@ continue_unlock: goto lock_node; } - f2fs_wait_on_page_writeback(page, NODE, true); + f2fs_wait_on_page_writeback(page, NODE, true, true); - BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; @@ -1889,7 +1889,7 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, get_page(page); spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); - f2fs_wait_on_page_writeback(page, NODE, true); + f2fs_wait_on_page_writeback(page, NODE, true, false); if (TestClearPageError(page)) ret = -EIO; @@ -2467,7 +2467,7 @@ void f2fs_recover_inline_xattr(struct inode *inode, struct page *page) src_addr = inline_xattr_addr(inode, page); inline_size = inline_xattr_size(inode); - f2fs_wait_on_page_writeback(ipage, NODE, true); + f2fs_wait_on_page_writeback(ipage, NODE, true, true); memcpy(dst_addr, src_addr, inline_size); update_inode: f2fs_update_inode(inode, ipage); @@ -2561,17 +2561,17 @@ retry: if (dst->i_inline & F2FS_EXTRA_ATTR) { dst->i_extra_isize = src->i_extra_isize; - if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) && + if (f2fs_sb_has_flexible_inline_xattr(sbi) && F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_inline_xattr_size)) dst->i_inline_xattr_size = src->i_inline_xattr_size; - if (f2fs_sb_has_project_quota(sbi->sb) && + if (f2fs_sb_has_project_quota(sbi) && F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) dst->i_projid = src->i_projid; - if (f2fs_sb_has_inode_crtime(sbi->sb) && + if (f2fs_sb_has_inode_crtime(sbi) && F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_crtime_nsec)) { dst->i_crtime = src->i_crtime; @@ -3113,17 +3113,17 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) for (i = 0; i < nm_i->nat_blocks; i++) kvfree(nm_i->free_nid_bitmap[i]); - kfree(nm_i->free_nid_bitmap); + kvfree(nm_i->free_nid_bitmap); } kvfree(nm_i->free_nid_count); - kfree(nm_i->nat_bitmap); - kfree(nm_i->nat_bits); + kvfree(nm_i->nat_bitmap); + kvfree(nm_i->nat_bits); #ifdef CONFIG_F2FS_CHECK_FS - kfree(nm_i->nat_bitmap_mir); + kvfree(nm_i->nat_bitmap_mir); #endif sbi->nm_info = NULL; - kfree(nm_i); + kvfree(nm_i); } int __init f2fs_create_node_manager_caches(void) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 1c73d879a9bc..e05af5df5648 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -361,7 +361,7 @@ static inline int set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); - f2fs_wait_on_page_writeback(p, NODE, true); + f2fs_wait_on_page_writeback(p, NODE, true, true); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 1dfb17f9f9ff..e3883db868d8 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -250,7 +250,7 @@ static int recover_inode(struct inode *inode, struct page *page) i_gid_write(inode, le32_to_cpu(raw->i_gid)); if (raw->i_inline & F2FS_EXTRA_ATTR) { - if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)) && F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize), i_projid)) { projid_t i_projid; @@ -539,7 +539,7 @@ retry_dn: goto out; } - f2fs_wait_on_page_writeback(dn.node_page, NODE, true); + f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); err = f2fs_get_node_info(sbi, dn.nid, &ni); if (err) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6edcf8391dd3..9b79056d705d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -229,7 +229,7 @@ static int __revoke_inmem_pages(struct inode *inode, lock_page(page); - f2fs_wait_on_page_writeback(page, DATA, true); + f2fs_wait_on_page_writeback(page, DATA, true, true); if (recover) { struct dnode_of_data dn; @@ -387,8 +387,9 @@ static int __f2fs_commit_inmem_pages(struct inode *inode) if (page->mapping == inode->i_mapping) { trace_f2fs_commit_inmem_page(page, INMEM); + f2fs_wait_on_page_writeback(page, DATA, true, true); + set_page_dirty(page); - f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); @@ -620,14 +621,16 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) return 0; if (!test_opt(sbi, FLUSH_MERGE)) { + atomic_inc(&fcc->queued_flush); ret = submit_flush_wait(sbi, ino); + atomic_dec(&fcc->queued_flush); atomic_inc(&fcc->issued_flush); return ret; } - if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) { + if (atomic_inc_return(&fcc->queued_flush) == 1 || sbi->s_ndevs > 1) { ret = submit_flush_wait(sbi, ino); - atomic_dec(&fcc->issing_flush); + atomic_dec(&fcc->queued_flush); atomic_inc(&fcc->issued_flush); return ret; @@ -646,14 +649,14 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); - atomic_dec(&fcc->issing_flush); + atomic_dec(&fcc->queued_flush); } else { struct llist_node *list; list = llist_del_all(&fcc->issue_list); if (!list) { wait_for_completion(&cmd.wait); - atomic_dec(&fcc->issing_flush); + atomic_dec(&fcc->queued_flush); } else { struct flush_cmd *tmp, *next; @@ -662,7 +665,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) llist_for_each_entry_safe(tmp, next, list, llnode) { if (tmp == &cmd) { cmd.ret = ret; - atomic_dec(&fcc->issing_flush); + atomic_dec(&fcc->queued_flush); continue; } tmp->ret = ret; @@ -691,7 +694,7 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) if (!fcc) return -ENOMEM; atomic_set(&fcc->issued_flush, 0); - atomic_set(&fcc->issing_flush, 0); + atomic_set(&fcc->queued_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; @@ -703,7 +706,7 @@ init_thread: "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); - kfree(fcc); + kvfree(fcc); SM_I(sbi)->fcc_info = NULL; return err; } @@ -722,7 +725,7 @@ void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) kthread_stop(flush_thread); } if (free) { - kfree(fcc); + kvfree(fcc); SM_I(sbi)->fcc_info = NULL; } } @@ -907,7 +910,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, dc->len = len; dc->ref = 0; dc->state = D_PREP; - dc->issuing = 0; + dc->queued = 0; dc->error = 0; init_completion(&dc->wait); list_add_tail(&dc->list, pend_list); @@ -940,7 +943,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { if (dc->state == D_DONE) - atomic_sub(dc->issuing, &dcc->issing_discard); + atomic_sub(dc->queued, &dcc->queued_discard); list_del(&dc->list); rb_erase_cached(&dc->rb_node, &dcc->root); @@ -1143,12 +1146,12 @@ submit: dc->bio_ref++; spin_unlock_irqrestore(&dc->lock, flags); - atomic_inc(&dcc->issing_discard); - dc->issuing++; + atomic_inc(&dcc->queued_discard); + dc->queued++; list_move_tail(&dc->list, wait_list); /* sanity check on discard range */ - __check_sit_bitmap(sbi, start, start + len); + __check_sit_bitmap(sbi, lstart, lstart + len); bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; @@ -1649,6 +1652,10 @@ static int issue_discard_thread(void *data) if (dcc->discard_wake) dcc->discard_wake = 0; + /* clean up pending candidates before going to sleep */ + if (atomic_read(&dcc->queued_discard)) + __wait_all_discard_cmd(sbi, NULL); + if (try_to_freeze()) continue; if (f2fs_readonly(sbi->sb)) @@ -1734,7 +1741,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { #ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sbi->sb) && + if (f2fs_sb_has_blkzoned(sbi) && bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif @@ -1882,7 +1889,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); - bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1; + bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi); mutex_lock(&dirty_i->seglist_lock); @@ -1914,7 +1921,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, (end - 1) <= cpc->trim_end) continue; - if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { + if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); continue; @@ -1946,7 +1953,7 @@ find_next: sbi->blocks_per_seg, cur_pos); len = next_pos - cur_pos; - if (f2fs_sb_has_blkzoned(sbi->sb) || + if (f2fs_sb_has_blkzoned(sbi) || (force && len < cpc->trim_minlen)) goto skip; @@ -1994,7 +2001,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&dcc->fstrim_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); - atomic_set(&dcc->issing_discard, 0); + atomic_set(&dcc->queued_discard, 0); atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; @@ -2010,7 +2017,7 @@ init_thread: "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); - kfree(dcc); + kvfree(dcc); SM_I(sbi)->dcc_info = NULL; return err; } @@ -2027,7 +2034,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) f2fs_stop_discard_thread(sbi); - kfree(dcc); + kvfree(dcc); SM_I(sbi)->dcc_info = NULL; } @@ -2146,7 +2153,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* update total number of valid blocks to be written in ckpt area */ SIT_I(sbi)->written_valid_blocks += del; - if (sbi->segs_per_sec > 1) + if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->valid_blocks += del; } @@ -2412,7 +2419,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { /* if segs_per_sec is large than 1, we need to keep original policy. */ - if (sbi->segs_per_sec != 1) + if (__is_large_section(sbi)) return CURSEG_I(sbi, type)->segno; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) @@ -2722,7 +2729,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) struct discard_policy dpolicy; unsigned long long trimmed = 0; int err = 0; - bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1; + bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi); if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -3272,16 +3279,18 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, } void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool ordered) + enum page_type type, bool ordered, bool locked) { if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); - if (ordered) + if (ordered) { wait_on_page_writeback(page); - else + f2fs_bug_on(sbi, locked && PageWriteback(page)); + } else { wait_for_stable_page(page); + } } } @@ -3298,7 +3307,7 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) cpage = find_lock_page(META_MAPPING(sbi), blkaddr); if (cpage) { - f2fs_wait_on_page_writeback(cpage, DATA, true); + f2fs_wait_on_page_writeback(cpage, DATA, true, true); f2fs_put_page(cpage, 1); } } @@ -3880,7 +3889,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) if (!sit_i->tmp_map) return -ENOMEM; - if (sbi->segs_per_sec > 1) { + if (__is_large_section(sbi)) { sit_i->sec_entries = f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry), MAIN_SECS(sbi)), @@ -4035,7 +4044,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) se->valid_blocks; } - if (sbi->segs_per_sec > 1) + if (__is_large_section(sbi)) get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; } @@ -4079,7 +4088,7 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) sbi->discard_blks -= se->valid_blocks; } - if (sbi->segs_per_sec > 1) { + if (__is_large_section(sbi)) { get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; get_sec_entry(sbi, start)->valid_blocks -= @@ -4314,7 +4323,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; - kfree(dirty_i); + kvfree(dirty_i); } static void destroy_curseg(struct f2fs_sb_info *sbi) @@ -4326,10 +4335,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) return; SM_I(sbi)->curseg_array = NULL; for (i = 0; i < NR_CURSEG_TYPE; i++) { - kfree(array[i].sum_blk); - kfree(array[i].journal); + kvfree(array[i].sum_blk); + kvfree(array[i].journal); } - kfree(array); + kvfree(array); } static void destroy_free_segmap(struct f2fs_sb_info *sbi) @@ -4340,7 +4349,7 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = NULL; kvfree(free_i->free_segmap); kvfree(free_i->free_secmap); - kfree(free_i); + kvfree(free_i); } static void destroy_sit_info(struct f2fs_sb_info *sbi) @@ -4353,26 +4362,26 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) if (sit_i->sentries) { for (start = 0; start < MAIN_SEGS(sbi); start++) { - kfree(sit_i->sentries[start].cur_valid_map); + kvfree(sit_i->sentries[start].cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - kfree(sit_i->sentries[start].cur_valid_map_mir); + kvfree(sit_i->sentries[start].cur_valid_map_mir); #endif - kfree(sit_i->sentries[start].ckpt_valid_map); - kfree(sit_i->sentries[start].discard_map); + kvfree(sit_i->sentries[start].ckpt_valid_map); + kvfree(sit_i->sentries[start].discard_map); } } - kfree(sit_i->tmp_map); + kvfree(sit_i->tmp_map); kvfree(sit_i->sentries); kvfree(sit_i->sec_entries); kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; - kfree(sit_i->sit_bitmap); + kvfree(sit_i->sit_bitmap); #ifdef CONFIG_F2FS_CHECK_FS - kfree(sit_i->sit_bitmap_mir); + kvfree(sit_i->sit_bitmap_mir); #endif - kfree(sit_i); + kvfree(sit_i); } void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) @@ -4388,7 +4397,7 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) destroy_free_segmap(sbi); destroy_sit_info(sbi); sbi->sm_info = NULL; - kfree(sm_info); + kvfree(sm_info); } int __init f2fs_create_segment_manager_caches(void) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ab3465faddf1..a77f76f528b6 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -333,7 +333,7 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ - if (use_section && sbi->segs_per_sec > 1) + if (use_section && __is_large_section(sbi)) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 9e13db994fdf..a467aca29cfe 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -135,6 +135,6 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); spin_lock(&f2fs_list_lock); - list_del(&sbi->s_list); + list_del_init(&sbi->s_list); spin_unlock(&f2fs_list_lock); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index af58b2cc21b8..c46a1d4318d4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -38,7 +38,7 @@ static struct kmem_cache *f2fs_inode_cachep; #ifdef CONFIG_F2FS_FAULT_INJECTION -char *f2fs_fault_name[FAULT_MAX] = { +const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", @@ -259,7 +259,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "quota options when quota turned on"); return -EINVAL; } - if (f2fs_sb_has_quota_ino(sb)) { + if (f2fs_sb_has_quota_ino(sbi)) { f2fs_msg(sb, KERN_INFO, "QUOTA feature is enabled, so ignore qf_name"); return 0; @@ -289,7 +289,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, set_opt(sbi, QUOTA); return 0; errout: - kfree(qname); + kvfree(qname); return ret; } @@ -302,7 +302,7 @@ static int f2fs_clear_qf_name(struct super_block *sb, int qtype) " when quota turned on"); return -EINVAL; } - kfree(F2FS_OPTION(sbi).s_qf_names[qtype]); + kvfree(F2FS_OPTION(sbi).s_qf_names[qtype]); F2FS_OPTION(sbi).s_qf_names[qtype] = NULL; return 0; } @@ -314,7 +314,7 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) * 'grpquota' mount options are allowed even without quota feature * to support legacy quotas in quota files. */ - if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) { + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) { f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. " "Cannot enable project quota enforcement."); return -1; @@ -348,7 +348,7 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) } } - if (f2fs_sb_has_quota_ino(sbi->sb) && F2FS_OPTION(sbi).s_jquota_fmt) { + if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) { f2fs_msg(sbi->sb, KERN_INFO, "QUOTA feature is enabled, so ignore jquota_fmt"); F2FS_OPTION(sbi).s_jquota_fmt = 0; @@ -399,10 +399,10 @@ static int parse_options(struct super_block *sb, char *options) set_opt(sbi, BG_GC); set_opt(sbi, FORCE_FG_GC); } else { - kfree(name); + kvfree(name); return -EINVAL; } - kfree(name); + kvfree(name); break; case Opt_disable_roll_forward: set_opt(sbi, DISABLE_ROLL_FORWARD); @@ -417,7 +417,7 @@ static int parse_options(struct super_block *sb, char *options) set_opt(sbi, DISCARD); break; case Opt_nodiscard: - if (f2fs_sb_has_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sbi)) { f2fs_msg(sb, KERN_WARNING, "discard is required for zoned block devices"); return -EINVAL; @@ -566,11 +566,11 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { - if (f2fs_sb_has_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sbi)) { f2fs_msg(sb, KERN_WARNING, "adaptive mode is not allowed with " "zoned block device feature"); - kfree(name); + kvfree(name); return -EINVAL; } set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); @@ -578,10 +578,10 @@ static int parse_options(struct super_block *sb, char *options) !strncmp(name, "lfs", 3)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); } else { - kfree(name); + kvfree(name); return -EINVAL; } - kfree(name); + kvfree(name); break; case Opt_io_size_bits: if (args->from && match_int(args, &arg)) @@ -714,10 +714,10 @@ static int parse_options(struct super_block *sb, char *options) !strncmp(name, "fs-based", 8)) { F2FS_OPTION(sbi).whint_mode = WHINT_MODE_FS; } else { - kfree(name); + kvfree(name); return -EINVAL; } - kfree(name); + kvfree(name); break; case Opt_alloc: name = match_strdup(&args[0]); @@ -731,10 +731,10 @@ static int parse_options(struct super_block *sb, char *options) !strncmp(name, "reuse", 5)) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; } else { - kfree(name); + kvfree(name); return -EINVAL; } - kfree(name); + kvfree(name); break; case Opt_fsync: name = match_strdup(&args[0]); @@ -751,14 +751,14 @@ static int parse_options(struct super_block *sb, char *options) F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_NOBARRIER; } else { - kfree(name); + kvfree(name); return -EINVAL; } - kfree(name); + kvfree(name); break; case Opt_test_dummy_encryption: #ifdef CONFIG_F2FS_FS_ENCRYPTION - if (!f2fs_sb_has_encrypt(sb)) { + if (!f2fs_sb_has_encrypt(sbi)) { f2fs_msg(sb, KERN_ERR, "Encrypt feature is off"); return -EINVAL; } @@ -783,10 +783,10 @@ static int parse_options(struct super_block *sb, char *options) !strncmp(name, "disable", 7)) { set_opt(sbi, DISABLE_CHECKPOINT); } else { - kfree(name); + kvfree(name); return -EINVAL; } - kfree(name); + kvfree(name); break; default: f2fs_msg(sb, KERN_ERR, @@ -799,13 +799,13 @@ static int parse_options(struct super_block *sb, char *options) if (f2fs_check_quota_options(sbi)) return -EINVAL; #else - if (f2fs_sb_has_quota_ino(sbi->sb) && !f2fs_readonly(sbi->sb)) { + if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_msg(sbi->sb, KERN_INFO, "Filesystem with quota feature cannot be mounted RDWR " "without CONFIG_QUOTA"); return -EINVAL; } - if (f2fs_sb_has_project_quota(sbi->sb) && !f2fs_readonly(sbi->sb)) { + if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) { f2fs_msg(sb, KERN_ERR, "Filesystem with project quota feature cannot be " "mounted RDWR without CONFIG_QUOTA"); @@ -821,8 +821,8 @@ static int parse_options(struct super_block *sb, char *options) } if (test_opt(sbi, INLINE_XATTR_SIZE)) { - if (!f2fs_sb_has_extra_attr(sb) || - !f2fs_sb_has_flexible_inline_xattr(sb)) { + if (!f2fs_sb_has_extra_attr(sbi) || + !f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_msg(sb, KERN_ERR, "extra_attr or flexible_inline_xattr " "feature is off"); @@ -1017,10 +1017,10 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) for (i = 0; i < sbi->s_ndevs; i++) { blkdev_put(FDEV(i).bdev, FMODE_EXCL); #ifdef CONFIG_BLK_DEV_ZONED - kfree(FDEV(i).blkz_type); + kvfree(FDEV(i).blkz_type); #endif } - kfree(sbi->devs); + kvfree(sbi->devs); } static void f2fs_put_super(struct super_block *sb) @@ -1058,9 +1058,6 @@ static void f2fs_put_super(struct super_block *sb) f2fs_write_checkpoint(sbi, &cpc); } - /* f2fs_write_checkpoint can update stat informaion */ - f2fs_destroy_stats(sbi); - /* * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. @@ -1080,29 +1077,35 @@ static void f2fs_put_super(struct super_block *sb) iput(sbi->node_inode); iput(sbi->meta_inode); + /* + * iput() can update stat information, if f2fs_write_checkpoint() + * above failed with error. + */ + f2fs_destroy_stats(sbi); + /* destroy f2fs internal modules */ f2fs_destroy_node_manager(sbi); f2fs_destroy_segment_manager(sbi); - kfree(sbi->ckpt); + kvfree(sbi->ckpt); f2fs_unregister_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); - kfree(sbi->raw_super); + kvfree(sbi->raw_super); destroy_device_list(sbi); mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kfree(F2FS_OPTION(sbi).s_qf_names[i]); + kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) - kfree(sbi->write_io[i]); - kfree(sbi); + kvfree(sbi->write_io[i]); + kvfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) @@ -1431,7 +1434,7 @@ static void default_options(struct f2fs_sb_info *sbi) sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); set_opt(sbi, DISCARD); - if (f2fs_sb_has_blkzoned(sbi->sb)) + if (f2fs_sb_has_blkzoned(sbi)) set_opt_mode(sbi, F2FS_MOUNT_LFS); else set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); @@ -1457,19 +1460,16 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) sbi->sb->s_flags |= SB_ACTIVE; - mutex_lock(&sbi->gc_mutex); f2fs_update_time(sbi, DISABLE_TIME); while (!f2fs_time_over(sbi, DISABLE_TIME)) { + mutex_lock(&sbi->gc_mutex); err = f2fs_gc(sbi, true, false, NULL_SEGNO); if (err == -ENODATA) break; - if (err && err != -EAGAIN) { - mutex_unlock(&sbi->gc_mutex); + if (err && err != -EAGAIN) return err; - } } - mutex_unlock(&sbi->gc_mutex); err = sync_filesystem(sbi->sb); if (err) @@ -1531,7 +1531,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) GFP_KERNEL); if (!org_mount_opt.s_qf_names[i]) { for (j = 0; j < i; j++) - kfree(org_mount_opt.s_qf_names[j]); + kvfree(org_mount_opt.s_qf_names[j]); return -ENOMEM; } } else { @@ -1575,7 +1575,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~SB_RDONLY; if (sb_any_quota_suspended(sb)) { dquot_resume(sb, -1); - } else if (f2fs_sb_has_quota_ino(sb)) { + } else if (f2fs_sb_has_quota_ino(sbi)) { err = f2fs_enable_quotas(sb); if (err) goto restore_opts; @@ -1651,7 +1651,7 @@ skip: #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) - kfree(org_mount_opt.s_qf_names[i]); + kvfree(org_mount_opt.s_qf_names[i]); #endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -1672,7 +1672,7 @@ restore_opts: #ifdef CONFIG_QUOTA F2FS_OPTION(sbi).s_jquota_fmt = org_mount_opt.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { - kfree(F2FS_OPTION(sbi).s_qf_names[i]); + kvfree(F2FS_OPTION(sbi).s_qf_names[i]); F2FS_OPTION(sbi).s_qf_names[i] = org_mount_opt.s_qf_names[i]; } #endif @@ -1817,7 +1817,7 @@ int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) int enabled = 0; int i, err; - if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) { + if (f2fs_sb_has_quota_ino(sbi) && rdonly) { err = f2fs_enable_quotas(sbi->sb); if (err) { f2fs_msg(sbi->sb, KERN_ERR, @@ -1848,7 +1848,7 @@ static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, unsigned long qf_inum; int err; - BUG_ON(!f2fs_sb_has_quota_ino(sb)); + BUG_ON(!f2fs_sb_has_quota_ino(F2FS_SB(sb))); qf_inum = f2fs_qf_ino(sb, type); if (!qf_inum) @@ -1993,7 +1993,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) goto out_put; err = dquot_quota_off(sb, type); - if (err || f2fs_sb_has_quota_ino(sb)) + if (err || f2fs_sb_has_quota_ino(F2FS_SB(sb))) goto out_put; inode_lock(inode); @@ -2173,7 +2173,7 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, * if LOST_FOUND feature is enabled. * */ - if (f2fs_sb_has_lost_found(sbi->sb) && + if (f2fs_sb_has_lost_found(sbi) && inode->i_ino == F2FS_ROOT_INO(sbi)) return -EPERM; @@ -2396,7 +2396,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, __u32 crc = 0; /* Check checksum_offset and crc in superblock */ - if (le32_to_cpu(raw_super->feature) & F2FS_FEATURE_SB_CHKSUM) { + if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_SB_CHKSUM)) { crc_offset = le32_to_cpu(raw_super->checksum_offset); if (crc_offset != offsetof(struct f2fs_super_block, crc)) { @@ -2496,10 +2496,10 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } - if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) { + if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) { f2fs_msg(sb, KERN_INFO, - "Wrong segment_count / block_count (%u > %u)", - segment_count, le32_to_cpu(raw_super->block_count)); + "Wrong segment_count / block_count (%u > %llu)", + segment_count, le64_to_cpu(raw_super->block_count)); return 1; } @@ -2674,7 +2674,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i, j; + int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -2692,7 +2692,10 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); sbi->cur_victim_sec = NULL_SECNO; + sbi->next_victim_seg[BG_GC] = NULL_SEGNO; + sbi->next_victim_seg[FG_GC] = NULL_SEGNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; + sbi->migration_granularity = sbi->segs_per_sec; sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; @@ -2710,9 +2713,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - for (i = 0; i < NR_PAGE_TYPE - 1; i++) - for (j = HOT; j < NR_TEMP_TYPE; j++) - mutex_init(&sbi->wio_mutex[i][j]); init_rwsem(&sbi->io_order_lock); spin_lock_init(&sbi->cp_lock); @@ -2749,7 +2749,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) unsigned int n = 0; int err = -EIO; - if (!f2fs_sb_has_blkzoned(sbi->sb)) + if (!f2fs_sb_has_blkzoned(sbi)) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != @@ -2800,7 +2800,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) } } - kfree(zones); + kvfree(zones); return err; } @@ -2860,7 +2860,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi, /* No valid superblock */ if (!*raw_super) - kfree(super); + kvfree(super); else err = 0; @@ -2880,7 +2880,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) } /* we should update superblock crc here */ - if (!recover && f2fs_sb_has_sb_chksum(sbi->sb)) { + if (!recover && f2fs_sb_has_sb_chksum(sbi)) { crc = f2fs_crc32(sbi, F2FS_RAW_SUPER(sbi), offsetof(struct f2fs_super_block, crc)); F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc); @@ -2968,7 +2968,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && - !f2fs_sb_has_blkzoned(sbi->sb)) { + !f2fs_sb_has_blkzoned(sbi)) { f2fs_msg(sbi->sb, KERN_ERR, "Zoned block device feature not enabled\n"); return -EINVAL; @@ -3064,7 +3064,7 @@ try_onemore: sbi->raw_super = raw_super; /* precompute checksum seed for metadata */ - if (f2fs_sb_has_inode_chksum(sb)) + if (f2fs_sb_has_inode_chksum(sbi)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, sizeof(raw_super->uuid)); @@ -3074,7 +3074,7 @@ try_onemore: * devices, but mandatory for host-managed zoned block devices. */ #ifndef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_has_blkzoned(sb)) { + if (f2fs_sb_has_blkzoned(sbi)) { f2fs_msg(sb, KERN_ERR, "Zoned block device support is not enabled\n"); err = -EOPNOTSUPP; @@ -3101,13 +3101,13 @@ try_onemore: #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; - if (f2fs_sb_has_quota_ino(sb)) + if (f2fs_sb_has_quota_ino(sbi)) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &f2fs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; - if (f2fs_sb_has_quota_ino(sbi->sb)) { + if (f2fs_sb_has_quota_ino(sbi)) { for (i = 0; i < MAXQUOTAS; i++) { if (f2fs_qf_ino(sbi->sb, i)) sbi->nquota_files++; @@ -3259,30 +3259,30 @@ try_onemore: f2fs_build_gc_manager(sbi); + err = f2fs_build_stats(sbi); + if (err) + goto free_nm; + /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); if (IS_ERR(sbi->node_inode)) { f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); err = PTR_ERR(sbi->node_inode); - goto free_nm; + goto free_stats; } - err = f2fs_build_stats(sbi); - if (err) - goto free_node_inode; - /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); err = PTR_ERR(root); - goto free_stats; + goto free_node_inode; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size || !root->i_nlink) { iput(root); err = -EINVAL; - goto free_stats; + goto free_node_inode; } sb->s_root = d_make_root(root); /* allocate root dentry */ @@ -3297,7 +3297,7 @@ try_onemore: #ifdef CONFIG_QUOTA /* Enable quota usage during mount */ - if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) { + if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { err = f2fs_enable_quotas(sb); if (err) f2fs_msg(sb, KERN_ERR, @@ -3369,7 +3369,7 @@ skip_recovery: if (err) goto free_meta; } - kfree(options); + kvfree(options); /* recover broken superblock */ if (recovery) { @@ -3392,7 +3392,7 @@ skip_recovery: free_meta: #ifdef CONFIG_QUOTA f2fs_truncate_quota_inode_pages(sb); - if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) + if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) f2fs_quota_off_umount(sbi->sb); #endif /* @@ -3406,19 +3406,19 @@ free_meta: free_root_inode: dput(sb->s_root); sb->s_root = NULL; -free_stats: - f2fs_destroy_stats(sbi); free_node_inode: f2fs_release_ino_entry(sbi, true); truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); +free_stats: + f2fs_destroy_stats(sbi); free_nm: f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); free_devices: destroy_device_list(sbi); - kfree(sbi->ckpt); + kvfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); @@ -3428,19 +3428,19 @@ free_percpu: destroy_percpu_info(sbi); free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) - kfree(sbi->write_io[i]); + kvfree(sbi->write_io[i]); free_options: #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) - kfree(F2FS_OPTION(sbi).s_qf_names[i]); + kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif - kfree(options); + kvfree(options); free_sb_buf: - kfree(raw_super); + kvfree(raw_super); free_sbi: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); - kfree(sbi); + kvfree(sbi); /* give only one another chance */ if (retry) { diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index b777cbdd796b..0575edbe3ed6 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -90,34 +90,34 @@ static ssize_t features_show(struct f2fs_attr *a, if (!sb->s_bdev->bd_part) return snprintf(buf, PAGE_SIZE, "0\n"); - if (f2fs_sb_has_encrypt(sb)) + if (f2fs_sb_has_encrypt(sbi)) len += snprintf(buf, PAGE_SIZE - len, "%s", "encryption"); - if (f2fs_sb_has_blkzoned(sb)) + if (f2fs_sb_has_blkzoned(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "blkzoned"); - if (f2fs_sb_has_extra_attr(sb)) + if (f2fs_sb_has_extra_attr(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "extra_attr"); - if (f2fs_sb_has_project_quota(sb)) + if (f2fs_sb_has_project_quota(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "projquota"); - if (f2fs_sb_has_inode_chksum(sb)) + if (f2fs_sb_has_inode_chksum(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_checksum"); - if (f2fs_sb_has_flexible_inline_xattr(sb)) + if (f2fs_sb_has_flexible_inline_xattr(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "flexible_inline_xattr"); - if (f2fs_sb_has_quota_ino(sb)) + if (f2fs_sb_has_quota_ino(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "quota_ino"); - if (f2fs_sb_has_inode_crtime(sb)) + if (f2fs_sb_has_inode_crtime(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_crtime"); - if (f2fs_sb_has_lost_found(sb)) + if (f2fs_sb_has_lost_found(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "lost_found"); - if (f2fs_sb_has_sb_chksum(sb)) + if (f2fs_sb_has_sb_chksum(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "sb_checksum"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); @@ -246,6 +246,11 @@ out: return count; } + if (!strcmp(a->attr.name, "migration_granularity")) { + if (t == 0 || t > sbi->segs_per_sec) + return -EINVAL; + } + if (!strcmp(a->attr.name, "trim_sections")) return -EINVAL; @@ -406,6 +411,7 @@ F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); @@ -460,6 +466,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_hot_blocks), ATTR_LIST(min_ssr_sections), ATTR_LIST(max_victim_search), + ATTR_LIST(migration_granularity), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 7261245c208d..18d5ffbc5e8c 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -288,7 +288,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, - void **base_addr) + void **base_addr, int *base_size) { void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; @@ -299,8 +299,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (!size && !inline_size) return -ENODATA; - txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), - inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); + *base_size = inline_size + size + XATTR_PADDING_SIZE; + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS); if (!txattr_addr) return -ENOMEM; @@ -312,8 +312,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, *xe = __find_inline_xattr(inode, txattr_addr, &last_addr, index, len, name); - if (*xe) + if (*xe) { + *base_size = inline_size; goto check; + } } /* read from xattr node block */ @@ -415,7 +417,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } f2fs_wait_on_page_writeback(ipage ? ipage : in_page, - NODE, true); + NODE, true, true); /* no need to use xattr node block */ if (hsize <= inline_size) { err = f2fs_truncate_xattr_node(inode); @@ -439,7 +441,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, goto in_page_out; } f2fs_bug_on(sbi, new_nid); - f2fs_wait_on_page_writeback(xpage, NODE, true); + f2fs_wait_on_page_writeback(xpage, NODE, true, true); } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); @@ -474,6 +476,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, int error = 0; unsigned int size, len; void *base_addr = NULL; + int base_size; if (name == NULL) return -EINVAL; @@ -484,7 +487,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, - &entry, &base_addr); + &entry, &base_addr, &base_size); up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -498,6 +501,11 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (buffer) { char *pval = entry->e_name + entry->e_name_len; + + if (base_size - (pval - (char *)base_addr) < size) { + error = -ERANGE; + goto out; + } memcpy(buffer, pval, size); } error = size; diff --git a/fs/file.c b/fs/file.c index 7ffd6e9d103d..3209ee271c41 100644 --- a/fs/file.c +++ b/fs/file.c @@ -158,7 +158,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr) * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) - synchronize_sched(); + synchronize_rcu(); spin_lock(&files->file_lock); if (!new_fdt) @@ -640,6 +640,35 @@ out_unlock: } EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ +/* + * variant of __close_fd that gets a ref on the file for later fput + */ +int __close_fd_get_file(unsigned int fd, struct file **res) +{ + struct files_struct *files = current->files; + struct file *file; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + if (fd >= fdt->max_fds) + goto out_unlock; + file = fdt->fd[fd]; + if (!file) + goto out_unlock; + rcu_assign_pointer(fdt->fd[fd], NULL); + __put_unused_fd(files, fd); + spin_unlock(&files->file_lock); + get_file(file); + *res = file; + return filp_close(file, files); + +out_unlock: + spin_unlock(&files->file_lock); + *res = NULL; + return -ENOENT; +} + void do_close_on_exec(struct files_struct *files) { unsigned i; diff --git a/fs/file_table.c b/fs/file_table.c index e49af4caf15d..5679e7fcb6b0 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -380,10 +380,11 @@ void __init files_init(void) void __init files_maxfiles_init(void) { unsigned long n; - unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2; + unsigned long nr_pages = totalram_pages(); + unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2; - memreserve = min(memreserve, totalram_pages - 1); - n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; + memreserve = min(memreserve, nr_pages - 1); + n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 568abed20eb2..76baaa6be393 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -824,7 +824,7 @@ static const struct super_operations fuse_super_operations = { static void sanitize_global_limit(unsigned *limit) { if (*limit == 0) - *limit = ((totalram_pages << PAGE_SHIFT) >> 13) / + *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / sizeof(struct fuse_req); if (*limit >= 1 << 16) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 8afbb35559b9..05dd78f4b2b3 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -820,10 +820,10 @@ out: * @page: the page that's being released * @gfp_mask: passed from Linux VFS, ignored by us * - * Call try_to_free_buffers() if the buffers in this page can be - * released. + * Calls try_to_free_buffers() to free the buffers and put the page if the + * buffers can be released. * - * Returns: 0 + * Returns: 1 if the page was put or else 0 */ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) @@ -930,14 +930,14 @@ static const struct address_space_operations gfs2_jdata_aops = { void gfs2_set_aops(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); - if (gfs2_is_writeback(ip)) + if (gfs2_is_jdata(ip)) + inode->i_mapping->a_ops = &gfs2_jdata_aops; + else if (gfs2_is_writeback(sdp)) inode->i_mapping->a_ops = &gfs2_writeback_aops; - else if (gfs2_is_ordered(ip)) + else if (gfs2_is_ordered(sdp)) inode->i_mapping->a_ops = &gfs2_ordered_aops; - else if (gfs2_is_jdata(ip)) - inode->i_mapping->a_ops = &gfs2_jdata_aops; else BUG(); } - diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 9a4a15d646eb..02b2646d84b3 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -14,6 +14,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/iomap.h> +#include <linux/ktime.h> #include "gfs2.h" #include "incore.h" @@ -2083,6 +2084,8 @@ static int do_grow(struct inode *inode, u64 size) } error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT + + (unstuff && + gfs2_is_jdata(ip) ? RES_JDATA : 0) + (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ? 0 : RES_QUOTA), 0); if (error) @@ -2248,7 +2251,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) unsigned int shift = sdp->sd_sb.sb_bsize_shift; u64 size; int rc; + ktime_t start, end; + start = ktime_get(); lblock_stop = i_size_read(jd->jd_inode) >> shift; size = (lblock_stop - lblock) << shift; jd->nr_extents = 0; @@ -2268,8 +2273,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); - fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid, - jd->nr_extents); + end = ktime_get(); + fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid, + jd->nr_extents, ktime_ms_delta(end, start)); return 0; fail: diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 45a17b770d97..a2dea5bc0427 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1199,13 +1199,13 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); if (gfs2_holder_initialized(fl_gh)) { + struct file_lock request; if (fl_gh->gh_state == state) goto out; - locks_lock_file_wait(file, - &(struct file_lock) { - .fl_type = F_UNLCK, - .fl_flags = FL_FLOCK - }); + locks_init_lock(&request); + request.fl_type = F_UNLCK; + request.fl_flags = FL_FLOCK; + locks_lock_file_wait(file, &request); gfs2_glock_dq(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); } else { diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 05431324b262..b92740edc416 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1777,7 +1777,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) * */ -void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) +void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned long long dtime; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 5e12220cc0c2..8949bf28b249 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -202,7 +202,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, struct gfs2_holder *gh); extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) extern __printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index c63bee9adb6a..f15b4c57c4bd 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,6 +28,7 @@ #include "util.h" #include "trans.h" #include "dir.h" +#include "lops.h" struct workqueue_struct *gfs2_freeze_wq; @@ -466,17 +467,25 @@ static int inode_go_lock(struct gfs2_holder *gh) * */ -static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) +static void inode_go_dump(struct seq_file *seq, struct gfs2_glock *gl) { - const struct gfs2_inode *ip = gl->gl_object; + struct gfs2_inode *ip = gl->gl_object; + struct inode *inode = &ip->i_inode; + unsigned long nrpages; + if (ip == NULL) return; - gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", + + xa_lock_irq(&inode->i_data.i_pages); + nrpages = inode->i_data.nrpages; + xa_unlock_irq(&inode->i_data.i_pages); + + gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu p:%lu\n", (unsigned long long)ip->i_no_formal_ino, (unsigned long long)ip->i_no_addr, IF2DT(ip->i_inode.i_mode), ip->i_flags, (unsigned int)ip->i_diskflags, - (unsigned long long)i_size_read(&ip->i_inode)); + (unsigned long long)i_size_read(inode), nrpages); } /** diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 888b62cfd6d1..e10e0b0a7cd5 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -165,7 +165,6 @@ struct gfs2_bufdata { u64 bd_blkno; struct list_head bd_list; - const struct gfs2_log_operations *bd_ops; struct gfs2_trans *bd_tr; struct list_head bd_ail_st_list; @@ -244,7 +243,7 @@ struct gfs2_glock_operations { int (*go_demote_ok) (const struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); - void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); + void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl); void (*go_callback)(struct gfs2_glock *gl, bool remote); const int go_type; const unsigned long go_flags; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 648f0ca1ad57..998051c4aea7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -744,17 +744,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, the gfs2 structures. */ if (default_acl) { error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) + goto fail_gunlock3; posix_acl_release(default_acl); + default_acl = NULL; } if (acl) { - if (!error) - error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto fail_gunlock3; posix_acl_release(acl); + acl = NULL; } - if (error) - goto fail_gunlock3; - error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name, &gfs2_initxattrs, NULL); if (error) @@ -789,10 +791,8 @@ fail_free_inode: } gfs2_rsqa_delete(ip, NULL); fail_free_acls: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); + posix_acl_release(default_acl); + posix_acl_release(acl); fail_gunlock: gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(ghs); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index b5b6341a4f5c..793808263c6d 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -30,16 +30,14 @@ static inline int gfs2_is_jdata(const struct gfs2_inode *ip) return ip->i_diskflags & GFS2_DIF_JDATA; } -static inline int gfs2_is_writeback(const struct gfs2_inode *ip) +static inline bool gfs2_is_ordered(const struct gfs2_sbd *sdp) { - const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - return (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK) && !gfs2_is_jdata(ip); + return sdp->sd_args.ar_data == GFS2_DATA_ORDERED; } -static inline int gfs2_is_ordered(const struct gfs2_inode *ip) +static inline bool gfs2_is_writeback(const struct gfs2_sbd *sdp) { - const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - return (sdp->sd_args.ar_data == GFS2_DATA_ORDERED) && !gfs2_is_jdata(ip); + return sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK; } static inline int gfs2_is_dir(const struct gfs2_inode *ip) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 99dd58694ba1..5bfaf381921a 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -605,7 +605,6 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) bd->bd_blkno = bh->b_blocknr; gfs2_remove_from_ail(bd); /* drops ref on bh */ bd->bd_bh = NULL; - bd->bd_ops = &gfs2_revoke_lops; sdp->sd_log_num_revoke++; atomic_inc(&gl->gl_revokes); set_bit(GLF_LFLUSH, &gl->gl_flags); @@ -734,7 +733,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr); - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); log_flush_wait(sdp); } @@ -811,7 +810,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 20241436126d..1bc9bd444b28 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -51,12 +51,11 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) { - struct gfs2_sbd *sdp; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - if (!gfs2_is_ordered(ip)) + if (gfs2_is_jdata(ip) || !gfs2_is_ordered(sdp)) return; - sdp = GFS2_SB(&ip->i_inode); if (!test_bit(GIF_ORDERED, &ip->i_flags)) { spin_lock(&sdp->sd_ordered_lock); if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags)) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 4c7069b8f3c1..94dcab655bc0 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -17,7 +17,9 @@ #include <linux/bio.h> #include <linux/fs.h> #include <linux/list_sort.h> +#include <linux/blkdev.h> +#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -193,7 +195,6 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, /** * gfs2_end_log_write - end of i/o to the log * @bio: The bio - * @error: Status of i/o request * * Each bio_vec contains either data from the pagecache or data * relating to the log itself. Here we iterate over the bio_vec @@ -228,83 +229,86 @@ static void gfs2_end_log_write(struct bio *bio) } /** - * gfs2_log_flush_bio - Submit any pending log bio - * @sdp: The superblock - * @op: REQ_OP - * @op_flags: req_flag_bits + * gfs2_log_submit_bio - Submit any pending log bio + * @biop: Address of the bio pointer + * @opf: REQ_OP | op_flags * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags) +void gfs2_log_submit_bio(struct bio **biop, int opf) { - if (sdp->sd_log_bio) { + struct bio *bio = *biop; + if (bio) { + struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio_set_op_attrs(sdp->sd_log_bio, op, op_flags); - submit_bio(sdp->sd_log_bio); - sdp->sd_log_bio = NULL; + bio->bi_opf = opf; + submit_bio(bio); + *biop = NULL; } } /** - * gfs2_log_alloc_bio - Allocate a new bio for log writing - * @sdp: The superblock - * @blkno: The next device block number we want to write to + * gfs2_log_alloc_bio - Allocate a bio + * @sdp: The super block + * @blkno: The device block number we want to write to + * @end_io: The bi_end_io callback * - * This should never be called when there is a cached bio in the - * super block. When it returns, there will be a cached bio in the - * super block which will have as many bio_vecs as the device is - * happy to handle. + * Allocate a new bio, initialize it with the given parameters and return it. * - * Returns: Newly allocated bio + * Returns: The newly allocated bio */ -static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) +static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, + bio_end_io_t *end_io) { struct super_block *sb = sdp->sd_vfs; - struct bio *bio; - - BUG_ON(sdp->sd_log_bio); + struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); bio_set_dev(bio, sb->s_bdev); - bio->bi_end_io = gfs2_end_log_write; + bio->bi_end_io = end_io; bio->bi_private = sdp; - sdp->sd_log_bio = bio; - return bio; } /** * gfs2_log_get_bio - Get cached log bio, or allocate a new one - * @sdp: The superblock + * @sdp: The super block * @blkno: The device block number we want to write to + * @bio: The bio to get or allocate + * @op: REQ_OP + * @end_io: The bi_end_io callback + * @flush: Always flush the current bio and allocate a new one? * * If there is a cached bio, then if the next block number is sequential * with the previous one, return it, otherwise flush the bio to the - * device. If there is not a cached bio, or we just flushed it, then + * device. If there is no cached bio, or we just flushed it, then * allocate a new one. * * Returns: The bio to use for log writes */ -static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) +static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, + struct bio **biop, int op, + bio_end_io_t *end_io, bool flush) { - struct bio *bio = sdp->sd_log_bio; - u64 nblk; + struct bio *bio = *biop; if (bio) { + u64 nblk; + nblk = bio_end_sector(bio); nblk >>= sdp->sd_fsb2bb_shift; - if (blkno == nblk) + if (blkno == nblk && !flush) return bio; - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); + gfs2_log_submit_bio(biop, op); } - return gfs2_log_alloc_bio(sdp, blkno); + *biop = gfs2_log_alloc_bio(sdp, blkno, end_io); + return *biop; } /** @@ -326,11 +330,12 @@ void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, struct bio *bio; int ret; - bio = gfs2_log_get_bio(sdp, blkno); + bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio, REQ_OP_WRITE, + gfs2_end_log_write, false); ret = bio_add_page(bio, page, size, offset); if (ret == 0) { - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); - bio = gfs2_log_alloc_bio(sdp, blkno); + bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio, + REQ_OP_WRITE, gfs2_end_log_write, true); ret = bio_add_page(bio, page, size, offset); WARN_ON(ret == 0); } @@ -370,6 +375,184 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } +/** + * gfs2_end_log_read - end I/O callback for reads from the log + * @bio: The bio + * + * Simply unlock the pages in the bio. The main thread will wait on them and + * process them in order as necessary. + */ + +static void gfs2_end_log_read(struct bio *bio) +{ + struct page *page; + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + page = bvec->bv_page; + if (bio->bi_status) { + int err = blk_status_to_errno(bio->bi_status); + + SetPageError(page); + mapping_set_error(page->mapping, err); + } + unlock_page(page); + } + + bio_put(bio); +} + +/** + * gfs2_jhead_pg_srch - Look for the journal head in a given page. + * @jd: The journal descriptor + * @page: The page to look in + * + * Returns: 1 if found, 0 otherwise. + */ + +static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct page *page) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_log_header_host uninitialized_var(lh); + void *kaddr = kmap_atomic(page); + unsigned int offset; + bool ret = false; + + for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { + if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { + if (lh.lh_sequence > head->lh_sequence) + *head = lh; + else { + ret = true; + break; + } + } + } + kunmap_atomic(kaddr); + return ret; +} + +/** + * gfs2_jhead_process_page - Search/cleanup a page + * @jd: The journal descriptor + * @index: Index of the page to look into + * @done: If set, perform only cleanup, else search and set if found. + * + * Find the page with 'index' in the journal's mapping. Search the page for + * the journal head if requested (cleanup == false). Release refs on the + * page so the page cache can reclaim it (put_page() twice). We grabbed a + * reference on this page two times, first when we did a find_or_create_page() + * to obtain the page to add it to the bio and second when we do a + * find_get_page() here to get the page to wait on while I/O on it is being + * completed. + * This function is also used to free up a page we might've grabbed but not + * used. Maybe we added it to a bio, but not submitted it for I/O. Or we + * submitted the I/O, but we already found the jhead so we only need to drop + * our references to the page. + */ + +static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, + struct gfs2_log_header_host *head, + bool *done) +{ + struct page *page; + + page = find_get_page(jd->jd_inode->i_mapping, index); + wait_on_page_locked(page); + + if (PageError(page)) + *done = true; + + if (!*done) + *done = gfs2_jhead_pg_srch(jd, head, page); + + put_page(page); /* Once for find_get_page */ + put_page(page); /* Once more for find_or_create_page */ +} + +/** + * gfs2_find_jhead - find the head of a log + * @jd: The journal descriptor + * @head: The log descriptor for the head of the log is returned here + * + * Do a search of a journal by reading it in large chunks using bios and find + * the valid log entry with the highest sequence number. (i.e. the log head) + * + * Returns: 0 on success, errno otherwise + */ + +int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct address_space *mapping = jd->jd_inode->i_mapping; + struct gfs2_journal_extent *je; + u32 block, read_idx = 0, submit_idx = 0, index = 0; + int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; + int blocks_per_page = 1 << shift, sz, ret = 0; + struct bio *bio = NULL; + struct page *page; + bool done = false; + errseq_t since; + + memset(head, 0, sizeof(*head)); + if (list_empty(&jd->extent_list)) + gfs2_map_journal_extents(sdp, jd); + + since = filemap_sample_wb_err(mapping); + list_for_each_entry(je, &jd->extent_list, list) { + for (block = 0; block < je->blocks; block += blocks_per_page) { + index = (je->lblock + block) >> shift; + + page = find_or_create_page(mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + done = true; + goto out; + } + + if (bio) { + sz = bio_add_page(bio, page, PAGE_SIZE, 0); + if (sz == PAGE_SIZE) + goto page_added; + submit_idx = index; + submit_bio(bio); + bio = NULL; + } + + bio = gfs2_log_alloc_bio(sdp, + je->dblock + (index << shift), + gfs2_end_log_read); + bio->bi_opf = REQ_OP_READ; + sz = bio_add_page(bio, page, PAGE_SIZE, 0); + gfs2_assert_warn(sdp, sz == PAGE_SIZE); + +page_added: + if (submit_idx <= read_idx + BIO_MAX_PAGES) { + /* Keep at least one bio in flight */ + continue; + } + + gfs2_jhead_process_page(jd, read_idx++, head, &done); + if (done) + goto out; /* found */ + } + } + +out: + if (bio) + submit_bio(bio); + while (read_idx <= index) + gfs2_jhead_process_page(jd, read_idx++, head, &done); + + if (!ret) + ret = filemap_check_wb_err(mapping, since); + + return ret; +} + static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, u32 ld_length, u32 ld_data1) { diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index e4949394f054..331160fc568b 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -30,8 +30,10 @@ extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp); extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, unsigned size, unsigned offset, u64 blkno); extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); -extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags); +extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); +extern int gfs2_find_jhead(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b041cb8ae383..1179763f6370 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -41,6 +41,7 @@ #include "dir.h" #include "meta_io.h" #include "trace_gfs2.h" +#include "lops.h" #define DO 0 #define UNDO 1 diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 0f501f938d1c..7389e445a7a7 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -120,6 +120,35 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) } } +int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, + unsigned int blkno, struct gfs2_log_header_host *head) +{ + u32 hash, crc; + + if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || + lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || + (blkno && be32_to_cpu(lh->lh_blkno) != blkno)) + return 1; + + hash = crc32(~0, lh, LH_V1_SIZE - 4); + hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ + + if (be32_to_cpu(lh->lh_hash) != hash) + return 1; + + crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, + sdp->sd_sb.sb_bsize - LH_V1_SIZE - 4); + + if ((lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc)) + return 1; + + head->lh_sequence = be64_to_cpu(lh->lh_sequence); + head->lh_flags = be32_to_cpu(lh->lh_flags); + head->lh_tail = be32_to_cpu(lh->lh_tail); + head->lh_blkno = be32_to_cpu(lh->lh_blkno); + + return 0; +} /** * get_log_header - read the log header for a given segment * @jd: the journal @@ -137,159 +166,18 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, struct gfs2_log_header_host *head) { - struct gfs2_log_header *lh; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct buffer_head *bh; - u32 hash, crc; int error; error = gfs2_replay_read_block(jd, blk, &bh); if (error) return error; - lh = (void *)bh->b_data; - - hash = crc32(~0, lh, LH_V1_SIZE - 4); - hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ - - crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, - bh->b_size - LH_V1_SIZE - 4); - - error = lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || - lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || - be32_to_cpu(lh->lh_blkno) != blk || - be32_to_cpu(lh->lh_hash) != hash || - (lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc); + error = __get_log_header(sdp, (const struct gfs2_log_header *)bh->b_data, + blk, head); brelse(bh); - if (!error) { - head->lh_sequence = be64_to_cpu(lh->lh_sequence); - head->lh_flags = be32_to_cpu(lh->lh_flags); - head->lh_tail = be32_to_cpu(lh->lh_tail); - head->lh_blkno = be32_to_cpu(lh->lh_blkno); - } - return error; -} - -/** - * find_good_lh - find a good log header - * @jd: the journal - * @blk: the segment to start searching from - * @lh: the log header to fill in - * @forward: if true search forward in the log, else search backward - * - * Call get_log_header() to get a log header for a segment, but if the - * segment is bad, either scan forward or backward until we find a good one. - * - * Returns: errno - */ - -static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk, - struct gfs2_log_header_host *head) -{ - unsigned int orig_blk = *blk; - int error; - - for (;;) { - error = get_log_header(jd, *blk, head); - if (error <= 0) - return error; - - if (++*blk == jd->jd_blocks) - *blk = 0; - - if (*blk == orig_blk) { - gfs2_consist_inode(GFS2_I(jd->jd_inode)); - return -EIO; - } - } -} - -/** - * jhead_scan - make sure we've found the head of the log - * @jd: the journal - * @head: this is filled in with the log descriptor of the head - * - * At this point, seg and lh should be either the head of the log or just - * before. Scan forward until we find the head. - * - * Returns: errno - */ - -static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) -{ - unsigned int blk = head->lh_blkno; - struct gfs2_log_header_host lh; - int error; - - for (;;) { - if (++blk == jd->jd_blocks) - blk = 0; - - error = get_log_header(jd, blk, &lh); - if (error < 0) - return error; - if (error == 1) - continue; - - if (lh.lh_sequence == head->lh_sequence) { - gfs2_consist_inode(GFS2_I(jd->jd_inode)); - return -EIO; - } - if (lh.lh_sequence < head->lh_sequence) - break; - - *head = lh; - } - - return 0; -} - -/** - * gfs2_find_jhead - find the head of a log - * @jd: the journal - * @head: the log descriptor for the head of the log is returned here - * - * Do a binary search of a journal and find the valid log entry with the - * highest sequence number. (i.e. the log head) - * - * Returns: errno - */ - -int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) -{ - struct gfs2_log_header_host lh_1, lh_m; - u32 blk_1, blk_2, blk_m; - int error; - - blk_1 = 0; - blk_2 = jd->jd_blocks - 1; - - for (;;) { - blk_m = (blk_1 + blk_2) / 2; - - error = find_good_lh(jd, &blk_1, &lh_1); - if (error) - return error; - - error = find_good_lh(jd, &blk_m, &lh_m); - if (error) - return error; - - if (blk_1 == blk_m || blk_m == blk_2) - break; - - if (lh_1.lh_sequence <= lh_m.lh_sequence) - blk_1 = blk_m; - else - blk_2 = blk_m; - } - - error = jhead_scan(jd, &lh_1); - if (error) - return error; - - *head = lh_1; - return error; } @@ -460,6 +348,8 @@ void gfs2_recover_func(struct work_struct *work) if (error) goto fail_gunlock_ji; t_jhd = ktime_get(); + fs_info(sdp, "jid=%u: Journal head lookup took %lldms\n", jd->jd_jid, + ktime_ms_delta(t_jhd, t_jlck)); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 11fdfab4bf99..99575ab81202 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -27,10 +27,11 @@ extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); extern void gfs2_revoke_clean(struct gfs2_jdesc *jd); -extern int gfs2_find_jhead(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head); extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); extern void gfs2_recover_func(struct work_struct *work); +extern int __get_log_header(struct gfs2_sbd *sdp, + const struct gfs2_log_header *lh, unsigned int blkno, + struct gfs2_log_header_host *head); #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index b08a530433ad..831d7cb5a49c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1780,9 +1780,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, goto next_iter; } if (ret == -E2BIG) { + n += rbm->bii - initial_bii; rbm->bii = 0; rbm->offset = 0; - n += (rbm->bii - initial_bii); goto res_covered_end_of_rgrp; } return ret; @@ -2256,7 +2256,7 @@ static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd, * */ -void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) +void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl) { struct gfs2_rgrpd *rgd = gl->gl_object; struct gfs2_blkreserv *trs; diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index b596c3d17988..499079a9dbbe 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -72,7 +72,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); -extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl); extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct buffer_head *bh, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ca71163ff7cf..d4b11c903971 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -45,6 +45,7 @@ #include "util.h" #include "sys.h" #include "xattr.h" +#include "lops.h" #define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 423bc2d03dd8..cd9a94a6b5bb 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -124,15 +124,13 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) } static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, - struct buffer_head *bh, - const struct gfs2_log_operations *lops) + struct buffer_head *bh) { struct gfs2_bufdata *bd; bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); bd->bd_bh = bh; bd->bd_gl = gl; - bd->bd_ops = lops; INIT_LIST_HEAD(&bd->bd_list); bh->b_private = bd; return bd; @@ -169,7 +167,7 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) gfs2_log_unlock(sdp); unlock_buffer(bh); if (bh->b_private == NULL) - bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops); + bd = gfs2_alloc_bufdata(gl, bh); else bd = bh->b_private; lock_buffer(bh); @@ -210,7 +208,7 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) unlock_buffer(bh); lock_page(bh->b_page); if (bh->b_private == NULL) - bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops); + bd = gfs2_alloc_bufdata(gl, bh); else bd = bh->b_private; unlock_page(bh->b_page); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 32920a10100e..a2fcea5f8225 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -383,17 +383,16 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv - * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() prevents page faults in the - * truncated range. It checks i_size before allocation, and again after - * with the page table lock for the page held. The same lock must be - * acquired to unmap a page. + * maps and global counts. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserv map * deleted. The region/reserv map for ranges without associated - * pages are not modified. Page faults can race with hole punch. - * This is indicated if we find a mapped page. + * pages are not modified. + * + * Callers of this routine must hold the i_mmap_rwsem in write mode to prevent + * races with page faults. + * * Note: If the passed end of range value is beyond the end of file, but * not LLONG_MAX this routine still performs a hole punch operation. */ @@ -423,32 +422,14 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; - u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, current->mm, - &pseudo_vma, - mapping, index, 0); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - /* - * If page is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) now after taking - * the fault mutex. The mutex will prevent faults - * until we finish removing the page. - * - * This race can only happen in the hole punch case. - * Getting here in a truncate operation is a bug. + * A mapped page is impossible as callers should unmap + * all references before calling. And, i_mmap_rwsem + * prevents the creation of additional mappings. */ - if (unlikely(page_mapped(page))) { - BUG_ON(truncate_op); - - i_mmap_lock_write(mapping); - hugetlb_vmdelete_list(&mapping->i_mmap, - index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h)); - i_mmap_unlock_write(mapping); - } + VM_BUG_ON(page_mapped(page)); lock_page(page); /* @@ -470,7 +451,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } unlock_page(page); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); } huge_pagevec_release(&pvec); cond_resched(); @@ -482,9 +462,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, static void hugetlbfs_evict_inode(struct inode *inode) { + struct address_space *mapping = inode->i_mapping; struct resv_map *resv_map; + /* + * The vfs layer guarantees that there are no other users of this + * inode. Therefore, it would be safe to call remove_inode_hugepages + * without holding i_mmap_rwsem. We acquire and hold here to be + * consistent with other callers. Since there will be no contention + * on the semaphore, overhead is negligible. + */ + i_mmap_lock_write(mapping); remove_inode_hugepages(inode, 0, LLONG_MAX); + i_mmap_unlock_write(mapping); + resv_map = (struct resv_map *)inode->i_mapping->private_data; /* root inode doesn't have the resv_map, so we should check it */ if (resv_map) @@ -505,8 +496,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); - i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); + i_mmap_unlock_write(mapping); return 0; } @@ -540,8 +531,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT); - i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, hole_start, hole_end); + i_mmap_unlock_write(mapping); inode_unlock(inode); } @@ -624,7 +615,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* addr is the offset within the file (zero based) */ addr = index * hpage_size; - /* mutex taken here, fault path and hole punch */ + /* + * fault mutex taken here, protects against fault path + * and hole punch. inode_lock previously taken protects + * against truncation. + */ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, index, addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); diff --git a/fs/inode.c b/fs/inode.c index 35d2108d567c..0cd47fe0dbe5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2149,7 +2149,9 @@ EXPORT_SYMBOL(timespec64_trunc); */ struct timespec64 current_time(struct inode *inode) { - struct timespec64 now = current_kernel_time64(); + struct timespec64 now; + + ktime_get_coarse_real_ts64(&now); if (unlikely(!inode->i_sb)) { WARN(1, "current_time() called with uninitialized super_block in the inode"); diff --git a/fs/iomap.c b/fs/iomap.c index 5bc172f3dfe8..a3088fae567b 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -116,12 +116,6 @@ iomap_page_create(struct inode *inode, struct page *page) atomic_set(&iop->read_count, 0); atomic_set(&iop->write_count, 0); bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); - - /* - * migrate_page_move_mapping() assumes that pages with private data have - * their count elevated by 1. - */ - get_page(page); set_page_private(page, (unsigned long)iop); SetPagePrivate(page); return iop; @@ -138,7 +132,6 @@ iomap_page_release(struct page *page) WARN_ON_ONCE(atomic_read(&iop->write_count)); ClearPagePrivate(page); set_page_private(page, 0); - put_page(page); kfree(iop); } @@ -499,16 +492,29 @@ done: } EXPORT_SYMBOL_GPL(iomap_readpages); +/* + * iomap_is_partially_uptodate checks whether blocks within a page are + * uptodate or not. + * + * Returns true if all blocks which correspond to a file portion + * we want to read within the page are uptodate. + */ int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count) { struct iomap_page *iop = to_iomap_page(page); struct inode *inode = page->mapping->host; - unsigned first = from >> inode->i_blkbits; - unsigned last = (from + count - 1) >> inode->i_blkbits; + unsigned len, first, last; unsigned i; + /* Limit range to one page */ + len = min_t(unsigned, PAGE_SIZE - from, count); + + /* First and last blocks in range within page */ + first = from >> inode->i_blkbits; + last = (from + len - 1) >> inode->i_blkbits; + if (iop) { for (i = first; i <= last; i++) if (!test_bit(i, iop->uptodate)) @@ -557,7 +563,7 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, { int ret; - ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; @@ -1537,7 +1543,7 @@ static void iomap_dio_bio_end_io(struct bio *bio) if (dio->wait_for_completion) { struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); - wake_up_process(waiter); + blk_wake_io_task(waiter); } else if (dio->flags & IOMAP_DIO_WRITE) { struct inode *inode = file_inode(dio->iocb->ki_filp); @@ -1565,6 +1571,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, unsigned len) { struct page *page = ZERO_PAGE(0); + int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; bio = bio_alloc(GFP_KERNEL, 1); @@ -1573,9 +1580,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; + if (dio->iocb->ki_flags & IOCB_HIPRI) + flags |= REQ_HIPRI; + get_page(page); __bio_add_page(bio, page, len, 0); - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + bio_set_op_attrs(bio, REQ_OP_WRITE, flags); atomic_inc(&dio->ref); return submit_bio(bio); @@ -1681,6 +1691,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio_set_pages_dirty(bio); } + if (dio->iocb->ki_flags & IOCB_HIPRI) + bio->bi_opf |= REQ_HIPRI; + iov_iter_advance(dio->submit.iter, n); dio->size += n; @@ -1915,7 +1928,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!(iocb->ki_flags & IOCB_HIPRI) || !dio->submit.last_queue || !blk_poll(dio->submit.last_queue, - dio->submit.cookie)) + dio->submit.cookie, true)) io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 150cc030b4d7..2eb55c3361a8 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -439,6 +439,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) finish_wait(&journal->j_wait_updates, &wait); } spin_unlock(&commit_transaction->t_handle_lock); + commit_transaction->t_state = T_SWITCH; + write_unlock(&journal->j_state_lock); J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= journal->j_max_transaction_buffers); @@ -505,6 +507,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) atomic_sub(atomic_read(&journal->j_reserved_credits), &commit_transaction->t_outstanding_credits); + write_lock(&journal->j_state_lock); trace_jbd2_commit_flushing(journal, commit_transaction); stats.run.rs_flushing = jiffies; stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index c0b66a7a795b..cc35537232f2 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -138,9 +138,9 @@ static inline void update_t_max_wait(transaction_t *transaction, } /* - * Wait until running transaction passes T_LOCKED state. Also starts the commit - * if needed. The function expects running transaction to exist and releases - * j_state_lock. + * Wait until running transaction passes to T_FLUSH state and new transaction + * can thus be started. Also starts the commit if needed. The function expects + * running transaction to exist and releases j_state_lock. */ static void wait_transaction_locked(journal_t *journal) __releases(journal->j_state_lock) @@ -160,6 +160,32 @@ static void wait_transaction_locked(journal_t *journal) finish_wait(&journal->j_wait_transaction_locked, &wait); } +/* + * Wait until running transaction transitions from T_SWITCH to T_FLUSH + * state and new transaction can thus be started. The function releases + * j_state_lock. + */ +static void wait_transaction_switching(journal_t *journal) + __releases(journal->j_state_lock) +{ + DEFINE_WAIT(wait); + + if (WARN_ON(!journal->j_running_transaction || + journal->j_running_transaction->t_state != T_SWITCH)) + return; + prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + TASK_UNINTERRUPTIBLE); + read_unlock(&journal->j_state_lock); + /* + * We don't call jbd2_might_wait_for_commit() here as there's no + * waiting for outstanding handles happening anymore in T_SWITCH state + * and handling of reserved handles actually relies on that for + * correctness. + */ + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); +} + static void sub_reserved_credits(journal_t *journal, int blocks) { atomic_sub(blocks, &journal->j_reserved_credits); @@ -183,7 +209,8 @@ static int add_transaction_credits(journal_t *journal, int blocks, * If the current transaction is locked down for commit, wait * for the lock to be released. */ - if (t->t_state == T_LOCKED) { + if (t->t_state != T_RUNNING) { + WARN_ON_ONCE(t->t_state >= T_FLUSH); wait_transaction_locked(journal); return 1; } @@ -360,8 +387,14 @@ repeat: /* * We have handle reserved so we are allowed to join T_LOCKED * transaction and we don't have to check for transaction size - * and journal space. + * and journal space. But we still have to wait while running + * transaction is being switched to a committing one as it + * won't wait for any handles anymore. */ + if (transaction->t_state == T_SWITCH) { + wait_transaction_switching(journal); + goto repeat; + } sub_reserved_credits(journal, blocks); handle->h_reserved = 0; } @@ -910,7 +943,7 @@ repeat: * this is the first time this transaction is touching this buffer, * reset the modified flag */ - jh->b_modified = 0; + jh->b_modified = 0; /* * If the buffer is not journaled right now, we need to make sure it diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 902a7dd10e5c..bb6ae387469f 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -101,7 +101,8 @@ static int jffs2_sync_fs(struct super_block *sb, int wait) struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); #ifdef CONFIG_JFFS2_FS_WRITEBUFFER - cancel_delayed_work_sync(&c->wbuf_dwork); + if (jffs2_is_writebuffered(c)) + cancel_delayed_work_sync(&c->wbuf_dwork); #endif mutex_lock(&c->alloc_sem); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index dbf5bc250bfd..f8d5021a652e 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -857,7 +857,6 @@ static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; - struct kernfs_open_node *on; struct kernfs_super_info *info; repeat: /* pop one off the notify_list */ @@ -871,17 +870,6 @@ repeat: kn->attr.notify_next = NULL; spin_unlock_irq(&kernfs_notify_lock); - /* kick poll */ - spin_lock_irq(&kernfs_open_node_lock); - - on = kn->attr.open; - if (on) { - atomic_inc(&on->event); - wake_up_interruptible(&on->poll); - } - - spin_unlock_irq(&kernfs_open_node_lock); - /* kick fsnotify */ mutex_lock(&kernfs_mutex); @@ -934,10 +922,21 @@ void kernfs_notify(struct kernfs_node *kn) { static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); unsigned long flags; + struct kernfs_open_node *on; if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) return; + /* kick poll immediately */ + spin_lock_irqsave(&kernfs_open_node_lock, flags); + on = kn->attr.open; + if (on) { + atomic_inc(&on->event); + wake_up_interruptible(&on->poll); + } + spin_unlock_irqrestore(&kernfs_open_node_lock, flags); + + /* schedule work to kick fsnotify */ spin_lock_irqsave(&kernfs_notify_lock, flags); if (!kn->attr.notify_next) { kernfs_get(kn); diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 00d5ef5f99f7..214a2fa1f1e3 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -128,24 +128,14 @@ static void encode_netobj(struct xdr_stream *xdr, static int decode_netobj(struct xdr_stream *xdr, struct xdr_netobj *obj) { - u32 length; - __be32 *p; + ssize_t ret; - p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; - length = be32_to_cpup(p++); - if (unlikely(length > XDR_MAX_NETOBJ)) - goto out_size; - obj->len = length; - obj->data = (u8 *)p; + ret = xdr_stream_decode_opaque_inline(xdr, (void *)&obj->data, + XDR_MAX_NETOBJ); + if (unlikely(ret < 0)) + return -EIO; + obj->len = ret; return 0; -out_size: - dprintk("NFS: returned netobj was too long: %u\n", length); - return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 7c80c28df971..e8a004097d18 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -442,7 +442,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) fl->fl_start = req->a_res.lock.fl.fl_start; fl->fl_end = req->a_res.lock.fl.fl_end; fl->fl_type = req->a_res.lock.fl.fl_type; - fl->fl_pid = 0; + fl->fl_pid = -req->a_res.lock.fl.fl_pid; break; default: status = nlm_stat_to_errno(req->a_res.status); diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 2c6176387143..747b9c8c940a 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -125,24 +125,14 @@ static void encode_netobj(struct xdr_stream *xdr, static int decode_netobj(struct xdr_stream *xdr, struct xdr_netobj *obj) { - u32 length; - __be32 *p; + ssize_t ret; - p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; - length = be32_to_cpup(p++); - if (unlikely(length > XDR_MAX_NETOBJ)) - goto out_size; - obj->len = length; - obj->data = (u8 *)p; + ret = xdr_stream_decode_opaque_inline(xdr, (void *)&obj->data, + XDR_MAX_NETOBJ); + if (unlikely(ret < 0)) + return -EIO; + obj->len = ret; return 0; -out_size: - dprintk("NFS: returned netobj was too long: %u\n", length); - return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 74330daeab71..ea719cdd6a36 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -276,7 +276,7 @@ static int nlmsvc_unlink_block(struct nlm_block *block) dprintk("lockd: unlinking block %p...\n", block); /* Remove block from list */ - status = posix_unblock_lock(&block->b_call->a_args.lock.fl); + status = locks_delete_block(&block->b_call->a_args.lock.fl); nlmsvc_remove_block(block); return status; } diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 7147e4aebecc..9846f7e95282 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -127,7 +127,7 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock) locks_init_lock(fl); fl->fl_owner = current->files; - fl->fl_pid = (pid_t)lock->svid; + fl->fl_pid = current->tgid; fl->fl_flags = FL_POSIX; fl->fl_type = F_RDLCK; /* as good as anything else */ start = ntohl(*p++); @@ -269,7 +269,7 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) memset(lock, 0, sizeof(*lock)); locks_init_lock(&lock->fl); lock->svid = ~(u32) 0; - lock->fl.fl_pid = (pid_t)lock->svid; + lock->fl.fl_pid = current->tgid; if (!(p = nlm_decode_cookie(p, &argp->cookie)) || !(p = xdr_decode_string_inplace(p, &lock->caller, diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 7ed9edf9aed4..70154f376695 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -119,7 +119,7 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock) locks_init_lock(fl); fl->fl_owner = current->files; - fl->fl_pid = (pid_t)lock->svid; + fl->fl_pid = current->tgid; fl->fl_flags = FL_POSIX; fl->fl_type = F_RDLCK; /* as good as anything else */ p = xdr_decode_hyper(p, &start); @@ -266,7 +266,7 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) memset(lock, 0, sizeof(*lock)); locks_init_lock(&lock->fl); lock->svid = ~(u32) 0; - lock->fl.fl_pid = (pid_t)lock->svid; + lock->fl.fl_pid = current->tgid; if (!(p = nlm4_decode_cookie(p, &argp->cookie)) || !(p = xdr_decode_string_inplace(p, &lock->caller, diff --git a/fs/locks.c b/fs/locks.c index 2ecb4db8c840..f0b24d98f36b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -11,11 +11,11 @@ * * Miscellaneous edits, and a total rewrite of posix_lock_file() code. * Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994 - * + * * Converted file_lock_table to a linked list from an array, which eliminates * the limits on how many active file locks are open. * Chad Page (pageone@netcom.com), November 27, 1994 - * + * * Removed dependency on file descriptors. dup()'ed file descriptors now * get the same locks as the original file descriptors, and a close() on * any file descriptor removes ALL the locks on the file for the current @@ -41,7 +41,7 @@ * with a file pointer (filp). As a result they can be shared by a parent * process and its children after a fork(). They are removed when the last * file descriptor referring to the file pointer is closed (unless explicitly - * unlocked). + * unlocked). * * FL_FLOCK locks never deadlock, an existing lock is always removed before * upgrading from shared to exclusive (or vice versa). When this happens @@ -50,7 +50,7 @@ * Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995 * * Removed some race conditions in flock_lock_file(), marked other possible - * races. Just grep for FIXME to see them. + * races. Just grep for FIXME to see them. * Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996. * * Addressed Dmitry's concerns. Deadlock checking no longer recursive. @@ -112,6 +112,46 @@ * Leases and LOCK_MAND * Matthew Wilcox <willy@debian.org>, June, 2000. * Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000. + * + * Locking conflicts and dependencies: + * If multiple threads attempt to lock the same byte (or flock the same file) + * only one can be granted the lock, and other must wait their turn. + * The first lock has been "applied" or "granted", the others are "waiting" + * and are "blocked" by the "applied" lock.. + * + * Waiting and applied locks are all kept in trees whose properties are: + * + * - the root of a tree may be an applied or waiting lock. + * - every other node in the tree is a waiting lock that + * conflicts with every ancestor of that node. + * + * Every such tree begins life as a waiting singleton which obviously + * satisfies the above properties. + * + * The only ways we modify trees preserve these properties: + * + * 1. We may add a new leaf node, but only after first verifying that it + * conflicts with all of its ancestors. + * 2. We may remove the root of a tree, creating a new singleton + * tree from the root and N new trees rooted in the immediate + * children. + * 3. If the root of a tree is not currently an applied lock, we may + * apply it (if possible). + * 4. We may upgrade the root of the tree (either extend its range, + * or upgrade its entire range from read to write). + * + * When an applied lock is modified in a way that reduces or downgrades any + * part of its range, we remove all its children (2 above). This particularly + * happens when a lock is unlocked. + * + * For each of those child trees we "wake up" the thread which is + * waiting for the lock so it can continue handling as follows: if the + * root of the tree applies, we do so (3). If it doesn't, it must + * conflict with some applied lock. We remove (wake up) all of its children + * (2), and add it is a new leaf to the tree rooted in the applied + * lock (1). We then repeat the process recursively with those + * children. + * */ #include <linux/capability.h> @@ -189,9 +229,9 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS); * This lock protects the blocked_hash. Generally, if you're accessing it, you * want to be holding this lock. * - * In addition, it also protects the fl->fl_block list, and the fl->fl_next - * pointer for file_lock structures that are acting as lock requests (in - * contrast to those that are acting as records of acquired locks). + * In addition, it also protects the fl->fl_blocked_requests list, and the + * fl->fl_blocker pointer for file_lock structures that are acting as lock + * requests (in contrast to those that are acting as records of acquired locks). * * Note that when we acquire this lock in order to change the above fields, * we often hold the flc_lock as well. In certain cases, when reading the fields @@ -293,7 +333,8 @@ static void locks_init_lock_heads(struct file_lock *fl) { INIT_HLIST_NODE(&fl->fl_link); INIT_LIST_HEAD(&fl->fl_list); - INIT_LIST_HEAD(&fl->fl_block); + INIT_LIST_HEAD(&fl->fl_blocked_requests); + INIT_LIST_HEAD(&fl->fl_blocked_member); init_waitqueue_head(&fl->fl_wait); } @@ -332,7 +373,8 @@ void locks_free_lock(struct file_lock *fl) { BUG_ON(waitqueue_active(&fl->fl_wait)); BUG_ON(!list_empty(&fl->fl_list)); - BUG_ON(!list_empty(&fl->fl_block)); + BUG_ON(!list_empty(&fl->fl_blocked_requests)); + BUG_ON(!list_empty(&fl->fl_blocked_member)); BUG_ON(!hlist_unhashed(&fl->fl_link)); locks_release_private(fl); @@ -357,7 +399,6 @@ void locks_init_lock(struct file_lock *fl) memset(fl, 0, sizeof(struct file_lock)); locks_init_lock_heads(fl); } - EXPORT_SYMBOL(locks_init_lock); /* @@ -397,9 +438,26 @@ void locks_copy_lock(struct file_lock *new, struct file_lock *fl) fl->fl_ops->fl_copy_lock(new, fl); } } - EXPORT_SYMBOL(locks_copy_lock); +static void locks_move_blocks(struct file_lock *new, struct file_lock *fl) +{ + struct file_lock *f; + + /* + * As ctx->flc_lock is held, new requests cannot be added to + * ->fl_blocked_requests, so we don't need a lock to check if it + * is empty. + */ + if (list_empty(&fl->fl_blocked_requests)) + return; + spin_lock(&blocked_lock_lock); + list_splice_init(&fl->fl_blocked_requests, &new->fl_blocked_requests); + list_for_each_entry(f, &fl->fl_blocked_requests, fl_blocked_member) + f->fl_blocker = new; + spin_unlock(&blocked_lock_lock); +} + static inline int flock_translate_cmd(int cmd) { if (cmd & LOCK_MAND) return cmd & (LOCK_MAND | LOCK_RW); @@ -416,17 +474,20 @@ static inline int flock_translate_cmd(int cmd) { /* Fill in a file_lock structure with an appropriate FLOCK lock. */ static struct file_lock * -flock_make_lock(struct file *filp, unsigned int cmd) +flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) { - struct file_lock *fl; int type = flock_translate_cmd(cmd); if (type < 0) return ERR_PTR(type); - - fl = locks_alloc_lock(); - if (fl == NULL) - return ERR_PTR(-ENOMEM); + + if (fl == NULL) { + fl = locks_alloc_lock(); + if (fl == NULL) + return ERR_PTR(-ENOMEM); + } else { + locks_init_lock(fl); + } fl->fl_file = filp; fl->fl_owner = filp; @@ -434,7 +495,7 @@ flock_make_lock(struct file *filp, unsigned int cmd) fl->fl_flags = FL_FLOCK; fl->fl_type = type; fl->fl_end = OFFSET_MAX; - + return fl; } @@ -666,16 +727,58 @@ static void locks_delete_global_blocked(struct file_lock *waiter) static void __locks_delete_block(struct file_lock *waiter) { locks_delete_global_blocked(waiter); - list_del_init(&waiter->fl_block); - waiter->fl_next = NULL; + list_del_init(&waiter->fl_blocked_member); + waiter->fl_blocker = NULL; } -static void locks_delete_block(struct file_lock *waiter) +static void __locks_wake_up_blocks(struct file_lock *blocker) { + while (!list_empty(&blocker->fl_blocked_requests)) { + struct file_lock *waiter; + + waiter = list_first_entry(&blocker->fl_blocked_requests, + struct file_lock, fl_blocked_member); + __locks_delete_block(waiter); + if (waiter->fl_lmops && waiter->fl_lmops->lm_notify) + waiter->fl_lmops->lm_notify(waiter); + else + wake_up(&waiter->fl_wait); + } +} + +/** + * locks_delete_lock - stop waiting for a file lock + * @waiter: the lock which was waiting + * + * lockd/nfsd need to disconnect the lock while working on it. + */ +int locks_delete_block(struct file_lock *waiter) +{ + int status = -ENOENT; + + /* + * If fl_blocker is NULL, it won't be set again as this thread + * "owns" the lock and is the only one that might try to claim + * the lock. So it is safe to test fl_blocker locklessly. + * Also if fl_blocker is NULL, this waiter is not listed on + * fl_blocked_requests for some lock, so no other request can + * be added to the list of fl_blocked_requests for this + * request. So if fl_blocker is NULL, it is safe to + * locklessly check if fl_blocked_requests is empty. If both + * of these checks succeed, there is no need to take the lock. + */ + if (waiter->fl_blocker == NULL && + list_empty(&waiter->fl_blocked_requests)) + return status; spin_lock(&blocked_lock_lock); + if (waiter->fl_blocker) + status = 0; + __locks_wake_up_blocks(waiter); __locks_delete_block(waiter); spin_unlock(&blocked_lock_lock); + return status; } +EXPORT_SYMBOL(locks_delete_block); /* Insert waiter into blocker's block list. * We use a circular list so that processes can be easily woken up in @@ -683,26 +786,49 @@ static void locks_delete_block(struct file_lock *waiter) * it seems like the reasonable thing to do. * * Must be called with both the flc_lock and blocked_lock_lock held. The - * fl_block list itself is protected by the blocked_lock_lock, but by ensuring - * that the flc_lock is also held on insertions we can avoid taking the - * blocked_lock_lock in some cases when we see that the fl_block list is empty. + * fl_blocked_requests list itself is protected by the blocked_lock_lock, + * but by ensuring that the flc_lock is also held on insertions we can avoid + * taking the blocked_lock_lock in some cases when we see that the + * fl_blocked_requests list is empty. + * + * Rather than just adding to the list, we check for conflicts with any existing + * waiters, and add beneath any waiter that blocks the new waiter. + * Thus wakeups don't happen until needed. */ static void __locks_insert_block(struct file_lock *blocker, - struct file_lock *waiter) + struct file_lock *waiter, + bool conflict(struct file_lock *, + struct file_lock *)) { - BUG_ON(!list_empty(&waiter->fl_block)); - waiter->fl_next = blocker; - list_add_tail(&waiter->fl_block, &blocker->fl_block); + struct file_lock *fl; + BUG_ON(!list_empty(&waiter->fl_blocked_member)); + +new_blocker: + list_for_each_entry(fl, &blocker->fl_blocked_requests, fl_blocked_member) + if (conflict(fl, waiter)) { + blocker = fl; + goto new_blocker; + } + waiter->fl_blocker = blocker; + list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests); if (IS_POSIX(blocker) && !IS_OFDLCK(blocker)) locks_insert_global_blocked(waiter); + + /* The requests in waiter->fl_blocked are known to conflict with + * waiter, but might not conflict with blocker, or the requests + * and lock which block it. So they all need to be woken. + */ + __locks_wake_up_blocks(waiter); } /* Must be called with flc_lock held. */ static void locks_insert_block(struct file_lock *blocker, - struct file_lock *waiter) + struct file_lock *waiter, + bool conflict(struct file_lock *, + struct file_lock *)) { spin_lock(&blocked_lock_lock); - __locks_insert_block(blocker, waiter); + __locks_insert_block(blocker, waiter, conflict); spin_unlock(&blocked_lock_lock); } @@ -716,25 +842,15 @@ static void locks_wake_up_blocks(struct file_lock *blocker) /* * Avoid taking global lock if list is empty. This is safe since new * blocked requests are only added to the list under the flc_lock, and - * the flc_lock is always held here. Note that removal from the fl_block - * list does not require the flc_lock, so we must recheck list_empty() - * after acquiring the blocked_lock_lock. + * the flc_lock is always held here. Note that removal from the + * fl_blocked_requests list does not require the flc_lock, so we must + * recheck list_empty() after acquiring the blocked_lock_lock. */ - if (list_empty(&blocker->fl_block)) + if (list_empty(&blocker->fl_blocked_requests)) return; spin_lock(&blocked_lock_lock); - while (!list_empty(&blocker->fl_block)) { - struct file_lock *waiter; - - waiter = list_first_entry(&blocker->fl_block, - struct file_lock, fl_block); - __locks_delete_block(waiter); - if (waiter->fl_lmops && waiter->fl_lmops->lm_notify) - waiter->fl_lmops->lm_notify(waiter); - else - wake_up(&waiter->fl_wait); - } + __locks_wake_up_blocks(blocker); spin_unlock(&blocked_lock_lock); } @@ -766,47 +882,50 @@ locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose) /* Determine if lock sys_fl blocks lock caller_fl. Common functionality * checks for shared/exclusive status of overlapping locks. */ -static int locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) +static bool locks_conflict(struct file_lock *caller_fl, + struct file_lock *sys_fl) { if (sys_fl->fl_type == F_WRLCK) - return 1; + return true; if (caller_fl->fl_type == F_WRLCK) - return 1; - return 0; + return true; + return false; } /* Determine if lock sys_fl blocks lock caller_fl. POSIX specific * checking before calling the locks_conflict(). */ -static int posix_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) +static bool posix_locks_conflict(struct file_lock *caller_fl, + struct file_lock *sys_fl) { /* POSIX locks owned by the same process do not conflict with * each other. */ if (posix_same_owner(caller_fl, sys_fl)) - return (0); + return false; /* Check whether they overlap */ if (!locks_overlap(caller_fl, sys_fl)) - return 0; + return false; - return (locks_conflict(caller_fl, sys_fl)); + return locks_conflict(caller_fl, sys_fl); } /* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific * checking before calling the locks_conflict(). */ -static int flock_locks_conflict(struct file_lock *caller_fl, struct file_lock *sys_fl) +static bool flock_locks_conflict(struct file_lock *caller_fl, + struct file_lock *sys_fl) { /* FLOCK locks referring to the same filp do not conflict with * each other. */ if (caller_fl->fl_file == sys_fl->fl_file) - return (0); + return false; if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND)) - return 0; + return false; - return (locks_conflict(caller_fl, sys_fl)); + return locks_conflict(caller_fl, sys_fl); } void @@ -877,8 +996,11 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl) struct file_lock *fl; hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) { - if (posix_same_owner(fl, block_fl)) - return fl->fl_next; + if (posix_same_owner(fl, block_fl)) { + while (fl->fl_blocker) + fl = fl->fl_blocker; + return fl; + } } return NULL; } @@ -965,12 +1087,13 @@ find_conflict: if (!(request->fl_flags & FL_SLEEP)) goto out; error = FILE_LOCK_DEFERRED; - locks_insert_block(fl, request); + locks_insert_block(fl, request, flock_locks_conflict); goto out; } if (request->fl_flags & FL_ACCESS) goto out; locks_copy_lock(new_fl, request); + locks_move_blocks(new_fl, request); locks_insert_lock_ctx(new_fl, &ctx->flc_flock); new_fl = NULL; error = 0; @@ -1039,12 +1162,13 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, spin_lock(&blocked_lock_lock); if (likely(!posix_locks_deadlock(request, fl))) { error = FILE_LOCK_DEFERRED; - __locks_insert_block(fl, request); + __locks_insert_block(fl, request, + posix_locks_conflict); } spin_unlock(&blocked_lock_lock); goto out; - } - } + } + } /* If we're just looking for a conflict, we're done. */ error = 0; @@ -1164,6 +1288,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, goto out; } locks_copy_lock(new_fl, request); + locks_move_blocks(new_fl, request); locks_insert_lock_ctx(new_fl, &fl->fl_list); fl = new_fl; new_fl = NULL; @@ -1237,13 +1362,11 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) error = posix_lock_inode(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); - if (!error) - continue; - - locks_delete_block(fl); - break; + error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker); + if (error) + break; } + locks_delete_block(fl); return error; } @@ -1324,7 +1447,7 @@ int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start, error = posix_lock_inode(inode, &fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl.fl_wait, !fl.fl_next); + error = wait_event_interruptible(fl.fl_wait, !fl.fl_blocker); if (!error) { /* * If we've been sleeping someone might have @@ -1334,13 +1457,12 @@ int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start, continue; } - locks_delete_block(&fl); break; } + locks_delete_block(&fl); return error; } - EXPORT_SYMBOL(locks_mandatory_area); #endif /* CONFIG_MANDATORY_FILE_LOCKING */ @@ -1511,14 +1633,14 @@ restart: break_time -= jiffies; if (break_time == 0) break_time++; - locks_insert_block(fl, new_fl); + locks_insert_block(fl, new_fl, leases_conflict); trace_break_lease_block(inode, new_fl); spin_unlock(&ctx->flc_lock); percpu_up_read_preempt_enable(&file_rwsem); locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, - !new_fl->fl_next, break_time); + !new_fl->fl_blocker, break_time); percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); @@ -1542,7 +1664,6 @@ out: locks_free_lock(new_fl); return error; } - EXPORT_SYMBOL(__break_lease); /** @@ -1573,7 +1694,6 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time) if (has_lease) *time = current_time(inode); } - EXPORT_SYMBOL(lease_get_mtime); /** @@ -1628,8 +1748,8 @@ int fcntl_getlease(struct file *filp) /** * check_conflicting_open - see if the given dentry points to a file that has - * an existing open that would conflict with the - * desired lease. + * an existing open that would conflict with the + * desired lease. * @dentry: dentry to check * @arg: type of lease that we're trying to acquire * @flags: current lock flags @@ -1646,7 +1766,7 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags) if (flags & FL_LAYOUT) return 0; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) + if ((arg == F_RDLCK) && inode_is_open_for_write(inode)) return -EAGAIN; if ((arg == F_WRLCK) && ((d_count(dentry) > 1) || @@ -1853,7 +1973,7 @@ EXPORT_SYMBOL(generic_setlease); * @arg: type of lease to obtain * @lease: file_lock to use when adding a lease * @priv: private info for lm_setup when adding a lease (may be - * NULL if lm_setup doesn't require it) + * NULL if lm_setup doesn't require it) * * Call this to establish a lease on the file. The "lease" argument is not * used for F_UNLCK requests and may be NULL. For commands that set or alter @@ -1931,13 +2051,11 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) error = flock_lock_inode(inode, fl); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); - if (!error) - continue; - - locks_delete_block(fl); - break; + error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker); + if (error) + break; } + locks_delete_block(fl); return error; } @@ -2001,7 +2119,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) goto out_putf; - lock = flock_make_lock(f.file, cmd); + lock = flock_make_lock(f.file, cmd, NULL); if (IS_ERR(lock)) { error = PTR_ERR(lock); goto out_putf; @@ -2143,7 +2261,7 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) error = vfs_test_lock(filp, fl); if (error) goto out; - + flock->l_type = fl->fl_type; if (fl->fl_type != F_UNLCK) { error = posix_lock_to_flock(flock, fl); @@ -2210,13 +2328,11 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, error = vfs_lock_file(filp, cmd, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; - error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); - if (!error) - continue; - - locks_delete_block(fl); - break; + error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker); + if (error) + break; } + locks_delete_block(fl); return error; } @@ -2476,6 +2592,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) if (!ctx || list_empty(&ctx->flc_posix)) return; + locks_init_lock(&lock); lock.fl_type = F_UNLCK; lock.fl_flags = FL_POSIX | FL_CLOSE; lock.fl_start = 0; @@ -2492,26 +2609,21 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) lock.fl_ops->fl_release_private(&lock); trace_locks_remove_posix(inode, &lock, error); } - EXPORT_SYMBOL(locks_remove_posix); /* The i_flctx must be valid when calling into here */ static void locks_remove_flock(struct file *filp, struct file_lock_context *flctx) { - struct file_lock fl = { - .fl_owner = filp, - .fl_pid = current->tgid, - .fl_file = filp, - .fl_flags = FL_FLOCK | FL_CLOSE, - .fl_type = F_UNLCK, - .fl_end = OFFSET_MAX, - }; + struct file_lock fl; struct inode *inode = locks_inode(filp); if (list_empty(&flctx->flc_flock)) return; + flock_make_lock(filp, LOCK_UN, &fl); + fl.fl_flags |= FL_CLOSE; + if (filp->f_op->flock) filp->f_op->flock(filp, F_SETLKW, &fl); else @@ -2570,27 +2682,6 @@ void locks_remove_file(struct file *filp) } /** - * posix_unblock_lock - stop waiting for a file lock - * @waiter: the lock which was waiting - * - * lockd needs to block waiting for locks. - */ -int -posix_unblock_lock(struct file_lock *waiter) -{ - int status = 0; - - spin_lock(&blocked_lock_lock); - if (waiter->fl_next) - __locks_delete_block(waiter); - else - status = -ENOENT; - spin_unlock(&blocked_lock_lock); - return status; -} -EXPORT_SYMBOL(posix_unblock_lock); - -/** * vfs_cancel_lock - file byte range unblock lock * @filp: The file to apply the unblock to * @fl: The lock to be unblocked @@ -2603,7 +2694,6 @@ int vfs_cancel_lock(struct file *filp, struct file_lock *fl) return filp->f_op->lock(filp, F_CANCELLK, fl); return 0; } - EXPORT_SYMBOL_GPL(vfs_cancel_lock); #ifdef CONFIG_PROC_FS @@ -2707,7 +2797,7 @@ static int locks_show(struct seq_file *f, void *v) lock_get_status(f, fl, iter->li_pos, ""); - list_for_each_entry(bfl, &fl->fl_block, fl_block) + list_for_each_entry(bfl, &fl->fl_blocked_requests, fl_blocked_member) lock_get_status(f, bfl, iter->li_pos, " ->"); return 0; @@ -2803,7 +2893,6 @@ static int __init filelock_init(void) filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); - for_each_possible_cpu(i) { struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i); @@ -2813,5 +2902,4 @@ static int __init filelock_init(void) return 0; } - core_initcall(filelock_init); diff --git a/fs/namei.c b/fs/namei.c index 0cab6494978c..914178cdbe94 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3701,8 +3701,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && - !ns_capable(dentry->d_sb->s_user_ns, CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 509dc5adeb8f..0b602a39dd71 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -56,7 +56,7 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret > 0) { nn->nfs_callback_tcpport6 = ret; - dprintk("NFS: Callback listener port = %u (af %u, net %x\n", + dprintk("NFS: Callback listener port = %u (af %u, net %x)\n", nn->nfs_callback_tcpport6, PF_INET6, net->ns.inum); } else if (ret != -EAFNOSUPPORT) goto out_err; @@ -206,11 +206,13 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, goto err_bind; } - ret = -EPROTONOSUPPORT; + ret = 0; if (!IS_ENABLED(CONFIG_NFS_V4_1) || minorversion == 0) ret = nfs4_callback_up_net(serv, net); - else if (xprt->ops->bc_up) - ret = xprt->ops->bc_up(serv, net); + else if (xprt->ops->bc_setup) + set_bc_enabled(serv); + else + ret = -EPROTONOSUPPORT; if (ret < 0) { printk(KERN_ERR "NFS: callback service start failed\n"); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 72961b5f6993..557a5d636183 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6306,7 +6306,8 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); p->l_ctx = nfs_get_lock_context(ctx); - memcpy(&p->fl, fl, sizeof(p->fl)); + locks_init_lock(&p->fl); + locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); return p; } @@ -6528,7 +6529,8 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->server = server; refcount_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); - memcpy(&p->fl, fl, sizeof(p->fl)); + locks_init_lock(&p->fl); + locks_copy_lock(&p->fl, fl); return p; out_free_seqid: nfs_free_seqid(p->arg.open_seqid); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 76f33df51fbb..5a0bbf917a32 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -2139,7 +2139,7 @@ int __init nfs_init_writepagecache(void) * This allows larger machines to have larger/more transfers. * Limit the default to 256M */ - nfs_congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + nfs_congestion_kb = (16*int_sqrt(totalram_pages())) << (PAGE_SHIFT-10); if (nfs_congestion_kb > 256*1024) nfs_congestion_kb = 256*1024; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 2b36aa037ce0..44517fb5c0de 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -656,7 +656,6 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) struct nfsd_net *nn; ktime_t now, cutoff; const struct nfsd4_layout_ops *ops; - LIST_HEAD(reaplist); switch (task->tk_status) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index d505990dac7c..0cfd257ffdaf 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -863,8 +863,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_rename *rename = &u->rename; __be32 status; - if (opens_in_grace(SVC_NET(rqstp)) && - !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) + if (opens_in_grace(SVC_NET(rqstp))) return nfserr_grace; status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, rename->rn_snamelen, &cstate->current_fh, @@ -1016,8 +1015,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist, &write->wr_head, write->wr_buflen); - if (!nvecs) - return nfserr_io; WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, @@ -1348,7 +1345,7 @@ static __be32 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) { - __be32 status = nfserr_notsupp; + __be32 status; struct file *file; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, @@ -2682,25 +2679,25 @@ static const struct nfsd4_operation nfsd4_ops[] = { /* NFSv4.2 operations */ [OP_ALLOCATE] = { .op_func = nfsd4_allocate, - .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_ALLOCATE", .op_rsize_bop = nfsd4_only_status_rsize, }, [OP_DEALLOCATE] = { .op_func = nfsd4_deallocate, - .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_DEALLOCATE", .op_rsize_bop = nfsd4_only_status_rsize, }, [OP_CLONE] = { .op_func = nfsd4_clone, - .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_CLONE", .op_rsize_bop = nfsd4_only_status_rsize, }, [OP_COPY] = { .op_func = nfsd4_copy, - .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_COPY", .op_rsize_bop = nfsd4_copy_rsize, }, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 9c247fa1e959..5188f9f70c78 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -662,7 +662,7 @@ struct cld_net { struct cld_upcall { struct list_head cu_list; struct cld_net *cu_net; - struct task_struct *cu_task; + struct completion cu_done; struct cld_msg cu_msg; }; @@ -671,23 +671,18 @@ __cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) { int ret; struct rpc_pipe_msg msg; + struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_msg); memset(&msg, 0, sizeof(msg)); msg.data = cmsg; msg.len = sizeof(*cmsg); - /* - * Set task state before we queue the upcall. That prevents - * wake_up_process in the downcall from racing with schedule. - */ - set_current_state(TASK_UNINTERRUPTIBLE); ret = rpc_queue_upcall(pipe, &msg); if (ret < 0) { - set_current_state(TASK_RUNNING); goto out; } - schedule(); + wait_for_completion(&cup->cu_done); if (msg.errno < 0) ret = msg.errno; @@ -754,7 +749,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (copy_from_user(&cup->cu_msg, src, mlen) != 0) return -EFAULT; - wake_up_process(cup->cu_task); + complete(&cup->cu_done); return mlen; } @@ -769,7 +764,7 @@ cld_pipe_destroy_msg(struct rpc_pipe_msg *msg) if (msg->errno >= 0) return; - wake_up_process(cup->cu_task); + complete(&cup->cu_done); } static const struct rpc_pipe_ops cld_upcall_ops = { @@ -900,7 +895,7 @@ restart_search: goto restart_search; } } - new->cu_task = current; + init_completion(&new->cu_done); new->cu_msg.cm_vers = CLD_UPCALL_VERSION; put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid); new->cu_net = cn; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f093fbe47133..fb3c9844c82a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -238,7 +238,7 @@ find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, } spin_unlock(&nn->blocked_locks_lock); if (found) - posix_unblock_lock(&found->nbl_lock); + locks_delete_block(&found->nbl_lock); return found; } @@ -293,7 +293,7 @@ remove_blocked_locks(struct nfs4_lockowner *lo) nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); - posix_unblock_lock(&nbl->nbl_lock); + locks_delete_block(&nbl->nbl_lock); free_blocked_lock(nbl); } } @@ -4863,7 +4863,7 @@ nfs4_laundromat(struct nfsd_net *nn) nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); - posix_unblock_lock(&nbl->nbl_lock); + locks_delete_block(&nbl->nbl_lock); free_blocked_lock(nbl); } out: @@ -5112,7 +5112,7 @@ nfs4_find_file(struct nfs4_stid *s, int flags) } static __be32 -nfs4_check_olstateid(struct svc_fh *fhp, struct nfs4_ol_stateid *ols, int flags) +nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags) { __be32 status; @@ -5195,7 +5195,7 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, break; case NFS4_OPEN_STID: case NFS4_LOCK_STID: - status = nfs4_check_olstateid(fhp, openlockstateid(s), flags); + status = nfs4_check_olstateid(openlockstateid(s), flags); break; default: status = nfserr_bad_stateid; @@ -6230,15 +6230,15 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, case NFS4_READ_LT: case NFS4_READW_LT: file_lock->fl_type = F_RDLCK; - break; + break; case NFS4_WRITE_LT: case NFS4_WRITEW_LT: file_lock->fl_type = F_WRLCK; - break; + break; default: dprintk("NFSD: nfs4_lockt: bad lock type!\n"); status = nfserr_inval; - goto out; + goto out; } lo = find_lockowner_str(cstate->clp, &lockt->lt_owner); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index e2fe0e9ce0df..da52b594362a 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -99,7 +99,7 @@ static unsigned int nfsd_cache_size_limit(void) { unsigned int limit; - unsigned long low_pages = totalram_pages - totalhigh_pages; + unsigned long low_pages = totalram_pages() - totalhigh_pages(); limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10); return min_t(unsigned int, limit, 256*1024); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 6384c9b94898..b33f9785b756 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1126,6 +1126,8 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case 'Y': case 'y': case '1': + if (nn->nfsd_serv) + return -EBUSY; nfsd4_end_grace(nn); break; default: diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index eb67098117b4..9824e32b2f23 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -396,10 +396,23 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, bool get_write_count; bool size_change = (iap->ia_valid & ATTR_SIZE); - if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) + if (iap->ia_valid & ATTR_SIZE) { accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; - if (iap->ia_valid & ATTR_SIZE) ftype = S_IFREG; + } + + /* + * If utimes(2) and friends are called with times not NULL, we should + * not set NFSD_MAY_WRITE bit. Otherwise fh_verify->nfsd_permission + * will return EACCESS, when the caller's effective UID does not match + * the owner of the file, and the caller is not privileged. In this + * situation, we should return EPERM(notify_change will return this). + */ + if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME)) { + accmode |= NFSD_MAY_OWNER_OVERRIDE; + if (!(iap->ia_valid & (ATTR_ATIME_SET | ATTR_MTIME_SET))) + accmode |= NFSD_MAY_WRITE; + } /* Callers that do fh_verify should do the fh_want_write: */ get_write_count = !fhp->fh_dentry; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e08a6647267b..3723f3d18d20 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -89,7 +89,13 @@ static int fanotify_get_response(struct fsnotify_group *group, return ret; } -static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info, +/* + * This function returns a mask for an event that only contains the flags + * that have been specifically requested by the user. Flags that may have + * been included within the event mask, but have not been explicitly + * requested by the user, will not be present in the returned mask. + */ +static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info, u32 event_mask, const void *data, int data_type) { @@ -101,14 +107,14 @@ static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info, pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", __func__, iter_info->report_mask, event_mask, data, data_type); - /* if we don't have enough info to send an event to userspace say no */ + /* If we don't have enough info to send an event to userspace say no */ if (data_type != FSNOTIFY_EVENT_PATH) - return false; + return 0; - /* sorry, fanotify only gives a damn about files and dirs */ + /* Sorry, fanotify only gives a damn about files and dirs */ if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry)) - return false; + return 0; fsnotify_foreach_obj_type(type) { if (!fsnotify_iter_should_report_type(iter_info, type)) @@ -129,13 +135,10 @@ static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info, if (d_is_dir(path->dentry) && !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) - return false; - - if (event_mask & FANOTIFY_OUTGOING_EVENTS & - marks_mask & ~marks_ignored_mask) - return true; + return 0; - return false; + return event_mask & FANOTIFY_OUTGOING_EVENTS & marks_mask & + ~marks_ignored_mask; } struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, @@ -207,10 +210,13 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); + BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 10); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 12); - if (!fanotify_should_send_event(iter_info, mask, data, data_type)) + mask = fanotify_group_event_mask(iter_info, mask, data, data_type); + if (!mask) return 0; pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode, diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index e03be5071362..9c870b0d2b56 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -206,7 +206,7 @@ static int process_access_response(struct fsnotify_group *group, static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fsnotify_event *event, - char __user *buf) + char __user *buf, size_t count) { struct fanotify_event_metadata fanotify_event_metadata; struct file *f; @@ -220,6 +220,12 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, fd = fanotify_event_metadata.fd; ret = -EFAULT; + /* + * Sanity check copy size in case get_one_event() and + * fill_event_metadata() event_len sizes ever get out of sync. + */ + if (WARN_ON_ONCE(fanotify_event_metadata.event_len > count)) + goto out_close_fd; if (copy_to_user(buf, &fanotify_event_metadata, fanotify_event_metadata.event_len)) goto out_close_fd; @@ -295,7 +301,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, continue; } - ret = copy_event_to_user(group, kevent, buf); + ret = copy_event_to_user(group, kevent, buf, count); if (unlikely(ret == -EOPENSTALE)) { /* * We cannot report events with stale fd so drop it. @@ -669,7 +675,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, */ if ((flags & FAN_MARK_IGNORED_MASK) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && - (atomic_read(&inode->i_writecount) > 0)) + inode_is_open_for_write(inode)) return 0; return fanotify_add_mark(group, &inode->i_fsnotify_marks, diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 348a184bcdda..1e2bfd26b352 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -15,6 +15,7 @@ #include <linux/exportfs.h> #include "inotify/inotify.h" +#include "fdinfo.h" #include "fsnotify.h" #if defined(CONFIG_PROC_FS) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index d2c34900ae05..ecf09b6243d9 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -401,7 +401,7 @@ static __init int fsnotify_init(void) { int ret; - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index ab172e5f51d9..5becc8acc8f4 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); /* return (void *)__get_free_page(gfp_mask); */ } - if (likely((size >> PAGE_SHIFT) < totalram_pages)) + if (likely((size >> PAGE_SHIFT) < totalram_pages())) return __vmalloc(size, gfp_mask, PAGE_KERNEL); return NULL; } diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 99ee093182cb..cc9b32b9db7c 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y := -Ifs/ocfs2 +ccflags-y := -I$(src) obj-$(CONFIG_OCFS2_FS) += \ ocfs2.o \ diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 4ebbd57cbf84..f9b84f7a3e4b 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -161,7 +161,6 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, #endif } - clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, 0, bh); @@ -341,7 +340,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, continue; } - clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ if (validate) set_buffer_needs_validate(bh); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 9b2ed62dd638..f3c20b279eb2 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -582,9 +582,10 @@ bail: } static int o2hb_read_slots(struct o2hb_region *reg, + unsigned int begin_slot, unsigned int max_slots) { - unsigned int current_slot=0; + unsigned int current_slot = begin_slot; int status; struct o2hb_bio_wait_ctxt wc; struct bio *bio; @@ -1093,9 +1094,14 @@ static int o2hb_highest_node(unsigned long *nodes, int numbits) return find_last_bit(nodes, numbits); } +static int o2hb_lowest_node(unsigned long *nodes, int numbits) +{ + return find_first_bit(nodes, numbits); +} + static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { - int i, ret, highest_node; + int i, ret, highest_node, lowest_node; int membership_change = 0, own_slot_ok = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -1120,7 +1126,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) } highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); - if (highest_node >= O2NM_MAX_NODES) { + lowest_node = o2hb_lowest_node(configured_nodes, O2NM_MAX_NODES); + if (highest_node >= O2NM_MAX_NODES || lowest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "o2hb: No configured nodes found!\n"); ret = -EINVAL; goto bail; @@ -1130,7 +1137,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * yet. Of course, if the node definitions have holes in them * then we're reading an empty slot anyway... Consider this * best-effort. */ - ret = o2hb_read_slots(reg, highest_node + 1); + ret = o2hb_read_slots(reg, lowest_node, highest_node + 1); if (ret < 0) { mlog_errno(ret); goto bail; @@ -1801,7 +1808,7 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg) struct o2hb_disk_slot *slot; struct o2hb_disk_heartbeat_block *hb_block; - ret = o2hb_read_slots(reg, reg->hr_blocks); + ret = o2hb_read_slots(reg, 0, reg->hr_blocks); if (ret) goto out; diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index bd1aab1f49a4..ef2854422a6e 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -1,4 +1,4 @@ -ccflags-y := -Ifs/ocfs2 +ccflags-y := -I$(src)/.. obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index eed3db8c5b49..33431a0296a3 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -1,4 +1,4 @@ -ccflags-y := -Ifs/ocfs2 +ccflags-y := -I$(src)/.. obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 602c71f32740..b8fa1487cd85 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -179,7 +179,7 @@ bail: static int dlmfs_file_release(struct inode *inode, struct file *file) { - int level, status; + int level; struct dlmfs_inode_private *ip = DLMFS_I(inode); struct dlmfs_filp_private *fp = file->private_data; @@ -188,7 +188,6 @@ static int dlmfs_file_release(struct inode *inode, mlog(0, "close called on inode %lu\n", inode->i_ino); - status = 0; if (fp) { level = fp->fp_lock_level; if (level != DLM_LOCK_IV) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index b63c97f4318e..46fd3ef2cf21 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1017,7 +1017,8 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) mlog_errno(status); } - if (status == 0) { + /* Shutdown the kernel journal system */ + if (!jbd2_journal_destroy(journal->j_journal) && !status) { /* * Do not toggle if flush was unsuccessful otherwise * will leave dirty metadata in a "clean" journal @@ -1026,9 +1027,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) if (status < 0) mlog_errno(status); } - - /* Shutdown the kernel journal system */ - jbd2_journal_destroy(journal->j_journal); journal->j_journal = NULL; OCFS2_I(inode)->ip_open_count--; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 7642b6712c39..58973e4d2471 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -345,13 +345,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) if (num_used || alloc->id1.bitmap1.i_used || alloc->id1.bitmap1.i_total - || la->la_bm_off) - mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" + || la->la_bm_off) { + mlog(ML_ERROR, "inconsistent detected, clean journal with" + " unrecovered local alloc, please run fsck.ocfs2!\n" "found = %u, set = %u, taken = %u, off = %u\n", num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), le32_to_cpu(alloc->id1.bitmap1.i_total), OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + status = -EINVAL; + goto bail; + } + osb->local_alloc_bh = alloc_bh; osb->local_alloc_state = OCFS2_LA_ENABLED; @@ -835,7 +840,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, u32 *numbits, struct ocfs2_alloc_reservation *resv) { - int numfound = 0, bitoff, left, startoff, lastzero; + int numfound = 0, bitoff, left, startoff; int local_resv = 0; struct ocfs2_alloc_reservation r; void *bitmap = NULL; @@ -873,7 +878,6 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; numfound = bitoff = startoff = 0; - lastzero = -1; left = le32_to_cpu(alloc->id1.bitmap1.i_total); while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { if (bitoff == left) { diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index d56f0079b858..b11acd34001a 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -52,6 +52,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, if (lockres->l_flags & OCFS2_LOCK_ATTACHED && lockres->l_level > LKM_NLMODE) { int old_level = 0; + struct file_lock request; if (lockres->l_level == LKM_EXMODE) old_level = 1; @@ -66,11 +67,10 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode, * level. */ - locks_lock_file_wait(file, - &(struct file_lock) { - .fl_type = F_UNLCK, - .fl_flags = FL_FLOCK - }); + locks_init_lock(&request); + request.fl_type = F_UNLCK; + request.fl_flags = FL_FLOCK; + locks_lock_file_wait(file, &request); ocfs2_file_unlock(file); } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 607092f367ad..1b2d0d2fe2ee 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -199,10 +199,11 @@ static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry child = dp->child; while (child) { - int n = strlen(child->path_component_name); + const char *node_name = kbasename(child->full_name); + int n = strlen(node_name); if (len == n && - !strncmp(child->path_component_name, name, len)) { + !strncmp(node_name, name, len)) { ent_type = op_inode_node; ent_data.node = child; ino = child->unique_id; @@ -245,7 +246,7 @@ found: set_nlink(inode, 2); break; case op_inode_prop: - if (!strcmp(dp->name, "options") && (len == 17) && + if (of_node_name_eq(dp, "options") && (len == 17) && !strncmp (name, "security-password", 17)) inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; else @@ -293,8 +294,8 @@ static int openpromfs_readdir(struct file *file, struct dir_context *ctx) } while (child) { if (!dir_emit(ctx, - child->path_component_name, - strlen(child->path_component_name), + kbasename(child->full_name), + strlen(kbasename(child->full_name)), child->unique_id, DT_DIR)) goto out; diff --git a/fs/proc/array.c b/fs/proc/array.c index 0ceb3b6b37e7..9d428d5a0ac8 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -392,6 +392,15 @@ static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) seq_putc(m, '\n'); } +static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) +{ + bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); + + if (thp_enabled) + thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags); + seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -406,6 +415,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, if (mm) { task_mem(m, mm); task_core_dumping(m, mm); + task_thp_status(m, mm); mmput(mm); } task_sig(m, task); diff --git a/fs/proc/base.c b/fs/proc/base.c index ce3465479447..d7fd1ca807d2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -530,7 +530,7 @@ static const struct file_operations proc_lstats_operations = { static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - unsigned long totalpages = totalram_pages + total_swap_pages; + unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; points = oom_badness(task, NULL, NULL, totalpages) * diff --git a/fs/proc/page.c b/fs/proc/page.c index 6c517b11acf8..40b05e0d4274 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -46,7 +46,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, ppage = pfn_to_page(pfn); else ppage = NULL; - if (!ppage || PageSlab(ppage)) + if (!ppage || PageSlab(ppage) || page_has_type(ppage)) pcount = 0; else pcount = page_mapcount(ppage); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 89921a0d2ebb..4d598a399bbf 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -464,7 +464,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode = new_inode(sb); if (!inode) - goto out; + return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); @@ -474,8 +474,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (unlikely(head->unregistering)) { spin_unlock(&sysctl_lock); iput(inode); - inode = NULL; - goto out; + return ERR_PTR(-ENOENT); } ei->sysctl = head; ei->sysctl_entry = table; @@ -500,7 +499,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (root->set_ownership) root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); -out: return inode; } @@ -549,10 +547,11 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; } - err = ERR_PTR(-ENOMEM); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - if (!inode) + if (IS_ERR(inode)) { + err = ERR_CAST(inode); goto out; + } d_set_d_op(dentry, &proc_sys_dentry_operations); err = d_splice_alias(inode, dentry); @@ -685,7 +684,7 @@ static bool proc_sys_fill_cache(struct file *file, if (d_in_lookup(child)) { struct dentry *res; inode = proc_sys_make_inode(dir->d_sb, head, table); - if (!inode) { + if (IS_ERR(inode)) { d_lookup_done(child); dput(child); return false; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 47c3764c469b..f0ec9edab2f3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -790,6 +790,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss); + seq_printf(m, "THPeligible: %d\n", transparent_hugepage_enabled(vma)); + if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); @@ -1096,6 +1098,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, }; @@ -1139,11 +1142,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, downgrade_write(&mm->mmap_sem); break; } - mmu_notifier_invalidate_range_start(mm, 0, -1); + + mmu_notifier_range_init(&range, mm, 0, -1UL); + mmu_notifier_invalidate_range_start(&range); } walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); if (type == CLEAR_REFS_SOFT_DIRTY) - mmu_notifier_invalidate_range_end(mm, 0, -1); + mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb, 0, -1); up_read(&mm->mmap_sem); out_mm: diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 06aab07b6bb7..b8a0931568f8 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -148,7 +148,7 @@ void pstore_unregister_ftrace(void) mutex_lock(&pstore_ftrace_lock); if (pstore_ftrace_enabled) { unregister_ftrace_function(&pstore_ftrace_ops); - pstore_ftrace_enabled = 0; + pstore_ftrace_enabled = false; } mutex_unlock(&pstore_ftrace_lock); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 8cf2218b46a7..c60ee46f3e39 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -335,53 +335,10 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) goto fail_alloc; private->record = record; - switch (record->type) { - case PSTORE_TYPE_DMESG: - scnprintf(name, sizeof(name), "dmesg-%s-%llu%s", - record->psi->name, record->id, - record->compressed ? ".enc.z" : ""); - break; - case PSTORE_TYPE_CONSOLE: - scnprintf(name, sizeof(name), "console-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_FTRACE: - scnprintf(name, sizeof(name), "ftrace-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_MCE: - scnprintf(name, sizeof(name), "mce-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_PPC_RTAS: - scnprintf(name, sizeof(name), "rtas-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_PPC_OF: - scnprintf(name, sizeof(name), "powerpc-ofw-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_PPC_COMMON: - scnprintf(name, sizeof(name), "powerpc-common-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_PMSG: - scnprintf(name, sizeof(name), "pmsg-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_PPC_OPAL: - scnprintf(name, sizeof(name), "powerpc-opal-%s-%llu", - record->psi->name, record->id); - break; - case PSTORE_TYPE_UNKNOWN: - scnprintf(name, sizeof(name), "unknown-%s-%llu", - record->psi->name, record->id); - break; - default: - scnprintf(name, sizeof(name), "type%d-%s-%llu", - record->type, record->psi->name, record->id); - break; - } + scnprintf(name, sizeof(name), "%s-%s-%llu%s", + pstore_type_to_name(record->type), + record->psi->name, record->id, + record->compressed ? ".enc.z" : ""); dentry = d_alloc_name(root, name); if (!dentry) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b821054ca3ed..2d1066ed3c28 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -59,6 +59,19 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content " "enabling this option is not safe, it may lead to further " "corruption on Oopses)"); +/* Names should be in the same order as the enum pstore_type_id */ +static const char * const pstore_type_names[] = { + "dmesg", + "mce", + "console", + "ftrace", + "rtas", + "powerpc-ofw", + "powerpc-common", + "pmsg", + "powerpc-opal", +}; + static int pstore_new_entry; static void pstore_timefunc(struct timer_list *); @@ -104,6 +117,30 @@ void pstore_set_kmsg_bytes(int bytes) /* Tag each group of saved records with a sequence number */ static int oopscount; +const char *pstore_type_to_name(enum pstore_type_id type) +{ + BUILD_BUG_ON(ARRAY_SIZE(pstore_type_names) != PSTORE_TYPE_MAX); + + if (WARN_ON_ONCE(type >= PSTORE_TYPE_MAX)) + return "unknown"; + + return pstore_type_names[type]; +} +EXPORT_SYMBOL_GPL(pstore_type_to_name); + +enum pstore_type_id pstore_name_to_type(const char *name) +{ + int i; + + for (i = 0; i < PSTORE_TYPE_MAX; i++) { + if (!strcmp(pstore_type_names[i], name)) + return i; + } + + return PSTORE_TYPE_MAX; +} +EXPORT_SYMBOL_GPL(pstore_name_to_type); + static const char *get_reason_str(enum kmsg_dump_reason reason) { switch (reason) { @@ -124,26 +161,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason) } } -bool pstore_cannot_block_path(enum kmsg_dump_reason reason) +/* + * Should pstore_dump() wait for a concurrent pstore_dump()? If + * not, the current pstore_dump() will report a failure to dump + * and return. + */ +static bool pstore_cannot_wait(enum kmsg_dump_reason reason) { - /* - * In case of NMI path, pstore shouldn't be blocked - * regardless of reason. - */ + /* In NMI path, pstore shouldn't block regardless of reason. */ if (in_nmi()) return true; switch (reason) { /* In panic case, other cpus are stopped by smp_send_stop(). */ case KMSG_DUMP_PANIC: - /* Emergency restart shouldn't be blocked by spin lock. */ + /* Emergency restart shouldn't be blocked. */ case KMSG_DUMP_EMERG: return true; default: return false; } } -EXPORT_SYMBOL_GPL(pstore_cannot_block_path); #if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS) static int zbufsize_deflate(size_t size) @@ -258,20 +296,6 @@ static int pstore_compress(const void *in, void *out, return outlen; } -static int pstore_decompress(void *in, void *out, - unsigned int inlen, unsigned int outlen) -{ - int ret; - - ret = crypto_comp_decompress(tfm, in, inlen, out, &outlen); - if (ret) { - pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); - return ret; - } - - return outlen; -} - static void allocate_buf_for_compression(void) { struct crypto_comp *ctx; @@ -318,7 +342,7 @@ static void allocate_buf_for_compression(void) big_oops_buf_sz = size; big_oops_buf = buf; - pr_info("Using compression: %s\n", zbackend->name); + pr_info("Using crash dump compression: %s\n", zbackend->name); } static void free_buf_for_compression(void) @@ -368,9 +392,8 @@ void pstore_record_init(struct pstore_record *record, } /* - * callback from kmsg_dump. (s2,l2) has the most recently - * written bytes, older bytes are in (s1,l1). Save as much - * as we can from the end of the buffer. + * callback from kmsg_dump. Save as much as we can (up to kmsg_bytes) from the + * end of the buffer. */ static void pstore_dump(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason) @@ -378,23 +401,23 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long total = 0; const char *why; unsigned int part = 1; - unsigned long flags = 0; - int is_locked; int ret; why = get_reason_str(reason); - if (pstore_cannot_block_path(reason)) { - is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags); - if (!is_locked) { - pr_err("pstore dump routine blocked in %s path, may corrupt error record\n" - , in_nmi() ? "NMI" : why); + if (down_trylock(&psinfo->buf_lock)) { + /* Failed to acquire lock: give up if we cannot wait. */ + if (pstore_cannot_wait(reason)) { + pr_err("dump skipped in %s path: may corrupt error record\n", + in_nmi() ? "NMI" : why); + return; + } + if (down_interruptible(&psinfo->buf_lock)) { + pr_err("could not grab semaphore?!\n"); return; } - } else { - spin_lock_irqsave(&psinfo->buf_lock, flags); - is_locked = 1; } + oopscount++; while (total < kmsg_bytes) { char *dst; @@ -411,7 +434,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, record.part = part; record.buf = psinfo->buf; - if (big_oops_buf && is_locked) { + if (big_oops_buf) { dst = big_oops_buf; dst_size = big_oops_buf_sz; } else { @@ -429,7 +452,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, dst_size, &dump_size)) break; - if (big_oops_buf && is_locked) { + if (big_oops_buf) { zipped_len = pstore_compress(dst, psinfo->buf, header_size + dump_size, psinfo->bufsize); @@ -452,8 +475,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += record.size; part++; } - if (is_locked) - spin_unlock_irqrestore(&psinfo->buf_lock, flags); + + up(&psinfo->buf_lock); } static struct kmsg_dumper pstore_dumper = { @@ -476,31 +499,14 @@ static void pstore_unregister_kmsg(void) #ifdef CONFIG_PSTORE_CONSOLE static void pstore_console_write(struct console *con, const char *s, unsigned c) { - const char *e = s + c; - - while (s < e) { - struct pstore_record record; - unsigned long flags; + struct pstore_record record; - pstore_record_init(&record, psinfo); - record.type = PSTORE_TYPE_CONSOLE; - - if (c > psinfo->bufsize) - c = psinfo->bufsize; + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_CONSOLE; - if (oops_in_progress) { - if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) - break; - } else { - spin_lock_irqsave(&psinfo->buf_lock, flags); - } - record.buf = (char *)s; - record.size = c; - psinfo->write(&record); - spin_unlock_irqrestore(&psinfo->buf_lock, flags); - s += c; - c = e - s; - } + record.buf = (char *)s; + record.size = c; + psinfo->write(&record); } static struct console pstore_console = { @@ -589,6 +595,7 @@ int pstore_register(struct pstore_info *psi) psi->write_user = pstore_write_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); + sema_init(&psinfo->buf_lock, 1); spin_unlock(&pstore_lock); if (owner && !try_module_get(owner)) { @@ -656,8 +663,9 @@ EXPORT_SYMBOL_GPL(pstore_unregister); static void decompress_record(struct pstore_record *record) { + int ret; int unzipped_len; - char *decompressed; + char *unzipped, *workspace; if (!record->compressed) return; @@ -668,35 +676,42 @@ static void decompress_record(struct pstore_record *record) return; } - /* No compression method has created the common buffer. */ + /* Missing compression buffer means compression was not initialized. */ if (!big_oops_buf) { - pr_warn("no decompression buffer allocated\n"); + pr_warn("no decompression method initialized!\n"); return; } - unzipped_len = pstore_decompress(record->buf, big_oops_buf, - record->size, big_oops_buf_sz); - if (unzipped_len <= 0) { - pr_err("decompression failed: %d\n", unzipped_len); + /* Allocate enough space to hold max decompression and ECC. */ + unzipped_len = big_oops_buf_sz; + workspace = kmalloc(unzipped_len + record->ecc_notice_size, + GFP_KERNEL); + if (!workspace) return; - } - /* Build new buffer for decompressed contents. */ - decompressed = kmalloc(unzipped_len + record->ecc_notice_size, - GFP_KERNEL); - if (!decompressed) { - pr_err("decompression ran out of memory\n"); + /* After decompression "unzipped_len" is almost certainly smaller. */ + ret = crypto_comp_decompress(tfm, record->buf, record->size, + workspace, &unzipped_len); + if (ret) { + pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); + kfree(workspace); return; } - memcpy(decompressed, big_oops_buf, unzipped_len); /* Append ECC notice to decompressed buffer. */ - memcpy(decompressed + unzipped_len, record->buf + record->size, + memcpy(workspace + unzipped_len, record->buf + record->size, record->ecc_notice_size); - /* Swap out compresed contents with decompressed contents. */ + /* Copy decompressed contents into an minimum-sized allocation. */ + unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size, + GFP_KERNEL); + kfree(workspace); + if (!unzipped) + return; + + /* Swap out compressed contents with decompressed contents. */ kfree(record->buf); - record->buf = decompressed; + record->buf = unzipped; record->size = unzipped_len; record->compressed = false; } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index e02a9039b5ea..96f7d32cd184 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -124,19 +124,17 @@ static int ramoops_pstore_open(struct pstore_info *psi) } static struct persistent_ram_zone * -ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, - u64 *id, - enum pstore_type_id *typep, enum pstore_type_id type, - bool update) +ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id, + struct pstore_record *record) { struct persistent_ram_zone *prz; - int i = (*c)++; + bool update = (record->type == PSTORE_TYPE_DMESG); /* Give up if we never existed or have hit the end. */ - if (!przs || i >= max) + if (!przs) return NULL; - prz = przs[i]; + prz = przs[id]; if (!prz) return NULL; @@ -147,8 +145,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, if (!persistent_ram_old_size(prz)) return NULL; - *typep = type; - *id = i; + record->type = prz->type; + record->id = id; return prz; } @@ -255,10 +253,8 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) /* Find the next valid persistent_ram_zone for DMESG */ while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) { - prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt, - cxt->max_dump_cnt, &record->id, - &record->type, - PSTORE_TYPE_DMESG, 1); + prz = ramoops_get_next_prz(cxt->dprzs, cxt->dump_read_cnt++, + record); if (!prz_ok(prz)) continue; header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), @@ -272,22 +268,18 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) } } - if (!prz_ok(prz)) - prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, - 1, &record->id, &record->type, - PSTORE_TYPE_CONSOLE, 0); + if (!prz_ok(prz) && !cxt->console_read_cnt++) + prz = ramoops_get_next_prz(&cxt->cprz, 0 /* single */, record); - if (!prz_ok(prz)) - prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt, - 1, &record->id, &record->type, - PSTORE_TYPE_PMSG, 0); + if (!prz_ok(prz) && !cxt->pmsg_read_cnt++) + prz = ramoops_get_next_prz(&cxt->mprz, 0 /* single */, record); /* ftrace is last since it may want to dynamically allocate memory. */ if (!prz_ok(prz)) { - if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) { - prz = ramoops_get_next_prz(cxt->fprzs, - &cxt->ftrace_read_cnt, 1, &record->id, - &record->type, PSTORE_TYPE_FTRACE, 0); + if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) && + !cxt->ftrace_read_cnt++) { + prz = ramoops_get_next_prz(cxt->fprzs, 0 /* single */, + record); } else { /* * Build a new dummy record which combines all the @@ -299,15 +291,12 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) GFP_KERNEL); if (!tmp_prz) return -ENOMEM; + prz = tmp_prz; free_prz = true; while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) { prz_next = ramoops_get_next_prz(cxt->fprzs, - &cxt->ftrace_read_cnt, - cxt->max_ftrace_cnt, - &record->id, - &record->type, - PSTORE_TYPE_FTRACE, 0); + cxt->ftrace_read_cnt++, record); if (!prz_ok(prz_next)) continue; @@ -321,7 +310,6 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) goto out; } record->id = 0; - prz = tmp_prz; } } @@ -611,6 +599,7 @@ static int ramoops_init_przs(const char *name, goto fail; } *paddr += zone_sz; + prz_ar[i]->type = pstore_name_to_type(name); } *przs = prz_ar; @@ -640,7 +629,7 @@ static int ramoops_init_prz(const char *name, label = kasprintf(GFP_KERNEL, "ramoops:%s", name); *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, - cxt->memtype, 0, label); + cxt->memtype, PRZ_FLAG_ZAP_OLD, label); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); @@ -649,9 +638,8 @@ static int ramoops_init_prz(const char *name, return err; } - persistent_ram_zap(*prz); - *paddr += sz; + (*prz)->type = pstore_name_to_type(name); return 0; } @@ -787,7 +775,7 @@ static int ramoops_probe(struct platform_device *pdev) dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size - cxt->pmsg_size; - err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr, + err = ramoops_init_przs("dmesg", dev, cxt, &cxt->dprzs, &paddr, dump_mem_sz, cxt->record_size, &cxt->max_dump_cnt, 0, 0); if (err) @@ -827,7 +815,6 @@ static int ramoops_probe(struct platform_device *pdev) err = -ENOMEM; goto fail_clear; } - spin_lock_init(&cxt->pstore.buf_lock); cxt->pstore.flags = PSTORE_FLAGS_DMESG; if (cxt->console_size) @@ -855,9 +842,9 @@ static int ramoops_probe(struct platform_device *pdev) ramoops_pmsg_size = pdata->pmsg_size; ramoops_ftrace_size = pdata->ftrace_size; - pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n", + pr_info("using 0x%lx@0x%llx, ecc: %d\n", cxt->size, (unsigned long long)cxt->phys_addr, - cxt->ecc_info.ecc_size, cxt->ecc_info.block_size); + cxt->ecc_info.ecc_size); return 0; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 12e21f789194..c11711c2cc83 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -12,7 +12,7 @@ * */ -#define pr_fmt(fmt) "persistent_ram: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/device.h> #include <linux/err.h> @@ -29,6 +29,16 @@ #include <linux/vmalloc.h> #include <asm/page.h> +/** + * struct persistent_ram_buffer - persistent circular RAM buffer + * + * @sig: + * signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value) + * @start: + * offset into @data where the beginning of the stored bytes begin + * @size: + * number of valid bytes stored in @data + */ struct persistent_ram_buffer { uint32_t sig; atomic_t start; @@ -443,7 +453,8 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size, void *va; if (!request_mem_region(start, size, label ?: "ramoops")) { - pr_err("request mem region (0x%llx@0x%llx) failed\n", + pr_err("request mem region (%s 0x%llx@0x%llx) failed\n", + label ?: "ramoops", (unsigned long long)size, (unsigned long long)start); return NULL; } @@ -489,32 +500,42 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, struct persistent_ram_ecc_info *ecc_info) { int ret; + bool zap = !!(prz->flags & PRZ_FLAG_ZAP_OLD); ret = persistent_ram_init_ecc(prz, ecc_info); - if (ret) + if (ret) { + pr_warn("ECC failed %s\n", prz->label); return ret; + } sig ^= PERSISTENT_RAM_SIG; if (prz->buffer->sig == sig) { + if (buffer_size(prz) == 0) { + pr_debug("found existing empty buffer\n"); + return 0; + } + if (buffer_size(prz) > prz->buffer_size || - buffer_start(prz) > buffer_size(prz)) + buffer_start(prz) > buffer_size(prz)) { pr_info("found existing invalid buffer, size %zu, start %zu\n", buffer_size(prz), buffer_start(prz)); - else { + zap = true; + } else { pr_debug("found existing buffer, size %zu, start %zu\n", buffer_size(prz), buffer_start(prz)); persistent_ram_save_old(prz); - return 0; } } else { pr_debug("no valid data in buffer (sig = 0x%08x)\n", prz->buffer->sig); + prz->buffer->sig = sig; + zap = true; } - /* Rewind missing or invalid memory area. */ - prz->buffer->sig = sig; - persistent_ram_zap(prz); + /* Reset missing, invalid, or single-use memory area. */ + if (zap) + persistent_ram_zap(prz); return 0; } @@ -572,6 +593,12 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, if (ret) goto err; + pr_debug("attached %s 0x%zx@0x%llx: %zu header, %zu data, %zu ecc (%d/%d)\n", + prz->label, prz->size, (unsigned long long)prz->paddr, + sizeof(*prz->buffer), prz->buffer_size, + prz->size - sizeof(*prz->buffer) - prz->buffer_size, + prz->ecc_info.ecc_size, prz->ecc_info.block_size); + return prz; err: persistent_ram_free(prz); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index f0cbf58ad4da..fd5dd806f1b9 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -791,7 +791,8 @@ static int quotactl_cmd_write(int cmd) /* Return true if quotactl command is manipulating quota on/off state */ static bool quotactl_cmd_onoff(int cmd) { - return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF); + return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF) || + (cmd == Q_XQUOTAON) || (cmd == Q_XQUOTAOFF); } /* diff --git a/fs/select.c b/fs/select.c index 22b3bf89f051..4c8652390c94 100644 --- a/fs/select.c +++ b/fs/select.c @@ -287,12 +287,18 @@ int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) return 0; } +enum poll_time_type { + PT_TIMEVAL = 0, + PT_OLD_TIMEVAL = 1, + PT_TIMESPEC = 2, + PT_OLD_TIMESPEC = 3, +}; + static int poll_select_copy_remaining(struct timespec64 *end_time, void __user *p, - int timeval, int ret) + enum poll_time_type pt_type, int ret) { struct timespec64 rts; - struct timeval rtv; if (!p) return ret; @@ -310,18 +316,40 @@ static int poll_select_copy_remaining(struct timespec64 *end_time, rts.tv_sec = rts.tv_nsec = 0; - if (timeval) { - if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) - memset(&rtv, 0, sizeof(rtv)); - rtv.tv_sec = rts.tv_sec; - rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; + switch (pt_type) { + case PT_TIMEVAL: + { + struct timeval rtv; - if (!copy_to_user(p, &rtv, sizeof(rtv))) + if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) + memset(&rtv, 0, sizeof(rtv)); + rtv.tv_sec = rts.tv_sec; + rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + } + break; + case PT_OLD_TIMEVAL: + { + struct old_timeval32 rtv; + + rtv.tv_sec = rts.tv_sec; + rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + } + break; + case PT_TIMESPEC: + if (!put_timespec64(&rts, p)) return ret; - - } else if (!put_timespec64(&rts, p)) - return ret; - + break; + case PT_OLD_TIMESPEC: + if (!put_old_timespec32(&rts, p)) + return ret; + break; + default: + BUG(); + } /* * If an application puts its timeval in read-only memory, we * don't want the Linux-specific update to the timeval to @@ -689,7 +717,7 @@ static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, } ret = core_sys_select(n, inp, outp, exp, to); - ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); + ret = poll_select_copy_remaining(&end_time, tvp, PT_TIMEVAL, ret); return ret; } @@ -701,49 +729,41 @@ SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, } static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timespec __user *tsp, - const sigset_t __user *sigmask, size_t sigsetsize) + fd_set __user *exp, void __user *tsp, + const sigset_t __user *sigmask, size_t sigsetsize, + enum poll_time_type type) { sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (get_timespec64(&ts, tsp)) - return -EFAULT; + switch (type) { + case PT_TIMESPEC: + if (get_timespec64(&ts, tsp)) + return -EFAULT; + break; + case PT_OLD_TIMESPEC: + if (get_old_timespec32(&ts, tsp)) + return -EFAULT; + break; + default: + BUG(); + } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } - if (sigmask) { - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) - return -EFAULT; - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } + ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (ret) + return ret; ret = core_sys_select(n, inp, outp, exp, to); - ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); + ret = poll_select_copy_remaining(&end_time, tsp, type, ret); - if (ret == -ERESTARTNOHAND) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + restore_user_sigmask(sigmask, &sigsaved); return ret; } @@ -755,7 +775,7 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, * the sigset size. */ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, - fd_set __user *, exp, struct timespec __user *, tsp, + fd_set __user *, exp, struct __kernel_timespec __user *, tsp, void __user *, sig) { size_t sigsetsize = 0; @@ -769,9 +789,31 @@ SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, return -EFAULT; } - return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize); + return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize, PT_TIMESPEC); } +#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) + +SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct old_timespec32 __user *, tsp, + void __user *, sig) +{ + size_t sigsetsize = 0; + sigset_t __user *up = NULL; + + if (sig) { + if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t)) + || __get_user(up, (sigset_t __user * __user *)sig) + || __get_user(sigsetsize, + (size_t __user *)(sig+sizeof(void *)))) + return -EFAULT; + } + + return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize, PT_OLD_TIMESPEC); +} + +#endif + #ifdef __ARCH_WANT_SYS_OLD_SELECT struct sel_arg_struct { unsigned long n; @@ -1045,7 +1087,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, } SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, - struct timespec __user *, tsp, const sigset_t __user *, sigmask, + struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { sigset_t ksigmask, sigsaved; @@ -1061,89 +1103,62 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, return -EINVAL; } - if (sigmask) { - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) - return -EFAULT; - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } + ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (ret) + return ret; ret = do_sys_poll(ufds, nfds, to); + restore_user_sigmask(sigmask, &sigsaved); + /* We can restart this syscall, usually */ - if (ret == -EINTR) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } + if (ret == -EINTR) ret = -ERESTARTNOHAND; - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); + ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret); return ret; } -#ifdef CONFIG_COMPAT -#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) +#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT) -static -int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user *p, - int timeval, int ret) +SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, + struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, + size_t, sigsetsize) { - struct timespec64 ts; + sigset_t ksigmask, sigsaved; + struct timespec64 ts, end_time, *to = NULL; + int ret; - if (!p) - return ret; + if (tsp) { + if (get_old_timespec32(&ts, tsp)) + return -EFAULT; - if (current->personality & STICKY_TIMEOUTS) - goto sticky; + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } - /* No update for zero timeout */ - if (!end_time->tv_sec && !end_time->tv_nsec) + ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (ret) return ret; - ktime_get_ts64(&ts); - ts = timespec64_sub(*end_time, ts); - if (ts.tv_sec < 0) - ts.tv_sec = ts.tv_nsec = 0; + ret = do_sys_poll(ufds, nfds, to); - if (timeval) { - struct old_timeval32 rtv; + restore_user_sigmask(sigmask, &sigsaved); - rtv.tv_sec = ts.tv_sec; - rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; + /* We can restart this syscall, usually */ + if (ret == -EINTR) + ret = -ERESTARTNOHAND; - if (!copy_to_user(p, &rtv, sizeof(rtv))) - return ret; - } else { - if (!put_old_timespec32(&ts, p)) - return ret; - } - /* - * If an application puts its timeval in read-only memory, we - * don't want the Linux-specific update to the timeval to - * cause a fault after the select has completed - * successfully. However, because we're not updating the - * timeval, we can't restart the system call. - */ + ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret); -sticky: - if (ret == -ERESTARTNOHAND) - ret = -EINTR; return ret; } +#endif + +#ifdef CONFIG_COMPAT +#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) /* * Ooo, nasty. We need here to frob 32-bit unsigned longs to @@ -1275,7 +1290,7 @@ static int do_compat_select(int n, compat_ulong_t __user *inp, } ret = compat_core_sys_select(n, inp, outp, exp, to); - ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret); + ret = poll_select_copy_remaining(&end_time, tvp, PT_OLD_TIMEVAL, ret); return ret; } @@ -1307,52 +1322,66 @@ COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) static long do_compat_pselect(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct old_timespec32 __user *tsp, compat_sigset_t __user *sigmask, - compat_size_t sigsetsize) + void __user *tsp, compat_sigset_t __user *sigmask, + compat_size_t sigsetsize, enum poll_time_type type) { sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (get_old_timespec32(&ts, tsp)) - return -EFAULT; + switch (type) { + case PT_OLD_TIMESPEC: + if (get_old_timespec32(&ts, tsp)) + return -EFAULT; + break; + case PT_TIMESPEC: + if (get_timespec64(&ts, tsp)) + return -EFAULT; + break; + default: + BUG(); + } to = &end_time; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (get_compat_sigset(&ksigmask, sigmask)) - return -EFAULT; - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } + ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (ret) + return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); - ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); + ret = poll_select_copy_remaining(&end_time, tsp, type, ret); - if (ret == -ERESTARTNOHAND) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + restore_user_sigmask(sigmask, &sigsaved); return ret; } +COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct __kernel_timespec __user *, tsp, void __user *, sig) +{ + compat_size_t sigsetsize = 0; + compat_uptr_t up = 0; + + if (sig) { + if (!access_ok(VERIFY_READ, sig, + sizeof(compat_uptr_t)+sizeof(compat_size_t)) || + __get_user(up, (compat_uptr_t __user *)sig) || + __get_user(sigsetsize, + (compat_size_t __user *)(sig+sizeof(up)))) + return -EFAULT; + } + + return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up), + sigsetsize, PT_TIMESPEC); +} + +#if defined(CONFIG_COMPAT_32BIT_TIME) + COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct old_timespec32 __user *, tsp, void __user *, sig) @@ -1368,10 +1397,14 @@ COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, (compat_size_t __user *)(sig+sizeof(up)))) return -EFAULT; } + return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up), - sigsetsize); + sigsetsize, PT_OLD_TIMESPEC); } +#endif + +#if defined(CONFIG_COMPAT_32BIT_TIME) COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) @@ -1389,36 +1422,57 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, return -EINVAL; } - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (get_compat_sigset(&ksigmask, sigmask)) + ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (ret) + return ret; + + ret = do_sys_poll(ufds, nfds, to); + + restore_user_sigmask(sigmask, &sigsaved); + + /* We can restart this syscall, usually */ + if (ret == -EINTR) + ret = -ERESTARTNOHAND; + + ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret); + + return ret; +} +#endif + +/* New compat syscall for 64 bit time_t*/ +COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, + unsigned int, nfds, struct __kernel_timespec __user *, tsp, + const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) +{ + sigset_t ksigmask, sigsaved; + struct timespec64 ts, end_time, *to = NULL; + int ret; + + if (tsp) { + if (get_timespec64(&ts, tsp)) return -EFAULT; - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; } + ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + if (ret) + return ret; + ret = do_sys_poll(ufds, nfds, to); + restore_user_sigmask(sigmask, &sigsaved); + /* We can restart this syscall, usually */ - if (ret == -EINTR) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } + if (ret == -EINTR) ret = -ERESTARTNOHAND; - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); + ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret); return ret; } + #endif diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 0a7252aecfa5..bb71db63c99c 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -334,7 +334,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, } EXPORT_SYMBOL_GPL(sysfs_create_file_ns); -int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr) +int sysfs_create_files(struct kobject *kobj, const struct attribute * const *ptr) { int err = 0; int i; @@ -493,7 +493,7 @@ bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) return ret; } -void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr) +void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr) { int i; for (i = 0; ptr[i]; i++) diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 529856fbccd0..bc1e082d921d 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -12,9 +12,10 @@ config UBIFS_FS help UBIFS is a file system for flash devices which works on top of UBI. +if UBIFS_FS + config UBIFS_FS_ADVANCED_COMPR bool "Advanced compression options" - depends on UBIFS_FS help This option allows to explicitly choose which compressions, if any, are enabled in UBIFS. Removing compressors means inability to read @@ -24,7 +25,6 @@ config UBIFS_FS_ADVANCED_COMPR config UBIFS_FS_LZO bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR - depends on UBIFS_FS default y help LZO compressor is generally faster than zlib but compresses worse. @@ -32,14 +32,12 @@ config UBIFS_FS_LZO config UBIFS_FS_ZLIB bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR - depends on UBIFS_FS default y help Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. config UBIFS_ATIME_SUPPORT - bool "Access time support" if UBIFS_FS - depends on UBIFS_FS + bool "Access time support" default n help Originally UBIFS did not support atime, because it looked like a bad idea due @@ -54,7 +52,6 @@ config UBIFS_ATIME_SUPPORT config UBIFS_FS_XATTR bool "UBIFS XATTR support" - depends on UBIFS_FS default y help Saying Y here includes support for extended attributes (xattrs). @@ -65,7 +62,7 @@ config UBIFS_FS_XATTR config UBIFS_FS_ENCRYPTION bool "UBIFS Encryption" - depends on UBIFS_FS && UBIFS_FS_XATTR && BLOCK + depends on UBIFS_FS_XATTR && BLOCK select FS_ENCRYPTION default n help @@ -76,7 +73,7 @@ config UBIFS_FS_ENCRYPTION config UBIFS_FS_SECURITY bool "UBIFS Security Labels" - depends on UBIFS_FS && UBIFS_FS_XATTR + depends on UBIFS_FS_XATTR default y help Security labels provide an access control facility to support Linux @@ -89,6 +86,7 @@ config UBIFS_FS_SECURITY config UBIFS_FS_AUTHENTICATION bool "UBIFS authentication support" + depends on KEYS select CRYPTO_HMAC help Enable authentication support for UBIFS. This feature offers protection @@ -96,3 +94,5 @@ config UBIFS_FS_AUTHENTICATION If you say yes here you should also select a hashing algorithm such as sha256, these are not selected automatically since there are many different options. + +endif # UBIFS_FS diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index 124e965a28b3..5bf5fd08879e 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -269,8 +269,7 @@ int ubifs_init_authentication(struct ubifs_info *c) goto out; } - c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0, - CRYPTO_ALG_ASYNC); + c->hash_tfm = crypto_alloc_shash(c->auth_hash_name, 0, 0); if (IS_ERR(c->hash_tfm)) { err = PTR_ERR(c->hash_tfm); ubifs_err(c, "Can not allocate %s: %d", @@ -286,7 +285,7 @@ int ubifs_init_authentication(struct ubifs_info *c) goto out_free_hash; } - c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, CRYPTO_ALG_ASYNC); + c->hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0); if (IS_ERR(c->hmac_tfm)) { err = PTR_ERR(c->hmac_tfm); ubifs_err(c, "Can not allocate %s: %d", hmac_name, err); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 1b78f2e09218..5d2ffb1a45fc 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1481,7 +1481,7 @@ static int ubifs_migrate_page(struct address_space *mapping, { int rc; - rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index d1d5e96350dd..b0c5f06128b5 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1675,6 +1675,12 @@ int ubifs_lpt_calc_hash(struct ubifs_info *c, u8 *hash) if (!ubifs_authenticated(c)) return 0; + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return err; + } + desc = ubifs_hash_get_desc(c); if (IS_ERR(desc)) return PTR_ERR(desc); @@ -1685,12 +1691,6 @@ int ubifs_lpt_calc_hash(struct ubifs_info *c, u8 *hash) goto out; } - if (!c->nroot) { - err = ubifs_read_nnode(c, NULL, 0); - if (err) - return err; - } - cnode = (struct ubifs_cnode *)c->nroot; while (cnode) { diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 75f961c4c044..0a0e65c07c6d 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -213,6 +213,38 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r) } /** + * inode_still_linked - check whether inode in question will be re-linked. + * @c: UBIFS file-system description object + * @rino: replay entry to test + * + * O_TMPFILE files can be re-linked, this means link count goes from 0 to 1. + * This case needs special care, otherwise all references to the inode will + * be removed upon the first replay entry of an inode with link count 0 + * is found. + */ +static bool inode_still_linked(struct ubifs_info *c, struct replay_entry *rino) +{ + struct replay_entry *r; + + ubifs_assert(c, rino->deletion); + ubifs_assert(c, key_type(c, &rino->key) == UBIFS_INO_KEY); + + /* + * Find the most recent entry for the inode behind @rino and check + * whether it is a deletion. + */ + list_for_each_entry_reverse(r, &c->replay_list, list) { + ubifs_assert(c, r->sqnum >= rino->sqnum); + if (key_inum(c, &r->key) == key_inum(c, &rino->key)) + return r->deletion == 0; + + } + + ubifs_assert(c, 0); + return false; +} + +/** * apply_replay_entry - apply a replay entry to the TNC. * @c: UBIFS file-system description object * @r: replay entry to apply @@ -239,6 +271,11 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) { ino_t inum = key_inum(c, &r->key); + if (inode_still_linked(c, r)) { + err = 0; + break; + } + err = ubifs_tnc_remove_ino(c, inum); break; } @@ -533,6 +570,28 @@ static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) return data == 0xFFFFFFFF; } +/* authenticate_sleb_hash and authenticate_sleb_hmac are split out for stack usage */ +static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_hash, u8 *hash) +{ + SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); + + hash_desc->tfm = c->hash_tfm; + hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + ubifs_shash_copy_state(c, log_hash, hash_desc); + return crypto_shash_final(hash_desc, hash); +} + +static int authenticate_sleb_hmac(struct ubifs_info *c, u8 *hash, u8 *hmac) +{ + SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm); + + hmac_desc->tfm = c->hmac_tfm; + hmac_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + return crypto_shash_digest(hmac_desc, hash, c->hash_len, hmac); +} + /** * authenticate_sleb - authenticate one scan LEB * @c: UBIFS file-system description object @@ -574,21 +633,12 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (snod->type == UBIFS_AUTH_NODE) { struct ubifs_auth_node *auth = snod->node; - SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); - SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm); - - hash_desc->tfm = c->hash_tfm; - hash_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - ubifs_shash_copy_state(c, log_hash, hash_desc); - err = crypto_shash_final(hash_desc, hash); + err = authenticate_sleb_hash(c, log_hash, hash); if (err) goto out; - hmac_desc->tfm = c->hmac_tfm; - hmac_desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; - err = crypto_shash_digest(hmac_desc, hash, c->hash_len, - hmac); + err = authenticate_sleb_hmac(c, hash, hmac); if (err) goto out; diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 75a69dd26d6e..3da90c951c23 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -63,6 +63,17 @@ /* Default time granularity in nanoseconds */ #define DEFAULT_TIME_GRAN 1000000000 +static int get_default_compressor(struct ubifs_info *c) +{ + if (ubifs_compr_present(c, UBIFS_COMPR_LZO)) + return UBIFS_COMPR_LZO; + + if (ubifs_compr_present(c, UBIFS_COMPR_ZLIB)) + return UBIFS_COMPR_ZLIB; + + return UBIFS_COMPR_NONE; +} + /** * create_default_filesystem - format empty UBI volume. * @c: UBIFS file-system description object @@ -207,7 +218,7 @@ static int create_default_filesystem(struct ubifs_info *c) if (c->mount_opts.override_compr) sup->default_compr = cpu_to_le16(c->mount_opts.compr_type); else - sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO); + sup->default_compr = cpu_to_le16(get_default_compressor(c)); generate_random_uuid(sup->uuid); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 5df554a9f9c9..ae796e10f68b 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1357,6 +1357,12 @@ reread: iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) & ICBTAG_FLAG_AD_MASK; + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT && + iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG && + iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + ret = -EIO; + goto out; + } iinfo->i_unique = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenExtents = 0; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7a85e609fc27..89800fc7dc9d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -53,7 +53,7 @@ struct userfaultfd_ctx { /* a refile sequence protected by fault_pending_wqh lock */ struct seqcount refile_seq; /* pseudo fd refcounting */ - atomic_t refcount; + refcount_t refcount; /* userfaultfd syscall flags */ unsigned int flags; /* features requested from the userspace */ @@ -140,8 +140,7 @@ out: */ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) { - if (!atomic_inc_not_zero(&ctx->refcount)) - BUG(); + refcount_inc(&ctx->refcount); } /** @@ -154,7 +153,7 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) */ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) { - if (atomic_dec_and_test(&ctx->refcount)) { + if (refcount_dec_and_test(&ctx->refcount)) { VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); @@ -686,7 +685,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) return -ENOMEM; } - atomic_set(&ctx->refcount, 1); + refcount_set(&ctx->refcount, 1); ctx->flags = octx->flags; ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; @@ -736,10 +735,18 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct userfaultfd_ctx *ctx; ctx = vma->vm_userfaultfd_ctx.ctx; - if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { + + if (!ctx) + return; + + if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); WRITE_ONCE(ctx->mmap_changing, true); + } else { + /* Drop uffd context if remap feature not enabled */ + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); } } @@ -926,7 +933,7 @@ static inline struct userfaultfd_wait_queue *find_userfault_in( wait_queue_entry_t *wq; struct userfaultfd_wait_queue *uwq; - VM_BUG_ON(!spin_is_locked(&wqh->lock)); + lockdep_assert_held(&wqh->lock); uwq = NULL; if (!waitqueue_active(wqh)) @@ -1927,7 +1934,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) if (!ctx) return -ENOMEM; - atomic_set(&ctx->refcount, 1); + refcount_set(&ctx->refcount, 1); ctx->flags = flags; ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 9345802c99f7..999ad8d00d43 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -414,7 +414,6 @@ xfs_ag_extend_space( struct aghdr_init_data *id, xfs_extlen_t len) { - struct xfs_owner_info oinfo; struct xfs_buf *bp; struct xfs_agi *agi; struct xfs_agf *agf; @@ -448,17 +447,17 @@ xfs_ag_extend_space( /* * Free the new space. * - * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that + * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that * this doesn't actually exist in the rmap btree. */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); error = xfs_rmap_free(tp, bp, id->agno, be32_to_cpu(agf->agf_length) - len, - len, &oinfo); + len, &XFS_RMAP_OINFO_SKIP_UPDATE); if (error) return error; return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno, be32_to_cpu(agf->agf_length) - len), - len, &oinfo, XFS_AG_RESV_NONE); + len, &XFS_RMAP_OINFO_SKIP_UPDATE, + XFS_AG_RESV_NONE); } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index e1c0c0d2f1b0..b715668886a4 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1594,7 +1594,6 @@ xfs_alloc_ag_vextent_small( xfs_extlen_t *flenp, /* result length */ int *stat) /* status: 0-freelist, 1-normal/none */ { - struct xfs_owner_info oinfo; int error; xfs_agblock_t fbno; xfs_extlen_t flen; @@ -1648,9 +1647,8 @@ xfs_alloc_ag_vextent_small( * doesn't live in the free space, we need to clear * out the OWN_AG rmap. */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); error = xfs_rmap_free(args->tp, args->agbp, args->agno, - fbno, 1, &oinfo); + fbno, 1, &XFS_RMAP_OINFO_AG); if (error) goto error0; @@ -1694,28 +1692,28 @@ error0: */ STATIC int xfs_free_ag_extent( - xfs_trans_t *tp, - xfs_buf_t *agbp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) { - xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ - xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ - int error; /* error return value */ - xfs_agblock_t gtbno; /* start of right neighbor block */ - xfs_extlen_t gtlen; /* length of right neighbor block */ - int haveleft; /* have a left neighbor block */ - int haveright; /* have a right neighbor block */ - int i; /* temp, result code */ - xfs_agblock_t ltbno; /* start of left neighbor block */ - xfs_extlen_t ltlen; /* length of left neighbor block */ - xfs_mount_t *mp; /* mount point struct for filesystem */ - xfs_agblock_t nbno; /* new starting block of freespace */ - xfs_extlen_t nlen; /* new length of freespace */ - xfs_perag_t *pag; /* per allocation group data */ + struct xfs_mount *mp; + struct xfs_perag *pag; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + xfs_agblock_t gtbno; /* start of right neighbor */ + xfs_extlen_t gtlen; /* length of right neighbor */ + xfs_agblock_t ltbno; /* start of left neighbor */ + xfs_extlen_t ltlen; /* length of left neighbor */ + xfs_agblock_t nbno; /* new starting block of freesp */ + xfs_extlen_t nlen; /* new length of freespace */ + int haveleft; /* have a left neighbor */ + int haveright; /* have a right neighbor */ + int i; + int error; bno_cur = cnt_cur = NULL; mp = tp->t_mountp; @@ -2314,10 +2312,11 @@ xfs_alloc_fix_freelist( * repair/rmap.c in xfsprogs for details. */ memset(&targs, 0, sizeof(targs)); + /* struct copy below */ if (flags & XFS_ALLOC_FLAG_NORMAP) - xfs_rmap_skip_owner_update(&targs.oinfo); + targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; else - xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG); + targs.oinfo = XFS_RMAP_OINFO_AG; while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) @@ -2435,7 +2434,6 @@ xfs_alloc_get_freelist( be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; - xfs_perag_put(pag); logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; if (btreeblk) { @@ -2443,6 +2441,7 @@ xfs_alloc_get_freelist( pag->pagf_btreeblks++; logflags |= XFS_AGF_BTREEBLKS; } + xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; @@ -3008,21 +3007,21 @@ out: * Just break up the extent address and hand off to xfs_free_ag_extent * after fixing up the freelist. */ -int /* error */ +int __xfs_free_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - struct xfs_owner_info *oinfo, /* extent owner */ - enum xfs_ag_resv_type type, /* block reservation type */ - bool skip_discard) + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, + bool skip_discard) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_buf *agbp; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); - int error; - unsigned int busy_flags = 0; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agbp; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + int error; + unsigned int busy_flags = 0; ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 00cd5ec4cb6b..d6ed5d2c07c2 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -182,7 +182,7 @@ __xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ - struct xfs_owner_info *oinfo, /* extent owner */ + const struct xfs_owner_info *oinfo, /* extent owner */ enum xfs_ag_resv_type type, /* block reservation type */ bool skip_discard); @@ -191,7 +191,7 @@ xfs_free_extent( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_extlen_t len, - struct xfs_owner_info *oinfo, + const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { return __xfs_free_extent(tp, bno, len, oinfo, type, false); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 19e921d1586f..332eefa2700b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -536,7 +536,7 @@ __xfs_bmap_add_free( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_owner_info *oinfo, + const struct xfs_owner_info *oinfo, bool skip_discard) { struct xfs_extent_free_item *new; /* new element */ @@ -564,7 +564,7 @@ __xfs_bmap_add_free( if (oinfo) new->xefi_oinfo = *oinfo; else - xfs_rmap_skip_owner_update(&new->xefi_oinfo); + new->xefi_oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; new->xefi_skip_discard = skip_discard; trace_xfs_bmap_free_defer(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, @@ -3453,7 +3453,7 @@ xfs_bmap_btalloc( args.tp = ap->tp; args.mp = mp; args.fsbno = ap->blkno; - xfs_rmap_skip_owner_update(&args.oinfo); + args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 488dc8860fd7..09d3ea97cc15 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -186,7 +186,7 @@ int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, - xfs_filblks_t len, struct xfs_owner_info *oinfo, + xfs_filblks_t len, const struct xfs_owner_info *oinfo, bool skip_discard); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, @@ -234,7 +234,7 @@ xfs_bmap_add_free( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_owner_info *oinfo) + const struct xfs_owner_info *oinfo) { __xfs_bmap_add_free(tp, bno, len, oinfo, false); } diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index e792b167150a..94f00427de98 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -172,7 +172,13 @@ * reoccur. */ -static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX]; +static const struct xfs_defer_op_type *defer_op_types[] = { + [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type, + [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type, + [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, + [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, + [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, +}; /* * For each pending item in the intake list, log its intent item and the @@ -185,15 +191,15 @@ xfs_defer_create_intents( { struct list_head *li; struct xfs_defer_pending *dfp; + const struct xfs_defer_op_type *ops; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { - dfp->dfp_intent = dfp->dfp_type->create_intent(tp, - dfp->dfp_count); + ops = defer_op_types[dfp->dfp_type]; + dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count); trace_xfs_defer_create_intent(tp->t_mountp, dfp); - list_sort(tp->t_mountp, &dfp->dfp_work, - dfp->dfp_type->diff_items); + list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items); list_for_each(li, &dfp->dfp_work) - dfp->dfp_type->log_item(tp, dfp->dfp_intent, li); + ops->log_item(tp, dfp->dfp_intent, li); } } @@ -204,14 +210,16 @@ xfs_defer_trans_abort( struct list_head *dop_pending) { struct xfs_defer_pending *dfp; + const struct xfs_defer_op_type *ops; trace_xfs_defer_trans_abort(tp, _RET_IP_); /* Abort intent items that don't have a done item. */ list_for_each_entry(dfp, dop_pending, dfp_list) { + ops = defer_op_types[dfp->dfp_type]; trace_xfs_defer_pending_abort(tp->t_mountp, dfp); if (dfp->dfp_intent && !dfp->dfp_done) { - dfp->dfp_type->abort_intent(dfp->dfp_intent); + ops->abort_intent(dfp->dfp_intent); dfp->dfp_intent = NULL; } } @@ -315,18 +323,20 @@ xfs_defer_cancel_list( struct xfs_defer_pending *pli; struct list_head *pwi; struct list_head *n; + const struct xfs_defer_op_type *ops; /* * Free the pending items. Caller should already have arranged * for the intent items to be released. */ list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) { + ops = defer_op_types[dfp->dfp_type]; trace_xfs_defer_cancel_list(mp, dfp); list_del(&dfp->dfp_list); list_for_each_safe(pwi, n, &dfp->dfp_work) { list_del(pwi); dfp->dfp_count--; - dfp->dfp_type->cancel_item(pwi); + ops->cancel_item(pwi); } ASSERT(dfp->dfp_count == 0); kmem_free(dfp); @@ -350,7 +360,7 @@ xfs_defer_finish_noroll( struct list_head *n; void *state; int error = 0; - void (*cleanup_fn)(struct xfs_trans *, void *, int); + const struct xfs_defer_op_type *ops; LIST_HEAD(dop_pending); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -373,18 +383,18 @@ xfs_defer_finish_noroll( /* Log an intent-done item for the first pending item. */ dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); + ops = defer_op_types[dfp->dfp_type]; trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); - dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, + dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent, dfp->dfp_count); - cleanup_fn = dfp->dfp_type->finish_cleanup; /* Finish the work items. */ state = NULL; list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; - error = dfp->dfp_type->finish_item(*tp, li, - dfp->dfp_done, &state); + error = ops->finish_item(*tp, li, dfp->dfp_done, + &state); if (error == -EAGAIN) { /* * Caller wants a fresh transaction; @@ -400,8 +410,8 @@ xfs_defer_finish_noroll( * xfs_defer_cancel will take care of freeing * all these lists and stuff. */ - if (cleanup_fn) - cleanup_fn(*tp, state, error); + if (ops->finish_cleanup) + ops->finish_cleanup(*tp, state, error); goto out; } } @@ -413,20 +423,19 @@ xfs_defer_finish_noroll( * a Fresh Transaction while Finishing * Deferred Work" above. */ - dfp->dfp_intent = dfp->dfp_type->create_intent(*tp, + dfp->dfp_intent = ops->create_intent(*tp, dfp->dfp_count); dfp->dfp_done = NULL; list_for_each(li, &dfp->dfp_work) - dfp->dfp_type->log_item(*tp, dfp->dfp_intent, - li); + ops->log_item(*tp, dfp->dfp_intent, li); } else { /* Done with the dfp, free it. */ list_del(&dfp->dfp_list); kmem_free(dfp); } - if (cleanup_fn) - cleanup_fn(*tp, state, error); + if (ops->finish_cleanup) + ops->finish_cleanup(*tp, state, error); } out: @@ -486,8 +495,10 @@ xfs_defer_add( struct list_head *li) { struct xfs_defer_pending *dfp = NULL; + const struct xfs_defer_op_type *ops; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); /* * Add the item to a pending item at the end of the intake list. @@ -497,15 +508,15 @@ xfs_defer_add( if (!list_empty(&tp->t_dfops)) { dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, dfp_list); - if (dfp->dfp_type->type != type || - (dfp->dfp_type->max_items && - dfp->dfp_count >= dfp->dfp_type->max_items)) + ops = defer_op_types[dfp->dfp_type]; + if (dfp->dfp_type != type || + (ops->max_items && dfp->dfp_count >= ops->max_items)) dfp = NULL; } if (!dfp) { dfp = kmem_alloc(sizeof(struct xfs_defer_pending), KM_SLEEP | KM_NOFS); - dfp->dfp_type = defer_op_types[type]; + dfp->dfp_type = type; dfp->dfp_intent = NULL; dfp->dfp_done = NULL; dfp->dfp_count = 0; @@ -517,14 +528,6 @@ xfs_defer_add( dfp->dfp_count++; } -/* Initialize a deferred operation list. */ -void -xfs_defer_init_op_type( - const struct xfs_defer_op_type *type) -{ - defer_op_types[type->type] = type; -} - /* * Move deferred ops from one transaction to another and reset the source to * initial state. This is primarily used to carry state forward across diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 2584a5b95b0d..7c28d7608ac6 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -9,20 +9,6 @@ struct xfs_defer_op_type; /* - * Save a log intent item and a list of extents, so that we can replay - * whatever action had to happen to the extent list and file the log done - * item. - */ -struct xfs_defer_pending { - const struct xfs_defer_op_type *dfp_type; /* function pointers */ - struct list_head dfp_list; /* pending items */ - void *dfp_intent; /* log intent item */ - void *dfp_done; /* log done item */ - struct list_head dfp_work; /* work items */ - unsigned int dfp_count; /* # extent items */ -}; - -/* * Header for deferred operation list. */ enum xfs_defer_ops_type { @@ -34,6 +20,20 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_MAX, }; +/* + * Save a log intent item and a list of extents, so that we can replay + * whatever action had to happen to the extent list and file the log done + * item. + */ +struct xfs_defer_pending { + struct list_head dfp_list; /* pending items */ + struct list_head dfp_work; /* work items */ + void *dfp_intent; /* log intent item */ + void *dfp_done; /* log done item */ + unsigned int dfp_count; /* # extent items */ + enum xfs_defer_ops_type dfp_type; +}; + void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type, struct list_head *h); int xfs_defer_finish_noroll(struct xfs_trans **tp); @@ -43,8 +43,6 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { - enum xfs_defer_ops_type type; - unsigned int max_items; void (*abort_intent)(void *); void *(*create_done)(struct xfs_trans *, void *, unsigned int); int (*finish_item)(struct xfs_trans *, struct list_head *, void *, @@ -54,8 +52,13 @@ struct xfs_defer_op_type { int (*diff_items)(void *, struct list_head *, struct list_head *); void *(*create_intent)(struct xfs_trans *, uint); void (*log_item)(struct xfs_trans *, void *, struct list_head *); + unsigned int max_items; }; -void xfs_defer_init_op_type(const struct xfs_defer_op_type *type); +extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; +extern const struct xfs_defer_op_type xfs_refcount_update_defer_type; +extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; +extern const struct xfs_defer_op_type xfs_extent_free_defer_type; +extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 9995d5ae380b..9bb3c48843ec 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -916,6 +916,9 @@ static inline uint xfs_dinode_size(int version) /* * Values for di_format + * + * This enum is used in string mapping in xfs_trace.h; please keep the + * TRACE_DEFINE_ENUMs for it up to date. */ typedef enum xfs_dinode_fmt { XFS_DINODE_FMT_DEV, /* xfs_dev_t */ @@ -925,6 +928,13 @@ typedef enum xfs_dinode_fmt { XFS_DINODE_FMT_UUID /* added long ago, but never used */ } xfs_dinode_fmt_t; +#define XFS_INODE_FORMAT_STR \ + { XFS_DINODE_FMT_DEV, "dev" }, \ + { XFS_DINODE_FMT_LOCAL, "local" }, \ + { XFS_DINODE_FMT_EXTENTS, "extent" }, \ + { XFS_DINODE_FMT_BTREE, "btree" }, \ + { XFS_DINODE_FMT_UUID, "uuid" } + /* * Inode minimum and maximum sizes. */ @@ -1083,6 +1093,8 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) #define XFS_OFFBNO_TO_AGINO(mp,b,o) \ ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o))) +#define XFS_FSB_TO_INO(mp, b) ((xfs_ino_t)((b) << XFS_INO_OFFSET_BITS(mp))) +#define XFS_AGB_TO_AGINO(mp, b) ((xfs_agino_t)((b) << XFS_INO_OFFSET_BITS(mp))) #define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) #define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL)) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index a8f6db735d5d..d32152fc8a6c 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -288,7 +288,7 @@ xfs_ialloc_inode_init( { struct xfs_buf *fbuf; struct xfs_dinode *free; - int nbufs, blks_per_cluster, inodes_per_cluster; + int nbufs; int version; int i, j; xfs_daddr_t d; @@ -299,9 +299,7 @@ xfs_ialloc_inode_init( * sizes, manipulate the inodes in buffers which are multiples of the * blocks size. */ - blks_per_cluster = xfs_icluster_size_fsb(mp); - inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; - nbufs = length / blks_per_cluster; + nbufs = length / mp->m_blocks_per_cluster; /* * Figure out what version number to use in the inodes we create. If @@ -312,7 +310,7 @@ xfs_ialloc_inode_init( * * For v3 inodes, we also need to write the inode number into the inode, * so calculate the first inode number of the chunk here as - * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not + * XFS_AGB_TO_AGINO() only works within a filesystem block, not * across multiple filesystem blocks (such as a cluster) and so cannot * be used in the cluster buffer loop below. * @@ -324,8 +322,7 @@ xfs_ialloc_inode_init( */ if (xfs_sb_version_hascrc(&mp->m_sb)) { version = 3; - ino = XFS_AGINO_TO_INO(mp, agno, - XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); + ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); /* * log the initialisation that is about to take place as an @@ -345,9 +342,10 @@ xfs_ialloc_inode_init( /* * Get the block. */ - d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); + d = XFS_AGB_TO_DADDR(mp, agno, agbno + + (j * mp->m_blocks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * blks_per_cluster, + mp->m_bsize * mp->m_blocks_per_cluster, XBF_UNMAPPED); if (!fbuf) return -ENOMEM; @@ -355,7 +353,7 @@ xfs_ialloc_inode_init( /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); - for (i = 0; i < inodes_per_cluster; i++) { + for (i = 0; i < mp->m_inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; uint isize = xfs_dinode_size(version); @@ -445,7 +443,7 @@ xfs_align_sparse_ino( return; /* calculate the inode offset and align startino */ - offset = mod << mp->m_sb.sb_inopblog; + offset = XFS_AGB_TO_AGINO(mp, mod); *startino -= offset; /* @@ -641,7 +639,7 @@ xfs_ialloc_ag_alloc( args.tp = tp; args.mp = tp->t_mountp; args.fsbno = NULLFSBLOCK; - xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INODES); + args.oinfo = XFS_RMAP_OINFO_INODES; #ifdef DEBUG /* randomly do sparse inode allocations */ @@ -692,7 +690,7 @@ xfs_ialloc_ag_alloc( * but not to use them in the actual exact allocation. */ args.alignment = 1; - args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1; + args.minalignslop = args.mp->m_cluster_align - 1; /* Allow space for the inode btree to split. */ args.minleft = args.mp->m_in_maxlevels - 1; @@ -727,7 +725,7 @@ xfs_ialloc_ag_alloc( args.alignment = args.mp->m_dalign; isaligned = 1; } else - args.alignment = xfs_ialloc_cluster_alignment(args.mp); + args.alignment = args.mp->m_cluster_align; /* * Need to figure out where to allocate the inode blocks. * Ideally they should be spaced out through the a.g. @@ -756,7 +754,7 @@ xfs_ialloc_ag_alloc( args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); - args.alignment = xfs_ialloc_cluster_alignment(args.mp); + args.alignment = args.mp->m_cluster_align; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -797,7 +795,7 @@ sparse_alloc: if (error) return error; - newlen = args.len << args.mp->m_sb.sb_inopblog; + newlen = XFS_AGB_TO_AGINO(args.mp, args.len); ASSERT(newlen <= XFS_INODES_PER_CHUNK); allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; } @@ -825,7 +823,7 @@ sparse_alloc: /* * Convert the results. */ - newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + newino = XFS_AGB_TO_AGINO(args.mp, args.agbno); if (xfs_inobt_issparse(~allocmask)) { /* @@ -1019,7 +1017,7 @@ xfs_ialloc_ag_select( */ ineed = mp->m_ialloc_min_blks; if (flags && ineed > 1) - ineed += xfs_ialloc_cluster_alignment(mp); + ineed += mp->m_cluster_align; longest = pag->pagf_longest; if (!longest) longest = pag->pagf_flcount > 0; @@ -1849,14 +1847,12 @@ xfs_difree_inode_chunk( int nextbit; xfs_agblock_t agbno; int contigblk; - struct xfs_owner_info oinfo; DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), - mp->m_ialloc_blks, &oinfo); + mp->m_ialloc_blks, &XFS_RMAP_OINFO_INODES); return; } @@ -1900,7 +1896,7 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, agbno), - contigblk, &oinfo); + contigblk, &XFS_RMAP_OINFO_INODES); /* reset range to current bit and carry on... */ startidx = endidx = nextbit; @@ -2292,7 +2288,6 @@ xfs_imap( xfs_agblock_t agbno; /* block number of inode in the alloc group */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agnumber_t agno; /* allocation group number */ - int blks_per_cluster; /* num blocks per inode cluster */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ xfs_agblock_t cluster_agbno; /* first block in inode cluster */ int error; /* error code */ @@ -2338,8 +2333,6 @@ xfs_imap( return -EINVAL; } - blks_per_cluster = xfs_icluster_size_fsb(mp); - /* * For bulkstat and handle lookups, we have an untrusted inode number * that we have to verify is valid. We cannot do this just by reading @@ -2359,7 +2352,7 @@ xfs_imap( * If the inode cluster size is the same as the blocksize or * smaller we get to the buffer by simple arithmetics. */ - if (blks_per_cluster == 1) { + if (mp->m_blocks_per_cluster == 1) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); @@ -2388,12 +2381,13 @@ xfs_imap( out_map: ASSERT(agbno >= chunk_agbno); cluster_agbno = chunk_agbno + - ((offset_agbno / blks_per_cluster) * blks_per_cluster); + ((offset_agbno / mp->m_blocks_per_cluster) * + mp->m_blocks_per_cluster); offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); - imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); + imap->im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); /* @@ -2726,8 +2720,8 @@ xfs_ialloc_has_inodes_at_extent( xfs_agino_t low; xfs_agino_t high; - low = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno, 0); - high = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno + len, 0) - 1; + low = XFS_AGB_TO_AGINO(cur->bc_mp, bno); + high = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; return xfs_ialloc_has_inode_record(cur, low, high, exists); } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 7fbf8af0b159..9b25e7a0df47 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -84,7 +84,7 @@ __xfs_inobt_alloc_block( memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; - xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INOBT); + args.oinfo = XFS_RMAP_OINFO_INOBT; args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); args.minlen = 1; args.maxlen = 1; @@ -136,12 +136,9 @@ __xfs_inobt_free_block( struct xfs_buf *bp, enum xfs_ag_resv_type resv) { - struct xfs_owner_info oinfo; - - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); return xfs_free_extent(cur->bc_tp, XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, - &oinfo, resv); + &XFS_RMAP_OINFO_INOBT, resv); } STATIC int diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 1aaa01c97517..d9eab657b63e 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -70,7 +70,7 @@ xfs_refcountbt_alloc_block( args.type = XFS_ALLOCTYPE_NEAR_BNO; args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, xfs_refc_block(args.mp)); - xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC); + args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; args.resv = XFS_AG_RESV_METADATA; @@ -106,15 +106,13 @@ xfs_refcountbt_free_block( struct xfs_buf *agbp = cur->bc_private.a.agbp; struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); - struct xfs_owner_info oinfo; int error; trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno, XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); - error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo, + error = xfs_free_extent(cur->bc_tp, fsbno, 1, &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 245af452840e..8ed885507dd8 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -458,21 +458,21 @@ out: */ STATIC int xfs_rmap_unmap( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - bool unwritten, - struct xfs_owner_info *oinfo) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_rmap_irec ltrec; - uint64_t ltoff; - int error = 0; - int i; - uint64_t owner; - uint64_t offset; - unsigned int flags; - bool ignore_off; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + uint64_t ltoff; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags; + bool ignore_off; xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); ignore_off = XFS_RMAP_NON_INODE_OWNER(owner) || @@ -653,16 +653,16 @@ out_error: */ int xfs_rmap_free( - struct xfs_trans *tp, - struct xfs_buf *agbp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo) + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_btree_cur *cur; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *cur; + int error; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; @@ -710,23 +710,23 @@ xfs_rmap_is_mergeable( */ STATIC int xfs_rmap_map( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - bool unwritten, - struct xfs_owner_info *oinfo) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_rmap_irec ltrec; - struct xfs_rmap_irec gtrec; - int have_gt; - int have_lt; - int error = 0; - int i; - uint64_t owner; - uint64_t offset; - unsigned int flags = 0; - bool ignore_off; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + struct xfs_rmap_irec gtrec; + int have_gt; + int have_lt; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags = 0; + bool ignore_off; xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); ASSERT(owner != 0); @@ -890,16 +890,16 @@ out_error: */ int xfs_rmap_alloc( - struct xfs_trans *tp, - struct xfs_buf *agbp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo) + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_btree_cur *cur; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *cur; + int error; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; @@ -929,16 +929,16 @@ xfs_rmap_alloc( */ STATIC int xfs_rmap_convert( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - bool unwritten, - struct xfs_owner_info *oinfo) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_rmap_irec r[4]; /* neighbor extent entries */ - /* left is 0, right is 1, prev is 2 */ - /* new is 3 */ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec r[4]; /* neighbor extent entries */ + /* left is 0, right is 1, */ + /* prev is 2, new is 3 */ uint64_t owner; uint64_t offset; uint64_t new_endoff; @@ -1354,16 +1354,16 @@ done: */ STATIC int xfs_rmap_convert_shared( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - bool unwritten, - struct xfs_owner_info *oinfo) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_rmap_irec r[4]; /* neighbor extent entries */ - /* left is 0, right is 1, prev is 2 */ - /* new is 3 */ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec r[4]; /* neighbor extent entries */ + /* left is 0, right is 1, */ + /* prev is 2, new is 3 */ uint64_t owner; uint64_t offset; uint64_t new_endoff; @@ -1743,20 +1743,20 @@ done: */ STATIC int xfs_rmap_unmap_shared( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - bool unwritten, - struct xfs_owner_info *oinfo) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_rmap_irec ltrec; - uint64_t ltoff; - int error = 0; - int i; - uint64_t owner; - uint64_t offset; - unsigned int flags; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + uint64_t ltoff; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags; xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) @@ -1905,22 +1905,22 @@ out_error: */ STATIC int xfs_rmap_map_shared( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - bool unwritten, - struct xfs_owner_info *oinfo) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + const struct xfs_owner_info *oinfo) { - struct xfs_mount *mp = cur->bc_mp; - struct xfs_rmap_irec ltrec; - struct xfs_rmap_irec gtrec; - int have_gt; - int have_lt; - int error = 0; - int i; - uint64_t owner; - uint64_t offset; - unsigned int flags = 0; + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + struct xfs_rmap_irec gtrec; + int have_gt; + int have_lt; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags = 0; xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) @@ -2459,18 +2459,18 @@ xfs_rmap_has_record( */ int xfs_rmap_record_exists( - struct xfs_btree_cur *cur, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo, - bool *has_rmap) + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + bool *has_rmap) { - uint64_t owner; - uint64_t offset; - unsigned int flags; - int has_record; - struct xfs_rmap_irec irec; - int error; + uint64_t owner; + uint64_t offset; + unsigned int flags; + int has_record; + struct xfs_rmap_irec irec; + int error; xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) || @@ -2530,7 +2530,7 @@ xfs_rmap_has_other_keys( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, - struct xfs_owner_info *oinfo, + const struct xfs_owner_info *oinfo, bool *has_rmap) { struct xfs_rmap_irec low = {0}; @@ -2550,3 +2550,31 @@ xfs_rmap_has_other_keys( *has_rmap = rks.has_rmap; return error; } + +const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { + .oi_owner = XFS_RMAP_OWN_NULL, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER = { + .oi_owner = XFS_RMAP_OWN_UNKNOWN, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_FS = { + .oi_owner = XFS_RMAP_OWN_FS, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_LOG = { + .oi_owner = XFS_RMAP_OWN_LOG, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_AG = { + .oi_owner = XFS_RMAP_OWN_AG, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_INOBT = { + .oi_owner = XFS_RMAP_OWN_INOBT, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_INODES = { + .oi_owner = XFS_RMAP_OWN_INODES, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_REFC = { + .oi_owner = XFS_RMAP_OWN_REFC, +}; +const struct xfs_owner_info XFS_RMAP_OINFO_COW = { + .oi_owner = XFS_RMAP_OWN_COW, +}; diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 157dc722ad35..e21ed0294e5c 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -7,16 +7,6 @@ #define __XFS_RMAP_H__ static inline void -xfs_rmap_ag_owner( - struct xfs_owner_info *oi, - uint64_t owner) -{ - oi->oi_owner = owner; - oi->oi_offset = 0; - oi->oi_flags = 0; -} - -static inline void xfs_rmap_ino_bmbt_owner( struct xfs_owner_info *oi, xfs_ino_t ino, @@ -43,27 +33,13 @@ xfs_rmap_ino_owner( oi->oi_flags |= XFS_OWNER_INFO_ATTR_FORK; } -static inline void -xfs_rmap_skip_owner_update( - struct xfs_owner_info *oi) -{ - xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_NULL); -} - static inline bool xfs_rmap_should_skip_owner_update( - struct xfs_owner_info *oi) + const struct xfs_owner_info *oi) { return oi->oi_owner == XFS_RMAP_OWN_NULL; } -static inline void -xfs_rmap_any_owner_update( - struct xfs_owner_info *oi) -{ - xfs_rmap_ag_owner(oi, XFS_RMAP_OWN_UNKNOWN); -} - /* Reverse mapping functions. */ struct xfs_buf; @@ -103,12 +79,12 @@ xfs_rmap_irec_offset_unpack( static inline void xfs_owner_info_unpack( - struct xfs_owner_info *oinfo, - uint64_t *owner, - uint64_t *offset, - unsigned int *flags) + const struct xfs_owner_info *oinfo, + uint64_t *owner, + uint64_t *offset, + unsigned int *flags) { - unsigned int r = 0; + unsigned int r = 0; *owner = oinfo->oi_owner; *offset = oinfo->oi_offset; @@ -137,10 +113,10 @@ xfs_owner_info_pack( int xfs_rmap_alloc(struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, - struct xfs_owner_info *oinfo); + const struct xfs_owner_info *oinfo); int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, - struct xfs_owner_info *oinfo); + const struct xfs_owner_info *oinfo); int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, uint64_t owner, uint64_t offset, @@ -218,11 +194,21 @@ int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec, int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, struct xfs_owner_info *oinfo, + xfs_extlen_t len, const struct xfs_owner_info *oinfo, bool *has_rmap); int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, struct xfs_owner_info *oinfo, + xfs_extlen_t len, const struct xfs_owner_info *oinfo, bool *has_rmap); int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap); +extern const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE; +extern const struct xfs_owner_info XFS_RMAP_OINFO_ANY_OWNER; +extern const struct xfs_owner_info XFS_RMAP_OINFO_FS; +extern const struct xfs_owner_info XFS_RMAP_OINFO_LOG; +extern const struct xfs_owner_info XFS_RMAP_OINFO_AG; +extern const struct xfs_owner_info XFS_RMAP_OINFO_INOBT; +extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES; +extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC; +extern const struct xfs_owner_info XFS_RMAP_OINFO_COW; + #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index b228c821bae6..eaaff67e9626 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -505,6 +505,12 @@ xfs_rtmodify_summary_int( uint first = (uint)((char *)sp - (char *)bp->b_addr); *sp += delta; + if (mp->m_rsum_cache) { + if (*sp == 0 && log == mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno]++; + if (*sp != 0 && log < mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log; + } xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1); } if (sum) diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 95374ab2dee7..77d80106f989 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -199,7 +199,10 @@ xfs_symlink_local_to_remote( ifp->if_bytes - 1); } -/* Verify the consistency of an inline symlink. */ +/* + * Verify the in-memory consistency of an inline symlink data fork. This + * does not do on-disk format checks. + */ xfs_failaddr_t xfs_symlink_shortform_verify( struct xfs_inode *ip) @@ -215,9 +218,12 @@ xfs_symlink_shortform_verify( size = ifp->if_bytes; endp = sfp + size; - /* Zero length symlinks can exist while we're deleting a remote one. */ - if (size == 0) - return NULL; + /* + * Zero length symlinks should never occur in memory as they are + * never alllowed to exist on disk. + */ + if (!size) + return __this_address; /* No negative sizes or overly long symlink targets. */ if (size < 0 || size > XFS_SYMLINK_MAXLEN) diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 33a5ca346baf..3306fc42cfad 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -87,16 +87,15 @@ xfs_agino_range( * Calculate the first inode, which will be in the first * cluster-aligned block after the AGFL. */ - bno = round_up(XFS_AGFL_BLOCK(mp) + 1, - xfs_ialloc_cluster_alignment(mp)); - *first = XFS_OFFBNO_TO_AGINO(mp, bno, 0); + bno = round_up(XFS_AGFL_BLOCK(mp) + 1, mp->m_cluster_align); + *first = XFS_AGB_TO_AGINO(mp, bno); /* * Calculate the last inode, which will be at the end of the * last (aligned) cluster that can be allocated in the AG. */ - bno = round_down(eoag, xfs_ialloc_cluster_alignment(mp)); - *last = XFS_OFFBNO_TO_AGINO(mp, bno, 0) - 1; + bno = round_down(eoag, mp->m_cluster_align); + *last = XFS_AGB_TO_AGINO(mp, bno) - 1; } /* diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index b9e6c89284c3..8f02855a019a 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -100,15 +100,37 @@ typedef void * xfs_failaddr_t; */ #define MAXNAMELEN 256 +/* + * This enum is used in string mapping in xfs_trace.h; please keep the + * TRACE_DEFINE_ENUMs for it up to date. + */ typedef enum { XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi } xfs_lookup_t; +#define XFS_AG_BTREE_CMP_FORMAT_STR \ + { XFS_LOOKUP_EQi, "eq" }, \ + { XFS_LOOKUP_LEi, "le" }, \ + { XFS_LOOKUP_GEi, "ge" } + +/* + * This enum is used in string mapping in xfs_trace.h and scrub/trace.h; + * please keep the TRACE_DEFINE_ENUMs for it up to date. + */ typedef enum { XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX } xfs_btnum_t; +#define XFS_BTNUM_STRINGS \ + { XFS_BTNUM_BNOi, "bnobt" }, \ + { XFS_BTNUM_CNTi, "cntbt" }, \ + { XFS_BTNUM_RMAPi, "rmapbt" }, \ + { XFS_BTNUM_BMAPi, "bmbt" }, \ + { XFS_BTNUM_INOi, "inobt" }, \ + { XFS_BTNUM_FINOi, "finobt" }, \ + { XFS_BTNUM_REFCi, "refcbt" } + struct xfs_name { const unsigned char *name; int len; diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 3068a9382feb..90955ab1e895 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -32,7 +32,6 @@ xchk_superblock_xref( struct xfs_scrub *sc, struct xfs_buf *bp) { - struct xfs_owner_info oinfo; struct xfs_mount *mp = sc->mp; xfs_agnumber_t agno = sc->sm->sm_agno; xfs_agblock_t agbno; @@ -49,8 +48,7 @@ xchk_superblock_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); - xchk_xref_is_owned_by(sc, agbno, 1, &oinfo); + xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); /* scrub teardown will take care of sc->sa for us */ @@ -484,7 +482,6 @@ STATIC void xchk_agf_xref( struct xfs_scrub *sc) { - struct xfs_owner_info oinfo; struct xfs_mount *mp = sc->mp; xfs_agblock_t agbno; int error; @@ -502,8 +499,7 @@ xchk_agf_xref( xchk_agf_xref_freeblks(sc); xchk_agf_xref_cntbt(sc); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); - xchk_xref_is_owned_by(sc, agbno, 1, &oinfo); + xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_agf_xref_btreeblks(sc); xchk_xref_is_not_shared(sc, agbno, 1); xchk_agf_xref_refcblks(sc); @@ -598,7 +594,6 @@ out: /* AGFL */ struct xchk_agfl_info { - struct xfs_owner_info oinfo; unsigned int sz_entries; unsigned int nr_entries; xfs_agblock_t *entries; @@ -609,15 +604,14 @@ struct xchk_agfl_info { STATIC void xchk_agfl_block_xref( struct xfs_scrub *sc, - xfs_agblock_t agbno, - struct xfs_owner_info *oinfo) + xfs_agblock_t agbno) { if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xchk_xref_is_owned_by(sc, agbno, 1, oinfo); + xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_AG); xchk_xref_is_not_shared(sc, agbno, 1); } @@ -638,7 +632,7 @@ xchk_agfl_block( else xchk_block_set_corrupt(sc, sc->sa.agfl_bp); - xchk_agfl_block_xref(sc, agbno, priv); + xchk_agfl_block_xref(sc, agbno); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return XFS_BTREE_QUERY_RANGE_ABORT; @@ -662,7 +656,6 @@ STATIC void xchk_agfl_xref( struct xfs_scrub *sc) { - struct xfs_owner_info oinfo; struct xfs_mount *mp = sc->mp; xfs_agblock_t agbno; int error; @@ -678,8 +671,7 @@ xchk_agfl_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); - xchk_xref_is_owned_by(sc, agbno, 1, &oinfo); + xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); /* @@ -732,7 +724,6 @@ xchk_agfl( } /* Check the blocks in the AGFL. */ - xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG); error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), sc->sa.agfl_bp, xchk_agfl_block, &sai); if (error == XFS_BTREE_QUERY_RANGE_ABORT) { @@ -791,7 +782,6 @@ STATIC void xchk_agi_xref( struct xfs_scrub *sc) { - struct xfs_owner_info oinfo; struct xfs_mount *mp = sc->mp; xfs_agblock_t agbno; int error; @@ -808,8 +798,7 @@ xchk_agi_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_xref_is_not_inode_chunk(sc, agbno, 1); xchk_agi_xref_icounts(sc); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_FS); - xchk_xref_is_owned_by(sc, agbno, 1, &oinfo); + xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_FS); xchk_xref_is_not_shared(sc, agbno, 1); /* scrub teardown will take care of sc->sa for us */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index f7568a4b5fe5..03d1e15cceba 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -646,7 +646,6 @@ int xrep_agfl( struct xfs_scrub *sc) { - struct xfs_owner_info oinfo; struct xfs_bitmap agfl_extents; struct xfs_mount *mp = sc->mp; struct xfs_buf *agf_bp; @@ -708,8 +707,8 @@ xrep_agfl( goto err; /* Dump any AGFL overflow. */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); - return xrep_reap_extents(sc, &agfl_extents, &oinfo, XFS_AG_RESV_AGFL); + return xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, + XFS_AG_RESV_AGFL); err: xfs_bitmap_destroy(&agfl_extents); return error; diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 376bcb585ae6..44883e9112ad 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -125,12 +125,10 @@ xchk_allocbt( struct xfs_scrub *sc, xfs_btnum_t which) { - struct xfs_owner_info oinfo; struct xfs_btree_cur *cur; - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; - return xchk_btree(sc, cur, xchk_allocbt_rec, &oinfo, NULL); + return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, NULL); } int diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 4ae959f7ad2c..6f94d1f7322d 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -583,31 +583,32 @@ xchk_btree_block_keys( */ int xchk_btree( - struct xfs_scrub *sc, - struct xfs_btree_cur *cur, - xchk_btree_rec_fn scrub_fn, - struct xfs_owner_info *oinfo, - void *private) + struct xfs_scrub *sc, + struct xfs_btree_cur *cur, + xchk_btree_rec_fn scrub_fn, + const struct xfs_owner_info *oinfo, + void *private) { - struct xchk_btree bs = { NULL }; - union xfs_btree_ptr ptr; - union xfs_btree_ptr *pp; - union xfs_btree_rec *recp; - struct xfs_btree_block *block; - int level; - struct xfs_buf *bp; - struct check_owner *co; - struct check_owner *n; - int i; - int error = 0; + struct xchk_btree bs = { + .cur = cur, + .scrub_rec = scrub_fn, + .oinfo = oinfo, + .firstrec = true, + .private = private, + .sc = sc, + }; + union xfs_btree_ptr ptr; + union xfs_btree_ptr *pp; + union xfs_btree_rec *recp; + struct xfs_btree_block *block; + int level; + struct xfs_buf *bp; + struct check_owner *co; + struct check_owner *n; + int i; + int error = 0; /* Initialize scrub state */ - bs.cur = cur; - bs.scrub_rec = scrub_fn; - bs.oinfo = oinfo; - bs.firstrec = true; - bs.private = private; - bs.sc = sc; for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) bs.firstkey[i] = true; INIT_LIST_HEAD(&bs.to_check); diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index aada763cd006..5572e475f8ed 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -31,21 +31,21 @@ typedef int (*xchk_btree_rec_fn)( struct xchk_btree { /* caller-provided scrub state */ - struct xfs_scrub *sc; - struct xfs_btree_cur *cur; - xchk_btree_rec_fn scrub_rec; - struct xfs_owner_info *oinfo; - void *private; + struct xfs_scrub *sc; + struct xfs_btree_cur *cur; + xchk_btree_rec_fn scrub_rec; + const struct xfs_owner_info *oinfo; + void *private; /* internal scrub state */ - union xfs_btree_rec lastrec; - bool firstrec; - union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS]; - bool firstkey[XFS_BTREE_MAXLEVELS]; - struct list_head to_check; + union xfs_btree_rec lastrec; + bool firstrec; + union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS]; + bool firstkey[XFS_BTREE_MAXLEVELS]; + struct list_head to_check; }; int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur, - xchk_btree_rec_fn scrub_fn, struct xfs_owner_info *oinfo, + xchk_btree_rec_fn scrub_fn, const struct xfs_owner_info *oinfo, void *private); #endif /* __XFS_SCRUB_BTREE_H__ */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 346b02abccf7..0c54ff55b901 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -313,8 +313,8 @@ xchk_set_incomplete( */ struct xchk_rmap_ownedby_info { - struct xfs_owner_info *oinfo; - xfs_filblks_t *blocks; + const struct xfs_owner_info *oinfo; + xfs_filblks_t *blocks; }; STATIC int @@ -347,15 +347,15 @@ int xchk_count_rmap_ownedby_ag( struct xfs_scrub *sc, struct xfs_btree_cur *cur, - struct xfs_owner_info *oinfo, + const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks) { - struct xchk_rmap_ownedby_info sroi; + struct xchk_rmap_ownedby_info sroi = { + .oinfo = oinfo, + .blocks = blocks, + }; - sroi.oinfo = oinfo; *blocks = 0; - sroi.blocks = blocks; - return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec, &sroi); } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 2d4324d12f9a..e26a430bd466 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -116,7 +116,7 @@ int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno, void xchk_ag_btcur_free(struct xchk_ag *sa); int xchk_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur, - struct xfs_owner_info *oinfo, xfs_filblks_t *blocks); + const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks); int xchk_setup_ag_btree(struct xfs_scrub *sc, struct xfs_inode *ip, bool force_log); diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 224dba937492..882dc56c5c21 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -44,6 +44,11 @@ xchk_setup_ag_iallocbt( /* Inode btree scrubber. */ +struct xchk_iallocbt { + /* Number of inodes we see while scanning inobt. */ + unsigned long long inodes; +}; + /* * If we're checking the finobt, cross-reference with the inobt. * Otherwise we're checking the inobt; if there is an finobt, make sure @@ -82,15 +87,12 @@ xchk_iallocbt_chunk_xref( xfs_agblock_t agbno, xfs_extlen_t len) { - struct xfs_owner_info oinfo; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return; xchk_xref_is_used_space(sc, agbno, len); xchk_iallocbt_chunk_xref_other(sc, irec, agino); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); - xchk_xref_is_owned_by(sc, agbno, len, &oinfo); + xchk_xref_is_owned_by(sc, agbno, len, &XFS_RMAP_OINFO_INODES); xchk_xref_is_not_shared(sc, agbno, len); } @@ -186,7 +188,6 @@ xchk_iallocbt_check_freemask( struct xchk_btree *bs, struct xfs_inobt_rec_incore *irec) { - struct xfs_owner_info oinfo; struct xfs_imap imap; struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_dinode *dip; @@ -197,19 +198,16 @@ xchk_iallocbt_check_freemask( xfs_agino_t chunkino; xfs_agino_t clusterino; xfs_agblock_t agbno; - int blks_per_cluster; uint16_t holemask; uint16_t ir_holemask; int error = 0; /* Make sure the freemask matches the inode records. */ - blks_per_cluster = xfs_icluster_size_fsb(mp); - nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); + nr_inodes = mp->m_inodes_per_cluster; for (agino = irec->ir_startino; agino < irec->ir_startino + XFS_INODES_PER_CHUNK; - agino += blks_per_cluster * mp->m_sb.sb_inopblock) { + agino += mp->m_inodes_per_cluster) { fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); chunkino = agino - irec->ir_startino; agbno = XFS_AGINO_TO_AGBNO(mp, agino); @@ -230,17 +228,18 @@ xchk_iallocbt_check_freemask( /* If any part of this is a hole, skip it. */ if (ir_holemask) { xchk_xref_is_not_owned_by(bs->sc, agbno, - blks_per_cluster, &oinfo); + mp->m_blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); continue; } - xchk_xref_is_owned_by(bs->sc, agbno, blks_per_cluster, - &oinfo); + xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); /* Grab the inode cluster buffer. */ imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, agbno); - imap.im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); + imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); imap.im_boffset = 0; error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, @@ -272,7 +271,7 @@ xchk_iallocbt_rec( union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_filblks_t *inode_blocks = bs->private; + struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; xfs_agnumber_t agno = bs->cur->bc_private.a.agno; @@ -306,12 +305,11 @@ xchk_iallocbt_rec( /* Make sure this record is aligned to cluster and inoalignmnt size. */ agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); - if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) || - (agbno & (xfs_icluster_size_fsb(mp) - 1))) + if ((agbno & (mp->m_cluster_align - 1)) || + (agbno & (mp->m_blocks_per_cluster - 1))) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - *inode_blocks += XFS_B_TO_FSB(mp, - irec.ir_count * mp->m_sb.sb_inodesize); + iabt->inodes += irec.ir_count; /* Handle non-sparse inodes */ if (!xfs_inobt_issparse(irec.ir_holemask)) { @@ -366,7 +364,6 @@ xchk_iallocbt_xref_rmap_btreeblks( struct xfs_scrub *sc, int which) { - struct xfs_owner_info oinfo; xfs_filblks_t blocks; xfs_extlen_t inobt_blocks = 0; xfs_extlen_t finobt_blocks = 0; @@ -388,9 +385,8 @@ xchk_iallocbt_xref_rmap_btreeblks( return; } - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); - error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo, - &blocks); + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, + &XFS_RMAP_OINFO_INOBT, &blocks); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; if (blocks != inobt_blocks + finobt_blocks) @@ -405,21 +401,21 @@ STATIC void xchk_iallocbt_xref_rmap_inodes( struct xfs_scrub *sc, int which, - xfs_filblks_t inode_blocks) + unsigned long long inodes) { - struct xfs_owner_info oinfo; xfs_filblks_t blocks; + xfs_filblks_t inode_blocks; int error; if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) return; /* Check that we saw as many inode blocks as the rmap knows about. */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); - error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, &oinfo, - &blocks); + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, + &XFS_RMAP_OINFO_INODES, &blocks); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; + inode_blocks = XFS_B_TO_FSB(sc->mp, inodes * sc->mp->m_sb.sb_inodesize); if (blocks != inode_blocks) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } @@ -431,14 +427,14 @@ xchk_iallocbt( xfs_btnum_t which) { struct xfs_btree_cur *cur; - struct xfs_owner_info oinfo; - xfs_filblks_t inode_blocks = 0; + struct xchk_iallocbt iabt = { + .inodes = 0, + }; int error; - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; - error = xchk_btree(sc, cur, xchk_iallocbt_rec, &oinfo, - &inode_blocks); + error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT, + &iabt); if (error) return error; @@ -452,7 +448,7 @@ xchk_iallocbt( * to inode chunks with free inodes. */ if (which == XFS_BTNUM_INO) - xchk_iallocbt_xref_rmap_inodes(sc, which, inode_blocks); + xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes); return error; } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index e386c9b0b4ab..e213efc194a1 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -509,7 +509,6 @@ xchk_inode_xref( xfs_ino_t ino, struct xfs_dinode *dip) { - struct xfs_owner_info oinfo; xfs_agnumber_t agno; xfs_agblock_t agbno; int error; @@ -526,8 +525,7 @@ xchk_inode_xref( xchk_xref_is_used_space(sc, agbno, 1); xchk_inode_xref_finobt(sc, ino); - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES); - xchk_xref_is_owned_by(sc, agbno, 1, &oinfo); + xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES); xchk_xref_is_not_shared(sc, agbno, 1); xchk_inode_xref_bmap(sc, dip); diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index e8c82b026083..708b4158eb90 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -383,7 +383,6 @@ xchk_refcountbt_rec( STATIC void xchk_refcount_xref_rmap( struct xfs_scrub *sc, - struct xfs_owner_info *oinfo, xfs_filblks_t cow_blocks) { xfs_extlen_t refcbt_blocks = 0; @@ -397,17 +396,16 @@ xchk_refcount_xref_rmap( error = xfs_btree_count_blocks(sc->sa.refc_cur, &refcbt_blocks); if (!xchk_btree_process_error(sc, sc->sa.refc_cur, 0, &error)) return; - error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo, - &blocks); + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, + &XFS_RMAP_OINFO_REFC, &blocks); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; if (blocks != refcbt_blocks) xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); /* Check that we saw as many cow blocks as the rmap knows about. */ - xfs_rmap_ag_owner(oinfo, XFS_RMAP_OWN_COW); - error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, oinfo, - &blocks); + error = xchk_count_rmap_ownedby_ag(sc, sc->sa.rmap_cur, + &XFS_RMAP_OINFO_COW, &blocks); if (!xchk_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; if (blocks != cow_blocks) @@ -419,17 +417,15 @@ int xchk_refcountbt( struct xfs_scrub *sc) { - struct xfs_owner_info oinfo; xfs_agblock_t cow_blocks = 0; int error; - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); error = xchk_btree(sc, sc->sa.refc_cur, xchk_refcountbt_rec, - &oinfo, &cow_blocks); + &XFS_RMAP_OINFO_REFC, &cow_blocks); if (error) return error; - xchk_refcount_xref_rmap(sc, &oinfo, cow_blocks); + xchk_refcount_xref_rmap(sc, cow_blocks); return 0; } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 4fc0a5ea7673..1c8eecfe52b8 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -299,14 +299,14 @@ xrep_calc_ag_resblks( /* Allocate a block in an AG. */ int xrep_alloc_ag_block( - struct xfs_scrub *sc, - struct xfs_owner_info *oinfo, - xfs_fsblock_t *fsbno, - enum xfs_ag_resv_type resv) + struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, + xfs_fsblock_t *fsbno, + enum xfs_ag_resv_type resv) { - struct xfs_alloc_arg args = {0}; - xfs_agblock_t bno; - int error; + struct xfs_alloc_arg args = {0}; + xfs_agblock_t bno; + int error; switch (resv) { case XFS_AG_RESV_AGFL: @@ -505,7 +505,6 @@ xrep_put_freelist( struct xfs_scrub *sc, xfs_agblock_t agbno) { - struct xfs_owner_info oinfo; int error; /* Make sure there's space on the freelist. */ @@ -518,9 +517,8 @@ xrep_put_freelist( * create an rmap for the block prior to merging it or else other * parts will break. */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1, - &oinfo); + &XFS_RMAP_OINFO_AG); if (error) return error; @@ -538,17 +536,17 @@ xrep_put_freelist( /* Dispose of a single block. */ STATIC int xrep_reap_block( - struct xfs_scrub *sc, - xfs_fsblock_t fsbno, - struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type resv) + struct xfs_scrub *sc, + xfs_fsblock_t fsbno, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type resv) { - struct xfs_btree_cur *cur; - struct xfs_buf *agf_bp = NULL; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - bool has_other_rmap; - int error; + struct xfs_btree_cur *cur; + struct xfs_buf *agf_bp = NULL; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + bool has_other_rmap; + int error; agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); @@ -612,15 +610,15 @@ out_free: /* Dispose of every block of every extent in the bitmap. */ int xrep_reap_extents( - struct xfs_scrub *sc, - struct xfs_bitmap *bitmap, - struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) + struct xfs_scrub *sc, + struct xfs_bitmap *bitmap, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; - xfs_fsblock_t fsbno; - int error = 0; + struct xfs_bitmap_range *bmr; + struct xfs_bitmap_range *n; + xfs_fsblock_t fsbno; + int error = 0; ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb)); diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 9de321eee4ab..f2fc18bb7605 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -21,8 +21,9 @@ int xrep_roll_ag_trans(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, enum xfs_ag_resv_type type); xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc); -int xrep_alloc_ag_block(struct xfs_scrub *sc, struct xfs_owner_info *oinfo, - xfs_fsblock_t *fsbno, enum xfs_ag_resv_type resv); +int xrep_alloc_ag_block(struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno, + enum xfs_ag_resv_type resv); int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb, struct xfs_buf **bpp, xfs_btnum_t btnum, const struct xfs_buf_ops *ops); @@ -32,7 +33,7 @@ struct xfs_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xfs_bitmap *btlist); int xrep_reap_extents(struct xfs_scrub *sc, struct xfs_bitmap *exlist, - struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); + const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); struct xrep_find_ag_btree { /* in: rmap owner of the btree we're looking for */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 5e293c129813..92a140c5b55e 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -174,24 +174,21 @@ int xchk_rmapbt( struct xfs_scrub *sc) { - struct xfs_owner_info oinfo; - - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); return xchk_btree(sc, sc->sa.rmap_cur, xchk_rmapbt_rec, - &oinfo, NULL); + &XFS_RMAP_OINFO_AG, NULL); } /* xref check that the extent is owned by a given owner */ static inline void xchk_xref_check_owner( - struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo, - bool should_have_rmap) + struct xfs_scrub *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + bool should_have_rmap) { - bool has_rmap; - int error; + bool has_rmap; + int error; if (!sc->sa.rmap_cur || xchk_skip_xref(sc->sm)) return; @@ -207,10 +204,10 @@ xchk_xref_check_owner( /* xref check that the extent is owned by a given owner */ void xchk_xref_is_owned_by( - struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo) + struct xfs_scrub *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo) { xchk_xref_check_owner(sc, bno, len, oinfo, true); } @@ -218,10 +215,10 @@ xchk_xref_is_owned_by( /* xref check that the extent is not owned by a given owner */ void xchk_xref_is_not_owned_by( - struct xfs_scrub *sc, - xfs_agblock_t bno, - xfs_extlen_t len, - struct xfs_owner_info *oinfo) + struct xfs_scrub *sc, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo) { xchk_xref_check_owner(sc, bno, len, oinfo, false); } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index af323b229c4b..22f754fba8e5 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -122,9 +122,9 @@ void xchk_xref_is_not_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno, void xchk_xref_is_inode_chunk(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len); void xchk_xref_is_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, - xfs_extlen_t len, struct xfs_owner_info *oinfo); + xfs_extlen_t len, const struct xfs_owner_info *oinfo); void xchk_xref_is_not_owned_by(struct xfs_scrub *sc, xfs_agblock_t agbno, - xfs_extlen_t len, struct xfs_owner_info *oinfo); + xfs_extlen_t len, const struct xfs_owner_info *oinfo); void xchk_xref_has_no_owner(struct xfs_scrub *sc, xfs_agblock_t agbno, xfs_extlen_t len); void xchk_xref_is_cow_staging(struct xfs_scrub *sc, xfs_agblock_t bno, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 4e20f0e48232..8344b14031ef 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -12,6 +12,71 @@ #include <linux/tracepoint.h> #include "xfs_bit.h" +/* + * ftrace's __print_symbolic requires that all enum values be wrapped in the + * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace + * ring buffer. Somehow this was only worth mentioning in the ftrace sample + * code. + */ +TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi); +TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi); +TRACE_DEFINE_ENUM(XFS_BTNUM_INOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); +TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); + +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PROBE); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SB); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGF); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGFL); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_AGI); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BNOBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_CNTBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INOBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FINOBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RMAPBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_REFCNTBT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_INODE); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTD); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_BMBTC); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_DIR); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_XATTR); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_SYMLINK); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PARENT); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTBITMAP); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTSUM); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); + +#define XFS_SCRUB_TYPE_STRINGS \ + { XFS_SCRUB_TYPE_PROBE, "probe" }, \ + { XFS_SCRUB_TYPE_SB, "sb" }, \ + { XFS_SCRUB_TYPE_AGF, "agf" }, \ + { XFS_SCRUB_TYPE_AGFL, "agfl" }, \ + { XFS_SCRUB_TYPE_AGI, "agi" }, \ + { XFS_SCRUB_TYPE_BNOBT, "bnobt" }, \ + { XFS_SCRUB_TYPE_CNTBT, "cntbt" }, \ + { XFS_SCRUB_TYPE_INOBT, "inobt" }, \ + { XFS_SCRUB_TYPE_FINOBT, "finobt" }, \ + { XFS_SCRUB_TYPE_RMAPBT, "rmapbt" }, \ + { XFS_SCRUB_TYPE_REFCNTBT, "refcountbt" }, \ + { XFS_SCRUB_TYPE_INODE, "inode" }, \ + { XFS_SCRUB_TYPE_BMBTD, "bmapbtd" }, \ + { XFS_SCRUB_TYPE_BMBTA, "bmapbta" }, \ + { XFS_SCRUB_TYPE_BMBTC, "bmapbtc" }, \ + { XFS_SCRUB_TYPE_DIR, "directory" }, \ + { XFS_SCRUB_TYPE_XATTR, "xattr" }, \ + { XFS_SCRUB_TYPE_SYMLINK, "symlink" }, \ + { XFS_SCRUB_TYPE_PARENT, "parent" }, \ + { XFS_SCRUB_TYPE_RTBITMAP, "rtbitmap" }, \ + { XFS_SCRUB_TYPE_RTSUM, "rtsummary" }, \ + { XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \ + { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \ + { XFS_SCRUB_TYPE_PQUOTA, "prjquota" } + DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, int error), @@ -36,10 +101,10 @@ DECLARE_EVENT_CLASS(xchk_class, __entry->flags = sm->sm_flags; __entry->error = error; ), - TP_printk("dev %d:%d ino 0x%llx type %u agno %u inum %llu gen %u flags 0x%x error %d", + TP_printk("dev %d:%d ino 0x%llx type %s agno %u inum %llu gen %u flags 0x%x error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->agno, __entry->inum, __entry->gen, @@ -78,9 +143,9 @@ TRACE_EVENT(xchk_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u agno %u agbno %u error %d ret_ip %pS", + TP_printk("dev %d:%d type %s agno %u agbno %u error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->agno, __entry->bno, __entry->error, @@ -109,11 +174,11 @@ TRACE_EVENT(xchk_file_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu error %d ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->offset, __entry->error, __entry->ret_ip) @@ -144,9 +209,9 @@ DECLARE_EVENT_CLASS(xchk_block_error_class, __entry->bno = bno; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d type %s agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->agno, __entry->bno, __entry->ret_ip) @@ -176,10 +241,10 @@ DECLARE_EVENT_CLASS(xchk_ino_error_class, __entry->type = sc->sm->sm_type; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx type %u ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx type %s ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->ret_ip) ) @@ -213,11 +278,11 @@ DECLARE_EVENT_CLASS(xchk_fblock_error_class, __entry->offset = offset; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %u offset %llu ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->offset, __entry->ret_ip) ); @@ -244,9 +309,9 @@ TRACE_EVENT(xchk_incomplete, __entry->type = sc->sm->sm_type; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u ret_ip %pS", + TP_printk("dev %d:%d type %s ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->ret_ip) ); @@ -278,10 +343,10 @@ TRACE_EVENT(xchk_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS", + TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, - __entry->btnum, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, __entry->ptr, __entry->agno, @@ -321,12 +386,12 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u error %d ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, - __entry->type, - __entry->btnum, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, __entry->ptr, __entry->agno, @@ -360,10 +425,10 @@ TRACE_EVENT(xchk_btree_error, __entry->ptr = cur->bc_ptrs[level]; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, - __entry->btnum, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, __entry->ptr, __entry->agno, @@ -400,12 +465,12 @@ TRACE_EVENT(xchk_ifork_btree_error, __entry->ptr = cur->bc_ptrs[level]; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %u btnum %d level %d ptr %d agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->whichfork, - __entry->type, - __entry->btnum, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, __entry->ptr, __entry->agno, @@ -439,10 +504,10 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_ptrs[level]; ), - TP_printk("dev %d:%d type %u btnum %d agno %u agbno %u level %d nlevels %d ptr %d", + TP_printk("dev %d:%d type %s btree %s agno %u agbno %u level %d nlevels %d ptr %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, - __entry->btnum, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->agno, __entry->bno, __entry->level, @@ -473,9 +538,9 @@ TRACE_EVENT(xchk_xref_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %u xref error %d ret_ip %pF", + TP_printk("dev %d:%d type %s xref error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->error, __entry->ret_ip) ); @@ -598,11 +663,11 @@ TRACE_EVENT(xrep_init_btblock, __entry->agbno = agbno; __entry->btnum = btnum; ), - TP_printk("dev %d:%d agno %u agbno %u btnum %d", + TP_printk("dev %d:%d agno %u agbno %u btree %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, - __entry->btnum) + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS)) ) TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 494b4338446e..e5c23948a8ab 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -10,6 +10,9 @@ extern struct bio_set xfs_ioend_bioset; /* * Types of I/O for bmap clustering and I/O completion tracking. + * + * This enum is used in string mapping in xfs_trace.h; please keep the + * TRACE_DEFINE_ENUMs for it up to date. */ enum { XFS_IO_HOLE, /* covers region without any block allocation */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d9da66c718bb..74ddf66f4cfe 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -494,7 +494,6 @@ xfs_efi_recover( int error = 0; xfs_extent_t *extp; xfs_fsblock_t startblock_fsb; - struct xfs_owner_info oinfo; ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); @@ -526,11 +525,11 @@ xfs_efi_recover( return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); - xfs_rmap_any_owner_update(&oinfo); for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &efip->efi_format.efi_extents[i]; error = xfs_trans_free_extent(tp, efdp, extp->ext_start, - extp->ext_len, &oinfo, false); + extp->ext_len, + &XFS_RMAP_OINFO_ANY_OWNER, false); if (error) goto abort_error; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 093c2b8d7e20..ec2e63a7963b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -252,7 +252,7 @@ xfs_growfs_data( if (mp->m_sb.sb_imax_pct) { uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; do_div(icount, 100); - mp->m_maxicount = icount << mp->m_sb.sb_inopblog; + mp->m_maxicount = XFS_FSB_TO_INO(mp, icount); } else mp->m_maxicount = 0; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 05db9540e459..ae667ba74a1c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2184,8 +2184,6 @@ xfs_ifree_cluster( struct xfs_icluster *xic) { xfs_mount_t *mp = free_ip->i_mount; - int blks_per_cluster; - int inodes_per_cluster; int nbufs; int i, j; int ioffset; @@ -2199,11 +2197,9 @@ xfs_ifree_cluster( inum = xic->first_ino; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - blks_per_cluster = xfs_icluster_size_fsb(mp); - inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; - nbufs = mp->m_ialloc_blks / blks_per_cluster; + nbufs = mp->m_ialloc_blks / mp->m_blocks_per_cluster; - for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { + for (j = 0; j < nbufs; j++, inum += mp->m_inodes_per_cluster) { /* * The allocation bitmap tells us which inodes of the chunk were * physically allocated. Skip the cluster if an inode falls into @@ -2211,7 +2207,7 @@ xfs_ifree_cluster( */ ioffset = inum - xic->first_ino; if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { - ASSERT(ioffset % inodes_per_cluster == 0); + ASSERT(ioffset % mp->m_inodes_per_cluster == 0); continue; } @@ -2227,7 +2223,7 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, + mp->m_bsize * mp->m_blocks_per_cluster, XBF_UNMAPPED); if (!bp) @@ -2242,7 +2238,7 @@ xfs_ifree_cluster( * want it to fail. We can acheive this by adding a write * verifier to the buffer. */ - bp->b_ops = &xfs_inode_buf_ops; + bp->b_ops = &xfs_inode_buf_ops; /* * Walk the inodes already attached to the buffer and mark them @@ -2274,7 +2270,7 @@ xfs_ifree_cluster( * transaction stale above, which means there is no point in * even trying to lock them. */ - for (i = 0; i < inodes_per_cluster; i++) { + for (i = 0; i < mp->m_inodes_per_cluster; i++) { retry: rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index fba115f4103a..5001dca361e9 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -241,6 +241,32 @@ xfs_compat_ioc_bulkstat( int done; int error; + /* + * Output structure handling functions. Depending on the command, + * either the xfs_bstat and xfs_inogrp structures are written out + * to userpace memory via bulkreq.ubuffer. Normally the compat + * functions and structure size are the correct ones to use ... + */ + inumbers_fmt_pf inumbers_func = xfs_inumbers_fmt_compat; + bulkstat_one_pf bs_one_func = xfs_bulkstat_one_compat; + size_t bs_one_size = sizeof(struct compat_xfs_bstat); + +#ifdef CONFIG_X86_X32 + if (in_x32_syscall()) { + /* + * ... but on x32 the input xfs_fsop_bulkreq has pointers + * which must be handled in the "compat" (32-bit) way, while + * the xfs_bstat and xfs_inogrp structures follow native 64- + * bit layout convention. So adjust accordingly, otherwise + * the data written out in compat layout will not match what + * x32 userspace expects. + */ + inumbers_func = xfs_inumbers_fmt; + bs_one_func = xfs_bulkstat_one; + bs_one_size = sizeof(struct xfs_bstat); + } +#endif + /* done = 1 if there are more stats to get and if bulkstat */ /* should be called again (unused here, but used in dmapi) */ @@ -272,15 +298,15 @@ xfs_compat_ioc_bulkstat( if (cmd == XFS_IOC_FSINUMBERS_32) { error = xfs_inumbers(mp, &inlast, &count, - bulkreq.ubuffer, xfs_inumbers_fmt_compat); + bulkreq.ubuffer, inumbers_func); } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) { int res; - error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, - sizeof(compat_xfs_bstat_t), NULL, &res); + error = bs_one_func(mp, inlast, bulkreq.ubuffer, + bs_one_size, NULL, &res); } else if (cmd == XFS_IOC_FSBULKSTAT_32) { error = xfs_bulkstat(mp, &inlast, &count, - xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), + bs_one_func, bs_one_size, bulkreq.ubuffer, &done); } else error = -EINVAL; @@ -336,6 +362,7 @@ xfs_compat_attrlist_by_handle( { int error; attrlist_cursor_kern_t *cursor; + compat_xfs_fsop_attrlist_handlereq_t __user *p = arg; compat_xfs_fsop_attrlist_handlereq_t al_hreq; struct dentry *dentry; char *kbuf; @@ -370,6 +397,11 @@ xfs_compat_attrlist_by_handle( if (error) goto out_kfree; + if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { + error = -EFAULT; + goto out_kfree; + } + if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) error = -EFAULT; @@ -547,8 +579,12 @@ xfs_file_compat_ioctl( case FS_IOC_GETFSMAP: case XFS_IOC_SCRUB_METADATA: return xfs_file_ioctl(filp, cmd, p); -#ifndef BROKEN_X86_ALIGNMENT - /* These are handled fine if no alignment issues */ +#if !defined(BROKEN_X86_ALIGNMENT) || defined(CONFIG_X86_X32) + /* + * These are handled fine if no alignment issues. To support x32 + * which uses native 64-bit alignment we must emit these cases in + * addition to the ia-32 compat set below. + */ case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_RESVSP: @@ -561,8 +597,16 @@ xfs_file_compat_ioctl( case XFS_IOC_FSGROWFSDATA: case XFS_IOC_FSGROWFSRT: case XFS_IOC_ZERO_RANGE: +#ifdef CONFIG_X86_X32 + /* + * x32 special: this gets a different cmd number from the ia-32 compat + * case below; the associated data will match native 64-bit alignment. + */ + case XFS_IOC_SWAPEXT: +#endif return xfs_file_ioctl(filp, cmd, p); -#else +#endif +#if defined(BROKEN_X86_ALIGNMENT) case XFS_IOC_ALLOCSP_32: case XFS_IOC_FREESP_32: case XFS_IOC_ALLOCSP64_32: diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index e9508ba01ed1..942e4aa5e729 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -167,20 +167,18 @@ xfs_bulkstat_ichunk_ra( { xfs_agblock_t agbno; struct blk_plug plug; - int blks_per_cluster; - int inodes_per_cluster; int i; /* inode chunk index */ agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino); - blks_per_cluster = xfs_icluster_size_fsb(mp); - inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; blk_start_plug(&plug); for (i = 0; i < XFS_INODES_PER_CHUNK; - i += inodes_per_cluster, agbno += blks_per_cluster) { - if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) { - xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster, - &xfs_inode_buf_ops); + i += mp->m_inodes_per_cluster, agbno += mp->m_blocks_per_cluster) { + if (xfs_inobt_maskn(i, mp->m_inodes_per_cluster) & + ~irec->ir_free) { + xfs_btree_reada_bufs(mp, agno, agbno, + mp->m_blocks_per_cluster, + &xfs_inode_buf_ops); } } blk_finish_plug(&plug); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1fc9e9042e0e..9fe88d125f0a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3850,7 +3850,6 @@ xlog_recover_do_icreate_pass2( unsigned int count; unsigned int isize; xfs_agblock_t length; - int blks_per_cluster; int bb_per_cluster; int cancel_count; int nbufs; @@ -3918,14 +3917,13 @@ xlog_recover_do_icreate_pass2( * buffers for cancellation so we don't overwrite anything written after * a cancellation. */ - blks_per_cluster = xfs_icluster_size_fsb(mp); - bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster); - nbufs = length / blks_per_cluster; + bb_per_cluster = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); + nbufs = length / mp->m_blocks_per_cluster; for (i = 0, cancel_count = 0; i < nbufs; i++) { xfs_daddr_t daddr; daddr = XFS_AGB_TO_DADDR(mp, agno, - agbno + i * blks_per_cluster); + agbno + i * mp->m_blocks_per_cluster); if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0)) cancel_count++; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 02d15098dbee..b4d8c318be3c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -798,6 +798,10 @@ xfs_mountfs( if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) mp->m_inode_cluster_size = new_size; } + mp->m_blocks_per_cluster = xfs_icluster_size_fsb(mp); + mp->m_inodes_per_cluster = XFS_FSB_TO_INO(mp, mp->m_blocks_per_cluster); + mp->m_cluster_align = xfs_ialloc_cluster_alignment(mp); + mp->m_cluster_align_inodes = XFS_FSB_TO_INO(mp, mp->m_cluster_align); /* * If enabled, sparse inode chunk alignment is expected to match the diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7964513c3128..7daafe064af8 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -89,6 +89,13 @@ typedef struct xfs_mount { int m_logbsize; /* size of each log buffer */ uint m_rsumlevels; /* rt summary levels */ uint m_rsumsize; /* size of rt summary, bytes */ + /* + * Optional cache of rt summary level per bitmap block with the + * invariant that m_rsum_cache[bbno] <= the minimum i for which + * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip + * inode lock. + */ + uint8_t *m_rsum_cache; struct xfs_inode *m_rbmip; /* pointer to bitmap inode */ struct xfs_inode *m_rsumip; /* pointer to summary inode */ struct xfs_inode *m_rootip; /* pointer to root directory */ @@ -101,6 +108,10 @@ typedef struct xfs_mount { uint8_t m_agno_log; /* log #ag's */ uint8_t m_agino_log; /* #bits for agino in inum */ uint m_inode_cluster_size;/* min inode buf size */ + unsigned int m_inodes_per_cluster; + unsigned int m_blocks_per_cluster; + unsigned int m_cluster_align; + unsigned int m_cluster_align_inodes; uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 322a852ce284..c5b4fa004ca4 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -623,54 +623,47 @@ out: } /* - * Remap parts of a file's data fork after a successful CoW. + * Remap part of the CoW fork into the data fork. + * + * We aim to remap the range starting at @offset_fsb and ending at @end_fsb + * into the data fork; this function will remap what it can (at the end of the + * range) and update @end_fsb appropriately. Each remap gets its own + * transaction because we can end up merging and splitting bmbt blocks for + * every remap operation and we'd like to keep the block reservation + * requirements as low as possible. */ -int -xfs_reflink_end_cow( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t count) +STATIC int +xfs_reflink_end_cow_extent( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t *end_fsb) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got, del; - struct xfs_trans *tp; - xfs_fileoff_t offset_fsb; - xfs_fileoff_t end_fsb; - int error; - unsigned int resblks; - xfs_filblks_t rlen; - struct xfs_iext_cursor icur; - - trace_xfs_reflink_end_cow(ip, offset, count); + struct xfs_bmbt_irec got, del; + struct xfs_iext_cursor icur; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + xfs_filblks_t rlen; + unsigned int resblks; + int error; /* No COW extents? That's easy! */ - if (ifp->if_bytes == 0) + if (ifp->if_bytes == 0) { + *end_fsb = offset_fsb; return 0; + } - offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); - end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); + if (error) + return error; /* - * Start a rolling transaction to switch the mappings. We're - * unlikely ever to have to remap 16T worth of single-block - * extents, so just cap the worst case extent count to 2^32-1. - * Stick a warning in just in case, and avoid 64-bit division. + * Lock the inode. We have to ijoin without automatic unlock because + * the lead transaction is the refcountbt record deletion; the data + * fork update follows as a deferred log item. */ - BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX); - if (end_fsb - offset_fsb > UINT_MAX) { - error = -EFSCORRUPTED; - xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE); - ASSERT(0); - goto out; - } - resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount, - (unsigned int)(end_fsb - offset_fsb), - XFS_DATA_FORK); - error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, - resblks, 0, XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); - if (error) - goto out; - xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -679,80 +672,131 @@ xfs_reflink_end_cow( * left by the time I/O completes for the loser of the race. In that * case we are done. */ - if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) + if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) || + got.br_startoff + got.br_blockcount <= offset_fsb) { + *end_fsb = offset_fsb; goto out_cancel; + } - /* Walk backwards until we're out of the I/O range... */ - while (got.br_startoff + got.br_blockcount > offset_fsb) { - del = got; - xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); - - /* Extent delete may have bumped ext forward */ - if (!del.br_blockcount) - goto prev_extent; + /* + * Structure copy @got into @del, then trim @del to the range that we + * were asked to remap. We preserve @got for the eventual CoW fork + * deletion; from now on @del represents the mapping that we're + * actually remapping. + */ + del = got; + xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb); - /* - * Only remap real extent that contain data. With AIO - * speculatively preallocations can leak into the range we - * are called upon, and we need to skip them. - */ - if (!xfs_bmap_is_real_extent(&got)) - goto prev_extent; + ASSERT(del.br_blockcount > 0); - /* Unmap the old blocks in the data fork. */ - ASSERT(tp->t_firstblock == NULLFSBLOCK); - rlen = del.br_blockcount; - error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); - if (error) - goto out_cancel; + /* + * Only remap real extents that contain data. With AIO, speculative + * preallocations can leak into the range we are called upon, and we + * need to skip them. + */ + if (!xfs_bmap_is_real_extent(&got)) { + *end_fsb = del.br_startoff; + goto out_cancel; + } - /* Trim the extent to whatever got unmapped. */ - if (rlen) { - xfs_trim_extent(&del, del.br_startoff + rlen, - del.br_blockcount - rlen); - } - trace_xfs_reflink_cow_remap(ip, &del); + /* Unmap the old blocks in the data fork. */ + rlen = del.br_blockcount; + error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1); + if (error) + goto out_cancel; - /* Free the CoW orphan record. */ - error = xfs_refcount_free_cow_extent(tp, del.br_startblock, - del.br_blockcount); - if (error) - goto out_cancel; + /* Trim the extent to whatever got unmapped. */ + xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen); + trace_xfs_reflink_cow_remap(ip, &del); - /* Map the new blocks into the data fork. */ - error = xfs_bmap_map_extent(tp, ip, &del); - if (error) - goto out_cancel; + /* Free the CoW orphan record. */ + error = xfs_refcount_free_cow_extent(tp, del.br_startblock, + del.br_blockcount); + if (error) + goto out_cancel; - /* Charge this new data fork mapping to the on-disk quota. */ - xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, - (long)del.br_blockcount); + /* Map the new blocks into the data fork. */ + error = xfs_bmap_map_extent(tp, ip, &del); + if (error) + goto out_cancel; - /* Remove the mapping from the CoW fork. */ - xfs_bmap_del_extent_cow(ip, &icur, &got, &del); + /* Charge this new data fork mapping to the on-disk quota. */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, + (long)del.br_blockcount); - error = xfs_defer_finish(&tp); - if (error) - goto out_cancel; - if (!xfs_iext_get_extent(ifp, &icur, &got)) - break; - continue; -prev_extent: - if (!xfs_iext_prev_extent(ifp, &icur, &got)) - break; - } + /* Remove the mapping from the CoW fork. */ + xfs_bmap_del_extent_cow(ip, &icur, &got, &del); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) - goto out; + return error; + + /* Update the caller about how much progress we made. */ + *end_fsb = del.br_startoff; return 0; out_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); -out: - trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); + return error; +} + +/* + * Remap parts of a file's data fork after a successful CoW. + */ +int +xfs_reflink_end_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + + trace_xfs_reflink_end_cow(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + + /* + * Walk backwards until we're out of the I/O range. The loop function + * repeatedly cycles the ILOCK to allocate one transaction per remapped + * extent. + * + * If we're being called by writeback then the the pages will still + * have PageWriteback set, which prevents races with reflink remapping + * and truncate. Reflink remapping prevents races with writeback by + * taking the iolock and mmaplock before flushing the pages and + * remapping, which means there won't be any further writeback or page + * cache dirtying until the reflink completes. + * + * We should never have two threads issuing writeback for the same file + * region. There are also have post-eof checks in the writeback + * preparation code so that we don't bother writing out pages that are + * about to be truncated. + * + * If we're being called as part of directio write completion, the dio + * count is still elevated, which reflink and truncate will wait for. + * Reflink remapping takes the iolock and mmaplock and waits for + * pending dio to finish, which should prevent any directio until the + * remap completes. Multiple concurrent directio writes to the same + * region are handled by end_cow processing only occurring for the + * threads which succeed; the outcome of multiple overlapping direct + * writes is not well defined anyway. + * + * It's possible that a buffered write and a direct write could collide + * here (the buffered write stumbles in after the dio flushes and + * invalidates the page cache and immediately queues writeback), but we + * have never supported this 100%. If either disk write succeeds the + * blocks will be remapped. + */ + while (end_fsb > offset_fsb && !error) + error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb); + + if (error) + trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); return error; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 926ed314ffba..ac0fcdad0c4e 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -64,8 +64,12 @@ xfs_rtany_summary( int log; /* loop counter, log2 of ext. size */ xfs_suminfo_t sum; /* summary data */ + /* There are no extents at levels < m_rsum_cache[bbno]. */ + if (mp->m_rsum_cache && low < mp->m_rsum_cache[bbno]) + low = mp->m_rsum_cache[bbno]; + /* - * Loop over logs of extent sizes. Order is irrelevant. + * Loop over logs of extent sizes. */ for (log = low; log <= high; log++) { /* @@ -80,13 +84,17 @@ xfs_rtany_summary( */ if (sum) { *stat = 1; - return 0; + goto out; } } /* * Found nothing, return failure. */ *stat = 0; +out: + /* There were no extents at levels < log. */ + if (mp->m_rsum_cache && log > mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log; return 0; } @@ -853,6 +861,21 @@ out_trans_cancel: return error; } +static void +xfs_alloc_rsum_cache( + xfs_mount_t *mp, /* file system mount structure */ + xfs_extlen_t rbmblocks) /* number of rt bitmap blocks */ +{ + /* + * The rsum cache is initialized to all zeroes, which is trivially a + * lower bound on the minimum level with any free extents. We can + * continue without the cache if it couldn't be allocated. + */ + mp->m_rsum_cache = kmem_zalloc_large(rbmblocks, KM_SLEEP); + if (!mp->m_rsum_cache) + xfs_warn(mp, "could not allocate realtime summary cache"); +} + /* * Visible (exported) functions. */ @@ -881,6 +904,7 @@ xfs_growfs_rt( xfs_extlen_t rsumblocks; /* current number of rt summary blks */ xfs_sb_t *sbp; /* old superblock */ xfs_fsblock_t sumbno; /* summary block number */ + uint8_t *rsum_cache; /* old summary cache */ sbp = &mp->m_sb; /* @@ -937,6 +961,11 @@ xfs_growfs_rt( error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip); if (error) return error; + + rsum_cache = mp->m_rsum_cache; + if (nrbmblocks != sbp->sb_rbmblocks) + xfs_alloc_rsum_cache(mp, nrbmblocks); + /* * Allocate a new (fake) mount/sb. */ @@ -1062,6 +1091,20 @@ error_cancel: */ kmem_free(nmp); + /* + * If we had to allocate a new rsum_cache, we either need to free the + * old one (if we succeeded) or free the new one and restore the old one + * (if there was an error). + */ + if (rsum_cache != mp->m_rsum_cache) { + if (error) { + kmem_free(mp->m_rsum_cache); + mp->m_rsum_cache = rsum_cache; + } else { + kmem_free(rsum_cache); + } + } + return error; } @@ -1187,8 +1230,8 @@ xfs_rtmount_init( } /* - * Get the bitmap and summary inodes into the mount structure - * at mount time. + * Get the bitmap and summary inodes and the summary cache into the mount + * structure at mount time. */ int /* error */ xfs_rtmount_inodes( @@ -1198,19 +1241,18 @@ xfs_rtmount_inodes( xfs_sb_t *sbp; sbp = &mp->m_sb; - if (sbp->sb_rbmino == NULLFSINO) - return 0; error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); if (error) return error; ASSERT(mp->m_rbmip != NULL); - ASSERT(sbp->sb_rsumino != NULLFSINO); + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); if (error) { xfs_irele(mp->m_rbmip); return error; } ASSERT(mp->m_rsumip != NULL); + xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks); return 0; } @@ -1218,6 +1260,7 @@ void xfs_rtunmount_inodes( struct xfs_mount *mp) { + kmem_free(mp->m_rsum_cache); if (mp->m_rbmip) xfs_irele(mp->m_rbmip); if (mp->m_rsumip) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d3e6cd063688..c9097cb0b955 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -38,6 +38,7 @@ #include "xfs_refcount_item.h" #include "xfs_bmap_item.h" #include "xfs_reflink.h" +#include "xfs_defer.h" #include <linux/namei.h> #include <linux/dax.h> @@ -607,7 +608,7 @@ xfs_set_inode_alloc( } /* Get the last possible inode in the filesystem */ - agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + agino = XFS_AGB_TO_AGINO(mp, sbp->sb_agblocks - 1); ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); /* @@ -1149,7 +1150,7 @@ xfs_fs_statfs( statp->f_bfree = fdblocks - mp->m_alloc_set_aside; statp->f_bavail = statp->f_bfree; - fakeinos = statp->f_bfree << sbp->sb_inopblog; + fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); if (mp->m_maxicount) statp->f_files = min_t(typeof(statp->f_files), @@ -2085,11 +2086,6 @@ init_xfs_fs(void) printk(KERN_INFO XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"); - xfs_extent_free_init_defer_op(); - xfs_rmap_update_init_defer_op(); - xfs_refcount_update_init_defer_op(); - xfs_bmap_update_init_defer_op(); - xfs_dir_startup(); error = xfs_init_zones(); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index a3e98c64b6e3..b2c1177c717f 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -192,6 +192,7 @@ xfs_symlink( pathlen = strlen(target_path); if (pathlen >= XFS_SYMLINK_MAXLEN) /* total string too long */ return -ENAMETOOLONG; + ASSERT(pathlen > 0); udqp = gdqp = NULL; prid = xfs_get_initial_prid(dp); @@ -378,6 +379,12 @@ out_release_inode: /* * Free a symlink that has blocks associated with it. + * + * Note: zero length symlinks are not allowed to exist. When we set the size to + * zero, also change it to a regular file so that it does not get written to + * disk as a zero length symlink. The inode is on the unlinked list already, so + * userspace cannot find this inode anymore, so this change is not user visible + * but allows us to catch corrupt zero-length symlinks in the verifiers. */ STATIC int xfs_inactive_symlink_rmt( @@ -412,13 +419,14 @@ xfs_inactive_symlink_rmt( xfs_trans_ijoin(tp, ip, 0); /* - * Lock the inode, fix the size, and join it to the transaction. - * Hold it so in the normal path, we still have it locked for - * the second transaction. In the error paths we need it + * Lock the inode, fix the size, turn it into a regular file and join it + * to the transaction. Hold it so in the normal path, we still have it + * locked for the second transaction. In the error paths we need it * held so the cancel won't rele it, see below. */ size = (int)ip->i_d.di_size; ip->i_d.di_size = 0; + VFS_I(ip)->i_mode = (VFS_I(ip)->i_mode & ~S_IFMT) | S_IFREG; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); /* * Find the block(s) so we can inval and unmap them. @@ -494,17 +502,10 @@ xfs_inactive_symlink( return -EIO; xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* - * Zero length symlinks _can_ exist. - */ pathlen = (int)ip->i_d.di_size; - if (!pathlen) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return 0; - } + ASSERT(pathlen); - if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) { + if (pathlen <= 0 || pathlen > XFS_SYMLINK_MAXLEN) { xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)", __func__, (unsigned long long)ip->i_ino, pathlen); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -512,12 +513,12 @@ xfs_inactive_symlink( return -EFSCORRUPTED; } + /* + * Inline fork state gets removed by xfs_difree() so we have nothing to + * do here in that case. + */ if (ip->i_df.if_flags & XFS_IFINLINE) { - if (ip->i_df.if_bytes > 0) - xfs_idata_realloc(ip, -(ip->i_df.if_bytes), - XFS_DATA_FORK); xfs_iunlock(ip, XFS_ILOCK_EXCL); - ASSERT(ip->i_df.if_bytes == 0); return 0; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8a6532aae779..6fcc893dfc91 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -640,6 +640,16 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); +/* + * ftrace's __print_symbolic requires that all enum values be wrapped in the + * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace + * ring buffer. Somehow this was only worth mentioning in the ftrace sample + * code. + */ +TRACE_DEFINE_ENUM(PE_SIZE_PTE); +TRACE_DEFINE_ENUM(PE_SIZE_PMD); +TRACE_DEFINE_ENUM(PE_SIZE_PUD); + TRACE_EVENT(xfs_filemap_fault, TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, bool write_fault), @@ -1208,6 +1218,12 @@ DEFINE_EVENT(xfs_readpage_class, name, \ DEFINE_READPAGE_EVENT(xfs_vm_readpage); DEFINE_READPAGE_EVENT(xfs_vm_readpages); +TRACE_DEFINE_ENUM(XFS_IO_HOLE); +TRACE_DEFINE_ENUM(XFS_IO_DELALLOC); +TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN); +TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE); +TRACE_DEFINE_ENUM(XFS_IO_COW); + DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int type, struct xfs_bmbt_irec *irec), @@ -1885,11 +1901,11 @@ TRACE_EVENT(xfs_dir2_leafn_moveents, { 0, "target" }, \ { 1, "temp" } -#define XFS_INODE_FORMAT_STR \ - { 0, "invalid" }, \ - { 1, "local" }, \ - { 2, "extent" }, \ - { 3, "btree" } +TRACE_DEFINE_ENUM(XFS_DINODE_FMT_DEV); +TRACE_DEFINE_ENUM(XFS_DINODE_FMT_LOCAL); +TRACE_DEFINE_ENUM(XFS_DINODE_FMT_EXTENTS); +TRACE_DEFINE_ENUM(XFS_DINODE_FMT_BTREE); +TRACE_DEFINE_ENUM(XFS_DINODE_FMT_UUID); DECLARE_EVENT_CLASS(xfs_swap_extent_class, TP_PROTO(struct xfs_inode *ip, int which), @@ -2178,6 +2194,14 @@ DEFINE_DISCARD_EVENT(xfs_discard_exclude); DEFINE_DISCARD_EVENT(xfs_discard_busy); /* btree cursor events */ +TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi); +TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi); +TRACE_DEFINE_ENUM(XFS_BTNUM_INOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); +TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); +TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); + DECLARE_EVENT_CLASS(xfs_btree_cur_class, TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), TP_ARGS(cur, level, bp), @@ -2197,9 +2221,9 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class, __entry->ptr = cur->bc_ptrs[level]; __entry->daddr = bp ? bp->b_bn : -1; ), - TP_printk("dev %d:%d btnum %d level %d/%d ptr %d daddr 0x%llx", + TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->btnum, + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, __entry->nlevels, __entry->ptr, @@ -2276,7 +2300,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __entry->type = dfp->dfp_type->type; + __entry->type = dfp->dfp_type; __entry->intent = dfp->dfp_intent; __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; @@ -2405,7 +2429,7 @@ DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred); DECLARE_EVENT_CLASS(xfs_rmap_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, - struct xfs_owner_info *oinfo), + const struct xfs_owner_info *oinfo), TP_ARGS(mp, agno, agbno, len, unwritten, oinfo), TP_STRUCT__entry( __field(dev_t, dev) @@ -2440,7 +2464,7 @@ DECLARE_EVENT_CLASS(xfs_rmap_class, DEFINE_EVENT(xfs_rmap_class, name, \ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ xfs_agblock_t agbno, xfs_extlen_t len, bool unwritten, \ - struct xfs_owner_info *oinfo), \ + const struct xfs_owner_info *oinfo), \ TP_ARGS(mp, agno, agbno, len, unwritten, oinfo)) /* simple AG-based error/%ip tracepoint class */ @@ -2610,10 +2634,9 @@ DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); #define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name) /* ag btree lookup tracepoint class */ -#define XFS_AG_BTREE_CMP_FORMAT_STR \ - { XFS_LOOKUP_EQ, "eq" }, \ - { XFS_LOOKUP_LE, "le" }, \ - { XFS_LOOKUP_GE, "ge" } +TRACE_DEFINE_ENUM(XFS_LOOKUP_EQi); +TRACE_DEFINE_ENUM(XFS_LOOKUP_LEi); +TRACE_DEFINE_ENUM(XFS_LOOKUP_GEi); DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_lookup_t dir), diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index a0c5dbda18aa..c6e1c5704a8c 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -223,13 +223,13 @@ void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); bool xfs_trans_buf_is_dirty(struct xfs_buf *bp); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); -void xfs_extent_free_init_defer_op(void); struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *, struct xfs_efi_log_item *, uint); int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, - xfs_extlen_t, struct xfs_owner_info *, + xfs_extlen_t, + const struct xfs_owner_info *, bool); int xfs_trans_commit(struct xfs_trans *); int xfs_trans_roll(struct xfs_trans **); @@ -248,7 +248,6 @@ extern kmem_zone_t *xfs_trans_zone; /* rmap updates */ enum xfs_rmap_intent_type; -void xfs_rmap_update_init_defer_op(void); struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp, struct xfs_rui_log_item *ruip); int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, @@ -260,7 +259,6 @@ int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, /* refcount updates */ enum xfs_refcount_intent_type; -void xfs_refcount_update_init_defer_op(void); struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp, struct xfs_cui_log_item *cuip); int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp, @@ -272,7 +270,6 @@ int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp, /* mapping updates */ enum xfs_bmap_intent_type; -void xfs_bmap_update_init_defer_op(void); struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp, struct xfs_bui_log_item *buip); int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c index 741c558b2179..11cff449d055 100644 --- a/fs/xfs/xfs_trans_bmap.c +++ b/fs/xfs/xfs_trans_bmap.c @@ -17,6 +17,7 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_inode.h" +#include "xfs_defer.h" /* * This routine is called to allocate a "bmap update done" @@ -220,8 +221,7 @@ xfs_bmap_update_cancel_item( kmem_free(bmap); } -static const struct xfs_defer_op_type xfs_bmap_update_defer_type = { - .type = XFS_DEFER_OPS_TYPE_BMAP, +const struct xfs_defer_op_type xfs_bmap_update_defer_type = { .max_items = XFS_BUI_MAX_FAST_EXTENTS, .diff_items = xfs_bmap_update_diff_items, .create_intent = xfs_bmap_update_create_intent, @@ -231,10 +231,3 @@ static const struct xfs_defer_op_type xfs_bmap_update_defer_type = { .finish_item = xfs_bmap_update_finish_item, .cancel_item = xfs_bmap_update_cancel_item, }; - -/* Register the deferred op type. */ -void -xfs_bmap_update_init_defer_op(void) -{ - xfs_defer_init_op_type(&xfs_bmap_update_defer_type); -} diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 855c0b651fd4..0710434eb240 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -18,6 +18,7 @@ #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_trace.h" +#include "xfs_defer.h" /* * This routine is called to allocate an "extent free done" @@ -52,19 +53,20 @@ xfs_trans_get_efd(struct xfs_trans *tp, */ int xfs_trans_free_extent( - struct xfs_trans *tp, - struct xfs_efd_log_item *efdp, - xfs_fsblock_t start_block, - xfs_extlen_t ext_len, - struct xfs_owner_info *oinfo, - bool skip_discard) + struct xfs_trans *tp, + struct xfs_efd_log_item *efdp, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len, + const struct xfs_owner_info *oinfo, + bool skip_discard) { - struct xfs_mount *mp = tp->t_mountp; - uint next_extent; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, start_block); - struct xfs_extent *extp; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_extent *extp; + uint next_extent; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, + start_block); + int error; trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); @@ -206,8 +208,7 @@ xfs_extent_free_cancel_item( kmem_free(free); } -static const struct xfs_defer_op_type xfs_extent_free_defer_type = { - .type = XFS_DEFER_OPS_TYPE_FREE, +const struct xfs_defer_op_type xfs_extent_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, @@ -274,8 +275,7 @@ xfs_agfl_free_finish_item( /* sub-type with special handling for AGFL deferred frees */ -static const struct xfs_defer_op_type xfs_agfl_free_defer_type = { - .type = XFS_DEFER_OPS_TYPE_AGFL_FREE, +const struct xfs_defer_op_type xfs_agfl_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, @@ -285,11 +285,3 @@ static const struct xfs_defer_op_type xfs_agfl_free_defer_type = { .finish_item = xfs_agfl_free_finish_item, .cancel_item = xfs_extent_free_cancel_item, }; - -/* Register the deferred op type. */ -void -xfs_extent_free_init_defer_op(void) -{ - xfs_defer_init_op_type(&xfs_extent_free_defer_type); - xfs_defer_init_op_type(&xfs_agfl_free_defer_type); -} diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c index 523c55663954..6c947ff4faf6 100644 --- a/fs/xfs/xfs_trans_refcount.c +++ b/fs/xfs/xfs_trans_refcount.c @@ -16,6 +16,7 @@ #include "xfs_refcount_item.h" #include "xfs_alloc.h" #include "xfs_refcount.h" +#include "xfs_defer.h" /* * This routine is called to allocate a "refcount update done" @@ -227,8 +228,7 @@ xfs_refcount_update_cancel_item( kmem_free(refc); } -static const struct xfs_defer_op_type xfs_refcount_update_defer_type = { - .type = XFS_DEFER_OPS_TYPE_REFCOUNT, +const struct xfs_defer_op_type xfs_refcount_update_defer_type = { .max_items = XFS_CUI_MAX_FAST_EXTENTS, .diff_items = xfs_refcount_update_diff_items, .create_intent = xfs_refcount_update_create_intent, @@ -239,10 +239,3 @@ static const struct xfs_defer_op_type xfs_refcount_update_defer_type = { .finish_cleanup = xfs_refcount_update_finish_cleanup, .cancel_item = xfs_refcount_update_cancel_item, }; - -/* Register the deferred op type. */ -void -xfs_refcount_update_init_defer_op(void) -{ - xfs_defer_init_op_type(&xfs_refcount_update_defer_type); -} diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index 05b00e40251f..a42890931ecd 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -16,6 +16,7 @@ #include "xfs_rmap_item.h" #include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_defer.h" /* Set the map extent flags for this reverse mapping. */ static void @@ -244,8 +245,7 @@ xfs_rmap_update_cancel_item( kmem_free(rmap); } -static const struct xfs_defer_op_type xfs_rmap_update_defer_type = { - .type = XFS_DEFER_OPS_TYPE_RMAP, +const struct xfs_defer_op_type xfs_rmap_update_defer_type = { .max_items = XFS_RUI_MAX_FAST_EXTENTS, .diff_items = xfs_rmap_update_diff_items, .create_intent = xfs_rmap_update_create_intent, @@ -256,10 +256,3 @@ static const struct xfs_defer_op_type xfs_rmap_update_defer_type = { .finish_cleanup = xfs_rmap_update_finish_cleanup, .cancel_item = xfs_rmap_update_cancel_item, }; - -/* Register the deferred op type. */ -void -xfs_rmap_update_init_defer_op(void) -{ - xfs_defer_init_op_type(&xfs_rmap_update_defer_type); -} |