diff options
Diffstat (limited to 'fs')
133 files changed, 5471 insertions, 3956 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 0ad61c6a65a5..055562c580b4 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -33,6 +33,7 @@ #include <linux/pagemap.h> #include <linux/idr.h> #include <linux/sched.h> +#include <linux/aio.h> #include <net/9p/9p.h> #include <net/9p/client.h> diff --git a/fs/afs/write.c b/fs/afs/write.c index 7e03eadb40c0..a890db4b9898 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/pagevec.h> +#include <linux/aio.h> #include "internal.h" static int afs_write_back_from_locked_page(struct afs_writeback *wb, @@ -8,6 +8,8 @@ * * See ../COPYING for licensing terms. */ +#define pr_fmt(fmt) "%s: " fmt, __func__ + #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> @@ -18,8 +20,6 @@ #include <linux/backing-dev.h> #include <linux/uio.h> -#define DEBUG 0 - #include <linux/sched.h> #include <linux/fs.h> #include <linux/file.h> @@ -39,11 +39,76 @@ #include <asm/kmap_types.h> #include <asm/uaccess.h> -#if DEBUG > 1 -#define dprintk printk -#else -#define dprintk(x...) do { ; } while (0) -#endif +#define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_INCOMPAT_FEATURES 0 +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; + unsigned tail; + + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ + + + struct io_event io_events[0]; +}; /* 128 bytes + ring size */ + +#define AIO_RING_PAGES 8 + +struct kioctx { + atomic_t users; + atomic_t dead; + + /* This needs improving */ + unsigned long user_id; + struct hlist_node list; + + /* + * This is what userspace passed to io_setup(), it's not used for + * anything but counting against the global max_reqs quota. + * + * The real limit is nr_events - 1, which will be larger (see + * aio_setup_ring()) + */ + unsigned max_reqs; + + /* Size of ringbuffer, in units of struct io_event */ + unsigned nr_events; + + unsigned long mmap_base; + unsigned long mmap_size; + + struct page **ring_pages; + long nr_pages; + + struct rcu_head rcu_head; + struct work_struct rcu_work; + + struct { + atomic_t reqs_active; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t ctx_lock; + struct list_head active_reqs; /* used for cancellation */ + } ____cacheline_aligned_in_smp; + + struct { + struct mutex ring_lock; + wait_queue_head_t wait; + } ____cacheline_aligned_in_smp; + + struct { + unsigned tail; + spinlock_t completion_lock; + } ____cacheline_aligned_in_smp; + + struct page *internal_pages[AIO_RING_PAGES]; +}; /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); @@ -54,11 +119,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; -static struct workqueue_struct *aio_wq; - -static void aio_kick_handler(struct work_struct *); -static void aio_queue_work(struct kioctx *); - /* aio_setup * Creates the slab caches used by the aio routines, panic on * failure as this is done early during the boot sequence. @@ -68,10 +128,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); - aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */ - BUG_ON(!aio_wq); - - pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); + pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); return 0; } @@ -79,28 +136,23 @@ __initcall(aio_setup); static void aio_free_ring(struct kioctx *ctx) { - struct aio_ring_info *info = &ctx->ring_info; long i; - for (i=0; i<info->nr_pages; i++) - put_page(info->ring_pages[i]); + for (i = 0; i < ctx->nr_pages; i++) + put_page(ctx->ring_pages[i]); - if (info->mmap_size) { - BUG_ON(ctx->mm != current->mm); - vm_munmap(info->mmap_base, info->mmap_size); - } + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); - if (info->ring_pages && info->ring_pages != info->internal_pages) - kfree(info->ring_pages); - info->ring_pages = NULL; - info->nr = 0; + if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) + kfree(ctx->ring_pages); } static int aio_setup_ring(struct kioctx *ctx) { struct aio_ring *ring; - struct aio_ring_info *info = &ctx->ring_info; unsigned nr_events = ctx->max_reqs; + struct mm_struct *mm = current->mm; unsigned long size, populate; int nr_pages; @@ -116,46 +168,44 @@ static int aio_setup_ring(struct kioctx *ctx) nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info->nr = 0; - info->ring_pages = info->internal_pages; + ctx->nr_events = 0; + ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { - info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!info->ring_pages) + ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), + GFP_KERNEL); + if (!ctx->ring_pages) return -ENOMEM; } - info->mmap_size = nr_pages * PAGE_SIZE; - dprintk("attempting mmap of %lu bytes\n", info->mmap_size); - down_write(&ctx->mm->mmap_sem); - info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0, - &populate); - if (IS_ERR((void *)info->mmap_base)) { - up_write(&ctx->mm->mmap_sem); - info->mmap_size = 0; + ctx->mmap_size = nr_pages * PAGE_SIZE; + pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); + down_write(&mm->mmap_sem); + ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, + PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); + if (IS_ERR((void *)ctx->mmap_base)) { + up_write(&mm->mmap_sem); + ctx->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; } - dprintk("mmap address: 0x%08lx\n", info->mmap_base); - info->nr_pages = get_user_pages(current, ctx->mm, - info->mmap_base, nr_pages, - 1, 0, info->ring_pages, NULL); - up_write(&ctx->mm->mmap_sem); + pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); + ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, + 1, 0, ctx->ring_pages, NULL); + up_write(&mm->mmap_sem); - if (unlikely(info->nr_pages != nr_pages)) { + if (unlikely(ctx->nr_pages != nr_pages)) { aio_free_ring(ctx); return -EAGAIN; } if (populate) - mm_populate(info->mmap_base, populate); + mm_populate(ctx->mmap_base, populate); - ctx->user_id = info->mmap_base; + ctx->user_id = ctx->mmap_base; + ctx->nr_events = nr_events; /* trusted copy */ - info->nr = nr_events; /* trusted copy */ - - ring = kmap_atomic(info->ring_pages[0]); + ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; ring->head = ring->tail = 0; @@ -164,72 +214,133 @@ static int aio_setup_ring(struct kioctx *ctx) ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); return 0; } - -/* aio_ring_event: returns a pointer to the event at the given index from - * kmap_atomic(). Release the pointer with put_aio_ring_event(); - */ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ - struct io_event *__event; \ - __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \ - __event += pos % AIO_EVENTS_PER_PAGE; \ - __event; \ -}) - -#define put_aio_ring_event(event) do { \ - struct io_event *__event = (event); \ - (void)__event; \ - kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \ -} while(0) - -static void ctx_rcu_free(struct rcu_head *head) +void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) +{ + struct kioctx *ctx = req->ki_ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + + if (!req->ki_list.next) + list_add(&req->ki_list, &ctx->active_reqs); + + req->ki_cancel = cancel; + + spin_unlock_irqrestore(&ctx->ctx_lock, flags); +} +EXPORT_SYMBOL(kiocb_set_cancel_fn); + +static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, + struct io_event *res) +{ + kiocb_cancel_fn *old, *cancel; + int ret = -EINVAL; + + /* + * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it + * actually has a cancel function, hence the cmpxchg() + */ + + cancel = ACCESS_ONCE(kiocb->ki_cancel); + do { + if (!cancel || cancel == KIOCB_CANCELLED) + return ret; + + old = cancel; + cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); + } while (cancel != old); + + atomic_inc(&kiocb->ki_users); + spin_unlock_irq(&ctx->ctx_lock); + + memset(res, 0, sizeof(*res)); + res->obj = (u64)(unsigned long)kiocb->ki_obj.user; + res->data = kiocb->ki_user_data; + ret = cancel(kiocb, res); + + spin_lock_irq(&ctx->ctx_lock); + + return ret; +} + +static void free_ioctx_rcu(struct rcu_head *head) { struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); kmem_cache_free(kioctx_cachep, ctx); } -/* __put_ioctx - * Called when the last user of an aio context has gone away, - * and the struct needs to be freed. +/* + * When this function runs, the kioctx has been removed from the "hash table" + * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - + * now it's safe to cancel any that need to be. */ -static void __put_ioctx(struct kioctx *ctx) +static void free_ioctx(struct kioctx *ctx) { - unsigned nr_events = ctx->max_reqs; - BUG_ON(ctx->reqs_active); + struct aio_ring *ring; + struct io_event res; + struct kiocb *req; + unsigned head, avail; - cancel_delayed_work_sync(&ctx->wq); - aio_free_ring(ctx); - mmdrop(ctx->mm); - ctx->mm = NULL; - if (nr_events) { - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - nr_events > aio_nr); - aio_nr -= nr_events; - spin_unlock(&aio_nr_lock); + spin_lock_irq(&ctx->ctx_lock); + + while (!list_empty(&ctx->active_reqs)) { + req = list_first_entry(&ctx->active_reqs, + struct kiocb, ki_list); + + list_del_init(&req->ki_list); + kiocb_cancel(ctx, req, &res); } - pr_debug("__put_ioctx: freeing %p\n", ctx); - call_rcu(&ctx->rcu_head, ctx_rcu_free); -} -static inline int try_get_ioctx(struct kioctx *kioctx) -{ - return atomic_inc_not_zero(&kioctx->users); + spin_unlock_irq(&ctx->ctx_lock); + + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; + kunmap_atomic(ring); + + while (atomic_read(&ctx->reqs_active) > 0) { + wait_event(ctx->wait, head != ctx->tail); + + avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; + + atomic_sub(avail, &ctx->reqs_active); + head += avail; + head %= ctx->nr_events; + } + + WARN_ON(atomic_read(&ctx->reqs_active) < 0); + + aio_free_ring(ctx); + + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - ctx->max_reqs > aio_nr); + aio_nr -= ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + pr_debug("freeing %p\n", ctx); + + /* + * Here the call_rcu() is between the wait_event() for reqs_active to + * hit 0, and freeing the ioctx. + * + * aio_complete() decrements reqs_active, but it has to touch the ioctx + * after to issue a wakeup so we use rcu. + */ + call_rcu(&ctx->rcu_head, free_ioctx_rcu); } -static inline void put_ioctx(struct kioctx *kioctx) +static void put_ioctx(struct kioctx *ctx) { - BUG_ON(atomic_read(&kioctx->users) <= 0); - if (unlikely(atomic_dec_and_test(&kioctx->users))) - __put_ioctx(kioctx); + if (unlikely(atomic_dec_and_test(&ctx->users))) + free_ioctx(ctx); } /* ioctx_alloc @@ -237,7 +348,7 @@ static inline void put_ioctx(struct kioctx *kioctx) */ static struct kioctx *ioctx_alloc(unsigned nr_events) { - struct mm_struct *mm; + struct mm_struct *mm = current->mm; struct kioctx *ctx; int err = -ENOMEM; @@ -256,17 +367,15 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-ENOMEM); ctx->max_reqs = nr_events; - mm = ctx->mm = current->mm; - atomic_inc(&mm->mm_count); atomic_set(&ctx->users, 2); + atomic_set(&ctx->dead, 0); spin_lock_init(&ctx->ctx_lock); - spin_lock_init(&ctx->ring_info.ring_lock); + spin_lock_init(&ctx->completion_lock); + mutex_init(&ctx->ring_lock); init_waitqueue_head(&ctx->wait); INIT_LIST_HEAD(&ctx->active_reqs); - INIT_LIST_HEAD(&ctx->run_list); - INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); if (aio_setup_ring(ctx) < 0) goto out_freectx; @@ -286,64 +395,56 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); spin_unlock(&mm->ioctx_lock); - dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", - ctx, ctx->user_id, current->mm, ctx->ring_info.nr); + pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", + ctx, ctx->user_id, mm, ctx->nr_events); return ctx; out_cleanup: err = -EAGAIN; aio_free_ring(ctx); out_freectx: - mmdrop(mm); kmem_cache_free(kioctx_cachep, ctx); - dprintk("aio: error allocating ioctx %d\n", err); + pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); } -/* kill_ctx - * Cancels all outstanding aio requests on an aio context. Used - * when the processes owning a context have all exited to encourage - * the rapid destruction of the kioctx. - */ -static void kill_ctx(struct kioctx *ctx) +static void kill_ioctx_work(struct work_struct *work) { - int (*cancel)(struct kiocb *, struct io_event *); - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - struct io_event res; + struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); - spin_lock_irq(&ctx->ctx_lock); - ctx->dead = 1; - while (!list_empty(&ctx->active_reqs)) { - struct list_head *pos = ctx->active_reqs.next; - struct kiocb *iocb = list_kiocb(pos); - list_del_init(&iocb->ki_list); - cancel = iocb->ki_cancel; - kiocbSetCancelled(iocb); - if (cancel) { - iocb->ki_users++; - spin_unlock_irq(&ctx->ctx_lock); - cancel(iocb, &res); - spin_lock_irq(&ctx->ctx_lock); - } - } + wake_up_all(&ctx->wait); + put_ioctx(ctx); +} - if (!ctx->reqs_active) - goto out; +static void kill_ioctx_rcu(struct rcu_head *head) +{ + struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); - add_wait_queue(&ctx->wait, &wait); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - while (ctx->reqs_active) { - spin_unlock_irq(&ctx->ctx_lock); - io_schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - spin_lock_irq(&ctx->ctx_lock); - } - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); + INIT_WORK(&ctx->rcu_work, kill_ioctx_work); + schedule_work(&ctx->rcu_work); +} -out: - spin_unlock_irq(&ctx->ctx_lock); +/* kill_ioctx + * Cancels all outstanding aio requests on an aio context. Used + * when the processes owning a context have all exited to encourage + * the rapid destruction of the kioctx. + */ +static void kill_ioctx(struct kioctx *ctx) +{ + if (!atomic_xchg(&ctx->dead, 1)) { + hlist_del_rcu(&ctx->list); + /* Between hlist_del_rcu() and dropping the initial ref */ + synchronize_rcu(); + + /* + * We can't punt to workqueue here because put_ioctx() -> + * free_ioctx() will unmap the ringbuffer, and that has to be + * done in the original process's context. kill_ioctx_rcu/work() + * exist for exit_aio(), as in that path free_ioctx() won't do + * the unmap. + */ + kill_ioctx_work(&ctx->rcu_work); + } } /* wait_on_sync_kiocb: @@ -351,9 +452,9 @@ out: */ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { - while (iocb->ki_users) { + while (atomic_read(&iocb->ki_users)) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!iocb->ki_users) + if (!atomic_read(&iocb->ki_users)) break; io_schedule(); } @@ -362,28 +463,26 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb) } EXPORT_SYMBOL(wait_on_sync_kiocb); -/* exit_aio: called when the last user of mm goes away. At this point, - * there is no way for any new requests to be submited or any of the - * io_* syscalls to be called on the context. However, there may be - * outstanding requests which hold references to the context; as they - * go away, they will call put_ioctx and release any pinned memory - * associated with the request (held via struct page * references). +/* + * exit_aio: called when the last user of mm goes away. At this point, there is + * no way for any new requests to be submited or any of the io_* syscalls to be + * called on the context. + * + * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on + * them. */ void exit_aio(struct mm_struct *mm) { struct kioctx *ctx; + struct hlist_node *n; - while (!hlist_empty(&mm->ioctx_list)) { - ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); - hlist_del_rcu(&ctx->list); - - kill_ctx(ctx); - + hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { if (1 != atomic_read(&ctx->users)) printk(KERN_DEBUG "exit_aio:ioctx still alive: %d %d %d\n", - atomic_read(&ctx->users), ctx->dead, - ctx->reqs_active); + atomic_read(&ctx->users), + atomic_read(&ctx->dead), + atomic_read(&ctx->reqs_active)); /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -391,150 +490,53 @@ void exit_aio(struct mm_struct *mm) * as indicator that it needs to unmap the area, * just set it to 0; aio_free_ring() is the only * place that uses ->mmap_size, so it's safe. - * That way we get all munmap done to current->mm - - * all other callers have ctx->mm == current->mm. */ - ctx->ring_info.mmap_size = 0; - put_ioctx(ctx); + ctx->mmap_size = 0; + + if (!atomic_xchg(&ctx->dead, 1)) { + hlist_del_rcu(&ctx->list); + call_rcu(&ctx->rcu_head, kill_ioctx_rcu); + } } } /* aio_get_req - * Allocate a slot for an aio request. Increments the users count + * Allocate a slot for an aio request. Increments the ki_users count * of the kioctx so that the kioctx stays around until all requests are * complete. Returns NULL if no requests are free. * - * Returns with kiocb->users set to 2. The io submit code path holds + * Returns with kiocb->ki_users set to 2. The io submit code path holds * an extra reference while submitting the i/o. * This prevents races between the aio code path referencing the * req (after submitting it) and aio_complete() freeing the req. */ -static struct kiocb *__aio_get_req(struct kioctx *ctx) +static inline struct kiocb *aio_get_req(struct kioctx *ctx) { - struct kiocb *req = NULL; + struct kiocb *req; - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); - if (unlikely(!req)) + if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) return NULL; - req->ki_flags = 0; - req->ki_users = 2; - req->ki_key = 0; - req->ki_ctx = ctx; - req->ki_cancel = NULL; - req->ki_retry = NULL; - req->ki_dtor = NULL; - req->private = NULL; - req->ki_iovec = NULL; - INIT_LIST_HEAD(&req->ki_run_list); - req->ki_eventfd = NULL; - - return req; -} - -/* - * struct kiocb's are allocated in batches to reduce the number of - * times the ctx lock is acquired and released. - */ -#define KIOCB_BATCH_SIZE 32L -struct kiocb_batch { - struct list_head head; - long count; /* number of requests left to allocate */ -}; - -static void kiocb_batch_init(struct kiocb_batch *batch, long total) -{ - INIT_LIST_HEAD(&batch->head); - batch->count = total; -} - -static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) -{ - struct kiocb *req, *n; - - if (list_empty(&batch->head)) - return; - - spin_lock_irq(&ctx->ctx_lock); - list_for_each_entry_safe(req, n, &batch->head, ki_batch) { - list_del(&req->ki_batch); - list_del(&req->ki_list); - kmem_cache_free(kiocb_cachep, req); - ctx->reqs_active--; - } - if (unlikely(!ctx->reqs_active && ctx->dead)) - wake_up_all(&ctx->wait); - spin_unlock_irq(&ctx->ctx_lock); -} - -/* - * Allocate a batch of kiocbs. This avoids taking and dropping the - * context lock a lot during setup. - */ -static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch) -{ - unsigned short allocated, to_alloc; - long avail; - struct kiocb *req, *n; - struct aio_ring *ring; - - to_alloc = min(batch->count, KIOCB_BATCH_SIZE); - for (allocated = 0; allocated < to_alloc; allocated++) { - req = __aio_get_req(ctx); - if (!req) - /* allocation failed, go with what we've got */ - break; - list_add(&req->ki_batch, &batch->head); - } - - if (allocated == 0) - goto out; - - spin_lock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_info.ring_pages[0]); - - avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active; - BUG_ON(avail < 0); - if (avail < allocated) { - /* Trim back the number of requests. */ - list_for_each_entry_safe(req, n, &batch->head, ki_batch) { - list_del(&req->ki_batch); - kmem_cache_free(kiocb_cachep, req); - if (--allocated <= avail) - break; - } - } - - batch->count -= allocated; - list_for_each_entry(req, &batch->head, ki_batch) { - list_add(&req->ki_list, &ctx->active_reqs); - ctx->reqs_active++; - } + if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) + goto out_put; - kunmap_atomic(ring); - spin_unlock_irq(&ctx->ctx_lock); - -out: - return allocated; -} + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); + if (unlikely(!req)) + goto out_put; -static inline struct kiocb *aio_get_req(struct kioctx *ctx, - struct kiocb_batch *batch) -{ - struct kiocb *req; + atomic_set(&req->ki_users, 2); + req->ki_ctx = ctx; - if (list_empty(&batch->head)) - if (kiocb_batch_refill(ctx, batch) == 0) - return NULL; - req = list_first_entry(&batch->head, struct kiocb, ki_batch); - list_del(&req->ki_batch); return req; +out_put: + atomic_dec(&ctx->reqs_active); + return NULL; } -static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) +static void kiocb_free(struct kiocb *req) { - assert_spin_locked(&ctx->ctx_lock); - + if (req->ki_filp) + fput(req->ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); if (req->ki_dtor) @@ -542,48 +544,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) if (req->ki_iovec != &req->ki_inline_vec) kfree(req->ki_iovec); kmem_cache_free(kiocb_cachep, req); - ctx->reqs_active--; - - if (unlikely(!ctx->reqs_active && ctx->dead)) - wake_up_all(&ctx->wait); } -/* __aio_put_req - * Returns true if this put was the last user of the request. - */ -static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) +void aio_put_req(struct kiocb *req) { - dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", - req, atomic_long_read(&req->ki_filp->f_count)); - - assert_spin_locked(&ctx->ctx_lock); - - req->ki_users--; - BUG_ON(req->ki_users < 0); - if (likely(req->ki_users)) - return 0; - list_del(&req->ki_list); /* remove from active_reqs */ - req->ki_cancel = NULL; - req->ki_retry = NULL; - - fput(req->ki_filp); - req->ki_filp = NULL; - really_put_req(ctx, req); - return 1; -} - -/* aio_put_req - * Returns true if this put was the last user of the kiocb, - * false if the request is still in use. - */ -int aio_put_req(struct kiocb *req) -{ - struct kioctx *ctx = req->ki_ctx; - int ret; - spin_lock_irq(&ctx->ctx_lock); - ret = __aio_put_req(ctx, req); - spin_unlock_irq(&ctx->ctx_lock); - return ret; + if (atomic_dec_and_test(&req->ki_users)) + kiocb_free(req); } EXPORT_SYMBOL(aio_put_req); @@ -595,13 +561,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) rcu_read_lock(); hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { - /* - * RCU protects us against accessing freed memory but - * we have to be careful not to get a reference when the - * reference count already dropped to 0 (ctx->dead test - * is unreliable because of races). - */ - if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ + if (ctx->user_id == ctx_id) { + atomic_inc(&ctx->users); ret = ctx; break; } @@ -611,295 +572,16 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) return ret; } -/* - * Queue up a kiocb to be retried. Assumes that the kiocb - * has already been marked as kicked, and places it on - * the retry run list for the corresponding ioctx, if it - * isn't already queued. Returns 1 if it actually queued - * the kiocb (to tell the caller to activate the work - * queue to process it), or 0, if it found that it was - * already queued. - */ -static inline int __queue_kicked_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - - assert_spin_locked(&ctx->ctx_lock); - - if (list_empty(&iocb->ki_run_list)) { - list_add_tail(&iocb->ki_run_list, - &ctx->run_list); - return 1; - } - return 0; -} - -/* aio_run_iocb - * This is the core aio execution routine. It is - * invoked both for initial i/o submission and - * subsequent retries via the aio_kick_handler. - * Expects to be invoked with iocb->ki_ctx->lock - * already held. The lock is released and reacquired - * as needed during processing. - * - * Calls the iocb retry method (already setup for the - * iocb on initial submission) for operation specific - * handling, but takes care of most of common retry - * execution details for a given iocb. The retry method - * needs to be non-blocking as far as possible, to avoid - * holding up other iocbs waiting to be serviced by the - * retry kernel thread. - * - * The trickier parts in this code have to do with - * ensuring that only one retry instance is in progress - * for a given iocb at any time. Providing that guarantee - * simplifies the coding of individual aio operations as - * it avoids various potential races. - */ -static ssize_t aio_run_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - ssize_t (*retry)(struct kiocb *); - ssize_t ret; - - if (!(retry = iocb->ki_retry)) { - printk("aio_run_iocb: iocb->ki_retry = NULL\n"); - return 0; - } - - /* - * We don't want the next retry iteration for this - * operation to start until this one has returned and - * updated the iocb state. However, wait_queue functions - * can trigger a kick_iocb from interrupt context in the - * meantime, indicating that data is available for the next - * iteration. We want to remember that and enable the - * next retry iteration _after_ we are through with - * this one. - * - * So, in order to be able to register a "kick", but - * prevent it from being queued now, we clear the kick - * flag, but make the kick code *think* that the iocb is - * still on the run list until we are actually done. - * When we are done with this iteration, we check if - * the iocb was kicked in the meantime and if so, queue - * it up afresh. - */ - - kiocbClearKicked(iocb); - - /* - * This is so that aio_complete knows it doesn't need to - * pull the iocb off the run list (We can't just call - * INIT_LIST_HEAD because we don't want a kick_iocb to - * queue this on the run list yet) - */ - iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; - spin_unlock_irq(&ctx->ctx_lock); - - /* Quit retrying if the i/o has been cancelled */ - if (kiocbIsCancelled(iocb)) { - ret = -EINTR; - aio_complete(iocb, ret, 0); - /* must not access the iocb after this */ - goto out; - } - - /* - * Now we are all set to call the retry method in async - * context. - */ - ret = retry(iocb); - - if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { - /* - * There's no easy way to restart the syscall since other AIO's - * may be already running. Just fail this IO with EINTR. - */ - if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || - ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) - ret = -EINTR; - aio_complete(iocb, ret, 0); - } -out: - spin_lock_irq(&ctx->ctx_lock); - - if (-EIOCBRETRY == ret) { - /* - * OK, now that we are done with this iteration - * and know that there is more left to go, - * this is where we let go so that a subsequent - * "kick" can start the next iteration - */ - - /* will make __queue_kicked_iocb succeed from here on */ - INIT_LIST_HEAD(&iocb->ki_run_list); - /* we must queue the next iteration ourselves, if it - * has already been kicked */ - if (kiocbIsKicked(iocb)) { - __queue_kicked_iocb(iocb); - - /* - * __queue_kicked_iocb will always return 1 here, because - * iocb->ki_run_list is empty at this point so it should - * be safe to unconditionally queue the context into the - * work queue. - */ - aio_queue_work(ctx); - } - } - return ret; -} - -/* - * __aio_run_iocbs: - * Process all pending retries queued on the ioctx - * run list. - * Assumes it is operating within the aio issuer's mm - * context. - */ -static int __aio_run_iocbs(struct kioctx *ctx) -{ - struct kiocb *iocb; - struct list_head run_list; - - assert_spin_locked(&ctx->ctx_lock); - - list_replace_init(&ctx->run_list, &run_list); - while (!list_empty(&run_list)) { - iocb = list_entry(run_list.next, struct kiocb, - ki_run_list); - list_del(&iocb->ki_run_list); - /* - * Hold an extra reference while retrying i/o. - */ - iocb->ki_users++; /* grab extra reference */ - aio_run_iocb(iocb); - __aio_put_req(ctx, iocb); - } - if (!list_empty(&ctx->run_list)) - return 1; - return 0; -} - -static void aio_queue_work(struct kioctx * ctx) -{ - unsigned long timeout; - /* - * if someone is waiting, get the work started right - * away, otherwise, use a longer delay - */ - smp_mb(); - if (waitqueue_active(&ctx->wait)) - timeout = 1; - else - timeout = HZ/10; - queue_delayed_work(aio_wq, &ctx->wq, timeout); -} - -/* - * aio_run_all_iocbs: - * Process all pending retries queued on the ioctx - * run list, and keep running them until the list - * stays empty. - * Assumes it is operating within the aio issuer's mm context. - */ -static inline void aio_run_all_iocbs(struct kioctx *ctx) -{ - spin_lock_irq(&ctx->ctx_lock); - while (__aio_run_iocbs(ctx)) - ; - spin_unlock_irq(&ctx->ctx_lock); -} - -/* - * aio_kick_handler: - * Work queue handler triggered to process pending - * retries on an ioctx. Takes on the aio issuer's - * mm context before running the iocbs, so that - * copy_xxx_user operates on the issuer's address - * space. - * Run on aiod's context. - */ -static void aio_kick_handler(struct work_struct *work) -{ - struct kioctx *ctx = container_of(work, struct kioctx, wq.work); - mm_segment_t oldfs = get_fs(); - struct mm_struct *mm; - int requeue; - - set_fs(USER_DS); - use_mm(ctx->mm); - spin_lock_irq(&ctx->ctx_lock); - requeue =__aio_run_iocbs(ctx); - mm = ctx->mm; - spin_unlock_irq(&ctx->ctx_lock); - unuse_mm(mm); - set_fs(oldfs); - /* - * we're in a worker thread already; no point using non-zero delay - */ - if (requeue) - queue_delayed_work(aio_wq, &ctx->wq, 0); -} - - -/* - * Called by kick_iocb to queue the kiocb for retry - * and if required activate the aio work queue to process - * it - */ -static void try_queue_kicked_iocb(struct kiocb *iocb) -{ - struct kioctx *ctx = iocb->ki_ctx; - unsigned long flags; - int run = 0; - - spin_lock_irqsave(&ctx->ctx_lock, flags); - /* set this inside the lock so that we can't race with aio_run_iocb() - * testing it and putting the iocb on the run list under the lock */ - if (!kiocbTryKick(iocb)) - run = __queue_kicked_iocb(iocb); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - if (run) - aio_queue_work(ctx); -} - -/* - * kick_iocb: - * Called typically from a wait queue callback context - * to trigger a retry of the iocb. - * The retry is usually executed by aio workqueue - * threads (See aio_kick_handler). - */ -void kick_iocb(struct kiocb *iocb) -{ - /* sync iocbs are easy: they can only ever be executing from a - * single context. */ - if (is_sync_kiocb(iocb)) { - kiocbSetKicked(iocb); - wake_up_process(iocb->ki_obj.tsk); - return; - } - - try_queue_kicked_iocb(iocb); -} -EXPORT_SYMBOL(kick_iocb); - /* aio_complete * Called when the io request on the given iocb is complete. - * Returns true if this is the last user of the request. The - * only other user of the request can be the cancellation code. */ -int aio_complete(struct kiocb *iocb, long res, long res2) +void aio_complete(struct kiocb *iocb, long res, long res2) { struct kioctx *ctx = iocb->ki_ctx; - struct aio_ring_info *info; struct aio_ring *ring; - struct io_event *event; + struct io_event *ev_page, *event; unsigned long flags; - unsigned long tail; - int ret; + unsigned tail, pos; /* * Special case handling for sync iocbs: @@ -909,61 +591,81 @@ int aio_complete(struct kiocb *iocb, long res, long res2) * - the sync task helpfully left a reference to itself in the iocb */ if (is_sync_kiocb(iocb)) { - BUG_ON(iocb->ki_users != 1); + BUG_ON(atomic_read(&iocb->ki_users) != 1); iocb->ki_user_data = res; - iocb->ki_users = 0; + atomic_set(&iocb->ki_users, 0); wake_up_process(iocb->ki_obj.tsk); - return 1; + return; } - info = &ctx->ring_info; - - /* add a completion event to the ring buffer. - * must be done holding ctx->ctx_lock to prevent - * other code from messing with the tail - * pointer since we might be called from irq - * context. + /* + * Take rcu_read_lock() in case the kioctx is being destroyed, as we + * need to issue a wakeup after decrementing reqs_active. */ - spin_lock_irqsave(&ctx->ctx_lock, flags); + rcu_read_lock(); - if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) - list_del_init(&iocb->ki_run_list); + if (iocb->ki_list.next) { + unsigned long flags; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_del(&iocb->ki_list); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); + } /* * cancelled requests don't get events, userland was given one * when the event got cancelled. */ - if (kiocbIsCancelled(iocb)) + if (unlikely(xchg(&iocb->ki_cancel, + KIOCB_CANCELLED) == KIOCB_CANCELLED)) { + atomic_dec(&ctx->reqs_active); + /* Still need the wake_up in case free_ioctx is waiting */ goto put_rq; + } - ring = kmap_atomic(info->ring_pages[0]); + /* + * Add a completion event to the ring buffer. Must be done holding + * ctx->ctx_lock to prevent other code from messing with the tail + * pointer since we might be called from irq context. + */ + spin_lock_irqsave(&ctx->completion_lock, flags); - tail = info->tail; - event = aio_ring_event(info, tail); - if (++tail >= info->nr) + tail = ctx->tail; + pos = tail + AIO_EVENTS_OFFSET; + + if (++tail >= ctx->nr_events) tail = 0; + ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + event = ev_page + pos % AIO_EVENTS_PER_PAGE; + event->obj = (u64)(unsigned long)iocb->ki_obj.user; event->data = iocb->ki_user_data; event->res = res; event->res2 = res2; - dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, - res, res2); + kunmap_atomic(ev_page); + flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + + pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", + ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, + res, res2); /* after flagging the request as done, we * must never even look at it again */ smp_wmb(); /* make event visible before updating tail */ - info->tail = tail; - ring->tail = tail; + ctx->tail = tail; - put_aio_ring_event(event); + ring = kmap_atomic(ctx->ring_pages[0]); + ring->tail = tail; kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); + + spin_unlock_irqrestore(&ctx->completion_lock, flags); - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + pr_debug("added to ring %p at [%u]\n", iocb, tail); /* * Check if the user asked us to deliver the result through an @@ -975,7 +677,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2) put_rq: /* everything turned out well, dispose of the aiocb. */ - ret = __aio_put_req(ctx, iocb); + aio_put_req(iocb); /* * We have to order our ring_info tail store above and test @@ -988,233 +690,133 @@ put_rq: if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - spin_unlock_irqrestore(&ctx->ctx_lock, flags); - return ret; + rcu_read_unlock(); } EXPORT_SYMBOL(aio_complete); -/* aio_read_evt - * Pull an event off of the ioctx's event ring. Returns the number of - * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. - * TODO: make the ringbuffer user mmap()able (requires FIXME). +/* aio_read_events + * Pull an event off of the ioctx's event ring. Returns the number of + * events fetched */ -static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) +static long aio_read_events_ring(struct kioctx *ctx, + struct io_event __user *event, long nr) { - struct aio_ring_info *info = &ioctx->ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; - - ring = kmap_atomic(info->ring_pages[0]); - dprintk("in aio_read_evt h%lu t%lu m%lu\n", - (unsigned long)ring->head, (unsigned long)ring->tail, - (unsigned long)ring->nr); - - if (ring->head == ring->tail) - goto out; + unsigned head, pos; + long ret = 0; + int copy_ret; - spin_lock(&info->ring_lock); - - head = ring->head % info->nr; - if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head); - *ent = *evp; - head = (head + 1) % info->nr; - smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; - ret = 1; - put_aio_ring_event(evp); - } - spin_unlock(&info->ring_lock); + mutex_lock(&ctx->ring_lock); -out: - dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, - (unsigned long)ring->head, (unsigned long)ring->tail); + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; kunmap_atomic(ring); - return ret; -} -struct aio_timeout { - struct timer_list timer; - int timed_out; - struct task_struct *p; -}; + pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); -static void timeout_func(unsigned long data) -{ - struct aio_timeout *to = (struct aio_timeout *)data; + if (head == ctx->tail) + goto out; - to->timed_out = 1; - wake_up_process(to->p); -} + while (ret < nr) { + long avail; + struct io_event *ev; + struct page *page; -static inline void init_timeout(struct aio_timeout *to) -{ - setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); - to->timed_out = 0; - to->p = current; -} + avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; + if (head == ctx->tail) + break; -static inline void set_timeout(long start_jiffies, struct aio_timeout *to, - const struct timespec *ts) -{ - to->timer.expires = start_jiffies + timespec_to_jiffies(ts); - if (time_after(to->timer.expires, jiffies)) - add_timer(&to->timer); - else - to->timed_out = 1; -} + avail = min(avail, nr - ret); + avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - + ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); -static inline void clear_timeout(struct aio_timeout *to) -{ - del_singleshot_timer_sync(&to->timer); -} + pos = head + AIO_EVENTS_OFFSET; + page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]; + pos %= AIO_EVENTS_PER_PAGE; -static int read_events(struct kioctx *ctx, - long min_nr, long nr, - struct io_event __user *event, - struct timespec __user *timeout) -{ - long start_jiffies = jiffies; - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); - int ret; - int i = 0; - struct io_event ent; - struct aio_timeout to; - int retry = 0; - - /* needed to zero any padding within an entry (there shouldn't be - * any, but C is fun! - */ - memset(&ent, 0, sizeof(ent)); -retry: - ret = 0; - while (likely(i < nr)) { - ret = aio_read_evt(ctx, &ent); - if (unlikely(ret <= 0)) - break; - - dprintk("read event: %Lx %Lx %Lx %Lx\n", - ent.data, ent.obj, ent.res, ent.res2); + ev = kmap(page); + copy_ret = copy_to_user(event + ret, ev + pos, + sizeof(*ev) * avail); + kunmap(page); - /* Could we split the check in two? */ - ret = -EFAULT; - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - dprintk("aio: lost an event due to EFAULT.\n"); - break; + if (unlikely(copy_ret)) { + ret = -EFAULT; + goto out; } - ret = 0; - /* Good, event copied to userland, update counts. */ - event ++; - i ++; + ret += avail; + head += avail; + head %= ctx->nr_events; } - if (min_nr <= i) - return i; - if (ret) - return ret; + ring = kmap_atomic(ctx->ring_pages[0]); + ring->head = head; + kunmap_atomic(ring); + flush_dcache_page(ctx->ring_pages[0]); - /* End fast path */ + pr_debug("%li h%u t%u\n", ret, head, ctx->tail); - /* racey check, but it gets redone */ - if (!retry && unlikely(!list_empty(&ctx->run_list))) { - retry = 1; - aio_run_all_iocbs(ctx); - goto retry; - } + atomic_sub(ret, &ctx->reqs_active); +out: + mutex_unlock(&ctx->ring_lock); - init_timeout(&to); - if (timeout) { - struct timespec ts; - ret = -EFAULT; - if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) - goto out; + return ret; +} - set_timeout(start_jiffies, &to, &ts); - } +static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, long *i) +{ + long ret = aio_read_events_ring(ctx, event + *i, nr - *i); - while (likely(i < nr)) { - add_wait_queue_exclusive(&ctx->wait, &wait); - do { - set_task_state(tsk, TASK_INTERRUPTIBLE); - ret = aio_read_evt(ctx, &ent); - if (ret) - break; - if (min_nr <= i) - break; - if (unlikely(ctx->dead)) { - ret = -EINVAL; - break; - } - if (to.timed_out) /* Only check after read evt */ - break; - /* Try to only show up in io wait if there are ops - * in flight */ - if (ctx->reqs_active) - io_schedule(); - else - schedule(); - if (signal_pending(tsk)) { - ret = -EINTR; - break; - } - /*ret = aio_read_evt(ctx, &ent);*/ - } while (1) ; - - set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(&ctx->wait, &wait); - - if (unlikely(ret <= 0)) - break; + if (ret > 0) + *i += ret; - ret = -EFAULT; - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { - dprintk("aio: lost an event due to EFAULT.\n"); - break; - } + if (unlikely(atomic_read(&ctx->dead))) + ret = -EINVAL; - /* Good, event copied to userland, update counts. */ - event ++; - i ++; - } + if (!*i) + *i = ret; - if (timeout) - clear_timeout(&to); -out: - destroy_timer_on_stack(&to.timer); - return i ? i : ret; + return ret < 0 || *i >= min_nr; } -/* Take an ioctx and remove it from the list of ioctx's. Protects - * against races with itself via ->dead. - */ -static void io_destroy(struct kioctx *ioctx) +static long read_events(struct kioctx *ctx, long min_nr, long nr, + struct io_event __user *event, + struct timespec __user *timeout) { - struct mm_struct *mm = current->mm; - int was_dead; + ktime_t until = { .tv64 = KTIME_MAX }; + long ret = 0; - /* delete the entry from the list is someone else hasn't already */ - spin_lock(&mm->ioctx_lock); - was_dead = ioctx->dead; - ioctx->dead = 1; - hlist_del_rcu(&ioctx->list); - spin_unlock(&mm->ioctx_lock); + if (timeout) { + struct timespec ts; - dprintk("aio_release(%p)\n", ioctx); - if (likely(!was_dead)) - put_ioctx(ioctx); /* twice for the list */ + if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) + return -EFAULT; - kill_ctx(ioctx); + until = timespec_to_ktime(ts); + } /* - * Wake up any waiters. The setting of ctx->dead must be seen - * by other CPUs at this point. Right now, we rely on the - * locking done by the above calls to ensure this consistency. + * Note that aio_read_events() is being called as the conditional - i.e. + * we're calling it after prepare_to_wait() has set task state to + * TASK_INTERRUPTIBLE. + * + * But aio_read_events() can block, and if it blocks it's going to flip + * the task state back to TASK_RUNNING. + * + * This should be ok, provided it doesn't flip the state back to + * TASK_RUNNING and return 0 too much - that causes us to spin. That + * will only happen if the mutex_lock() call blocks, and we then find + * the ringbuffer empty. So in practice we should be ok, but it's + * something to be aware of when touching this code. */ - wake_up_all(&ioctx->wait); + wait_event_interruptible_hrtimeout(ctx->wait, + aio_read_events(ctx, min_nr, nr, event, &ret), until); + + if (!ret && signal_pending(current)) + ret = -EINTR; + + return ret; } /* sys_io_setup: @@ -1252,7 +854,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - io_destroy(ioctx); + kill_ioctx(ioctx); put_ioctx(ioctx); } @@ -1270,7 +872,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - io_destroy(ioctx); + kill_ioctx(ioctx); put_ioctx(ioctx); return 0; } @@ -1301,30 +903,21 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) BUG_ON(ret > 0 && iocb->ki_left == 0); } -static ssize_t aio_rw_vect_retry(struct kiocb *iocb) +typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, + unsigned long, loff_t); + +static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - ssize_t (*rw_op)(struct kiocb *, const struct iovec *, - unsigned long, loff_t); ssize_t ret = 0; - unsigned short opcode; - - if ((iocb->ki_opcode == IOCB_CMD_PREADV) || - (iocb->ki_opcode == IOCB_CMD_PREAD)) { - rw_op = file->f_op->aio_read; - opcode = IOCB_CMD_PREADV; - } else { - rw_op = file->f_op->aio_write; - opcode = IOCB_CMD_PWRITEV; - } /* This matches the pread()/pwrite() logic */ if (iocb->ki_pos < 0) return -EINVAL; - if (opcode == IOCB_CMD_PWRITEV) + if (rw == WRITE) file_start_write(file); do { ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], @@ -1336,9 +929,9 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* retry all partial writes. retry partial reads as long as its a * regular file. */ } while (ret > 0 && iocb->ki_left > 0 && - (opcode == IOCB_CMD_PWRITEV || + (rw == WRITE || (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); - if (opcode == IOCB_CMD_PWRITEV) + if (rw == WRITE) file_end_write(file); /* This means we must have transferred all that we could */ @@ -1348,81 +941,49 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb) /* If we managed to write some out we return that, rather than * the eventual error. */ - if (opcode == IOCB_CMD_PWRITEV - && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY + if (rw == WRITE + && ret < 0 && ret != -EIOCBQUEUED && iocb->ki_nbytes - iocb->ki_left) ret = iocb->ki_nbytes - iocb->ki_left; return ret; } -static ssize_t aio_fdsync(struct kiocb *iocb) -{ - struct file *file = iocb->ki_filp; - ssize_t ret = -EINVAL; - - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(iocb, 1); - return ret; -} - -static ssize_t aio_fsync(struct kiocb *iocb) -{ - struct file *file = iocb->ki_filp; - ssize_t ret = -EINVAL; - - if (file->f_op->aio_fsync) - ret = file->f_op->aio_fsync(iocb, 0); - return ret; -} - -static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) +static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) { ssize_t ret; + kiocb->ki_nr_segs = kiocb->ki_nbytes; + #ifdef CONFIG_COMPAT if (compat) - ret = compat_rw_copy_check_uvector(type, + ret = compat_rw_copy_check_uvector(rw, (struct compat_iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, &kiocb->ki_iovec); else #endif - ret = rw_copy_check_uvector(type, + ret = rw_copy_check_uvector(rw, (struct iovec __user *)kiocb->ki_buf, - kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, + kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, &kiocb->ki_iovec); if (ret < 0) - goto out; - - ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret); - if (ret < 0) - goto out; + return ret; - kiocb->ki_nr_segs = kiocb->ki_nbytes; - kiocb->ki_cur_seg = 0; - /* ki_nbytes/left now reflect bytes instead of segs */ + /* ki_nbytes now reflect bytes instead of segs */ kiocb->ki_nbytes = ret; - kiocb->ki_left = ret; - - ret = 0; -out: - return ret; + return 0; } -static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) { - int bytes; - - bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left); - if (bytes < 0) - return bytes; + if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) + return -EFAULT; kiocb->ki_iovec = &kiocb->ki_inline_vec; kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = bytes; + kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; kiocb->ki_nr_segs = 1; - kiocb->ki_cur_seg = 0; return 0; } @@ -1431,96 +992,95 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. */ -static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) +static ssize_t aio_run_iocb(struct kiocb *req, bool compat) { - struct file *file = kiocb->ki_filp; - ssize_t ret = 0; + struct file *file = req->ki_filp; + ssize_t ret; + int rw; + fmode_t mode; + aio_rw_op *rw_op; - switch (kiocb->ki_opcode) { + switch (req->ki_opcode) { case IOCB_CMD_PREAD: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - break; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, - kiocb->ki_left))) - break; - ret = aio_setup_single_vector(READ, file, kiocb); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_read) - kiocb->ki_retry = aio_rw_vect_retry; - break; - case IOCB_CMD_PWRITE: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - break; - ret = -EFAULT; - if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, - kiocb->ki_left))) - break; - ret = aio_setup_single_vector(WRITE, file, kiocb); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_write) - kiocb->ki_retry = aio_rw_vect_retry; - break; case IOCB_CMD_PREADV: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_READ))) - break; - ret = aio_setup_vectored_rw(READ, kiocb, compat); - if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_read) - kiocb->ki_retry = aio_rw_vect_retry; - break; + mode = FMODE_READ; + rw = READ; + rw_op = file->f_op->aio_read; + goto rw_common; + + case IOCB_CMD_PWRITE: case IOCB_CMD_PWRITEV: - ret = -EBADF; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - break; - ret = aio_setup_vectored_rw(WRITE, kiocb, compat); + mode = FMODE_WRITE; + rw = WRITE; + rw_op = file->f_op->aio_write; + goto rw_common; +rw_common: + if (unlikely(!(file->f_mode & mode))) + return -EBADF; + + if (!rw_op) + return -EINVAL; + + ret = (req->ki_opcode == IOCB_CMD_PREADV || + req->ki_opcode == IOCB_CMD_PWRITEV) + ? aio_setup_vectored_rw(rw, req, compat) + : aio_setup_single_vector(rw, req); if (ret) - break; - ret = -EINVAL; - if (file->f_op->aio_write) - kiocb->ki_retry = aio_rw_vect_retry; + return ret; + + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (ret < 0) + return ret; + + req->ki_nbytes = ret; + req->ki_left = ret; + + ret = aio_rw_vect_retry(req, rw, rw_op); break; + case IOCB_CMD_FDSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - kiocb->ki_retry = aio_fdsync; + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 1); break; + case IOCB_CMD_FSYNC: - ret = -EINVAL; - if (file->f_op->aio_fsync) - kiocb->ki_retry = aio_fsync; + if (!file->f_op->aio_fsync) + return -EINVAL; + + ret = file->f_op->aio_fsync(req, 0); break; + default: - dprintk("EINVAL: io_submit: no operation provided\n"); - ret = -EINVAL; + pr_debug("EINVAL: no operation provided\n"); + return -EINVAL; } - if (!kiocb->ki_retry) - return ret; + if (ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || + ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; + aio_complete(req, ret, 0); + } return 0; } static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb, struct kiocb_batch *batch, - bool compat) + struct iocb *iocb, bool compat) { struct kiocb *req; - struct file *file; ssize_t ret; /* enforce forwards compatibility on users */ if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { - pr_debug("EINVAL: io_submit: reserve field set\n"); + pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } @@ -1534,16 +1094,16 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return -EINVAL; } - file = fget(iocb->aio_fildes); - if (unlikely(!file)) - return -EBADF; - - req = aio_get_req(ctx, batch); /* returns with 2 references to req */ - if (unlikely(!req)) { - fput(file); + req = aio_get_req(ctx); + if (unlikely(!req)) return -EAGAIN; + + req->ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->ki_filp)) { + ret = -EBADF; + goto out_put_req; } - req->ki_filp = file; + if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an @@ -1559,9 +1119,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, } } - ret = put_user(req->ki_key, &user_iocb->aio_key); + ret = put_user(KIOCB_KEY, &user_iocb->aio_key); if (unlikely(ret)) { - dprintk("EFAULT: aio_key\n"); + pr_debug("EFAULT: aio_key\n"); goto out_put_req; } @@ -1573,41 +1133,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_left = req->ki_nbytes = iocb->aio_nbytes; req->ki_opcode = iocb->aio_lio_opcode; - ret = aio_setup_iocb(req, compat); - + ret = aio_run_iocb(req, compat); if (ret) goto out_put_req; - spin_lock_irq(&ctx->ctx_lock); - /* - * We could have raced with io_destroy() and are currently holding a - * reference to ctx which should be destroyed. We cannot submit IO - * since ctx gets freed as soon as io_submit() puts its reference. The - * check here is reliable: io_destroy() sets ctx->dead before waiting - * for outstanding IO and the barrier between these two is realized by - * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we - * increment ctx->reqs_active before checking for ctx->dead and the - * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we - * don't see ctx->dead set here, io_destroy() waits for our IO to - * finish. - */ - if (ctx->dead) { - spin_unlock_irq(&ctx->ctx_lock); - ret = -EINVAL; - goto out_put_req; - } - aio_run_iocb(req); - if (!list_empty(&ctx->run_list)) { - /* drain the run list */ - while (__aio_run_iocbs(ctx)) - ; - } - spin_unlock_irq(&ctx->ctx_lock); - aio_put_req(req); /* drop extra ref to req */ return 0; - out_put_req: + atomic_dec(&ctx->reqs_active); aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ return ret; @@ -1620,7 +1153,6 @@ long do_io_submit(aio_context_t ctx_id, long nr, long ret = 0; int i = 0; struct blk_plug plug; - struct kiocb_batch batch; if (unlikely(nr < 0)) return -EINVAL; @@ -1633,12 +1165,10 @@ long do_io_submit(aio_context_t ctx_id, long nr, ctx = lookup_ioctx(ctx_id); if (unlikely(!ctx)) { - pr_debug("EINVAL: io_submit: invalid context id\n"); + pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } - kiocb_batch_init(&batch, nr); - blk_start_plug(&plug); /* @@ -1659,13 +1189,12 @@ long do_io_submit(aio_context_t ctx_id, long nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat); + ret = io_submit_one(ctx, user_iocb, &tmp, compat); if (ret) break; } blk_finish_plug(&plug); - kiocb_batch_free(ctx, &batch); put_ioctx(ctx); return i ? i : ret; } @@ -1698,10 +1227,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, assert_spin_locked(&ctx->ctx_lock); + if (key != KIOCB_KEY) + return NULL; + /* TODO: use a hash or array, this sucks. */ list_for_each(pos, &ctx->active_reqs) { struct kiocb *kiocb = list_kiocb(pos); - if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) + if (kiocb->ki_obj.user == iocb) return kiocb; } return NULL; @@ -1720,7 +1252,7 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { - int (*cancel)(struct kiocb *iocb, struct io_event *res); + struct io_event res; struct kioctx *ctx; struct kiocb *kiocb; u32 key; @@ -1735,32 +1267,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - ret = -EAGAIN; + kiocb = lookup_kiocb(ctx, iocb, key); - if (kiocb && kiocb->ki_cancel) { - cancel = kiocb->ki_cancel; - kiocb->ki_users ++; - kiocbSetCancelled(kiocb); - } else - cancel = NULL; + if (kiocb) + ret = kiocb_cancel(ctx, kiocb, &res); + else + ret = -EINVAL; + spin_unlock_irq(&ctx->ctx_lock); - if (NULL != cancel) { - struct io_event tmp; - pr_debug("calling cancel\n"); - memset(&tmp, 0, sizeof(tmp)); - tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; - tmp.data = kiocb->ki_user_data; - ret = cancel(kiocb, &tmp); - if (!ret) { - /* Cancellation succeeded -- copy the result - * into the user's buffer. - */ - if (copy_to_user(result, &tmp, sizeof(tmp))) - ret = -EFAULT; - } - } else - ret = -EINVAL; + if (!ret) { + /* Cancellation succeeded -- copy the result + * into the user's buffer. + */ + if (copy_to_user(result, &res, sizeof(res))) + ret = -EFAULT; + } put_ioctx(ctx); diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index a3f28f331b2b..8fb42916d8a2 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -27,48 +27,11 @@ #include <linux/workqueue.h> #include <linux/slab.h> -struct integrity_slab { - struct kmem_cache *slab; - unsigned short nr_vecs; - char name[8]; -}; - -#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) } -struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = { - IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES), -}; -#undef IS +#define BIP_INLINE_VECS 4 +static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; -static inline unsigned int vecs_to_idx(unsigned int nr) -{ - switch (nr) { - case 1: - return 0; - case 2 ... 4: - return 1; - case 5 ... 16: - return 2; - case 17 ... 64: - return 3; - case 65 ... 128: - return 4; - case 129 ... BIO_MAX_PAGES: - return 5; - default: - BUG(); - } -} - -static inline int use_bip_pool(unsigned int idx) -{ - if (idx == BIOVEC_MAX_IDX) - return 1; - - return 0; -} - /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to @@ -84,37 +47,41 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, unsigned int nr_vecs) { struct bio_integrity_payload *bip; - unsigned int idx = vecs_to_idx(nr_vecs); struct bio_set *bs = bio->bi_pool; - - if (!bs) - bs = fs_bio_set; - - BUG_ON(bio == NULL); - bip = NULL; - - /* Lower order allocations come straight from slab */ - if (!use_bip_pool(idx)) - bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask); - - /* Use mempool if lower order alloc failed or max vecs were requested */ - if (bip == NULL) { - idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */ + unsigned long idx = BIO_POOL_NONE; + unsigned inline_vecs; + + if (!bs) { + bip = kmalloc(sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * nr_vecs, gfp_mask); + inline_vecs = nr_vecs; + } else { bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); - - if (unlikely(bip == NULL)) { - printk(KERN_ERR "%s: could not alloc bip\n", __func__); - return NULL; - } + inline_vecs = BIP_INLINE_VECS; } + if (unlikely(!bip)) + return NULL; + memset(bip, 0, sizeof(*bip)); + if (nr_vecs > inline_vecs) { + bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, + bs->bvec_integrity_pool); + if (!bip->bip_vec) + goto err; + } else { + bip->bip_vec = bip->bip_inline_vecs; + } + bip->bip_slab = idx; bip->bip_bio = bio; bio->bi_integrity = bip; return bip; +err: + mempool_free(bip, bs->bio_integrity_pool); + return NULL; } EXPORT_SYMBOL(bio_integrity_alloc); @@ -130,20 +97,18 @@ void bio_integrity_free(struct bio *bio) struct bio_integrity_payload *bip = bio->bi_integrity; struct bio_set *bs = bio->bi_pool; - if (!bs) - bs = fs_bio_set; - - BUG_ON(bip == NULL); - - /* A cloned bio doesn't own the integrity metadata */ - if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY) - && bip->bip_buf != NULL) + if (bip->bip_owns_buf) kfree(bip->bip_buf); - if (use_bip_pool(bip->bip_slab)) + if (bs) { + if (bip->bip_slab != BIO_POOL_NONE) + bvec_free(bs->bvec_integrity_pool, bip->bip_vec, + bip->bip_slab); + mempool_free(bip, bs->bio_integrity_pool); - else - kmem_cache_free(bip_slab[bip->bip_slab].slab, bip); + } else { + kfree(bip); + } bio->bi_integrity = NULL; } @@ -419,6 +384,7 @@ int bio_integrity_prep(struct bio *bio) return -EIO; } + bip->bip_owns_buf = 1; bip->bip_buf = buf; bip->bip_size = len; bip->bip_sector = bio->bi_sector; @@ -694,11 +660,11 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) bp->bio1.bi_integrity = &bp->bip1; bp->bio2.bi_integrity = &bp->bip2; - bp->iv1 = bip->bip_vec[0]; - bp->iv2 = bip->bip_vec[0]; + bp->iv1 = bip->bip_vec[bip->bip_idx]; + bp->iv2 = bip->bip_vec[bip->bip_idx]; - bp->bip1.bip_vec[0] = bp->iv1; - bp->bip2.bip_vec[0] = bp->iv2; + bp->bip1.bip_vec = &bp->iv1; + bp->bip2.bip_vec = &bp->iv2; bp->iv1.bv_len = sectors * bi->tuple_size; bp->iv2.bv_offset += sectors * bi->tuple_size; @@ -746,13 +712,14 @@ EXPORT_SYMBOL(bio_integrity_clone); int bioset_integrity_create(struct bio_set *bs, int pool_size) { - unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES); - if (bs->bio_integrity_pool) return 0; - bs->bio_integrity_pool = - mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab); + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); + + bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); + if (!bs->bvec_integrity_pool) + return -1; if (!bs->bio_integrity_pool) return -1; @@ -765,13 +732,14 @@ void bioset_integrity_free(struct bio_set *bs) { if (bs->bio_integrity_pool) mempool_destroy(bs->bio_integrity_pool); + + if (bs->bvec_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); } EXPORT_SYMBOL(bioset_integrity_free); void __init bio_integrity_init(void) { - unsigned int i; - /* * kintegrityd won't block much but may burn a lot of CPU cycles. * Make it highpri CPU intensive wq with max concurrency of 1. @@ -781,14 +749,10 @@ void __init bio_integrity_init(void) if (!kintegrityd_wq) panic("Failed to create kintegrityd\n"); - for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) { - unsigned int size; - - size = sizeof(struct bio_integrity_payload) - + bip_slab[i].nr_vecs * sizeof(struct bio_vec); - - bip_slab[i].slab = - kmem_cache_create(bip_slab[i].name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - } + bip_slab = kmem_cache_create("bio_integrity_payload", + sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * BIP_INLINE_VECS, + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + if (!bip_slab) + panic("Failed to create slab\n"); } @@ -19,6 +19,7 @@ #include <linux/swap.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/uio.h> #include <linux/iocontext.h> #include <linux/slab.h> #include <linux/init.h> @@ -160,12 +161,12 @@ unsigned int bvec_nr_vecs(unsigned short idx) return bvec_slabs[idx].nr_vecs; } -void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx) +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) { BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); if (idx == BIOVEC_MAX_IDX) - mempool_free(bv, bs->bvec_pool); + mempool_free(bv, pool); else { struct biovec_slab *bvs = bvec_slabs + idx; @@ -173,8 +174,8 @@ void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx) } } -struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, - struct bio_set *bs) +struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, + mempool_t *pool) { struct bio_vec *bvl; @@ -210,7 +211,7 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, */ if (*idx == BIOVEC_MAX_IDX) { fallback: - bvl = mempool_alloc(bs->bvec_pool, gfp_mask); + bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); @@ -252,8 +253,8 @@ static void bio_free(struct bio *bio) __bio_free(bio); if (bs) { - if (bio_has_allocated_vec(bio)) - bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); + if (bio_flagged(bio, BIO_OWNS_VEC)) + bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); /* * If we have front padding, adjust the bio pointer before freeing @@ -297,6 +298,54 @@ void bio_reset(struct bio *bio) } EXPORT_SYMBOL(bio_reset); +static void bio_alloc_rescue(struct work_struct *work) +{ + struct bio_set *bs = container_of(work, struct bio_set, rescue_work); + struct bio *bio; + + while (1) { + spin_lock(&bs->rescue_lock); + bio = bio_list_pop(&bs->rescue_list); + spin_unlock(&bs->rescue_lock); + + if (!bio) + break; + + generic_make_request(bio); + } +} + +static void punt_bios_to_rescuer(struct bio_set *bs) +{ + struct bio_list punt, nopunt; + struct bio *bio; + + /* + * In order to guarantee forward progress we must punt only bios that + * were allocated from this bio_set; otherwise, if there was a bio on + * there for a stacking driver higher up in the stack, processing it + * could require allocating bios from this bio_set, and doing that from + * our own rescuer would be bad. + * + * Since bio lists are singly linked, pop them all instead of trying to + * remove from the middle of the list: + */ + + bio_list_init(&punt); + bio_list_init(&nopunt); + + while ((bio = bio_list_pop(current->bio_list))) + bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + + *current->bio_list = nopunt; + + spin_lock(&bs->rescue_lock); + bio_list_merge(&bs->rescue_list, &punt); + spin_unlock(&bs->rescue_lock); + + queue_work(bs->rescue_workqueue, &bs->rescue_work); +} + /** * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator @@ -314,11 +363,27 @@ EXPORT_SYMBOL(bio_reset); * previously allocated bio for IO before attempting to allocate a new one. * Failure to do so can cause deadlocks under memory pressure. * + * Note that when running under generic_make_request() (i.e. any block + * driver), bios are not submitted until after you return - see the code in + * generic_make_request() that converts recursion into iteration, to prevent + * stack overflows. + * + * This would normally mean allocating multiple bios under + * generic_make_request() would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. + * + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * generic_make_request() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. + * * RETURNS: * Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { + gfp_t saved_gfp = gfp_mask; unsigned front_pad; unsigned inline_vecs; unsigned long idx = BIO_POOL_NONE; @@ -336,7 +401,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) front_pad = 0; inline_vecs = nr_iovecs; } else { + /* + * generic_make_request() converts recursion to iteration; this + * means if we're running beneath it, any bios we allocate and + * submit will not be submitted (and thus freed) until after we + * return. + * + * This exposes us to a potential deadlock if we allocate + * multiple bios from the same bio_set() while running + * underneath generic_make_request(). If we were to allocate + * multiple bios (say a stacking block driver that was splitting + * bios), we would deadlock if we exhausted the mempool's + * reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are + * bios on current->bio_list, we first try the allocation + * without __GFP_WAIT; if that fails, we punt those bios we + * would be blocking to the rescuer workqueue before we retry + * with the original gfp_flags. + */ + + if (current->bio_list && !bio_list_empty(current->bio_list)) + gfp_mask &= ~__GFP_WAIT; + p = mempool_alloc(bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + p = mempool_alloc(bs->bio_pool, gfp_mask); + } + front_pad = bs->front_pad; inline_vecs = BIO_INLINE_VECS; } @@ -348,9 +443,17 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) bio_init(bio); if (nr_iovecs > inline_vecs) { - bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); + if (!bvl && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); + } + if (unlikely(!bvl)) goto err_free; + + bio->bi_flags |= 1 << BIO_OWNS_VEC; } else if (nr_iovecs) { bvl = bio->bi_inline_vecs; } @@ -652,6 +755,181 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, } EXPORT_SYMBOL(bio_add_page); +struct submit_bio_ret { + struct completion event; + int error; +}; + +static void submit_bio_wait_endio(struct bio *bio, int error) +{ + struct submit_bio_ret *ret = bio->bi_private; + + ret->error = error; + complete(&ret->event); +} + +/** + * submit_bio_wait - submit a bio, and wait until it completes + * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) + * @bio: The &struct bio which describes the I/O + * + * Simple wrapper around submit_bio(). Returns 0 on success, or the error from + * bio_endio() on failure. + */ +int submit_bio_wait(int rw, struct bio *bio) +{ + struct submit_bio_ret ret; + + rw |= REQ_SYNC; + init_completion(&ret.event); + bio->bi_private = &ret; + bio->bi_end_io = submit_bio_wait_endio; + submit_bio(rw, bio); + wait_for_completion(&ret.event); + + return ret.error; +} +EXPORT_SYMBOL(submit_bio_wait); + +/** + * bio_advance - increment/complete a bio by some number of bytes + * @bio: bio to advance + * @bytes: number of bytes to complete + * + * This updates bi_sector, bi_size and bi_idx; if the number of bytes to + * complete doesn't align with a bvec boundary, then bv_len and bv_offset will + * be updated on the last bvec as well. + * + * @bio will then represent the remaining, uncompleted portion of the io. + */ +void bio_advance(struct bio *bio, unsigned bytes) +{ + if (bio_integrity(bio)) + bio_integrity_advance(bio, bytes); + + bio->bi_sector += bytes >> 9; + bio->bi_size -= bytes; + + if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK) + return; + + while (bytes) { + if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { + WARN_ONCE(1, "bio idx %d >= vcnt %d\n", + bio->bi_idx, bio->bi_vcnt); + break; + } + + if (bytes >= bio_iovec(bio)->bv_len) { + bytes -= bio_iovec(bio)->bv_len; + bio->bi_idx++; + } else { + bio_iovec(bio)->bv_len -= bytes; + bio_iovec(bio)->bv_offset += bytes; + bytes = 0; + } + } +} +EXPORT_SYMBOL(bio_advance); + +/** + * bio_alloc_pages - allocates a single page for each bvec in a bio + * @bio: bio to allocate pages for + * @gfp_mask: flags for allocation + * + * Allocates pages up to @bio->bi_vcnt. + * + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are + * freed. + */ +int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ + int i; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, i) { + bv->bv_page = alloc_page(gfp_mask); + if (!bv->bv_page) { + while (--bv >= bio->bi_io_vec) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} +EXPORT_SYMBOL(bio_alloc_pages); + +/** + * bio_copy_data - copy contents of data buffers from one chain of bios to + * another + * @src: source bio list + * @dst: destination bio list + * + * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats + * @src and @dst as linked lists of bios. + * + * Stops when it reaches the end of either @src or @dst - that is, copies + * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). + */ +void bio_copy_data(struct bio *dst, struct bio *src) +{ + struct bio_vec *src_bv, *dst_bv; + unsigned src_offset, dst_offset, bytes; + void *src_p, *dst_p; + + src_bv = bio_iovec(src); + dst_bv = bio_iovec(dst); + + src_offset = src_bv->bv_offset; + dst_offset = dst_bv->bv_offset; + + while (1) { + if (src_offset == src_bv->bv_offset + src_bv->bv_len) { + src_bv++; + if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) { + src = src->bi_next; + if (!src) + break; + + src_bv = bio_iovec(src); + } + + src_offset = src_bv->bv_offset; + } + + if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) { + dst_bv++; + if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) { + dst = dst->bi_next; + if (!dst) + break; + + dst_bv = bio_iovec(dst); + } + + dst_offset = dst_bv->bv_offset; + } + + bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset, + src_bv->bv_offset + src_bv->bv_len - src_offset); + + src_p = kmap_atomic(src_bv->bv_page); + dst_p = kmap_atomic(dst_bv->bv_page); + + memcpy(dst_p + dst_bv->bv_offset, + src_p + src_bv->bv_offset, + bytes); + + kunmap_atomic(dst_p); + kunmap_atomic(src_p); + + src_offset += bytes; + dst_offset += bytes; + } +} +EXPORT_SYMBOL(bio_copy_data); + struct bio_map_data { struct bio_vec *iovecs; struct sg_iovec *sgvecs; @@ -714,7 +992,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, int iov_idx = 0; unsigned int iov_off = 0; - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { char *bv_addr = page_address(bvec->bv_page); unsigned int bv_len = iovecs[i].bv_len; @@ -896,7 +1174,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, return bio; cleanup: if (!map_data) - bio_for_each_segment(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i) __free_page(bvec->bv_page); bio_put(bio); @@ -1110,7 +1388,7 @@ static void __bio_unmap_user(struct bio *bio) /* * make sure we dirty pages we wrote to */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { if (bio_data_dir(bio) == READ) set_page_dirty_lock(bvec->bv_page); @@ -1216,7 +1494,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err) int i; char *p = bmd->sgvecs[0].iov_base; - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); int len = bmd->iovecs[i].bv_len; @@ -1256,7 +1534,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, if (!reading) { void *p = data; - bio_for_each_segment(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); memcpy(addr, p, bvec->bv_len); @@ -1301,11 +1579,11 @@ EXPORT_SYMBOL(bio_copy_kern); */ void bio_set_pages_dirty(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (page && !PageCompound(page)) set_page_dirty_lock(page); @@ -1314,11 +1592,11 @@ void bio_set_pages_dirty(struct bio *bio) static void bio_release_pages(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (page) put_page(page); @@ -1367,16 +1645,16 @@ static void bio_dirty_fn(struct work_struct *work) void bio_check_pages_dirty(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int nr_clean_pages = 0; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (PageDirty(page) || PageCompound(page)) { page_cache_release(page); - bvec[i].bv_page = NULL; + bvec->bv_page = NULL; } else { nr_clean_pages++; } @@ -1477,8 +1755,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) trace_block_split(bdev_get_queue(bi->bi_bdev), bi, bi->bi_sector + first_sectors); - BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0); - BUG_ON(bi->bi_idx != 0); + BUG_ON(bio_segments(bi) > 1); atomic_set(&bp->cnt, 3); bp->error = 0; bp->bio1 = *bi; @@ -1488,8 +1765,8 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) bp->bio1.bi_size = first_sectors << 9; if (bi->bi_vcnt != 0) { - bp->bv1 = bi->bi_io_vec[0]; - bp->bv2 = bi->bi_io_vec[0]; + bp->bv1 = *bio_iovec(bi); + bp->bv2 = *bio_iovec(bi); if (bio_is_rw(bi)) { bp->bv2.bv_offset += first_sectors << 9; @@ -1541,7 +1818,7 @@ sector_t bio_sector_offset(struct bio *bio, unsigned short index, if (index >= bio->bi_idx) index = bio->bi_vcnt - 1; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { if (i == index) { if (offset > bv->bv_offset) sectors += (offset - bv->bv_offset) / sector_sz; @@ -1559,29 +1836,25 @@ EXPORT_SYMBOL(bio_sector_offset); * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ -static int biovec_create_pools(struct bio_set *bs, int pool_entries) +mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) { struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; - bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab); - if (!bs->bvec_pool) - return -ENOMEM; - - return 0; -} - -static void biovec_free_pools(struct bio_set *bs) -{ - mempool_destroy(bs->bvec_pool); + return mempool_create_slab_pool(pool_entries, bp->slab); } void bioset_free(struct bio_set *bs) { + if (bs->rescue_workqueue) + destroy_workqueue(bs->rescue_workqueue); + if (bs->bio_pool) mempool_destroy(bs->bio_pool); + if (bs->bvec_pool) + mempool_destroy(bs->bvec_pool); + bioset_integrity_free(bs); - biovec_free_pools(bs); bio_put_slab(bs); kfree(bs); @@ -1612,6 +1885,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) bs->front_pad = front_pad; + spin_lock_init(&bs->rescue_lock); + bio_list_init(&bs->rescue_list); + INIT_WORK(&bs->rescue_work, bio_alloc_rescue); + bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); if (!bs->bio_slab) { kfree(bs); @@ -1622,9 +1899,15 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) if (!bs->bio_pool) goto bad; - if (!biovec_create_pools(bs, pool_size)) - return bs; + bs->bvec_pool = biovec_create_pool(bs, pool_size); + if (!bs->bvec_pool) + goto bad; + + bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); + if (!bs->rescue_workqueue) + goto bad; + return bs; bad: bioset_free(bs); return NULL; diff --git a/fs/block_dev.c b/fs/block_dev.c index 3823d3ffb760..2091db8cdd78 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -27,6 +27,7 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include "internal.h" @@ -1555,7 +1556,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, return 0; size -= pos; - if (size < INT_MAX) + if (size < iocb->ki_left) nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); return generic_file_aio_read(iocb, iov, nr_segs, pos); } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 9a8622a5b867..2b3b83296977 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -1,5 +1,5 @@ config BTRFS_FS - tristate "Btrfs filesystem Unstable disk format" + tristate "Btrfs filesystem support" select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE @@ -52,3 +52,23 @@ config BTRFS_FS_CHECK_INTEGRITY In most cases, unless you are a btrfs developer who needs to verify the integrity of (super)-block write requests during the run of a regression test, say N + +config BTRFS_FS_RUN_SANITY_TESTS + bool "Btrfs will run sanity tests upon loading" + depends on BTRFS_FS + help + This will run some basic sanity tests on the free space cache + code to make sure it is acting as it should. These are mostly + regression tests and are only really interesting to btrfs devlopers. + + If unsure, say N. + +config BTRFS_DEBUG + bool "Btrfs debugging support" + depends on BTRFS_FS + help + Enable run-time debugging support for the btrfs filesystem. This may + enable additional and expensive checks with negative impact on + performance, or export extra information via sysfs. + + If unsure, say N. diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index bd605c87adfd..b4fb41558111 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -352,6 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, err = __resolve_indirect_ref(fs_info, search_commit_root, time_seq, ref, parents, extent_item_pos); + if (err == -ENOMEM) + goto out; if (err) continue; @@ -367,7 +369,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); if (!new_ref) { ret = -ENOMEM; - break; + goto out; } memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; @@ -377,7 +379,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, } ulist_reinit(parents); } - +out: ulist_free(parents); return ret; } @@ -421,7 +423,10 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(!ref->wanted_disk_byte); eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, fs_info->tree_root->leafsize, 0); - BUG_ON(!eb); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); @@ -443,7 +448,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, * having a parent). * mode = 2: merge identical parents */ -static int __merge_refs(struct list_head *head, int mode) +static void __merge_refs(struct list_head *head, int mode) { struct list_head *pos1; @@ -489,7 +494,6 @@ static int __merge_refs(struct list_head *head, int mode) } } - return 0; } /* @@ -582,7 +586,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, default: WARN_ON(1); } - BUG_ON(ret); + if (ret) + return ret; } return 0; @@ -680,7 +685,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, default: WARN_ON(1); } - BUG_ON(ret); + if (ret) + return ret; ptr += btrfs_extent_inline_ref_size(type); } @@ -762,7 +768,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, default: WARN_ON(1); } - BUG_ON(ret); + if (ret) + return ret; + } return ret; @@ -880,18 +888,14 @@ again: if (ret) goto out; - ret = __merge_refs(&prefs, 1); - if (ret) - goto out; + __merge_refs(&prefs, 1); ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq, &prefs, extent_item_pos); if (ret) goto out; - ret = __merge_refs(&prefs, 2); - if (ret) - goto out; + __merge_refs(&prefs, 2); while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); @@ -900,7 +904,8 @@ again: if (ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); - BUG_ON(ret < 0); + if (ret < 0) + goto out; } if (ref->count && ref->parent) { struct extent_inode_elem *eie = NULL; @@ -911,7 +916,10 @@ again: info_level); eb = read_tree_block(fs_info->extent_root, ref->parent, bsz, 0); - BUG_ON(!eb); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie); ref->inode_list = eie; @@ -920,6 +928,8 @@ again: ret = ulist_add_merge(refs, ref->parent, (uintptr_t)ref->inode_list, (u64 *)&eie, GFP_NOFS); + if (ret < 0) + goto out; if (!ret && extent_item_pos) { /* * we've recorded that parent, so we must extend @@ -930,7 +940,6 @@ again: eie = eie->next; eie->next = ref->inode_list; } - BUG_ON(ret < 0); } kfree(ref); } @@ -1180,6 +1189,20 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, return ret; } +/* + * this iterates to turn a name (from iref/extref) into a full filesystem path. + * Elements of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, @@ -1249,32 +1272,6 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, } /* - * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements - * of the path are separated by '/' and the path is guaranteed to be - * 0-terminated. the path is only given within the current file system. - * Therefore, it never starts with a '/'. the caller is responsible to provide - * "size" bytes in "dest". the dest buffer will be filled backwards. finally, - * the start point of the resulting string is returned. this pointer is within - * dest, normally. - * in case the path buffer would overflow, the pointer is decremented further - * as if output was written to the buffer, though no more output is actually - * generated. that way, the caller can determine how much space would be - * required for the path to fit into the buffer. in that case, the returned - * value will be smaller than dest. callers must check this! - */ -char *btrfs_iref_to_path(struct btrfs_root *fs_root, - struct btrfs_path *path, - struct btrfs_inode_ref *iref, - struct extent_buffer *eb_in, u64 parent, - char *dest, u32 size) -{ - return btrfs_ref_to_path(fs_root, path, - btrfs_inode_ref_name_len(eb_in, iref), - (unsigned long)(iref + 1), - eb_in, parent, dest, size); -} - -/* * this makes the path point to (logical EXTENT_ITEM *) * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for * tree blocks and <0 on error. @@ -1461,8 +1458,6 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, iterate_extent_inodes_t *iterate, void *ctx) { int ret; - struct list_head data_refs = LIST_HEAD_INIT(data_refs); - struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); struct btrfs_trans_handle *trans; struct ulist *refs = NULL; struct ulist *roots = NULL; @@ -1508,11 +1503,9 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, iterate, ctx); } ulist_free(roots); - roots = NULL; } free_leaf_list(refs); - ulist_free(roots); out: if (!search_commit_root) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 310a7f6d09b1..0f446d7ca2c0 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -59,9 +59,6 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **roots); -char *btrfs_iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, - struct btrfs_inode_ref *iref, struct extent_buffer *eb, - u64 parent, char *dest, u32 size); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d9b97d4960e6..08b286b2a2c5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -93,7 +93,7 @@ struct btrfs_inode { unsigned long runtime_flags; - /* Keep track of who's O_SYNC/fsycing currently */ + /* Keep track of who's O_SYNC/fsyncing currently */ atomic_t sync_writers; /* full 64 bit generation number, struct vfs_inode doesn't have a big diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 15b94089abc4..b189bd1e7a3e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -82,6 +82,10 @@ struct compressed_bio { u32 sums; }; +static int btrfs_decompress_biovec(int type, struct page **pages_in, + u64 disk_start, struct bio_vec *bvec, + int vcnt, size_t srclen); + static inline int compressed_bio_size(struct btrfs_root *root, unsigned long disk_size) { @@ -106,7 +110,6 @@ static int check_compressed_csum(struct inode *inode, u64 disk_start) { int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; struct page *page; unsigned long i; char *kaddr; @@ -121,7 +124,7 @@ static int check_compressed_csum(struct inode *inode, csum = ~(u32)0; kaddr = kmap_atomic(page); - csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); + csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE); btrfs_csum_final(csum, (char *)&csum); kunmap_atomic(kaddr); @@ -739,7 +742,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; -struct btrfs_compress_op *btrfs_compress_op[] = { +static struct btrfs_compress_op *btrfs_compress_op[] = { &btrfs_zlib_compress, &btrfs_lzo_compress, }; @@ -910,8 +913,9 @@ int btrfs_compress_pages(int type, struct address_space *mapping, * be contiguous. They all correspond to the range of bytes covered by * the compressed extent. */ -int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, - struct bio_vec *bvec, int vcnt, size_t srclen) +static int btrfs_decompress_biovec(int type, struct page **pages_in, + u64 disk_start, struct bio_vec *bvec, + int vcnt, size_t srclen) { struct list_head *workspace; int ret; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9afb0a62ae82..0c803b4fbf93 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -30,8 +30,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping, unsigned long *total_in, unsigned long *total_out, unsigned long max_out); -int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, - struct bio_vec *bvec, int vcnt, size_t srclen); int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ca9d8f1a3bb6..de6de8e60b46 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,16 +37,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); -static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot); +static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, + int level, int slot); static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); -struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr, - u32 blocksize, u64 parent_transid, - u64 time_seq); -struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize, - u64 time_seq); +static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); struct btrfs_path *btrfs_alloc_path(void) { @@ -208,7 +203,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * tree until you end up with a lock on the root. A locked buffer * is returned, with a reference held. */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) { struct extent_buffer *eb; @@ -361,6 +356,44 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info) } /* + * Increment the upper half of tree_mod_seq, set lower half zero. + * + * Must be called with fs_info->tree_mod_seq_lock held. + */ +static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info) +{ + u64 seq = atomic64_read(&fs_info->tree_mod_seq); + seq &= 0xffffffff00000000ull; + seq += 1ull << 32; + atomic64_set(&fs_info->tree_mod_seq, seq); + return seq; +} + +/* + * Increment the lower half of tree_mod_seq. + * + * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers + * are generated should not technically require a spin lock here. (Rationale: + * incrementing the minor while incrementing the major seq number is between its + * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it + * just returns a unique sequence number as usual.) We have decided to leave + * that requirement in here and rethink it once we notice it really imposes a + * problem on some workload. + */ +static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info) +{ + return atomic64_inc_return(&fs_info->tree_mod_seq); +} + +/* + * return the last minor in the previous major tree_mod_seq number + */ +u64 btrfs_tree_mod_seq_prev(u64 seq) +{ + return (seq & 0xffffffff00000000ull) - 1ull; +} + +/* * This adds a new blocker to the tree mod log's blocker list if the @elem * passed does not already have a sequence number set. So when a caller expects * to record tree modifications, it should ensure to set elem->seq to zero @@ -376,10 +409,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, tree_mod_log_write_lock(fs_info); spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { - elem->seq = btrfs_inc_tree_mod_seq(fs_info); + elem->seq = btrfs_inc_tree_mod_seq_major(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - seq = btrfs_inc_tree_mod_seq(fs_info); + seq = btrfs_inc_tree_mod_seq_minor(fs_info); spin_unlock(&fs_info->tree_mod_seq_lock); tree_mod_log_write_unlock(fs_info); @@ -524,7 +557,10 @@ static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, if (!tm) return -ENOMEM; - tm->seq = btrfs_inc_tree_mod_seq(fs_info); + spin_lock(&fs_info->tree_mod_seq_lock); + tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); + spin_unlock(&fs_info->tree_mod_seq_lock); + return tm->seq; } @@ -643,7 +679,8 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) static noinline int tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *old_root, - struct extent_buffer *new_root, gfp_t flags) + struct extent_buffer *new_root, gfp_t flags, + int log_removal) { struct tree_mod_elem *tm; int ret; @@ -651,7 +688,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, NULL)) return 0; - __tree_mod_log_free_eb(fs_info, old_root); + if (log_removal) + __tree_mod_log_free_eb(fs_info, old_root); ret = tree_mod_alloc(fs_info, flags, &tm); if (ret < 0) @@ -738,7 +776,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) static noinline void tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items, int log_removal) + unsigned long src_offset, int nr_items) { int ret; int i; @@ -752,12 +790,10 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, } for (i = 0; i < nr_items; i++) { - if (log_removal) { - ret = tree_mod_log_insert_key_locked(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); - } + ret = tree_mod_log_insert_key_locked(fs_info, src, + i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); ret = tree_mod_log_insert_key_locked(fs_info, dst, i + dst_offset, MOD_LOG_KEY_ADD); @@ -802,11 +838,12 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) static noinline void tree_mod_log_set_root_pointer(struct btrfs_root *root, - struct extent_buffer *new_root_node) + struct extent_buffer *new_root_node, + int log_removal) { int ret; ret = tree_mod_log_insert_root(root->fs_info, root->node, - new_root_node, GFP_NOFS); + new_root_node, GFP_NOFS, log_removal); BUG_ON(ret < 0); } @@ -867,7 +904,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (btrfs_block_can_be_shared(root, buf)) { ret = btrfs_lookup_extent_info(trans, root, buf->start, - buf->len, &refs, &flags); + btrfs_header_level(buf), 1, + &refs, &flags); if (ret) return ret; if (refs == 0) { @@ -1028,7 +1066,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = 0; extent_buffer_get(cow); - tree_mod_log_set_root_pointer(root, cow); + tree_mod_log_set_root_pointer(root, cow, 1); rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, @@ -1067,11 +1105,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, */ static struct tree_mod_elem * __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, - struct btrfs_root *root, u64 time_seq) + struct extent_buffer *eb_root, u64 time_seq) { struct tree_mod_elem *tm; struct tree_mod_elem *found = NULL; - u64 root_logical = root->node->start; + u64 root_logical = eb_root->start; int looped = 0; if (!time_seq) @@ -1105,7 +1143,6 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, found = tm; root_logical = tm->old_root.logical; - BUG_ON(root_logical == root->node->start); looped = 1; } @@ -1190,6 +1227,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, btrfs_set_header_nritems(eb, n); } +/* + * Called with eb read locked. If the buffer cannot be rewinded, the same buffer + * is returned. If rewind operations happen, a fresh buffer is returned. The + * returned buffer is always read-locked. If the returned buffer is not the + * input buffer, the lock on the input buffer is released and the input buffer + * is freed (its refcount is decremented). + */ static struct extent_buffer * tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, u64 time_seq) @@ -1223,8 +1267,11 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, } extent_buffer_get(eb_rewin); + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); + extent_buffer_get(eb_rewin); + btrfs_tree_read_lock(eb_rewin); __tree_mod_log_rewind(eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); @@ -1243,33 +1290,35 @@ static inline struct extent_buffer * get_old_root(struct btrfs_root *root, u64 time_seq) { struct tree_mod_elem *tm; - struct extent_buffer *eb; + struct extent_buffer *eb = NULL; + struct extent_buffer *eb_root; struct extent_buffer *old; struct tree_mod_root *old_root = NULL; u64 old_generation = 0; u64 logical; u32 blocksize; - eb = btrfs_read_lock_root_node(root); - tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); + eb_root = btrfs_read_lock_root_node(root); + tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); if (!tm) - return root->node; + return eb_root; if (tm->op == MOD_LOG_ROOT_REPLACE) { old_root = &tm->old_root; old_generation = tm->generation; logical = old_root->logical; } else { - logical = root->node->start; + logical = eb_root->start; } tm = tree_mod_log_search(root->fs_info, logical, time_seq); if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { - btrfs_tree_read_unlock(root->node); - free_extent_buffer(root->node); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); blocksize = btrfs_level_size(root, old_root->level); old = read_tree_block(root, logical, blocksize, 0); - if (!old) { + if (!old || !extent_buffer_uptodate(old)) { + free_extent_buffer(old); pr_warn("btrfs: failed to read tree block %llu from get_old_root\n", logical); WARN_ON(1); @@ -1278,13 +1327,13 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(old); } } else if (old_root) { - btrfs_tree_read_unlock(root->node); - free_extent_buffer(root->node); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(logical, root->nodesize); } else { - eb = btrfs_clone_extent_buffer(root->node); - btrfs_tree_read_unlock(root->node); - free_extent_buffer(root->node); + eb = btrfs_clone_extent_buffer(eb_root); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); } if (!eb) @@ -1294,7 +1343,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(eb, root->root_key.objectid); + btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); } @@ -1311,15 +1360,15 @@ int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) { struct tree_mod_elem *tm; int level; + struct extent_buffer *eb_root = btrfs_root_node(root); - tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq); + tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { level = tm->old_root.level; } else { - rcu_read_lock(); - level = btrfs_header_level(root->node); - rcu_read_unlock(); + level = btrfs_header_level(eb_root); } + free_extent_buffer(eb_root); return level; } @@ -1514,8 +1563,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur) { cur = read_tree_block(root, blocknr, blocksize, gen); - if (!cur) + if (!cur || !extent_buffer_uptodate(cur)) { + free_extent_buffer(cur); return -EIO; + } } else if (!uptodate) { err = btrfs_read_buffer(cur, gen); if (err) { @@ -1680,6 +1731,8 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, struct extent_buffer *parent, int slot) { int level = btrfs_header_level(parent); + struct extent_buffer *eb; + if (slot < 0) return NULL; if (slot >= btrfs_header_nritems(parent)) @@ -1687,9 +1740,15 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, BUG_ON(level == 0); - return read_tree_block(root, btrfs_node_blockptr(parent, slot), - btrfs_level_size(root, level - 1), - btrfs_node_ptr_generation(parent, slot)); + eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(parent, slot)); + if (eb && !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + eb = NULL; + } + + return eb; } /* @@ -1754,7 +1813,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - tree_mod_log_set_root_pointer(root, child); + tree_mod_log_set_root_pointer(root, child, 1); rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); @@ -1818,7 +1877,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - del_ptr(trans, root, path, level + 1, pslot + 1); + del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, root, right, 0, 1); free_extent_buffer_stale(right); @@ -1862,7 +1921,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - del_ptr(trans, root, path, level + 1, pslot); + del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, root, mid, 0, 1); free_extent_buffer_stale(mid); @@ -2210,9 +2269,6 @@ static noinline void unlock_up(struct btrfs_path *path, int level, int no_skips = 0; struct extent_buffer *t; - if (path->really_keep_locks) - return; - for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) break; @@ -2260,7 +2316,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) { int i; - if (path->keep_locks || path->really_keep_locks) + if (path->keep_locks) return; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -2493,7 +2549,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root if (!cow) write_lock_level = -1; - if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level)) + if (cow && (p->keep_locks || p->lowest_level)) write_lock_level = BTRFS_MAX_LEVEL; min_write_lock_level = write_lock_level; @@ -2795,15 +2851,9 @@ again: btrfs_clear_path_blocking(p, b, BTRFS_READ_LOCK); } + b = tree_mod_log_rewind(root->fs_info, b, time_seq); p->locks[level] = BTRFS_READ_LOCK; p->nodes[level] = b; - b = tree_mod_log_rewind(root->fs_info, b, time_seq); - if (b != p->nodes[level]) { - btrfs_tree_unlock_rw(p->nodes[level], - p->locks[level]); - p->locks[level] = 0; - p->nodes[level] = b; - } } else { p->slots[level] = slot; unlock_up(p, level, lowest_unlock, 0, NULL); @@ -2902,8 +2952,7 @@ again: * higher levels * */ -static void fixup_low_keys(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key *key, int level) { int i; @@ -2928,8 +2977,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; @@ -2951,7 +2999,7 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, btrfs_set_item_key(eb, &disk_key, slot); btrfs_mark_buffer_dirty(eb); if (slot == 0) - fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); } /* @@ -2998,7 +3046,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items, 1); + push_items); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3069,7 +3117,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, sizeof(struct btrfs_key_ptr)); tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items, 1); + src_nritems - push_items, push_items); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3093,7 +3141,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, */ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level) + struct btrfs_path *path, int level, int log_removal) { u64 lower_gen; struct extent_buffer *lower; @@ -3144,7 +3192,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); old = root->node; - tree_mod_log_set_root_pointer(root, c); + tree_mod_log_set_root_pointer(root, c, log_removal); rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ @@ -3221,18 +3269,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans, int mid; int ret; u32 c_nritems; - int tree_mod_log_removal = 1; c = path->nodes[level]; WARN_ON(btrfs_header_generation(c) != trans->transid); if (c == root->node) { - /* trying to split the root, lets make a new one */ - ret = insert_new_root(trans, root, path, level + 1); /* - * removal of root nodes has been logged by - * tree_mod_log_set_root_pointer due to locking + * trying to split the root, lets make a new one + * + * tree mod log: We pass 0 as log_removal parameter to + * insert_new_root, because that root buffer will be kept as a + * normal node. We are going to log removal of half of the + * elements below with tree_mod_log_eb_copy. We're holding a + * tree lock on the buffer, which is why we cannot race with + * other tree_mod_log users. */ - tree_mod_log_removal = 0; + ret = insert_new_root(trans, root, path, level + 1, 0); if (ret) return ret; } else { @@ -3270,8 +3321,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid, - tree_mod_log_removal); + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), @@ -3687,7 +3737,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, clean_tree_block(trans, root, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -3953,7 +4003,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, } if (!path->nodes[1]) { - ret = insert_new_root(trans, root, path, 1); + ret = insert_new_root(trans, root, path, 1, 1); if (ret) return ret; } @@ -4047,8 +4097,7 @@ again: path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) - fixup_low_keys(trans, root, path, - &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); } btrfs_mark_buffer_dirty(right); return ret; @@ -4264,7 +4313,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - setup_items_for_insert(trans, root, path, new_key, &item_size, + setup_items_for_insert(root, path, new_key, &item_size, item_size, item_size + sizeof(struct btrfs_item), 1); leaf = path->nodes[0]; @@ -4281,9 +4330,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, +void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, u32 new_size, int from_end) { int slot; @@ -4367,7 +4414,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); } item = btrfs_item_nr(leaf, slot); @@ -4383,8 +4430,7 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans, /* * make the item pointed to by the path bigger, data_size is the new size. */ -void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, u32 data_size) { int slot; @@ -4454,8 +4500,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, * to save stack depth by doing the bulk of the work in a function * that doesn't call btrfs_search_slot */ -void setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *cpu_key, u32 *data_size, u32 total_data, u32 total_size, int nr) { @@ -4531,7 +4576,7 @@ void setup_items_for_insert(struct btrfs_trans_handle *trans, if (slot == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(leaf); @@ -4571,7 +4616,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(trans, root, path, cpu_key, data_size, + setup_items_for_insert(root, path, cpu_key, data_size, total_data, total_size, nr); return 0; } @@ -4609,8 +4654,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root * the tree should have been previously balanced so the deletion does not * empty a node. */ -static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int level, int slot) +static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, + int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; @@ -4642,7 +4687,7 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(trans, root, path, &disk_key, level + 1); + fixup_low_keys(root, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); } @@ -4663,7 +4708,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *leaf) { WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(trans, root, path, 1, path->slots[1]); + del_ptr(root, path, 1, path->slots[1]); /* * btrfs_free_extent is expensive, we want to make sure we @@ -4744,7 +4789,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(trans, root, path, &disk_key, 1); + fixup_low_keys(root, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ @@ -5464,139 +5509,6 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) return btrfs_next_old_leaf(root, path, 0); } -/* Release the path up to but not including the given level */ -static void btrfs_release_level(struct btrfs_path *path, int level) -{ - int i; - - for (i = 0; i < level; i++) { - path->slots[i] = 0; - if (!path->nodes[i]) - continue; - if (path->locks[i]) { - btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); - path->locks[i] = 0; - } - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } -} - -/* - * This function assumes 2 things - * - * 1) You are using path->keep_locks - * 2) You are not inserting items. - * - * If either of these are not true do not use this function. If you need a next - * leaf with either of these not being true then this function can be easily - * adapted to do that, but at the moment these are the limitations. - */ -int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - int del) -{ - struct extent_buffer *b; - struct btrfs_key key; - u32 nritems; - int level = 1; - int slot; - int ret = 1; - int write_lock_level = BTRFS_MAX_LEVEL; - int ins_len = del ? -1 : 0; - - WARN_ON(!(path->keep_locks || path->really_keep_locks)); - - nritems = btrfs_header_nritems(path->nodes[0]); - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); - - while (path->nodes[level]) { - nritems = btrfs_header_nritems(path->nodes[level]); - if (!(path->locks[level] & BTRFS_WRITE_LOCK)) { -search: - btrfs_release_path(path); - ret = btrfs_search_slot(trans, root, &key, path, - ins_len, 1); - if (ret < 0) - goto out; - level = 1; - continue; - } - - if (path->slots[level] >= nritems - 1) { - level++; - continue; - } - - btrfs_release_level(path, level); - break; - } - - if (!path->nodes[level]) { - ret = 1; - goto out; - } - - path->slots[level]++; - b = path->nodes[level]; - - while (b) { - level = btrfs_header_level(b); - - if (!should_cow_block(trans, root, b)) - goto cow_done; - - btrfs_set_path_blocking(path); - ret = btrfs_cow_block(trans, root, b, - path->nodes[level + 1], - path->slots[level + 1], &b); - if (ret) - goto out; -cow_done: - path->nodes[level] = b; - btrfs_clear_path_blocking(path, NULL, 0); - if (level != 0) { - ret = setup_nodes_for_search(trans, root, path, b, - level, ins_len, - &write_lock_level); - if (ret == -EAGAIN) - goto search; - if (ret) - goto out; - - b = path->nodes[level]; - slot = path->slots[level]; - - ret = read_block_for_search(trans, root, path, - &b, level, slot, &key, 0); - if (ret == -EAGAIN) - goto search; - if (ret) - goto out; - level = btrfs_header_level(b); - if (!btrfs_try_tree_write_lock(b)) { - btrfs_set_path_blocking(path); - btrfs_tree_lock(b); - btrfs_clear_path_blocking(path, b, - BTRFS_WRITE_LOCK); - } - path->locks[level] = BTRFS_WRITE_LOCK; - path->nodes[level] = b; - path->slots[level] = 0; - } else { - path->slots[level] = 0; - ret = 0; - break; - } - } - -out: - if (ret) - btrfs_release_path(path); - - return ret; -} - int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0d82922179db..63c328a9ce95 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -340,6 +340,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) */ #define BTRFS_FS_STATE_ERROR 0 #define BTRFS_FS_STATE_REMOUNTING 1 +#define BTRFS_FS_STATE_TRANS_ABORTED 2 /* Super block flags */ /* Errors detected */ @@ -508,6 +509,7 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) +#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL @@ -518,7 +520,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ BTRFS_FEATURE_INCOMPAT_RAID56 | \ - BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) /* * A leaf is full of items. offset and size tell us where to find @@ -583,7 +586,6 @@ struct btrfs_path { unsigned int skip_locking:1; unsigned int leave_spinning:1; unsigned int search_commit_root:1; - unsigned int really_keep_locks:1; }; /* @@ -1019,9 +1021,9 @@ struct btrfs_block_group_item { */ #define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) /* - * SCANNING is set during the initialization phase + * RESCAN is set during the initialization phase */ -#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1) +#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) /* * Some qgroup entries are known to be out of date, * either because the configuration has changed in a way that @@ -1050,7 +1052,7 @@ struct btrfs_qgroup_status_item { * only used during scanning to record the progress * of the scan. It contains a logical address */ - __le64 scan; + __le64 rescan; } __attribute__ ((__packed__)); struct btrfs_qgroup_info_item { @@ -1360,6 +1362,17 @@ struct btrfs_fs_info { wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; + /* + * Used to protect the incompat_flags, compat_flags, compat_ro_flags + * when they are updated. + * + * Because we do not clear the flags for ever, so we needn't use + * the lock on the read side. + * + * We also needn't use the lock when we mount the fs, because + * there is no other task which will update the flag. + */ + spinlock_t super_lock; struct btrfs_super_block *super_copy; struct btrfs_super_block *super_for_commit; struct block_device *__bdev; @@ -1409,7 +1422,7 @@ struct btrfs_fs_info { /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; - atomic_t tree_mod_seq; + atomic64_t tree_mod_seq; struct list_head tree_mod_seq_list; struct seq_list tree_mod_seq_elem; @@ -1581,12 +1594,20 @@ struct btrfs_fs_info { struct rb_root qgroup_tree; spinlock_t qgroup_lock; + /* protect user change for quota operations */ + struct mutex qgroup_ioctl_lock; + /* list of dirty qgroups to be written at next commit */ struct list_head dirty_qgroups; /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ u64 qgroup_seq; + /* qgroup rescan items */ + struct mutex qgroup_rescan_lock; /* protects the progress item */ + struct btrfs_key qgroup_rescan_progress; + struct btrfs_workers qgroup_rescan_workers; + /* filesystem state */ unsigned long fs_state; @@ -1808,6 +1829,12 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_EXTENT_ITEM_KEY 168 +/* + * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know + * the length, so we save the level in key->offset instead of the length. + */ +#define BTRFS_METADATA_ITEM_KEY 169 + #define BTRFS_TREE_BLOCK_REF_KEY 176 #define BTRFS_EXTENT_DATA_REF_KEY 178 @@ -2766,8 +2793,10 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, static inline int btrfs_super_csum_size(struct btrfs_super_block *s) { - int t = btrfs_super_csum_type(s); - BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes)); + u16 t = btrfs_super_csum_type(s); + /* + * csum type is validated at mount time + */ return btrfs_csum_sizes[t]; } @@ -2864,8 +2893,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, version, 64); BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, flags, 64); -BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item, - scan, 64); +BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, + rescan, 64); /* btrfs_qgroup_info_item */ BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, @@ -3005,7 +3034,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags); + u64 offset, int metadata, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, @@ -3017,8 +3046,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); -u64 btrfs_find_block_group(struct btrfs_root *root, - u64 search_start, u64 search_hint, int owner); struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize, u64 parent, u64 root_objectid, @@ -3028,10 +3055,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, u64 parent, int last_ref); -struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - int level); int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, @@ -3044,7 +3067,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, u64 data); + struct btrfs_key *ins, int is_data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3084,7 +3107,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); -u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); @@ -3161,8 +3183,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); -void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); @@ -3198,12 +3219,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); -void btrfs_extend_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, u32 data_size); -void btrfs_truncate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, +void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3243,8 +3261,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, return btrfs_del_items(trans, root, path, path->slots[0], 1); } -void setup_items_for_insert(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, +void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *cpu_key, u32 *data_size, u32 total_data, u32 total_size, int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root @@ -3264,9 +3281,6 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, } int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); -int btrfs_next_leaf_write(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, - int del); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); static inline int btrfs_next_old_item(struct btrfs_root *root, @@ -3281,7 +3295,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); int __must_check btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, @@ -3318,10 +3331,7 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem); -static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) -{ - return atomic_inc_return(&fs_info->tree_mod_seq); -} +u64 btrfs_tree_mod_seq_prev(u64 seq); int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ @@ -3345,9 +3355,8 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); -void btrfs_read_root_item(struct btrfs_root *root, - struct extent_buffer *eb, int slot, - struct btrfs_root_item *item); +void btrfs_read_root_item(struct extent_buffer *eb, int slot, + struct btrfs_root_item *item); int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); @@ -3380,9 +3389,6 @@ struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const char *name, int name_len); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len); int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -3460,16 +3466,11 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 bytenr, int mod); -u64 btrfs_file_extent_length(struct btrfs_path *path); int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig); -struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, int cow); int btrfs_csum_truncate(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 isize); @@ -3531,8 +3532,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); -int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, u64 new_dirid); int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, @@ -3542,7 +3541,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); -int btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); @@ -3560,7 +3558,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); -int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3611,7 +3608,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); -void btrfs_drop_pages(struct page **pages, size_t num_pages); int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, @@ -3634,14 +3630,31 @@ int btrfs_sync_fs(struct super_block *sb, int wait); #ifdef CONFIG_PRINTK __printf(2, 3) -void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...); +void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); #else static inline __printf(2, 3) -void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) +void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } #endif +#define btrfs_emerg(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_INFO fmt, ##args) +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) + __printf(5, 6) void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); @@ -3663,11 +3676,28 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, disk_super = fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (!(features & flag)) { - features |= flag; - btrfs_set_super_incompat_flags(disk_super, features); + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_incompat_flags(disk_super, features); + printk(KERN_INFO "btrfs: setting %llu feature flag\n", + flag); + } + spin_unlock(&fs_info->super_lock); } } +#define btrfs_fs_incompat(fs_info, opt) \ + __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) +{ + struct btrfs_super_block *disk_super; + disk_super = fs_info->super_copy; + return !!(btrfs_super_incompat_flags(disk_super) & flag); +} + /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. @@ -3753,7 +3783,6 @@ void btrfs_scrub_continue_super(struct btrfs_root *root); int btrfs_scrub_cancel(struct btrfs_fs_info *info); int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, struct btrfs_device *dev); -int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); @@ -3784,7 +3813,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -int btrfs_quota_rescan(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 14fce27b4780..f26f38ccd194 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -202,7 +202,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, spin_unlock(&root->lock); } -struct btrfs_delayed_node *btrfs_first_delayed_node( +static struct btrfs_delayed_node *btrfs_first_delayed_node( struct btrfs_delayed_root *delayed_root) { struct list_head *p; @@ -221,7 +221,7 @@ out: return node; } -struct btrfs_delayed_node *btrfs_next_delayed_node( +static struct btrfs_delayed_node *btrfs_next_delayed_node( struct btrfs_delayed_node *node) { struct btrfs_delayed_root *delayed_root; @@ -282,7 +282,7 @@ static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) __btrfs_release_delayed_node(node, 0); } -struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( +static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( struct btrfs_delayed_root *delayed_root) { struct list_head *p; @@ -308,7 +308,7 @@ static inline void btrfs_release_prepared_delayed_node( __btrfs_release_delayed_node(node, 1); } -struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) +static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) { struct btrfs_delayed_item *item; item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); @@ -383,7 +383,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item( return NULL; } -struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( +static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( struct btrfs_delayed_node *delayed_node, struct btrfs_key *key) { @@ -394,45 +394,6 @@ struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( return item; } -struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - struct btrfs_delayed_item *item; - - item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key, - NULL, NULL); - return item; -} - -struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - struct btrfs_delayed_item *item, *next; - - item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key, - NULL, &next); - if (!item) - item = next; - - return item; -} - -struct btrfs_delayed_item *__btrfs_search_delayed_deletion_item( - struct btrfs_delayed_node *delayed_node, - struct btrfs_key *key) -{ - struct btrfs_delayed_item *item, *next; - - item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key, - NULL, &next); - if (!item) - item = next; - - return item; -} - static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, struct btrfs_delayed_item *ins, int action) @@ -535,7 +496,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) } } -struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( +static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( struct btrfs_delayed_node *delayed_node) { struct rb_node *p; @@ -548,7 +509,7 @@ struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( return item; } -struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( +static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( struct btrfs_delayed_node *delayed_node) { struct rb_node *p; @@ -561,7 +522,7 @@ struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( return item; } -struct btrfs_delayed_item *__btrfs_next_delayed_item( +static struct btrfs_delayed_item *__btrfs_next_delayed_item( struct btrfs_delayed_item *item) { struct rb_node *p; @@ -766,10 +727,9 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, * This helper will insert some continuous items into the same leaf according * to the free space of the leaf. */ -static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_item *item) +static int btrfs_batch_insert_items(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *item) { struct btrfs_delayed_item *curr, *next; int free_space; @@ -848,7 +808,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, btrfs_clear_path_blocking(path, NULL, 0); /* insert the keys of the items */ - setup_items_for_insert(trans, root, path, keys, data_size, + setup_items_for_insert(root, path, keys, data_size, total_data_size, total_size, nitems); /* insert the dir index items */ @@ -932,7 +892,7 @@ do_again: if (curr && btrfs_is_continuous_delayed_item(prev, curr)) { /* insert the continuous items into the same leaf */ path->slots[0]++; - btrfs_batch_insert_items(trans, root, path, curr); + btrfs_batch_insert_items(root, path, curr); } btrfs_release_delayed_item(prev); btrfs_mark_buffer_dirty(path->nodes[0]); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index b7a0641ead77..c219463fb1fd 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -40,16 +40,19 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep; * compare two delayed tree backrefs with same bytenr and type */ static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, - struct btrfs_delayed_tree_ref *ref1) + struct btrfs_delayed_tree_ref *ref1, int type) { - if (ref1->root < ref2->root) - return -1; - if (ref1->root > ref2->root) - return 1; - if (ref1->parent < ref2->parent) - return -1; - if (ref1->parent > ref2->parent) - return 1; + if (type == BTRFS_TREE_BLOCK_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } return 0; } @@ -113,7 +116,8 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2, if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), - btrfs_delayed_node_to_tree_ref(ref1)); + btrfs_delayed_node_to_tree_ref(ref1), + ref1->type); } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || ref1->type == BTRFS_SHARED_DATA_REF_KEY) { return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), @@ -357,8 +361,10 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, elem = list_first_entry(&fs_info->tree_mod_seq_list, struct seq_list, list); if (seq >= elem->seq) { - pr_debug("holding back delayed_ref %llu, lowest is " - "%llu (%p)\n", seq, elem->seq, delayed_refs); + pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n", + (u32)(seq >> 32), (u32)seq, + (u32)(elem->seq >> 32), (u32)elem->seq, + delayed_refs); ret = 1; } } diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 502c2158167c..79e594e341c7 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -21,6 +21,10 @@ #include "hash.h" #include "transaction.h" +static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len); + /* * insert a name into a directory, doing overflow properly if there is a hash * collision. data_size indicates how big the item inserted should be. On @@ -49,7 +53,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle di = btrfs_match_dir_item_name(root, path, name, name_len); if (di) return ERR_PTR(-EEXIST); - btrfs_extend_item(trans, root, path, data_size); + btrfs_extend_item(root, path, data_size); } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); @@ -379,7 +383,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, +static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, const char *name, int name_len) { @@ -442,8 +446,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); - btrfs_truncate_item(trans, root, path, - item_len - sub_item_len, 1); + btrfs_truncate_item(root, path, item_len - sub_item_len, 1); } return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6d19a0a554aa..4e9ebe1f1827 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -30,6 +30,7 @@ #include <linux/slab.h> #include <linux/migrate.h> #include <linux/ratelimit.h> +#include <linux/uuid.h> #include <asm/unaligned.h> #include "compat.h" #include "ctree.h" @@ -69,6 +70,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, int mark); static int btrfs_destroy_pinned_extent(struct btrfs_root *root, struct extent_io_tree *pinned_extents); +static int btrfs_cleanup_transaction(struct btrfs_root *root); +static void btrfs_error_commit_super(struct btrfs_root *root); /* * end_io_wq structs are used to do processing in task context when an IO is @@ -222,7 +225,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 0); if (ret == -EEXIST) { free_extent_map(em); em = lookup_extent_mapping(em_tree, start, len); @@ -238,7 +241,7 @@ out: return em; } -u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) +u32 btrfs_csum_data(char *data, u32 seed, size_t len) { return crc32c(seed, data, len); } @@ -274,7 +277,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, if (err) return 1; cur_len = min(len, map_len - (offset - map_start)); - crc = btrfs_csum_data(root, kaddr + offset - map_start, + crc = btrfs_csum_data(kaddr + offset - map_start, crc, cur_len); len -= cur_len; offset += cur_len; @@ -354,6 +357,49 @@ out: } /* + * Return 0 if the superblock checksum type matches the checksum value of that + * algorithm. Pass the raw disk superblock data. + */ +static int btrfs_check_super_csum(char *raw_disk_sb) +{ + struct btrfs_super_block *disk_sb = + (struct btrfs_super_block *)raw_disk_sb; + u16 csum_type = btrfs_super_csum_type(disk_sb); + int ret = 0; + + if (csum_type == BTRFS_CSUM_TYPE_CRC32) { + u32 crc = ~(u32)0; + const int csum_size = sizeof(crc); + char result[csum_size]; + + /* + * The super_block structure does not span the whole + * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space + * is filled with zeros and is included in the checkum. + */ + crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, + crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, result); + + if (memcmp(raw_disk_sb, result, csum_size)) + ret = 1; + + if (ret && btrfs_super_generation(disk_sb) < 10) { + printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n"); + ret = 0; + } + } + + if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { + printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n", + csum_type); + ret = 1; + } + + return ret; +} + +/* * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. */ @@ -530,41 +576,6 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } -struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree, - struct page *page, int max_walk) -{ - struct extent_buffer *eb; - u64 start = page_offset(page); - u64 target = start; - u64 min_start; - - if (start < max_walk) - min_start = 0; - else - min_start = start - max_walk; - - while (start >= min_start) { - eb = find_extent_buffer(tree, start, 0); - if (eb) { - /* - * we found an extent buffer and it contains our page - * horray! - */ - if (eb->start <= target && - eb->start + eb->len > target) - return eb; - - /* we found an extent buffer that wasn't for us */ - free_extent_buffer(eb); - return NULL; - } - if (start == 0) - break; - start -= PAGE_CACHE_SIZE; - } - return NULL; -} - static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int mirror) { @@ -613,6 +624,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, goto err; } found_level = btrfs_header_level(eb); + if (found_level >= BTRFS_MAX_LEVEL) { + btrfs_info(root->fs_info, "bad tree block level %d\n", + (int)btrfs_header_level(eb)); + ret = -EIO; + goto err; + } btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); @@ -636,10 +653,9 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, if (!ret) set_extent_buffer_uptodate(eb); err: - if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { - clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + if (reads_done && + test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, ret); - } if (ret) { /* @@ -993,14 +1009,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) return 0; - /* - * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing - * slab allocation from alloc_extent_state down the callchain where - * it'd hit a BUG_ON as those flags are not allowed. - */ - gfp_flags &= ~GFP_SLAB_BUG_MASK; - return try_release_extent_buffer(page, gfp_flags); + return try_release_extent_buffer(page); } static void btree_invalidatepage(struct page *page, unsigned long offset) @@ -1275,6 +1285,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_key key; int ret = 0; u64 bytenr; + uuid_le uuid; root = btrfs_alloc_root(fs_info); if (!root) @@ -1324,6 +1335,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_root_used(&root->root_item, leaf->len); btrfs_set_root_last_snapshot(&root->root_item, 0); btrfs_set_root_dirid(&root->root_item, 0); + uuid_le_gen(&uuid); + memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE); root->root_item.drop_level = 0; key.objectid = objectid; @@ -1476,7 +1489,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, if (ret == 0) { l = path->nodes[0]; slot = path->slots[0]; - btrfs_read_root_item(tree_root, l, slot, &root->root_item); + btrfs_read_root_item(l, slot, &root->root_item); memcpy(&root->root_key, location, sizeof(*location)); } btrfs_free_path(path); @@ -1491,6 +1504,14 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); + if (!root->node || !extent_buffer_uptodate(root->node)) { + ret = (!root->node) ? -ENOMEM : -EIO; + + free_extent_buffer(root->node); + kfree(root); + return ERR_PTR(ret); + } + root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); /* -ENOMEM */ out: @@ -1658,15 +1679,20 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; do { + int again = 0; + if (!(root->fs_info->sb->s_flags & MS_RDONLY) && - mutex_trylock(&root->fs_info->cleaner_mutex)) { - btrfs_run_delayed_iputs(root); - btrfs_clean_old_snapshots(root); - mutex_unlock(&root->fs_info->cleaner_mutex); + down_read_trylock(&root->fs_info->sb->s_umount)) { + if (mutex_trylock(&root->fs_info->cleaner_mutex)) { + btrfs_run_delayed_iputs(root); + again = btrfs_clean_one_deleted_snapshot(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + } btrfs_run_defrag_inodes(root->fs_info); + up_read(&root->fs_info->sb->s_umount); } - if (!try_to_freeze()) { + if (!try_to_freeze() && !again) { set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop()) schedule(); @@ -1935,6 +1961,28 @@ static noinline int next_root_backup(struct btrfs_fs_info *info, return 0; } +/* helper to cleanup workers */ +static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) +{ + btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_raid56_workers); + btrfs_stop_workers(&fs_info->rmw_workers); + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->endio_freespace_worker); + btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->delayed_workers); + btrfs_stop_workers(&fs_info->caching_workers); + btrfs_stop_workers(&fs_info->readahead_workers); + btrfs_stop_workers(&fs_info->flush_workers); + btrfs_stop_workers(&fs_info->qgroup_rescan_workers); +} + /* helper to cleanup tree roots */ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) { @@ -1972,6 +2020,36 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) } } +static void del_fs_roots(struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *gang[8]; + int i; + + while (!list_empty(&fs_info->dead_roots)) { + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); + + if (gang[0]->in_radix) { + btrfs_free_fs_root(fs_info, gang[0]); + } else { + free_extent_buffer(gang[0]->node); + free_extent_buffer(gang[0]->commit_root); + kfree(gang[0]); + } + } + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_free_fs_root(fs_info, gang[i]); + } +} int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, @@ -2060,6 +2138,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->free_chunk_lock); spin_lock_init(&fs_info->tree_mod_seq_lock); + spin_lock_init(&fs_info->super_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); seqlock_init(&fs_info->profiles_lock); @@ -2083,7 +2162,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); - atomic_set(&fs_info->tree_mod_seq, 0); + atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; fs_info->metadata_ratio = 0; @@ -2187,11 +2266,13 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->dev_replace.lock); spin_lock_init(&fs_info->qgroup_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); fs_info->qgroup_tree = RB_ROOT; INIT_LIST_HEAD(&fs_info->dirty_qgroups); fs_info->qgroup_seq = 1; fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; + mutex_init(&fs_info->qgroup_rescan_lock); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -2211,12 +2292,31 @@ int open_ctree(struct super_block *sb, fs_info, BTRFS_ROOT_TREE_OBJECTID); invalidate_bdev(fs_devices->latest_bdev); + + /* + * Read super block and check the signature bytes only + */ bh = btrfs_read_dev_super(fs_devices->latest_bdev); if (!bh) { err = -EINVAL; goto fail_alloc; } + /* + * We want to check superblock checksum, the type is stored inside. + * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). + */ + if (btrfs_check_super_csum(bh->b_data)) { + printk(KERN_ERR "btrfs: superblock checksum mismatch\n"); + err = -EINVAL; + goto fail_alloc; + } + + /* + * super_copy is zeroed at allocation time and we never touch the + * following bytes up to INFO_SIZE, the checksum is calculated from + * the whole block of INFO_SIZE + */ memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_for_commit)); @@ -2224,6 +2324,13 @@ int open_ctree(struct super_block *sb, memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); + ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + if (ret) { + printk(KERN_ERR "btrfs: superblock contains fatal errors\n"); + err = -EINVAL; + goto fail_alloc; + } + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; @@ -2232,13 +2339,6 @@ int open_ctree(struct super_block *sb, if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); - ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); - if (ret) { - printk(KERN_ERR "btrfs: superblock contains fatal errors\n"); - err = ret; - goto fail_alloc; - } - /* * run through our array of backup supers and setup * our ring pointer to the oldest one @@ -2290,6 +2390,9 @@ int open_ctree(struct super_block *sb, if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) + printk(KERN_ERR "btrfs: has skinny extents\n"); + /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size @@ -2319,6 +2422,10 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } + /* + * Needn't use the lock because there is no other task which will + * update the flag. + */ btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & @@ -2394,6 +2501,8 @@ int open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->readahead_workers, "readahead", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -2428,6 +2537,7 @@ int open_ctree(struct super_block *sb, ret |= btrfs_start_workers(&fs_info->caching_workers); ret |= btrfs_start_workers(&fs_info->readahead_workers); ret |= btrfs_start_workers(&fs_info->flush_workers); + ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); if (ret) { err = -ENOMEM; goto fail_sb_buffer; @@ -2475,8 +2585,8 @@ int open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), blocksize, generation); - BUG_ON(!chunk_root->node); /* -ENOMEM */ - if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + if (!chunk_root->node || + !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", sb->s_id); goto fail_tree_roots; @@ -2661,6 +2771,13 @@ retry_root_backup: log_tree_root->node = read_tree_block(tree_root, bytenr, blocksize, generation + 1); + if (!log_tree_root->node || + !extent_buffer_uptodate(log_tree_root->node)) { + printk(KERN_ERR "btrfs: failed to read log tree\n"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + goto fail_trans_kthread; + } /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { @@ -2740,6 +2857,8 @@ fail_qgroup: btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); + del_fs_roots(fs_info); + btrfs_cleanup_transaction(fs_info->tree_root); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); @@ -2750,6 +2869,7 @@ fail_cleaner: filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_block_groups: + btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); fail_tree_roots: @@ -2757,22 +2877,7 @@ fail_tree_roots: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); fail_sb_buffer: - btrfs_stop_workers(&fs_info->generic_worker); - btrfs_stop_workers(&fs_info->readahead_workers); - btrfs_stop_workers(&fs_info->fixup_workers); - btrfs_stop_workers(&fs_info->delalloc_workers); - btrfs_stop_workers(&fs_info->workers); - btrfs_stop_workers(&fs_info->endio_workers); - btrfs_stop_workers(&fs_info->endio_meta_workers); - btrfs_stop_workers(&fs_info->endio_raid56_workers); - btrfs_stop_workers(&fs_info->rmw_workers); - btrfs_stop_workers(&fs_info->endio_meta_write_workers); - btrfs_stop_workers(&fs_info->endio_write_workers); - btrfs_stop_workers(&fs_info->endio_freespace_worker); - btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->delayed_workers); - btrfs_stop_workers(&fs_info->caching_workers); - btrfs_stop_workers(&fs_info->flush_workers); + btrfs_stop_all_workers(fs_info); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -2904,7 +3009,10 @@ static int write_dev_supers(struct btrfs_device *device, if (wait) { bh = __find_get_block(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); - BUG_ON(!bh); + if (!bh) { + errors++; + continue; + } wait_on_buffer(bh); if (!buffer_uptodate(bh)) errors++; @@ -2919,7 +3027,7 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_set_super_bytenr(sb, bytenr); crc = ~(u32)0; - crc = btrfs_csum_data(NULL, (char *)sb + + crc = btrfs_csum_data((char *)sb + BTRFS_CSUM_SIZE, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); @@ -2931,6 +3039,13 @@ static int write_dev_supers(struct btrfs_device *device, */ bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); + if (!bh) { + printk(KERN_ERR "btrfs: couldn't get super " + "buffer head for bytenr %Lu\n", bytenr); + errors++; + continue; + } + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); /* one reference for submit_bh */ @@ -3153,7 +3268,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( return num_tolerated_disk_barrier_failures; } -int write_all_supers(struct btrfs_root *root, int max_mirrors) +static int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; struct btrfs_device *dev; @@ -3283,37 +3398,6 @@ static void free_fs_root(struct btrfs_root *root) kfree(root); } -static void del_fs_roots(struct btrfs_fs_info *fs_info) -{ - int ret; - struct btrfs_root *gang[8]; - int i; - - while (!list_empty(&fs_info->dead_roots)) { - gang[0] = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); - list_del(&gang[0]->root_list); - - if (gang[0]->in_radix) { - btrfs_free_fs_root(fs_info, gang[0]); - } else { - free_extent_buffer(gang[0]->node); - free_extent_buffer(gang[0]->commit_root); - kfree(gang[0]); - } - } - - while (1) { - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, 0, - ARRAY_SIZE(gang)); - if (!ret) - break; - for (i = 0; i < ret; i++) - btrfs_free_fs_root(fs_info, gang[i]); - } -} - int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { u64 root_objectid = 0; @@ -3349,8 +3433,8 @@ int btrfs_commit_super(struct btrfs_root *root) mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); - btrfs_clean_old_snapshots(root); mutex_unlock(&root->fs_info->cleaner_mutex); + wake_up_process(root->fs_info->cleaner_kthread); /* wait until ongoing cleanup work done */ down_write(&root->fs_info->cleanup_work_sem); @@ -3426,20 +3510,7 @@ int close_ctree(struct btrfs_root *root) percpu_counter_sum(&fs_info->delalloc_bytes)); } - free_extent_buffer(fs_info->extent_root->node); - free_extent_buffer(fs_info->extent_root->commit_root); - free_extent_buffer(fs_info->tree_root->node); - free_extent_buffer(fs_info->tree_root->commit_root); - free_extent_buffer(fs_info->chunk_root->node); - free_extent_buffer(fs_info->chunk_root->commit_root); - free_extent_buffer(fs_info->dev_root->node); - free_extent_buffer(fs_info->dev_root->commit_root); - free_extent_buffer(fs_info->csum_root->node); - free_extent_buffer(fs_info->csum_root->commit_root); - if (fs_info->quota_root) { - free_extent_buffer(fs_info->quota_root->node); - free_extent_buffer(fs_info->quota_root->commit_root); - } + free_root_pointers(fs_info, 1); btrfs_free_block_groups(fs_info); @@ -3447,22 +3518,7 @@ int close_ctree(struct btrfs_root *root) iput(fs_info->btree_inode); - btrfs_stop_workers(&fs_info->generic_worker); - btrfs_stop_workers(&fs_info->fixup_workers); - btrfs_stop_workers(&fs_info->delalloc_workers); - btrfs_stop_workers(&fs_info->workers); - btrfs_stop_workers(&fs_info->endio_workers); - btrfs_stop_workers(&fs_info->endio_meta_workers); - btrfs_stop_workers(&fs_info->endio_raid56_workers); - btrfs_stop_workers(&fs_info->rmw_workers); - btrfs_stop_workers(&fs_info->endio_meta_write_workers); - btrfs_stop_workers(&fs_info->endio_write_workers); - btrfs_stop_workers(&fs_info->endio_freespace_worker); - btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->delayed_workers); - btrfs_stop_workers(&fs_info->caching_workers); - btrfs_stop_workers(&fs_info->readahead_workers); - btrfs_stop_workers(&fs_info->flush_workers); + btrfs_stop_all_workers(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY)) @@ -3567,18 +3623,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { - if (btrfs_super_csum_type(fs_info->super_copy) >= ARRAY_SIZE(btrfs_csum_sizes)) { - printk(KERN_ERR "btrfs: unsupported checksum algorithm\n"); - return -EINVAL; - } - - if (read_only) - return 0; - + /* + * Placeholder for checks + */ return 0; } -void btrfs_error_commit_super(struct btrfs_root *root) +static void btrfs_error_commit_super(struct btrfs_root *root) { mutex_lock(&root->fs_info->cleaner_mutex); btrfs_run_delayed_iputs(root); @@ -3669,6 +3720,9 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } + if (head->must_insert_reserved) + btrfs_pin_extent(root, ref->bytenr, + ref->num_bytes, 1); btrfs_free_delayed_extent_op(head->extent_op); delayed_refs->num_heads--; if (list_empty(&head->cluster)) @@ -3740,13 +3794,9 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, int mark) { int ret; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; struct extent_buffer *eb; u64 start = 0; u64 end; - u64 offset; - unsigned long index; while (1) { ret = find_first_extent_bit(dirty_pages, start, &start, &end, @@ -3756,36 +3806,17 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) + eb = btrfs_find_tree_block(root, start, + root->leafsize); + start += eb->len; + if (!eb) continue; - offset = page_offset(page); - - spin_lock(&dirty_pages->buffer_lock); - eb = radix_tree_lookup( - &(&BTRFS_I(page->mapping->host)->io_tree)->buffer, - offset >> PAGE_CACHE_SHIFT); - spin_unlock(&dirty_pages->buffer_lock); - if (eb) - ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY, - &eb->bflags); - if (PageWriteback(page)) - end_page_writeback(page); - - lock_page(page); - if (PageDirty(page)) { - clear_page_dirty_for_io(page); - spin_lock_irq(&page->mapping->tree_lock); - radix_tree_tag_clear(&page->mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); - spin_unlock_irq(&page->mapping->tree_lock); - } + wait_on_extent_buffer_writeback(eb); - unlock_page(page); - page_cache_release(page); + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, + &eb->bflags)) + clear_extent_buffer_dirty(eb); + free_extent_buffer_stale(eb); } } @@ -3866,7 +3897,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, */ } -int btrfs_cleanup_transaction(struct btrfs_root *root) +static int btrfs_cleanup_transaction(struct btrfs_root *root) { struct btrfs_transaction *t; LIST_HEAD(list); @@ -3887,10 +3918,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_delayed_refs(t, root); - btrfs_block_rsv_release(root, - &root->fs_info->trans_block_rsv, - t->dirty_pages.dirty_bytes); - /* FIXME: cleanup wait for commit */ t->in_commit = 1; t->blocked = 1; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 034d7dc552b2..be69ce1b07a2 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -61,7 +61,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); -void btrfs_error_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, @@ -77,7 +76,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); -u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); +u32 btrfs_csum_data(char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata); @@ -93,10 +92,8 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_cleanup_transaction(struct btrfs_root *root); void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_root *root); -void btrfs_abort_devices(struct btrfs_root *root); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 objectid); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d551231caba..2305b5c5cf00 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -105,6 +105,8 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int reserve); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -270,9 +272,27 @@ static int exclude_super_stripes(struct btrfs_root *root, return ret; while (nr--) { - cache->bytes_super += stripe_len; - ret = add_excluded_extent(root, logical[nr], - stripe_len); + u64 start, len; + + if (logical[nr] > cache->key.objectid + + cache->key.offset) + continue; + + if (logical[nr] + stripe_len <= cache->key.objectid) + continue; + + start = logical[nr]; + if (start < cache->key.objectid) { + start = cache->key.objectid; + len = (logical[nr] + stripe_len) - start; + } else { + len = min_t(u64, stripe_len, + cache->key.objectid + + cache->key.offset - start); + } + + cache->bytes_super += len; + ret = add_excluded_extent(root, start, len); if (ret) { kfree(logical); return ret; @@ -419,8 +439,7 @@ again: if (ret) break; - if (need_resched() || - btrfs_next_leaf(extent_root, path)) { + if (need_resched()) { caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->extent_commit_sem); @@ -428,6 +447,12 @@ again: cond_resched(); goto again; } + + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto err; + if (ret) + break; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); continue; @@ -442,11 +467,16 @@ again: block_group->key.offset) break; - if (key.type == BTRFS_EXTENT_ITEM_KEY) { + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { total_found += add_new_free_space(block_group, fs_info, last, key.objectid); - last = key.objectid + key.offset; + if (key.type == BTRFS_METADATA_ITEM_KEY) + last = key.objectid + + fs_info->tree_root->leafsize; + else + last = key.objectid + key.offset; if (total_found > (1024 * 1024 * 2)) { total_found = 0; @@ -656,55 +686,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -u64 btrfs_find_block_group(struct btrfs_root *root, - u64 search_start, u64 search_hint, int owner) -{ - struct btrfs_block_group_cache *cache; - u64 used; - u64 last = max(search_hint, search_start); - u64 group_start = 0; - int full_search = 0; - int factor = 9; - int wrapped = 0; -again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - if (!cache) - break; - - spin_lock(&cache->lock); - last = cache->key.objectid + cache->key.offset; - used = btrfs_block_group_used(&cache->item); - - if ((full_search || !cache->ro) && - block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { - if (used + cache->pinned + cache->reserved < - div_factor(cache->key.offset, factor)) { - group_start = cache->key.objectid; - spin_unlock(&cache->lock); - btrfs_put_block_group(cache); - goto found; - } - } - spin_unlock(&cache->lock); - btrfs_put_block_group(cache); - cond_resched(); - } - if (!wrapped) { - last = search_start; - wrapped = 1; - goto again; - } - if (!full_search && factor < 10) { - last = search_start; - full_search = 1; - factor = 10; - goto again; - } -found: - return group_start; -} - /* simple helper to search for an existing extent at a given offset */ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) { @@ -718,15 +699,21 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) key.objectid = start; key.offset = len; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + key.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 0, 0); + if (ret > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == start && + key.type == BTRFS_METADATA_ITEM_KEY) + ret = 0; + } btrfs_free_path(path); return ret; } /* - * helper function to lookup reference count and flags of extent. + * helper function to lookup reference count and flags of a tree block. * * the head node for delayed ref is used to store the sum of all the * reference count modifications queued up in the rbtree. the head @@ -736,7 +723,7 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags) + u64 offset, int metadata, u64 *refs, u64 *flags) { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; @@ -749,13 +736,29 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 extent_flags; int ret; + /* + * If we don't have skinny metadata, don't bother doing anything + * different + */ + if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { + offset = root->leafsize; + metadata = 0; + } + path = btrfs_alloc_path(); if (!path) return -ENOMEM; - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; + if (metadata) { + key.objectid = bytenr; + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = offset; + } else { + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = offset; + } + if (!trans) { path->skip_locking = 1; path->search_commit_root = 1; @@ -766,6 +769,13 @@ again: if (ret < 0) goto out_free; + if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = root->leafsize; + btrfs_release_path(path); + goto again; + } + if (ret == 0) { leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -1001,7 +1011,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, return ret; BUG_ON(ret); /* Corruption */ - btrfs_extend_item(trans, root, path, new_size); + btrfs_extend_item(root, path, new_size); leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1453,6 +1463,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, int want; int ret; int err = 0; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -1464,11 +1476,46 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, path->keep_locks = 1; } else extra_size = -1; + + /* + * Owner is our parent level, so we can just add one to get the level + * for the block we are interested in. + */ + if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner; + } + +again: ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); if (ret < 0) { err = ret; goto out; } + + /* + * We may be a newly converted file system which still has the old fat + * extent entries for metadata, so try and see if we have one of those. + */ + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } + if (ret) { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + goto again; + } + } + if (ret && !insert) { err = -ENOENT; goto out; @@ -1504,11 +1551,9 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ei + 1); end = (unsigned long)ei + item_size; - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); - } else { - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); } err = -ENOENT; @@ -1590,8 +1635,7 @@ out: * helper to add new inline back ref */ static noinline_for_stack -void setup_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +void setup_inline_extent_backref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, u64 parent, u64 root_objectid, @@ -1614,7 +1658,7 @@ void setup_inline_extent_backref(struct btrfs_trans_handle *trans, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - btrfs_extend_item(trans, root, path, size); + btrfs_extend_item(root, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1683,8 +1727,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, * helper to update/remove inline back ref */ static noinline_for_stack -void update_inline_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +void update_inline_extent_backref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, @@ -1740,7 +1783,7 @@ void update_inline_extent_backref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - btrfs_truncate_item(trans, root, path, item_size, 1); + btrfs_truncate_item(root, path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); } @@ -1762,10 +1805,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, root_objectid, owner, offset, 1); if (ret == 0) { BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); - update_inline_extent_backref(trans, root, path, iref, + update_inline_extent_backref(root, path, iref, refs_to_add, extent_op); } else if (ret == -ENOENT) { - setup_inline_extent_backref(trans, root, path, iref, parent, + setup_inline_extent_backref(root, path, iref, parent, root_objectid, owner, offset, refs_to_add, extent_op); ret = 0; @@ -1802,7 +1845,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (iref) { - update_inline_extent_backref(trans, root, path, iref, + update_inline_extent_backref(root, path, iref, -refs_to_drop, NULL); } else if (is_data) { ret = remove_extent_data_ref(trans, root, path, refs_to_drop); @@ -1973,10 +2016,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ref_root = ref->root; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - if (extent_op) { - BUG_ON(extent_op->update_key); + if (extent_op) flags |= extent_op->flags_to_set; - } ret = alloc_reserved_file_extent(trans, root, parent, ref_root, flags, ref->objectid, ref->offset, @@ -2029,18 +2070,33 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; + int metadata = (node->type == BTRFS_TREE_BLOCK_REF_KEY || + node->type == BTRFS_SHARED_BLOCK_REF_KEY); if (trans->aborted) return 0; + if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + metadata = 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = node->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = node->num_bytes; + if (metadata) { + struct btrfs_delayed_tree_ref *tree_ref; + + tree_ref = btrfs_delayed_node_to_tree_ref(node); + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = tree_ref->level; + } else { + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + } + +again: path->reada = 1; path->leave_spinning = 1; ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, @@ -2050,6 +2106,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, goto out; } if (ret > 0) { + if (metadata) { + btrfs_release_path(path); + metadata = 0; + + key.offset = node->num_bytes; + key.type = BTRFS_EXTENT_ITEM_KEY; + goto again; + } err = -EIO; goto out; } @@ -2089,10 +2153,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_key ins; u64 parent = 0; u64 ref_root = 0; - - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) @@ -2100,10 +2162,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, else ref_root = ref->root; + ins.objectid = node->bytenr; + if (skinny_metadata) { + ins.offset = ref->level; + ins.type = BTRFS_METADATA_ITEM_KEY; + } else { + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + } + BUG_ON(node->ref_mod != 1); if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { - BUG_ON(!extent_op || !extent_op->update_flags || - !extent_op->update_key); + BUG_ON(!extent_op || !extent_op->update_flags); ret = alloc_reserved_tree_block(trans, root, parent, ref_root, extent_op->flags_to_set, @@ -2307,9 +2377,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { - printk(KERN_DEBUG - "btrfs: run_delayed_extent_op " - "returned %d\n", ret); + btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); spin_lock(&delayed_refs->lock); btrfs_delayed_ref_unlock(locked_ref); return ret; @@ -2348,8 +2416,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, if (ret) { btrfs_delayed_ref_unlock(locked_ref); btrfs_put_delayed_ref(ref); - printk(KERN_DEBUG - "btrfs: run_one_delayed_ref returned %d\n", ret); + btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); spin_lock(&delayed_refs->lock); return ret; } @@ -2426,9 +2493,11 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, if (list_empty(&trans->qgroup_ref_list) != !trans->delayed_ref_elem.seq) { /* list without seq or seq without list */ - printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n", + btrfs_err(fs_info, + "qgroup accounting update error, list is%s empty, seq is %#x.%x", list_empty(&trans->qgroup_ref_list) ? "" : " not", - trans->delayed_ref_elem.seq); + (u32)(trans->delayed_ref_elem.seq >> 32), + (u32)trans->delayed_ref_elem.seq); BUG(); } @@ -3337,7 +3406,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) * progress (either running or paused) picks the target profile (if it's * already available), otherwise falls back to plain reducing. */ -u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { /* * we add in the count of missing devices because we want @@ -3557,6 +3626,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } +static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) +{ + return (global->size << 1); +} + static int should_alloc_chunk(struct btrfs_root *root, struct btrfs_space_info *sinfo, int force) { @@ -3574,7 +3648,7 @@ static int should_alloc_chunk(struct btrfs_root *root, * global_rsv, it doesn't change except when the transaction commits. */ if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) - num_allocated += global_rsv->size; + num_allocated += calc_global_rsv_need_space(global_rsv); /* * in limited mode, we want to have some free space up to @@ -3627,8 +3701,8 @@ static void check_system_chunk(struct btrfs_trans_handle *trans, thresh = get_system_chunk_thresh(root, type); if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { - printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n", - left, thresh, type); + btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", + left, thresh, type); dump_space_info(info, 0, 0); } @@ -3746,7 +3820,7 @@ static int can_overcommit(struct btrfs_root *root, { struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 profile = btrfs_get_alloc_profile(root, 0); - u64 rsv_size = 0; + u64 space_size; u64 avail; u64 used; u64 to_add; @@ -3754,18 +3828,16 @@ static int can_overcommit(struct btrfs_root *root, used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly; - spin_lock(&global_rsv->lock); - rsv_size = global_rsv->size; - spin_unlock(&global_rsv->lock); - /* * We only want to allow over committing if we have lots of actual space * free, but if we don't have enough space to handle the global reserve * space then we could end up having a real enospc problem when trying * to allocate a chunk or some other such important allocation. */ - rsv_size <<= 1; - if (used + rsv_size >= space_info->total_bytes) + spin_lock(&global_rsv->lock); + space_size = calc_global_rsv_need_space(global_rsv); + spin_unlock(&global_rsv->lock); + if (used + space_size >= space_info->total_bytes) return 0; used += space_info->bytes_may_use; @@ -3808,8 +3880,8 @@ static int can_overcommit(struct btrfs_root *root, return 0; } -void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, - unsigned long nr_pages) +static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, + unsigned long nr_pages) { struct super_block *sb = root->fs_info->sb; int started; @@ -3826,7 +3898,8 @@ void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, * the disk). */ btrfs_start_delalloc_inodes(root, 0); - btrfs_wait_ordered_extents(root, 0); + if (!current->journal_info) + btrfs_wait_ordered_extents(root, 0); } } @@ -5090,9 +5163,11 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { struct btrfs_block_group_cache *cache; + int ret; cache = btrfs_lookup_block_group(root->fs_info, bytenr); - BUG_ON(!cache); /* Logic error */ + if (!cache) + return -EINVAL; /* * pull in the free space cache (if any) so that our pin @@ -5105,9 +5180,9 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, pin_down_extent(root, cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ - btrfs_remove_free_space(cache, bytenr, num_bytes); + ret = btrfs_remove_free_space(cache, bytenr, num_bytes); btrfs_put_block_group(cache); - return 0; + return ret; } /** @@ -5312,6 +5387,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int num_to_del = 1; u32 item_size; u64 refs; + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); path = btrfs_alloc_path(); if (!path) @@ -5323,6 +5400,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; BUG_ON(!is_data && refs_to_drop != 1); + if (is_data) + skinny_metadata = 0; + ret = lookup_extent_backref(trans, extent_root, path, &iref, bytenr, num_bytes, parent, root_objectid, owner_objectid, @@ -5339,6 +5419,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, found_extent = 1; break; } + if (key.type == BTRFS_METADATA_ITEM_KEY && + key.offset == owner_objectid) { + found_extent = 1; + break; + } if (path->slots[0] - extent_slot > 5) break; extent_slot--; @@ -5364,12 +5449,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; + if (!is_data && skinny_metadata) { + key.type = BTRFS_METADATA_ITEM_KEY; + key.offset = owner_objectid; + } + ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); + if (ret > 0 && skinny_metadata && path->slots[0]) { + /* + * Couldn't find our skinny metadata item, + * see if we have ye olde extent item. + */ + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) + ret = 0; + } + + if (ret > 0 && skinny_metadata) { + skinny_metadata = false; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + btrfs_release_path(path); + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + } + if (ret) { - printk(KERN_ERR "umm, got %d back from search" - ", was looking for %llu\n", ret, - (unsigned long long)bytenr); + btrfs_err(info, "umm, got %d back from search, was looking for %llu", + ret, (unsigned long long)bytenr); if (ret > 0) btrfs_print_leaf(extent_root, path->nodes[0]); @@ -5383,13 +5495,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } else if (ret == -ENOENT) { btrfs_print_leaf(extent_root, path->nodes[0]); WARN_ON(1); - printk(KERN_ERR "btrfs unable to find ref byte nr %llu " - "parent %llu root %llu owner %llu offset %llu\n", - (unsigned long long)bytenr, - (unsigned long long)parent, - (unsigned long long)root_objectid, - (unsigned long long)owner_objectid, - (unsigned long long)owner_offset); + btrfs_err(info, + "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", + (unsigned long long)bytenr, + (unsigned long long)parent, + (unsigned long long)root_objectid, + (unsigned long long)owner_objectid, + (unsigned long long)owner_offset); } else { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5417,9 +5529,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret) { - printk(KERN_ERR "umm, got %d back from search" - ", was looking for %llu\n", ret, - (unsigned long long)bytenr); + btrfs_err(info, "umm, got %d back from search, was looking for %llu", + ret, (unsigned long long)bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } if (ret < 0) { @@ -5435,7 +5546,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && + key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); bi = (struct btrfs_tree_block_info *)(ei + 1); @@ -5443,7 +5555,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } refs = btrfs_extent_refs(leaf, ei); - BUG_ON(refs < refs_to_drop); + if (refs < refs_to_drop) { + btrfs_err(info, "trying to drop %d refs but we only have %Lu " + "for bytenr %Lu\n", refs_to_drop, refs, bytenr); + ret = -EINVAL; + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } refs -= refs_to_drop; if (refs > 0) { @@ -5758,7 +5876,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, - u64 data) + u64 flags) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -5769,8 +5887,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int empty_cluster = 2 * 1024 * 1024; struct btrfs_space_info *space_info; int loop = 0; - int index = __get_raid_index(data); - int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? + int index = __get_raid_index(flags); + int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; bool failed_cluster_refill = false; @@ -5783,11 +5901,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(orig_root, num_bytes, empty_size, data); + trace_find_free_extent(orig_root, num_bytes, empty_size, flags); - space_info = __find_space_info(root->fs_info, data); + space_info = __find_space_info(root->fs_info, flags); if (!space_info) { - printk(KERN_ERR "No space info for %llu\n", data); + btrfs_err(root->fs_info, "No space info for %llu", flags); return -ENOSPC; } @@ -5798,13 +5916,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (btrfs_mixed_space_info(space_info)) use_cluster = false; - if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { + if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) empty_cluster = 64 * 1024; } - if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster && + if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && btrfs_test_opt(root, SSD)) { last_ptr = &root->fs_info->data_alloc_cluster; } @@ -5833,7 +5951,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, * However if we are re-searching with an ideal block group * picked out then we don't care that the block group is cached. */ - if (block_group && block_group_bits(block_group, data) && + if (block_group && block_group_bits(block_group, flags) && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || @@ -5871,7 +5989,7 @@ search: * raid types, but we want to make sure we only allocate * for the proper type. */ - if (!block_group_bits(block_group, data)) { + if (!block_group_bits(block_group, flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | @@ -5883,7 +6001,7 @@ search: * doesn't provide them, bail. This does allow us to * fill raid0 from raid1. */ - if ((data & extra) && !(block_group->flags & extra)) + if ((flags & extra) && !(block_group->flags & extra)) goto loop; } @@ -5914,7 +6032,7 @@ have_block_group: if (used_block_group != block_group && (!used_block_group || used_block_group->ro || - !block_group_bits(used_block_group, data))) { + !block_group_bits(used_block_group, flags))) { used_block_group = block_group; goto refill_cluster; } @@ -6110,7 +6228,7 @@ loop: index = 0; loop++; if (loop == LOOP_ALLOC_CHUNK) { - ret = do_chunk_alloc(trans, root, data, + ret = do_chunk_alloc(trans, root, flags, CHUNK_ALLOC_FORCE); /* * Do not bail out on ENOSPC since we @@ -6188,16 +6306,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, u64 data) + struct btrfs_key *ins, int is_data) { bool final_tried = false; + u64 flags; int ret; - data = btrfs_get_alloc_profile(root, data); + flags = btrfs_get_alloc_profile(root, is_data); again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, - hint_byte, ins, data); + hint_byte, ins, flags); if (ret == -ENOSPC) { if (!final_tried) { @@ -6210,10 +6329,10 @@ again: } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; - sinfo = __find_space_info(root->fs_info, data); - printk(KERN_ERR "btrfs allocation failed flags %llu, " - "wanted %llu\n", (unsigned long long)data, - (unsigned long long)num_bytes); + sinfo = __find_space_info(root->fs_info, flags); + btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", + (unsigned long long)flags, + (unsigned long long)num_bytes); if (sinfo) dump_space_info(sinfo, num_bytes, 1); } @@ -6232,8 +6351,8 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, cache = btrfs_lookup_block_group(root->fs_info, start); if (!cache) { - printk(KERN_ERR "Unable to find block group for %llu\n", - (unsigned long long)start); + btrfs_err(root->fs_info, "Unable to find block group for %llu", + (unsigned long long)start); return -ENOSPC; } @@ -6328,9 +6447,9 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ - printk(KERN_ERR "btrfs update block group failed for %llu " - "%llu\n", (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + btrfs_err(fs_info, "update block group failed for %llu %llu", + (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); BUG(); } return ret; @@ -6349,7 +6468,12 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; - u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); + u32 size = sizeof(*extent_item) + sizeof(*iref); + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); + + if (!skinny_metadata) + size += sizeof(*block_info); path = btrfs_alloc_path(); if (!path) @@ -6370,12 +6494,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_set_extent_generation(leaf, extent_item, trans->transid); btrfs_set_extent_flags(leaf, extent_item, flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); - block_info = (struct btrfs_tree_block_info *)(extent_item + 1); - btrfs_set_tree_block_key(leaf, block_info, key); - btrfs_set_tree_block_level(leaf, block_info, level); + if (skinny_metadata) { + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + } else { + block_info = (struct btrfs_tree_block_info *)(extent_item + 1); + btrfs_set_tree_block_key(leaf, block_info, key); + btrfs_set_tree_block_level(leaf, block_info, level); + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + } - iref = (struct btrfs_extent_inline_ref *)(block_info + 1); if (parent > 0) { BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); btrfs_set_extent_inline_ref_type(leaf, iref, @@ -6390,11 +6518,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(root, ins->objectid, ins->offset, 1); + ret = update_block_group(root, ins->objectid, root->leafsize, 1); if (ret) { /* -ENOENT, logic error */ - printk(KERN_ERR "btrfs update block group failed for %llu " - "%llu\n", (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + btrfs_err(fs_info, "update block group failed for %llu %llu", + (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); BUG(); } return ret; @@ -6439,47 +6567,48 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, if (!caching_ctl) { BUG_ON(!block_group_cache_done(block_group)); ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto out; } else { mutex_lock(&caching_ctl->mutex); if (start >= caching_ctl->progress) { ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ } else if (start + num_bytes <= caching_ctl->progress) { ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ } else { num_bytes = caching_ctl->progress - start; ret = btrfs_remove_free_space(block_group, start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto out_lock; start = caching_ctl->progress; num_bytes = ins->objectid + ins->offset - caching_ctl->progress; ret = add_excluded_extent(root, start, num_bytes); - BUG_ON(ret); /* -ENOMEM */ } - +out_lock: mutex_unlock(&caching_ctl->mutex); put_caching_control(caching_ctl); + if (ret) + goto out; } ret = btrfs_update_reserved_bytes(block_group, ins->offset, RESERVE_ALLOC_NO_ACCOUNT); BUG_ON(ret); /* logic error */ - btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); +out: + btrfs_put_block_group(block_group); return ret; } -struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - int level) +static struct extent_buffer * +btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, + u64 bytenr, u32 blocksize, int level) { struct extent_buffer *buf; @@ -6594,7 +6723,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf; u64 flags = 0; int ret; - + bool skinny_metadata = btrfs_fs_incompat(root->fs_info, + SKINNY_METADATA); block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) @@ -6627,7 +6757,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, else memset(&extent_op->key, 0, sizeof(extent_op->key)); extent_op->flags_to_set = flags; - extent_op->update_key = 1; + if (skinny_metadata) + extent_op->update_key = 0; + else + extent_op->update_key = 1; extent_op->update_flags = 1; extent_op->is_data = 0; @@ -6704,8 +6837,9 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; /* We don't lock the tree block, it's OK to be racy here */ - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, - &refs, &flags); + ret = btrfs_lookup_extent_info(trans, root, bytenr, + wc->level - 1, 1, &refs, + &flags); /* We don't care about errors in readahead. */ if (ret < 0) continue; @@ -6772,7 +6906,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { BUG_ON(!path->locks[level]); ret = btrfs_lookup_extent_info(trans, root, - eb->start, eb->len, + eb->start, level, 1, &wc->refs[level], &wc->flags[level]); BUG_ON(ret == -ENOMEM); @@ -6870,7 +7004,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, &wc->refs[level - 1], &wc->flags[level - 1]); if (ret < 0) { @@ -6878,7 +7012,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, return ret; } - BUG_ON(wc->refs[level - 1] == 0); + if (unlikely(wc->refs[level - 1] == 0)) { + btrfs_err(root->fs_info, "Missing references."); + BUG(); + } *lookup_info = 0; if (wc->stage == DROP_REFERENCE) { @@ -6917,8 +7054,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (reada && level == 1) reada_walk_down(trans, root, wc, path); next = read_tree_block(root, bytenr, blocksize, generation); - if (!next) + if (!next || !extent_buffer_uptodate(next)) { + free_extent_buffer(next); return -EIO; + } btrfs_tree_lock(next); btrfs_set_lock_blocking(next); } @@ -7001,7 +7140,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; ret = btrfs_lookup_extent_info(trans, root, - eb->start, eb->len, + eb->start, level, 1, &wc->refs[level], &wc->flags[level]); if (ret < 0) { @@ -7137,6 +7276,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * reference count by one. if update_ref is true, this function * also make sure backrefs for the shared block and all lower level * blocks are properly updated. + * + * If called with for_reloc == 0, may exit early with -EAGAIN */ int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int update_ref, @@ -7211,8 +7352,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_lookup_extent_info(trans, root, path->nodes[level]->start, - path->nodes[level]->len, - &wc->refs[level], + level, 1, &wc->refs[level], &wc->flags[level]); if (ret < 0) { err = ret; @@ -7238,6 +7378,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root, wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { + if (!for_reloc && btrfs_fs_closing(root->fs_info)) { + pr_debug("btrfs: drop snapshot early exit\n"); + err = -EAGAIN; + goto out_end_trans; + } + ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { err = ret; @@ -8020,10 +8166,26 @@ int btrfs_read_block_groups(struct btrfs_root *root) free_excluded_extents(root, cache); } + ret = btrfs_add_block_group_cache(root->fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + goto error; + } + ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), &space_info); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_remove_free_space_cache(cache); + spin_lock(&info->block_group_cache_lock); + rb_erase(&cache->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + btrfs_put_block_group(cache); + goto error; + } + cache->space_info = space_info; spin_lock(&cache->space_info->lock); cache->space_info->bytes_readonly += cache->bytes_super; @@ -8031,9 +8193,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) __link_block_group(space_info, cache); - ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); /* Logic error */ - set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) set_block_group_ro(cache, 1); @@ -8156,9 +8315,24 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); + ret = btrfs_add_block_group_cache(root->fs_info, cache); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_remove_free_space_cache(cache); + spin_lock(&root->fs_info->block_group_cache_lock); + rb_erase(&cache->cache_node, + &root->fs_info->block_group_cache_tree); + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + return ret; + } update_global_block_rsv(root->fs_info); spin_lock(&cache->space_info->lock); @@ -8167,9 +8341,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, __link_block_group(cache->space_info, cache); - ret = btrfs_add_block_group_cache(root->fs_info, cache); - BUG_ON(ret); /* Logic error */ - list_add_tail(&cache->new_bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cdee391fc7bf..32d67a822e93 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -24,12 +24,62 @@ static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; +#ifdef CONFIG_BTRFS_DEBUG static LIST_HEAD(buffers); static LIST_HEAD(states); -#define LEAK_DEBUG 0 -#if LEAK_DEBUG static DEFINE_SPINLOCK(leak_lock); + +static inline +void btrfs_leak_debug_add(struct list_head *new, struct list_head *head) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_add(new, head); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline +void btrfs_leak_debug_del(struct list_head *entry) +{ + unsigned long flags; + + spin_lock_irqsave(&leak_lock, flags); + list_del(entry); + spin_unlock_irqrestore(&leak_lock, flags); +} + +static inline +void btrfs_leak_debug_check(void) +{ + struct extent_state *state; + struct extent_buffer *eb; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + printk(KERN_ERR "btrfs state leak: start %llu end %llu " + "state %lu in tree %p refs %d\n", + (unsigned long long)state->start, + (unsigned long long)state->end, + state->state, state->tree, atomic_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + } + + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, struct extent_buffer, leak_list); + printk(KERN_ERR "btrfs buffer leak start %llu len %lu " + "refs %d\n", (unsigned long long)eb->start, + eb->len, atomic_read(&eb->refs)); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } +} +#else +#define btrfs_leak_debug_add(new, head) do {} while (0) +#define btrfs_leak_debug_del(entry) do {} while (0) +#define btrfs_leak_debug_check() do {} while (0) #endif #define BUFFER_LRU_MAX 64 @@ -84,29 +134,7 @@ free_state_cache: void extent_io_exit(void) { - struct extent_state *state; - struct extent_buffer *eb; - - while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); - printk(KERN_ERR "btrfs state leak: start %llu end %llu " - "state %lu in tree %p refs %d\n", - (unsigned long long)state->start, - (unsigned long long)state->end, - state->state, state->tree, atomic_read(&state->refs)); - list_del(&state->leak_list); - kmem_cache_free(extent_state_cache, state); - - } - - while (!list_empty(&buffers)) { - eb = list_entry(buffers.next, struct extent_buffer, leak_list); - printk(KERN_ERR "btrfs buffer leak start %llu len %lu " - "refs %d\n", (unsigned long long)eb->start, - eb->len, atomic_read(&eb->refs)); - list_del(&eb->leak_list); - kmem_cache_free(extent_buffer_cache, eb); - } + btrfs_leak_debug_check(); /* * Make sure all delayed rcu free are flushed before we @@ -134,9 +162,6 @@ void extent_io_tree_init(struct extent_io_tree *tree, static struct extent_state *alloc_extent_state(gfp_t mask) { struct extent_state *state; -#if LEAK_DEBUG - unsigned long flags; -#endif state = kmem_cache_alloc(extent_state_cache, mask); if (!state) @@ -144,11 +169,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->state = 0; state->private = 0; state->tree = NULL; -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_add(&state->leak_list, &states); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_add(&state->leak_list, &states); atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); trace_alloc_extent_state(state, mask, _RET_IP_); @@ -160,15 +181,8 @@ void free_extent_state(struct extent_state *state) if (!state) return; if (atomic_dec_and_test(&state->refs)) { -#if LEAK_DEBUG - unsigned long flags; -#endif WARN_ON(state->tree); -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_del(&state->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_del(&state->leak_list); trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } @@ -308,21 +322,21 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, int *bits) + struct extent_state *state, unsigned long *bits) { if (tree->ops && tree->ops->set_bit_hook) tree->ops->set_bit_hook(tree->mapping->host, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, int *bits) + struct extent_state *state, unsigned long *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, int *bits); + struct extent_state *state, unsigned long *bits); /* * insert an extent_state struct into the tree. 'bits' are set on the @@ -336,7 +350,7 @@ static void set_state_bits(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, - int *bits) + unsigned long *bits) { struct rb_node *node; @@ -424,10 +438,10 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - int *bits, int wake) + unsigned long *bits, int wake) { struct extent_state *next; - int bits_to_clear = *bits & ~EXTENT_CTLBITS; + unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -463,7 +477,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc) return prealloc; } -void extent_io_tree_panic(struct extent_io_tree *tree, int err) +static void extent_io_tree_panic(struct extent_io_tree *tree, int err) { btrfs_panic(tree_fs_info(tree), err, "Locking error: " "Extent tree was modified by another " @@ -483,7 +497,7 @@ void extent_io_tree_panic(struct extent_io_tree *tree, int err) * This takes the tree lock, and returns 0 on success and < 0 on error. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, + unsigned long bits, int wake, int delete, struct extent_state **cached_state, gfp_t mask) { @@ -644,7 +658,8 @@ static void wait_on_state(struct extent_io_tree *tree, * The range [start, end] is inclusive. * The tree lock is taken by this function */ -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits) { struct extent_state *state; struct rb_node *node; @@ -685,9 +700,9 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - int *bits) + unsigned long *bits) { - int bits_to_set = *bits & ~EXTENT_CTLBITS; + unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; set_state_cb(tree, state, bits); if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { @@ -730,8 +745,9 @@ static void uncache_state(struct extent_state **cached_ptr) static int __must_check __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, gfp_t mask) + unsigned long bits, unsigned long exclusive_bits, + u64 *failed_start, struct extent_state **cached_state, + gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -923,9 +939,9 @@ search_again: goto again; } -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, - u64 *failed_start, struct extent_state **cached_state, - gfp_t mask) +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned long bits, u64 * failed_start, + struct extent_state **cached_state, gfp_t mask) { return __set_extent_bit(tree, start, end, bits, 0, failed_start, cached_state, mask); @@ -950,7 +966,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, * boundary bits like LOCK. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int clear_bits, + unsigned long bits, unsigned long clear_bits, struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; @@ -1143,14 +1159,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) + unsigned long bits, gfp_t mask) { return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); } int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) + unsigned long bits, gfp_t mask) { return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); } @@ -1189,7 +1205,7 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, cached_state, mask); } @@ -1205,7 +1221,7 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached_state) + unsigned long bits, struct extent_state **cached_state) { int err; u64 failed_start; @@ -1313,8 +1329,9 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) * return it. tree->lock must be held. NULL will returned if * nothing was found after 'start' */ -struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, int bits) +static struct extent_state * +find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, unsigned long bits) { struct rb_node *node; struct extent_state *state; @@ -1348,7 +1365,7 @@ out: * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits, + u64 *start_ret, u64 *end_ret, unsigned long bits, struct extent_state **cached_state) { struct extent_state *state; @@ -1638,7 +1655,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, unsigned long end_index = end >> PAGE_CACHE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; - int clear_bits = 0; + unsigned long clear_bits = 0; if (op & EXTENT_CLEAR_UNLOCK) clear_bits |= EXTENT_LOCKED; @@ -1777,6 +1794,64 @@ out: return ret; } +void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[], + int count) +{ + struct rb_node *node; + struct extent_state *state; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + BUG_ON(!node); + + state = rb_entry(node, struct extent_state, rb_node); + BUG_ON(state->start != start); + + while (count) { + state->private = *csums++; + count--; + state = next_state(state); + } + spin_unlock(&tree->lock); +} + +static inline u64 __btrfs_get_bio_offset(struct bio *bio, int bio_index) +{ + struct bio_vec *bvec = bio->bi_io_vec + bio_index; + + return page_offset(bvec->bv_page) + bvec->bv_offset; +} + +void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, int bio_index, + u32 csums[], int count) +{ + struct rb_node *node; + struct extent_state *state = NULL; + u64 start; + + spin_lock(&tree->lock); + do { + start = __btrfs_get_bio_offset(bio, bio_index); + if (state == NULL || state->start != start) { + node = tree_search(tree, start); + BUG_ON(!node); + + state = rb_entry(node, struct extent_state, rb_node); + BUG_ON(state->start != start); + } + state->private = *csums++; + count--; + bio_index++; + + state = next_state(state); + } while (count); + spin_unlock(&tree->lock); +} + int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) { struct rb_node *node; @@ -1811,7 +1886,7 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled, struct extent_state *cached) + unsigned long bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; @@ -2560,8 +2635,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (old_compressed) contig = bio->bi_sector == sector; else - contig = bio->bi_sector + (bio->bi_size >> 9) == - sector; + contig = bio_end_sector(bio) == sector; if (prev_bio_flags != bio_flags || !contig || merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || @@ -2596,7 +2670,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, return ret; } -void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) +static void attach_extent_buffer_page(struct extent_buffer *eb, + struct page *page) { if (!PagePrivate(page)) { SetPagePrivate(page); @@ -2626,7 +2701,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct bio **bio, int mirror_num, - unsigned long *bio_flags) + unsigned long *bio_flags, int rw) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -2772,7 +2847,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } pnr -= page->index; - ret = submit_extent_page(READ, tree, page, + ret = submit_extent_page(rw, tree, page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, @@ -2805,7 +2880,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, int ret; ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, - &bio_flags); + &bio_flags, READ); if (bio) ret = submit_one_bio(READ, bio, mirror_num, bio_flags); return ret; @@ -3104,7 +3179,7 @@ static int eb_wait(void *word) return 0; } -static void wait_on_extent_buffer_writeback(struct extent_buffer *eb) +void wait_on_extent_buffer_writeback(struct extent_buffer *eb) { wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, TASK_UNINTERRUPTIBLE); @@ -3229,7 +3304,7 @@ static int write_one_eb(struct extent_buffer *eb, u64 offset = eb->start; unsigned long i, num_pages; unsigned long bio_flags = 0; - int rw = (epd->sync_io ? WRITE_SYNC : WRITE); + int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); @@ -3666,14 +3741,14 @@ int extent_readpages(struct extent_io_tree *tree, continue; for (i = 0; i < nr; i++) { __extent_read_full_page(tree, pagepool[i], get_extent, - &bio, 0, &bio_flags); + &bio, 0, &bio_flags, READ); page_cache_release(pagepool[i]); } nr = 0; } for (i = 0; i < nr; i++) { __extent_read_full_page(tree, pagepool[i], get_extent, - &bio, 0, &bio_flags); + &bio, 0, &bio_flags, READ); page_cache_release(pagepool[i]); } @@ -3714,9 +3789,9 @@ int extent_invalidatepage(struct extent_io_tree *tree, * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ -int try_release_extent_state(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask) +static int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, + struct page *page, gfp_t mask) { u64 start = page_offset(page); u64 end = start + PAGE_CACHE_SIZE - 1; @@ -4007,12 +4082,7 @@ out: static void __free_extent_buffer(struct extent_buffer *eb) { -#if LEAK_DEBUG - unsigned long flags; - spin_lock_irqsave(&leak_lock, flags); - list_del(&eb->leak_list); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } @@ -4022,9 +4092,6 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, gfp_t mask) { struct extent_buffer *eb = NULL; -#if LEAK_DEBUG - unsigned long flags; -#endif eb = kmem_cache_zalloc(extent_buffer_cache, mask); if (eb == NULL) @@ -4044,11 +4111,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); -#if LEAK_DEBUG - spin_lock_irqsave(&leak_lock, flags); - list_add(&eb->leak_list, &buffers); - spin_unlock_irqrestore(&leak_lock, flags); -#endif + btrfs_leak_debug_add(&eb->leak_list, &buffers); + spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); atomic_set(&eb->io_pages, 0); @@ -4386,7 +4450,7 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) } /* Expects to have eb->eb_lock already held */ -static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) +static int release_extent_buffer(struct extent_buffer *eb) { WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { @@ -4444,7 +4508,7 @@ void free_extent_buffer(struct extent_buffer *eb) * I know this is terrible, but it's temporary until we stop tracking * the uptodate bits and such for the extent buffers. */ - release_extent_buffer(eb, GFP_ATOMIC); + release_extent_buffer(eb); } void free_extent_buffer_stale(struct extent_buffer *eb) @@ -4458,7 +4522,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb) if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) && test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) atomic_dec(&eb->refs); - release_extent_buffer(eb, GFP_NOFS); + release_extent_buffer(eb); } void clear_extent_buffer_dirty(struct extent_buffer *eb) @@ -4510,17 +4574,6 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) return was_dirty; } -static int range_straddles_pages(u64 start, u64 len) -{ - if (len < PAGE_CACHE_SIZE) - return 1; - if (start & (PAGE_CACHE_SIZE - 1)) - return 1; - if ((start + len) & (PAGE_CACHE_SIZE - 1)) - return 1; - return 0; -} - int clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; @@ -4552,37 +4605,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb) return 0; } -int extent_range_uptodate(struct extent_io_tree *tree, - u64 start, u64 end) -{ - struct page *page; - int ret; - int pg_uptodate = 1; - int uptodate; - unsigned long index; - - if (range_straddles_pages(start, end - start + 1)) { - ret = test_range_bit(tree, start, end, - EXTENT_UPTODATE, 1, NULL); - if (ret) - return 1; - } - while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - page = find_get_page(tree->mapping, index); - if (!page) - return 1; - uptodate = PageUptodate(page); - page_cache_release(page); - if (!uptodate) { - pg_uptodate = 0; - break; - } - start += PAGE_CACHE_SIZE; - } - return pg_uptodate; -} - int extent_buffer_uptodate(struct extent_buffer *eb) { return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); @@ -4645,7 +4667,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, - mirror_num, &bio_flags); + mirror_num, &bio_flags, + READ | REQ_META); if (err) ret = err; } else { @@ -4654,7 +4677,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (bio) { - err = submit_one_bio(READ, bio, mirror_num, bio_flags); + err = submit_one_bio(READ | REQ_META, bio, mirror_num, + bio_flags); if (err) return err; } @@ -5018,7 +5042,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } -int try_release_extent_buffer(struct page *page, gfp_t mask) +int try_release_extent_buffer(struct page *page) { struct extent_buffer *eb; @@ -5048,9 +5072,6 @@ int try_release_extent_buffer(struct page *page, gfp_t mask) } spin_unlock(&page->mapping->private_lock); - if ((mask & GFP_NOFS) == GFP_NOFS) - mask = GFP_NOFS; - /* * If tree ref isn't set then we know the ref on this eb is a real ref, * so just return, this page will likely be freed soon anyway. @@ -5060,5 +5081,5 @@ int try_release_extent_buffer(struct page *page, gfp_t mask) return 0; } - return release_extent_buffer(eb, mask); + return release_extent_buffer(eb); } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 258c92156857..a2c03a175009 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -81,9 +81,9 @@ struct extent_io_ops { int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, - int *bits); + unsigned long *bits); void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - int *bits); + unsigned long *bits); void (*merge_extent_hook)(struct inode *inode, struct extent_state *new, struct extent_state *other); @@ -116,7 +116,9 @@ struct extent_state { /* for use by the FS */ u64 private; +#ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; +#endif }; #define INLINE_EXTENT_BUFFER_PAGES 16 @@ -132,7 +134,6 @@ struct extent_buffer { atomic_t refs; atomic_t io_pages; int read_mirror; - struct list_head leak_list; struct rcu_head rcu_head; pid_t lock_owner; @@ -159,6 +160,9 @@ struct extent_buffer { wait_queue_head_t read_lock_wq; wait_queue_head_t lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; +#ifdef CONFIG_BTRFS_DEBUG + struct list_head leak_list; +#endif }; static inline void extent_set_compress_type(unsigned long *bio_flags, @@ -185,13 +189,10 @@ void extent_io_tree_init(struct extent_io_tree *tree, int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); -int try_release_extent_buffer(struct page *page, gfp_t mask); -int try_release_extent_state(struct extent_map_tree *map, - struct extent_io_tree *tree, struct page *page, - gfp_t mask); +int try_release_extent_buffer(struct page *page); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, struct extent_state **cached); + unsigned long bits, struct extent_state **cached); int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached, gfp_t mask); @@ -207,16 +208,17 @@ u64 count_range_bits(struct extent_io_tree *tree, void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled, struct extent_state *cached_state); + unsigned long bits, int filled, + struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask); + unsigned long bits, gfp_t mask); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, struct extent_state **cached, - gfp_t mask); + unsigned long bits, int wake, int delete, + struct extent_state **cached, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask); + unsigned long bits, gfp_t mask); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, u64 *failed_start, + unsigned long bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); @@ -229,17 +231,15 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int clear_bits, + unsigned long bits, unsigned long clear_bits, struct extent_state **cached_state, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, int bits, + u64 *start_ret, u64 *end_ret, unsigned long bits, struct extent_state **cached_state); -struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, int bits); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); int extent_write_full_page(struct extent_io_tree *tree, struct page *page, @@ -261,6 +261,10 @@ int extent_readpages(struct extent_io_tree *tree, int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent); int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); +void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[], + int count); +void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, + int bvec_index, u32 csums[], int count); int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); @@ -278,6 +282,7 @@ void free_extent_buffer_stale(struct extent_buffer *eb); int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num); +void wait_on_extent_buffer_writeback(struct extent_buffer *eb); static inline unsigned long num_extent_pages(u64 start, u64 len) { @@ -313,7 +318,6 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); void clear_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_uptodate(struct extent_buffer *eb); @@ -323,8 +327,6 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, unsigned long *map_len); -int extent_range_uptodate(struct extent_io_tree *tree, - u64 start, u64 end); int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2834ca5768ea..a4a7a1a8da95 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -174,6 +174,14 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) test_bit(EXTENT_FLAG_LOGGING, &next->flags)) return 0; + /* + * We don't want to merge stuff that hasn't been written to the log yet + * since it may not reflect exactly what is on disk, and that would be + * bad. + */ + if (!list_empty(&prev->list) || !list_empty(&next->list)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -209,9 +217,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); - list_move(&em->list, &tree->modified_extents); - list_del_init(&merge->list); rb_erase(&merge->rb_node, &tree->map); free_extent_map(merge); } @@ -227,7 +233,6 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) merge->in_tree = 0; em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); - list_del_init(&merge->list); free_extent_map(merge); } } @@ -302,7 +307,7 @@ void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) * reference dropped if the merge attempt was successful. */ int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em) + struct extent_map *em, int modified) { int ret = 0; struct rb_node *rb; @@ -324,7 +329,10 @@ int add_extent_mapping(struct extent_map_tree *tree, em->mod_start = em->start; em->mod_len = em->len; - try_merge_map(tree, em); + if (modified) + list_move(&em->list, &tree->modified_extents); + else + try_merge_map(tree, em); out: return ret; } @@ -337,8 +345,9 @@ static u64 range_end(u64 start, u64 len) return start + len; } -struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len, int strict) +static struct extent_map * +__lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len, int strict) { struct extent_map *em; struct rb_node *rb_node; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index c6598c89cff8..61adc44b7805 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -26,6 +26,7 @@ struct extent_map { u64 mod_len; u64 orig_start; u64 orig_block_len; + u64 ram_bytes; u64 block_start; u64 block_len; u64 generation; @@ -61,7 +62,7 @@ void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em); + struct extent_map *em, int modified); int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *alloc_extent_map(void); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index c4628a201cb3..b193bf324a41 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -83,10 +83,11 @@ out: return ret; } -struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, int cow) +static struct btrfs_csum_item * +btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow) { int ret; struct btrfs_key file_key; @@ -152,32 +153,12 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } -u64 btrfs_file_extent_length(struct btrfs_path *path) -{ - int extent_type; - struct btrfs_file_extent_item *fi; - u64 len; - - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(path->nodes[0], fi); - - if (extent_type == BTRFS_FILE_EXTENT_REG || - extent_type == BTRFS_FILE_EXTENT_PREALLOC) - len = btrfs_file_extent_num_bytes(path->nodes[0], fi); - else if (extent_type == BTRFS_FILE_EXTENT_INLINE) - len = btrfs_file_extent_inline_len(path->nodes[0], fi); - else - BUG(); - - return len; -} - static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { - u32 sum; + u32 sum[16]; + int len; struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; u64 offset = 0; @@ -186,7 +167,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 disk_bytenr; u32 diff; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - int ret; + int count; struct btrfs_path *path; struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -214,10 +195,12 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (dio) offset = logical_offset; while (bio_index < bio->bi_vcnt) { + len = min_t(int, ARRAY_SIZE(sum), bio->bi_vcnt - bio_index); if (!dio) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); - if (ret == 0) + count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum, + len); + if (count) goto found; if (!item || disk_bytenr < item_start_offset || @@ -230,10 +213,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, disk_bytenr, 0); if (IS_ERR(item)) { - ret = PTR_ERR(item); - if (ret == -ENOENT || ret == -EFBIG) - ret = 0; - sum = 0; + count = 1; + sum[0] = 0; if (BTRFS_I(inode)->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, @@ -269,19 +250,29 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, diff = disk_bytenr - item_start_offset; diff = diff / root->sectorsize; diff = diff * csum_size; - - read_extent_buffer(path->nodes[0], &sum, + count = min_t(int, len, (item_last_offset - disk_bytenr) >> + inode->i_sb->s_blocksize_bits); + read_extent_buffer(path->nodes[0], sum, ((unsigned long)item) + diff, - csum_size); + csum_size * count); found: - if (dst) - *dst++ = sum; - else - set_state_private(io_tree, offset, sum); - disk_bytenr += bvec->bv_len; - offset += bvec->bv_len; - bio_index++; - bvec++; + if (dst) { + memcpy(dst, sum, count * csum_size); + dst += count; + } else { + if (dio) + extent_cache_csums_dio(io_tree, offset, sum, + count); + else + extent_cache_csums(io_tree, bio, bio_index, sum, + count); + } + while (count--) { + disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; + bio_index++; + bvec++; + } } btrfs_free_path(path); return 0; @@ -358,11 +349,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || - key.type != BTRFS_EXTENT_CSUM_KEY) - break; - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.offset > end) + key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset > end) break; if (key.offset > start) @@ -484,8 +472,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, data = kmap_atomic(bvec->bv_page); sector_sum->sum = ~(u32)0; - sector_sum->sum = btrfs_csum_data(root, - data + bvec->bv_offset, + sector_sum->sum = btrfs_csum_data(data + bvec->bv_offset, sector_sum->sum, bvec->bv_len); kunmap_atomic(data); @@ -518,8 +505,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, * This calls btrfs_truncate_item with the correct args based on the * overlap, and fixes up the key as required. */ -static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static noinline void truncate_one_csum(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, u64 bytenr, u64 len) @@ -544,7 +530,7 @@ static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(trans, root, path, new_size, 1); + btrfs_truncate_item(root, path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* @@ -556,10 +542,10 @@ static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(trans, root, path, new_size, 0); + btrfs_truncate_item(root, path, new_size, 0); key->offset = end_byte; - btrfs_set_item_key_safe(trans, root, path, key); + btrfs_set_item_key_safe(root, path, key); } else { BUG(); } @@ -674,7 +660,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; } else { - truncate_one_csum(trans, root, path, &key, bytenr, len); + truncate_one_csum(root, path, &key, bytenr, len); if (key.offset < bytenr) break; } @@ -835,7 +821,7 @@ again: diff /= csum_size; diff *= csum_size; - btrfs_extend_item(trans, root, path, diff); + btrfs_extend_item(root, path, diff); goto csum; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bb8b7a0e28a6..4205ba752d40 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,6 +24,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> +#include <linux/aio.h> #include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> @@ -192,8 +193,8 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, * the same inode in the tree, we will merge them together (by * __btrfs_add_inode_defrag()) and free the one that we want to requeue. */ -void btrfs_requeue_inode_defrag(struct inode *inode, - struct inode_defrag *defrag) +static void btrfs_requeue_inode_defrag(struct inode *inode, + struct inode_defrag *defrag) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; @@ -473,7 +474,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, /* * unlocks pages after btrfs_file_write is done with them */ -void btrfs_drop_pages(struct page **pages, size_t num_pages) +static void btrfs_drop_pages(struct page **pages, size_t num_pages) { size_t i; for (i = 0; i < num_pages; i++) { @@ -497,9 +498,9 @@ void btrfs_drop_pages(struct page **pages, size_t num_pages) * doing real data extents, marking pages dirty and delalloc as required. */ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, - struct page **pages, size_t num_pages, - loff_t pos, size_t write_bytes, - struct extent_state **cached) + struct page **pages, size_t num_pages, + loff_t pos, size_t write_bytes, + struct extent_state **cached) { int err = 0; int i; @@ -552,6 +553,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int testend = 1; unsigned long flags; int compressed = 0; + bool modified; WARN_ON(end < start); if (end == (u64)-1) { @@ -561,6 +563,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, while (1) { int no_splits = 0; + modified = false; if (!split) split = alloc_extent_map(); if (!split2) @@ -592,6 +595,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); clear_bit(EXTENT_FLAG_PINNED, &em->flags); clear_bit(EXTENT_FLAG_LOGGING, &flags); + modified = !list_empty(&em->list); remove_extent_mapping(em_tree, em); if (no_splits) goto next; @@ -607,15 +611,15 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->block_len = em->block_len; else split->block_len = split->len; + split->ram_bytes = em->ram_bytes; split->orig_block_len = max(split->block_len, em->orig_block_len); split->generation = gen; split->bdev = em->bdev; split->flags = flags; split->compress_type = em->compress_type; - ret = add_extent_mapping(em_tree, split); + ret = add_extent_mapping(em_tree, split, modified); BUG_ON(ret); /* Logic error */ - list_move(&split->list, &em_tree->modified_extents); free_extent_map(split); split = split2; split2 = NULL; @@ -632,6 +636,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->generation = gen; split->orig_block_len = max(em->block_len, em->orig_block_len); + split->ram_bytes = em->ram_bytes; if (compressed) { split->block_len = em->block_len; @@ -643,9 +648,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->orig_start = em->orig_start; } - ret = add_extent_mapping(em_tree, split); + ret = add_extent_mapping(em_tree, split, modified); BUG_ON(ret); /* Logic error */ - list_move(&split->list, &em_tree->modified_extents); free_extent_map(split); split = NULL; } @@ -821,7 +825,7 @@ next_slot: memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; - btrfs_set_item_key_safe(trans, root, path, &new_key); + btrfs_set_item_key_safe(root, path, &new_key); extent_offset += end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); @@ -1037,7 +1041,7 @@ again: ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; - btrfs_set_item_key_safe(trans, root, path, &new_key); + btrfs_set_item_key_safe(root, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, @@ -1071,7 +1075,7 @@ again: trans->transid); path->slots[0]++; new_key.offset = start; - btrfs_set_item_key_safe(trans, root, path, &new_key); + btrfs_set_item_key_safe(root, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1882,7 +1886,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, path->slots[0]++; key.offset = offset; - btrfs_set_item_key_safe(trans, root, path, &key); + btrfs_set_item_key_safe(root, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - @@ -1912,6 +1916,7 @@ out: } else { hole_em->start = offset; hole_em->len = end - offset; + hole_em->ram_bytes = hole_em->len; hole_em->orig_start = offset; hole_em->block_start = EXTENT_MAP_HOLE; @@ -1924,10 +1929,7 @@ out: do { btrfs_drop_extent_cache(inode, offset, end - 1, 0); write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, hole_em); - if (!ret) - list_move(&hole_em->list, - &em_tree->modified_extents); + ret = add_extent_mapping(em_tree, hole_em, 1); write_unlock(&em_tree->lock); } while (ret == -EEXIST); free_extent_map(hole_em); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 1f84fc09c1a8..ecca6c7375a6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -104,7 +104,8 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, spin_lock(&block_group->lock); if (!((BTRFS_I(inode)->flags & flags) == flags)) { - printk(KERN_INFO "Old style space inode found, converting.\n"); + btrfs_info(root->fs_info, + "Old style space inode found, converting."); BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; block_group->disk_cache_state = BTRFS_DC_CLEAR; @@ -119,9 +120,10 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; } -int __create_free_space_inode(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, u64 ino, u64 offset) +static int __create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + u64 ino, u64 offset) { struct btrfs_key key; struct btrfs_disk_key disk_key; @@ -431,7 +433,7 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + crc = btrfs_csum_data(io_ctl->orig + offset, crc, PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); io_ctl_unmap_page(io_ctl); @@ -461,7 +463,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) kunmap(io_ctl->pages[0]); io_ctl_map_page(io_ctl, 0); - crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + crc = btrfs_csum_data(io_ctl->orig + offset, crc, PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); if (val != crc) { @@ -624,9 +626,9 @@ next: spin_unlock(&ctl->tree_lock); } -int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, - struct btrfs_free_space_ctl *ctl, - struct btrfs_path *path, u64 offset) +static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_path *path, u64 offset) { struct btrfs_free_space_header *header; struct extent_buffer *leaf; @@ -669,10 +671,11 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, btrfs_release_path(path); if (BTRFS_I(inode)->generation != generation) { - printk(KERN_ERR "btrfs: free space inode generation (%llu) did" - " not match free space cache generation (%llu)\n", - (unsigned long long)BTRFS_I(inode)->generation, - (unsigned long long)generation); + btrfs_err(root->fs_info, + "free space inode generation (%llu) " + "did not match free space cache generation (%llu)", + (unsigned long long)BTRFS_I(inode)->generation, + (unsigned long long)generation); return 0; } @@ -721,8 +724,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ret = link_free_space(ctl, e); spin_unlock(&ctl->tree_lock); if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); + btrfs_err(root->fs_info, + "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } @@ -741,8 +744,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ctl->op->recalc_thresholds(ctl); spin_unlock(&ctl->tree_lock); if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); + btrfs_err(root->fs_info, + "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } @@ -833,8 +836,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, if (!matched) { __btrfs_remove_free_space_cache(ctl); - printk(KERN_ERR "block group %llu has an wrong amount of free " - "space\n", block_group->key.objectid); + btrfs_err(fs_info, "block group %llu has wrong amount of free space", + block_group->key.objectid); ret = -1; } out: @@ -845,8 +848,8 @@ out: spin_unlock(&block_group->lock); ret = 0; - printk(KERN_ERR "btrfs: failed to load free space cache " - "for block group %llu\n", block_group->key.objectid); + btrfs_err(fs_info, "failed to load free space cache for block group %llu", + block_group->key.objectid); } iput(inode); @@ -866,11 +869,11 @@ out: * on mount. This will return 0 if it was successfull in writing the cache out, * and -1 if it was not. */ -int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, - struct btrfs_free_space_ctl *ctl, - struct btrfs_block_group_cache *block_group, - struct btrfs_trans_handle *trans, - struct btrfs_path *path, u64 offset) +static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, + struct btrfs_free_space_ctl *ctl, + struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, u64 offset) { struct btrfs_free_space_header *header; struct extent_buffer *leaf; @@ -1104,8 +1107,9 @@ int btrfs_write_out_cache(struct btrfs_root *root, spin_unlock(&block_group->lock); ret = 0; #ifdef DEBUG - printk(KERN_ERR "btrfs: failed to write free space cache " - "for block group %llu\n", block_group->key.objectid); + btrfs_err(root->fs_info, + "failed to write free space cache for block group %llu", + block_group->key.objectid); #endif } @@ -1564,7 +1568,8 @@ again: search_bytes = ctl->unit; search_bytes = min(search_bytes, end - search_start + 1); ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); - BUG_ON(ret < 0 || search_start != *offset); + if (ret < 0 || search_start != *offset) + return -EINVAL; /* We may have found more bits than what we need */ search_bytes = min(search_bytes, *bytes); @@ -1970,7 +1975,6 @@ again: re_search = true; goto again; } - BUG_ON(ret); /* logic error */ out_lock: spin_unlock(&ctl->tree_lock); out: @@ -2064,7 +2068,8 @@ out: return 0; } -void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl) +static void __btrfs_remove_free_space_cache_locked( + struct btrfs_free_space_ctl *ctl) { struct btrfs_free_space *info; struct rb_node *node; @@ -2931,8 +2936,9 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) ret = __load_free_space_cache(root, inode, ctl, path, 0); if (ret < 0) - printk(KERN_ERR "btrfs: failed to load free ino cache for " - "root %llu\n", root->root_key.objectid); + btrfs_err(fs_info, + "failed to load free ino cache for root %llu", + root->root_key.objectid); out_put: iput(inode); out: @@ -2959,11 +2965,531 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, if (ret) { btrfs_delalloc_release_metadata(inode, inode->i_size); #ifdef DEBUG - printk(KERN_ERR "btrfs: failed to write free ino cache " - "for root %llu\n", root->root_key.objectid); + btrfs_err(root->fs_info, + "failed to write free ino cache for root %llu", + root->root_key.objectid); #endif } iput(inode); return ret; } + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +static struct btrfs_block_group_cache *init_test_block_group(void) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = 0; + cache->key.offset = 1024 * 1024 * 1024; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = 4096; + + spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); + + btrfs_init_free_space_ctl(cache); + + return cache; +} + +/* + * Checks to see if the given range is in the free space cache. This is really + * just used to check the absence of space, so if there is free space in the + * range at all we will return 1. + */ +static int check_exists(struct btrfs_block_group_cache *cache, u64 offset, + u64 bytes) +{ + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *info; + int ret = 0; + + spin_lock(&ctl->tree_lock); + info = tree_search_offset(ctl, offset, 0, 0); + if (!info) { + info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!info) + goto out; + } + +have_info: + if (info->bitmap) { + u64 bit_off, bit_bytes; + struct rb_node *n; + struct btrfs_free_space *tmp; + + bit_off = offset; + bit_bytes = ctl->unit; + ret = search_bitmap(ctl, info, &bit_off, &bit_bytes); + if (!ret) { + if (bit_off == offset) { + ret = 1; + goto out; + } else if (bit_off > offset && + offset + bytes > bit_off) { + ret = 1; + goto out; + } + } + + n = rb_prev(&info->offset_index); + while (n) { + tmp = rb_entry(n, struct btrfs_free_space, + offset_index); + if (tmp->offset + tmp->bytes < offset) + break; + if (offset + bytes < tmp->offset) { + n = rb_prev(&info->offset_index); + continue; + } + info = tmp; + goto have_info; + } + + n = rb_next(&info->offset_index); + while (n) { + tmp = rb_entry(n, struct btrfs_free_space, + offset_index); + if (offset + bytes < tmp->offset) + break; + if (tmp->offset + tmp->bytes < offset) { + n = rb_next(&info->offset_index); + continue; + } + info = tmp; + goto have_info; + } + + goto out; + } + + if (info->offset == offset) { + ret = 1; + goto out; + } + + if (offset > info->offset && offset < info->offset + info->bytes) + ret = 1; +out: + spin_unlock(&ctl->tree_lock); + return ret; +} + +/* + * Use this if you need to make a bitmap or extent entry specifically, it + * doesn't do any of the merging that add_free_space does, this acts a lot like + * how the free space cache loading stuff works, so you can get really weird + * configurations. + */ +static int add_free_space_entry(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes, bool bitmap) +{ + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *info = NULL, *bitmap_info; + void *map = NULL; + u64 bytes_added; + int ret; + +again: + if (!info) { + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); + if (!info) + return -ENOMEM; + } + + if (!bitmap) { + spin_lock(&ctl->tree_lock); + info->offset = offset; + info->bytes = bytes; + ret = link_free_space(ctl, info); + spin_unlock(&ctl->tree_lock); + if (ret) + kmem_cache_free(btrfs_free_space_cachep, info); + return ret; + } + + if (!map) { + map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!map) { + kmem_cache_free(btrfs_free_space_cachep, info); + return -ENOMEM; + } + } + + spin_lock(&ctl->tree_lock); + bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!bitmap_info) { + info->bitmap = map; + map = NULL; + add_new_bitmap(ctl, info, offset); + bitmap_info = info; + } + + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + spin_unlock(&ctl->tree_lock); + + if (bytes) + goto again; + + if (map) + kfree(map); + return 0; +} + +/* + * This test just does basic sanity checking, making sure we can add an exten + * entry and remove space from either end and the middle, and make sure we can + * remove space that covers adjacent extent entries. + */ +static int test_extents(struct btrfs_block_group_cache *cache) +{ + int ret = 0; + + printk(KERN_ERR "Running extent only tests\n"); + + /* First just make sure we can remove an entire entry */ + ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Error adding initial extents %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Error removing extent %d\n", ret); + return ret; + } + + if (check_exists(cache, 0, 4 * 1024 * 1024)) { + printk(KERN_ERR "Full remove left some lingering space\n"); + return -1; + } + + /* Ok edge and middle cases now */ + ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Error adding half extent %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Error removing tail end %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Error removing front end %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); + if (ret) { + printk(KERN_ERR "Error removing middle peice %d\n", ret); + return ret; + } + + if (check_exists(cache, 0, 1 * 1024 * 1024)) { + printk(KERN_ERR "Still have space at the front\n"); + return -1; + } + + if (check_exists(cache, 2 * 1024 * 1024, 4096)) { + printk(KERN_ERR "Still have space in the middle\n"); + return -1; + } + + if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { + printk(KERN_ERR "Still have space at the end\n"); + return -1; + } + + /* Cleanup */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +static int test_bitmaps(struct btrfs_block_group_cache *cache) +{ + u64 next_bitmap_offset; + int ret; + + printk(KERN_ERR "Running bitmap only tests\n"); + + ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + if (ret) { + printk(KERN_ERR "Couldn't create a bitmap entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Error removing bitmap full range %d\n", ret); + return ret; + } + + if (check_exists(cache, 0, 4 * 1024 * 1024)) { + printk(KERN_ERR "Left some space in bitmap\n"); + return -1; + } + + ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + if (ret) { + printk(KERN_ERR "Couldn't add to our bitmap entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Couldn't remove middle chunk %d\n", ret); + return ret; + } + + /* + * The first bitmap we have starts at offset 0 so the next one is just + * at the end of the first bitmap. + */ + next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + + /* Test a bit straddling two bitmaps */ + ret = add_free_space_entry(cache, next_bitmap_offset - + (2 * 1024 * 1024), 4 * 1024 * 1024, 1); + if (ret) { + printk(KERN_ERR "Couldn't add space that straddles two bitmaps" + " %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, next_bitmap_offset - + (1 * 1024 * 1024), 2 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); + return ret; + } + + if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), + 2 * 1024 * 1024)) { + printk(KERN_ERR "Left some space when removing overlapping\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +/* This is the high grade jackassery */ +static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) +{ + u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + int ret; + + printk(KERN_ERR "Running bitmap and extent tests\n"); + + /* + * First let's do something simple, an extent at the same offset as the + * bitmap, but the free space completely in the extent and then + * completely in the bitmap. + */ + ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); + if (ret) { + printk(KERN_ERR "Couldn't create bitmap entry %d\n", ret); + return ret; + } + + ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + if (ret) { + printk(KERN_ERR "Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Couldn't remove extent entry %d\n", ret); + return ret; + } + + if (check_exists(cache, 0, 1 * 1024 * 1024)) { + printk(KERN_ERR "Left remnants after our remove\n"); + return -1; + } + + /* Now to add back the extent entry and remove from the bitmap */ + ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + if (ret) { + printk(KERN_ERR "Couldn't re-add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Couldn't remove from bitmap %d\n", ret); + return ret; + } + + if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { + printk(KERN_ERR "Left remnants in the bitmap\n"); + return -1; + } + + /* + * Ok so a little more evil, extent entry and bitmap at the same offset, + * removing an overlapping chunk. + */ + ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); + if (ret) { + printk(KERN_ERR "Couldn't add to a bitmap %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Couldn't remove overlapping space %d\n", ret); + return ret; + } + + if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { + printk(KERN_ERR "Left over peices after removing " + "overlapping\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* Now with the extent entry offset into the bitmap */ + ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); + if (ret) { + printk(KERN_ERR "Couldn't add space to the bitmap %d\n", ret); + return ret; + } + + ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); + if (ret) { + printk(KERN_ERR "Couldn't add extent to the cache %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Problem removing overlapping space %d\n", ret); + return ret; + } + + if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { + printk(KERN_ERR "Left something behind when removing space"); + return -1; + } + + /* + * This has blown up in the past, the extent entry starts before the + * bitmap entry, but we're trying to remove an offset that falls + * completely within the bitmap range and is in both the extent entry + * and the bitmap entry, looks like this + * + * [ extent ] + * [ bitmap ] + * [ del ] + */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, + 4 * 1024 * 1024, 1); + if (ret) { + printk(KERN_ERR "Couldn't add bitmap %d\n", ret); + return ret; + } + + ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, + 5 * 1024 * 1024, 0); + if (ret) { + printk(KERN_ERR "Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, + 5 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Failed to free our space %d\n", ret); + return ret; + } + + if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024, + 5 * 1024 * 1024)) { + printk(KERN_ERR "Left stuff over\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* + * This blew up before, we have part of the free space in a bitmap and + * then the entirety of the rest of the space in an extent. This used + * to return -EAGAIN back from btrfs_remove_extent, make sure this + * doesn't happen. + */ + ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); + if (ret) { + printk(KERN_ERR "Couldn't add bitmap entry %d\n", ret); + return ret; + } + + ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); + if (ret) { + printk(KERN_ERR "Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); + if (ret) { + printk(KERN_ERR "Error removing bitmap and extent " + "overlapping %d\n", ret); + return ret; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + return 0; +} + +void btrfs_test_free_space_cache(void) +{ + struct btrfs_block_group_cache *cache; + + printk(KERN_ERR "Running btrfs free space cache tests\n"); + + cache = init_test_block_group(); + if (!cache) { + printk(KERN_ERR "Couldn't run the tests\n"); + return; + } + + if (test_extents(cache)) + goto out; + if (test_bitmaps(cache)) + goto out; + if (test_bitmaps_and_extents(cache)) + goto out; +out: + __btrfs_remove_free_space_cache(cache->free_space_ctl); + kfree(cache->free_space_ctl); + kfree(cache); + printk(KERN_ERR "Free space cache tests finished\n"); +} +#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 8f2613f779ed..4dc17d8809c7 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -110,4 +110,9 @@ int btrfs_return_cluster_to_free_space( struct btrfs_free_cluster *cluster); int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_free_space_cache(void); +#endif + #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 48b8fda93132..e0b7034d6343 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -183,10 +183,11 @@ int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans, return -ENOENT; } -int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 *index) +static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, + u64 *index) { struct btrfs_path *path; struct btrfs_key key; @@ -246,7 +247,7 @@ int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + del_len, item_size - (ptr + del_len - item_start)); - btrfs_truncate_item(trans, root, path, item_size - del_len, 1); + btrfs_truncate_item(root, path, item_size - del_len, 1); out: btrfs_free_path(path); @@ -309,7 +310,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - btrfs_truncate_item(trans, root, path, item_size - sub_item_len, 1); + btrfs_truncate_item(root, path, item_size - sub_item_len, 1); out: btrfs_free_path(path); @@ -361,7 +362,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, name, name_len, NULL)) goto out; - btrfs_extend_item(trans, root, path, ins_len); + btrfs_extend_item(root, path, ins_len); ret = 0; } if (ret < 0) @@ -417,7 +418,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - btrfs_extend_item(trans, root, path, ins_len); + btrfs_extend_item(root, path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 09c58a35b429..9b31b3b091fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,6 +32,7 @@ #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> +#include <linux/aio.h> #include <linux/bit_spinlock.h> #include <linux/xattr.h> #include <linux/posix_acl.h> @@ -100,7 +101,10 @@ static noinline int cow_file_range(struct inode *inode, static struct extent_map *create_pinned_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, - u64 orig_block_len, int type); + u64 orig_block_len, u64 ram_bytes, + int type); + +static int btrfs_dirty_inode(struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -722,6 +726,7 @@ retry: em->block_start = ins.objectid; em->block_len = ins.offset; em->orig_block_len = ins.offset; + em->ram_bytes = async_extent->ram_size; em->bdev = root->fs_info->fs_devices->latest_bdev; em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); @@ -730,10 +735,7 @@ retry: while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - if (!ret) - list_move(&em->list, - &em_tree->modified_extents); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -921,7 +923,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, } em = alloc_extent_map(); - BUG_ON(!em); /* -ENOMEM */ + if (!em) + goto out_reserve; em->start = start; em->orig_start = em->start; ram_size = ins.offset; @@ -932,16 +935,14 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, em->block_start = ins.objectid; em->block_len = ins.offset; em->orig_block_len = ins.offset; + em->ram_bytes = ram_size; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); em->generation = -1; while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - if (!ret) - list_move(&em->list, - &em_tree->modified_extents); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -950,11 +951,14 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); } + if (ret) + goto out_reserve; cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto out_reserve; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { @@ -962,7 +966,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, cur_alloc_size); if (ret) { btrfs_abort_transaction(trans, root, ret); - goto out_unlock; + goto out_reserve; } } @@ -991,6 +995,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, out: return ret; +out_reserve: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); out_unlock: extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, @@ -1194,6 +1200,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 disk_bytenr; u64 num_bytes; u64 disk_num_bytes; + u64 ram_bytes; int extent_type; int ret, err; int type; @@ -1290,6 +1297,7 @@ next_slot: struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); @@ -1373,6 +1381,7 @@ out_check: em->block_len = num_bytes; em->block_start = disk_bytenr; em->orig_block_len = disk_num_bytes; + em->ram_bytes = ram_bytes; em->bdev = root->fs_info->fs_devices->latest_bdev; em->mod_start = em->start; em->mod_len = em->len; @@ -1381,10 +1390,7 @@ out_check: em->generation = -1; while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - if (!ret) - list_move(&em->list, - &em_tree->modified_extents); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -1525,7 +1531,7 @@ static void btrfs_merge_extent_hook(struct inode *inode, * have pending delalloc work to be done. */ static void btrfs_set_bit_hook(struct inode *inode, - struct extent_state *state, int *bits) + struct extent_state *state, unsigned long *bits) { /* @@ -1569,7 +1575,8 @@ static void btrfs_set_bit_hook(struct inode *inode, * extent_io.c clear_bit_hook, see set_bit_hook for why */ static void btrfs_clear_bit_hook(struct inode *inode, - struct extent_state *state, int *bits) + struct extent_state *state, + unsigned long *bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore @@ -2793,6 +2800,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; u32 csum = ~(u32)0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (PageChecked(page)) { ClearPageChecked(page); @@ -2819,7 +2828,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, if (ret) goto zeroit; - csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); + csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1); btrfs_csum_final(csum, (char *)&csum); if (csum != private) goto zeroit; @@ -2829,11 +2838,11 @@ good: return 0; zeroit: - printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u " - "private %llu\n", - (unsigned long long)btrfs_ino(page->mapping->host), - (unsigned long long)start, csum, - (unsigned long long)private); + if (__ratelimit(&_rs)) + btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u private %llu", + (unsigned long long)btrfs_ino(page->mapping->host), + (unsigned long long)start, csum, + (unsigned long long)private); memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); kunmap_atomic(kaddr); @@ -3019,7 +3028,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) * We have done the truncate/delete so we can go ahead and remove the orphan * item for this particular inode. */ -int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) +static int btrfs_orphan_del(struct btrfs_trans_handle *trans, + struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; int delete_item = 0; @@ -3114,8 +3124,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) */ if (found_key.offset == last_objectid) { - printk(KERN_ERR "btrfs: Error removing orphan entry, " - "stopping orphan cleanup\n"); + btrfs_err(root->fs_info, + "Error removing orphan entry, stopping orphan cleanup"); ret = -EINVAL; goto out; } @@ -3172,8 +3182,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = PTR_ERR(trans); goto out; } - printk(KERN_ERR "auto deleting %Lu\n", - found_key.objectid); + btrfs_debug(root->fs_info, "auto deleting %Lu", + found_key.objectid); ret = btrfs_del_orphan_item(trans, root, found_key.objectid); BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ @@ -3237,13 +3247,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) } if (nr_unlink) - printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); + btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink); if (nr_truncate) - printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); + btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate); out: if (ret) - printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); + btrfs_crit(root->fs_info, + "could not do orphan cleanup %d", ret); btrfs_free_path(path); return ret; } @@ -3591,9 +3602,10 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, dir_ino, &index); if (ret) { - printk(KERN_INFO "btrfs failed to delete reference to %.*s, " - "inode %llu parent %llu\n", name_len, name, - (unsigned long long)ino, (unsigned long long)dir_ino); + btrfs_info(root->fs_info, + "failed to delete reference to %.*s, inode %llu parent %llu", + name_len, name, + (unsigned long long)ino, (unsigned long long)dir_ino); btrfs_abort_transaction(trans, root, ret); goto err; } @@ -3615,6 +3627,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, dir, index); if (ret == -ENOENT) ret = 0; + else if (ret) + btrfs_abort_transaction(trans, root, ret); err: btrfs_free_path(path); if (ret) @@ -3660,7 +3674,7 @@ static int check_path_shared(struct btrfs_root *root, eb = path->nodes[level]; if (!btrfs_block_can_be_shared(root, eb)) continue; - ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, + ret = btrfs_lookup_extent_info(NULL, root, eb->start, level, 1, &refs, NULL); if (refs > 1) return 1; @@ -4175,8 +4189,7 @@ search_again: } size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(trans, root, path, - size, 1); + btrfs_truncate_item(root, path, size, 1); } else if (root->ref_cows) { inode_sub_bytes(inode, item_end + 1 - found_key.offset); @@ -4450,16 +4463,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; hole_em->orig_block_len = 0; + hole_em->ram_bytes = hole_size; hole_em->bdev = root->fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; while (1) { write_lock(&em_tree->lock); - err = add_extent_mapping(em_tree, hole_em); - if (!err) - list_move(&hole_em->list, - &em_tree->modified_extents); + err = add_extent_mapping(em_tree, hole_em, 1); write_unlock(&em_tree->lock); if (err != -EEXIST) break; @@ -4670,8 +4681,9 @@ void btrfs_evict_inode(struct inode *inode) ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); if (ret) { - printk(KERN_WARNING "Could not get space for a " - "delete, will truncate on mount %d\n", ret); + btrfs_warn(root->fs_info, + "Could not get space for a delete, will truncate on mount %d", + ret); btrfs_orphan_del(NULL, inode); btrfs_free_block_rsv(root, rsv); goto no_delete; @@ -5335,7 +5347,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ -int btrfs_dirty_inode(struct inode *inode) +static int btrfs_dirty_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -5977,7 +5989,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree, em->block_start += start_diff; em->block_len -= start_diff; } - return add_extent_mapping(em_tree, em); + return add_extent_mapping(em_tree, em, 0); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -6151,6 +6163,7 @@ again: goto not_found_em; } + em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { em->start = extent_start; @@ -6259,18 +6272,18 @@ not_found_em: insert: btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { - printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " - "[%llu %llu]\n", (unsigned long long)em->start, - (unsigned long long)em->len, - (unsigned long long)start, - (unsigned long long)len); + btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", + (unsigned long long)em->start, + (unsigned long long)em->len, + (unsigned long long)start, + (unsigned long long)len); err = -EIO; goto out; } err = 0; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 0); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that * an overlapping map exists in the tree @@ -6482,7 +6495,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, } em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, 0); + ins.offset, ins.offset, ins.offset, 0); if (IS_ERR(em)) goto out; @@ -6502,7 +6515,9 @@ out: * block must be cow'd */ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, - struct inode *inode, u64 offset, u64 len) + struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes) { struct btrfs_path *path; int ret; @@ -6559,8 +6574,12 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); backref_offset = btrfs_file_extent_offset(leaf, fi); + *orig_start = key.offset - backref_offset; + *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); + *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (extent_end < offset + len) { + if (extent_end < offset + *len) { /* extent doesn't include our full range, must cow */ goto out; } @@ -6584,13 +6603,14 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, */ disk_bytenr += backref_offset; disk_bytenr += offset - key.offset; - num_bytes = min(offset + len, extent_end) - offset; + num_bytes = min(offset + *len, extent_end) - offset; if (csum_exist_in_range(root, disk_bytenr, num_bytes)) goto out; /* * all of the above have passed, it is safe to overwrite this extent * without cow */ + *len = num_bytes; ret = 1; out: btrfs_free_path(path); @@ -6661,7 +6681,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, static struct extent_map *create_pinned_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, - u64 orig_block_len, int type) + u64 orig_block_len, u64 ram_bytes, + int type) { struct extent_map_tree *em_tree; struct extent_map *em; @@ -6682,6 +6703,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, em->block_start = block_start; em->bdev = root->fs_info->fs_devices->latest_bdev; em->orig_block_len = orig_block_len; + em->ram_bytes = ram_bytes; em->generation = -1; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) @@ -6691,10 +6713,7 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, btrfs_drop_extent_cache(inode, em->start, em->start + em->len - 1, 0); write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - if (!ret) - list_move(&em->list, - &em_tree->modified_extents); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); } while (ret == -EEXIST); @@ -6789,7 +6808,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, em->block_start != EXTENT_MAP_HOLE)) { int type; int ret; - u64 block_start; + u64 block_start, orig_start, orig_block_len, ram_bytes; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) type = BTRFS_ORDERED_PREALLOC; @@ -6807,16 +6826,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (IS_ERR(trans)) goto must_cow; - if (can_nocow_odirect(trans, inode, start, len) == 1) { - u64 orig_start = em->orig_start; - u64 orig_block_len = em->orig_block_len; - + if (can_nocow_odirect(trans, inode, start, &len, &orig_start, + &orig_block_len, &ram_bytes) == 1) { if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); em = create_pinned_em(inode, start, len, orig_start, block_start, len, - orig_block_len, type); + orig_block_len, + ram_bytes, type); if (IS_ERR(em)) { btrfs_end_transaction(trans, root); goto unlock_err; @@ -6936,7 +6954,7 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) goto failed; local_irq_save(flags); kaddr = kmap_atomic(page); - csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, + csum = btrfs_csum_data(kaddr + bvec->bv_offset, csum, bvec->bv_len); btrfs_csum_final(csum, (char *)&csum); kunmap_atomic(kaddr); @@ -6945,11 +6963,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) flush_dcache_page(bvec->bv_page); if (csum != private) { failed: - printk(KERN_ERR "btrfs csum failed ino %llu off" - " %llu csum %u private %u\n", - (unsigned long long)btrfs_ino(inode), - (unsigned long long)start, - csum, (unsigned)private); + btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u private %u", + (unsigned long long)btrfs_ino(inode), + (unsigned long long)start, + csum, (unsigned)private); err = -EIO; } } @@ -7425,8 +7442,8 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc) return extent_write_full_page(tree, page, btrfs_get_extent, wbc); } -int btrfs_writepages(struct address_space *mapping, - struct writeback_control *wbc) +static int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { struct extent_io_tree *tree; @@ -7941,8 +7958,8 @@ void btrfs_destroy_inode(struct inode *inode) if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)) { - printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", - (unsigned long long)btrfs_ino(inode)); + btrfs_info(root->fs_info, "inode %llu still on the orphan list", + (unsigned long long)btrfs_ino(inode)); atomic_dec(&root->orphan_inodes); } @@ -7951,10 +7968,9 @@ void btrfs_destroy_inode(struct inode *inode) if (!ordered) break; else { - printk(KERN_ERR "btrfs found ordered " - "extent %llu %llu on inode cleanup\n", - (unsigned long long)ordered->file_offset, - (unsigned long long)ordered->len); + btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", + (unsigned long long)ordered->file_offset, + (unsigned long long)ordered->len); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -8571,16 +8587,14 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->block_start = ins.objectid; em->block_len = ins.offset; em->orig_block_len = ins.offset; + em->ram_bytes = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - if (!ret) - list_move(&em->list, - &em_tree->modified_extents); + ret = add_extent_mapping(em_tree, em, 1); write_unlock(&em_tree->lock); if (ret != -EEXIST) break; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2c02310ff2d9..0de4a2fcfb24 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -723,7 +723,9 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + if (error == -EINTR) + return error; dentry = lookup_one_len(name, parent->dentry, namelen); error = PTR_ERR(dentry); @@ -1152,8 +1154,11 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, u64 new_align = ~((u64)128 * 1024 - 1); struct page **pages = NULL; - if (extent_thresh == 0) - extent_thresh = 256 * 1024; + if (isize == 0) + return 0; + + if (range->start >= isize) + return -EINVAL; if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { if (range->compress_type > BTRFS_COMPRESS_TYPES) @@ -1162,8 +1167,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, compress_type = range->compress_type; } - if (isize == 0) - return 0; + if (extent_thresh == 0) + extent_thresh = 256 * 1024; /* * if we were not given a file, allocate a readahead @@ -2086,7 +2091,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + if (err == -EINTR) + goto out; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2425,7 +2432,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) mutex_lock(&fs_devices->device_list_mutex); dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); - mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { ret = -ENODEV; @@ -2449,6 +2455,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) } out: + mutex_unlock(&fs_devices->device_list_mutex); if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) ret = -EFAULT; @@ -3003,7 +3010,7 @@ void btrfs_get_block_group_info(struct list_head *groups_list, } } -long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_space_args space_args; struct btrfs_ioctl_space_info space; @@ -3693,12 +3700,11 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) goto drop_write; } - if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { - trans = btrfs_start_transaction(root, 2); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } + down_write(&root->fs_info->subvol_sem); + trans = btrfs_start_transaction(root->fs_info->tree_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; } switch (sa->cmd) { @@ -3708,9 +3714,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) case BTRFS_QUOTA_CTL_DISABLE: ret = btrfs_quota_disable(trans, root->fs_info); break; - case BTRFS_QUOTA_CTL_RESCAN: - ret = btrfs_quota_rescan(root->fs_info); - break; default: ret = -EINVAL; break; @@ -3719,13 +3722,12 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; - if (trans) { - err = btrfs_commit_transaction(trans, root); - if (err && !ret) - ret = err; - } + err = btrfs_commit_transaction(trans, root->fs_info->tree_root); + if (err && !ret) + ret = err; out: kfree(sa); + up_write(&root->fs_info->subvol_sem); drop_write: mnt_drop_write_file(file); return ret; @@ -3877,6 +3879,64 @@ drop_write: return ret; } +static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_ioctl_quota_rescan_args *qsa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + qsa = memdup_user(arg, sizeof(*qsa)); + if (IS_ERR(qsa)) { + ret = PTR_ERR(qsa); + goto drop_write; + } + + if (qsa->flags) { + ret = -EINVAL; + goto out; + } + + ret = btrfs_qgroup_rescan(root->fs_info); + +out: + kfree(qsa); +drop_write: + mnt_drop_write_file(file); + return ret; +} + +static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_ioctl_quota_rescan_args *qsa; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + qsa = kzalloc(sizeof(*qsa), GFP_NOFS); + if (!qsa) + return -ENOMEM; + + if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + qsa->flags = 1; + qsa->progress = root->fs_info->qgroup_rescan_progress.objectid; + } + + if (copy_to_user(arg, qsa, sizeof(*qsa))) + ret = -EFAULT; + + kfree(qsa); + return ret; +} + static long btrfs_ioctl_set_received_subvol(struct file *file, void __user *arg) { @@ -4115,6 +4175,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_qgroup_create(file, argp); case BTRFS_IOC_QGROUP_LIMIT: return btrfs_ioctl_qgroup_limit(file, argp); + case BTRFS_IOC_QUOTA_RESCAN: + return btrfs_ioctl_quota_rescan(file, argp); + case BTRFS_IOC_QUOTA_RESCAN_STATUS: + return btrfs_ioctl_quota_rescan_status(file, argp); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(root, argp); case BTRFS_IOC_GET_FSLABEL: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index e95df435d897..01277b8f2373 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -24,7 +24,7 @@ #include "extent_io.h" #include "locking.h" -void btrfs_assert_tree_read_locked(struct extent_buffer *eb); +static void btrfs_assert_tree_read_locked(struct extent_buffer *eb); /* * if we currently have a spinning reader or writer lock @@ -264,7 +264,7 @@ void btrfs_assert_tree_locked(struct extent_buffer *eb) BUG_ON(!atomic_read(&eb->write_locks)); } -void btrfs_assert_tree_read_locked(struct extent_buffer *eb) +static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { BUG_ON(!atomic_read(&eb->read_locks)); } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 005c45db699e..1ddd728541ee 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -986,7 +986,7 @@ out: * be reclaimed before their checksum is actually put into the btree */ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u32 *sum) + u32 *sum, int len) { struct btrfs_ordered_sum *ordered_sum; struct btrfs_sector_sum *sector_sums; @@ -995,22 +995,28 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, unsigned long num_sectors; unsigned long i; u32 sectorsize = BTRFS_I(inode)->root->sectorsize; - int ret = 1; + int index = 0; ordered = btrfs_lookup_ordered_extent(inode, offset); if (!ordered) - return 1; + return 0; spin_lock_irq(&tree->lock); list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { - if (disk_bytenr >= ordered_sum->bytenr) { - num_sectors = ordered_sum->len / sectorsize; - sector_sums = ordered_sum->sums; - for (i = 0; i < num_sectors; i++) { + if (disk_bytenr >= ordered_sum->bytenr && + disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { + i = (disk_bytenr - ordered_sum->bytenr) >> + inode->i_sb->s_blocksize_bits; + sector_sums = ordered_sum->sums + i; + num_sectors = ordered_sum->len >> + inode->i_sb->s_blocksize_bits; + for (; i < num_sectors; i++) { if (sector_sums[i].bytenr == disk_bytenr) { - *sum = sector_sums[i].sum; - ret = 0; - goto out; + sum[index] = sector_sums[i].sum; + index++; + if (index == len) + goto out; + disk_bytenr += sectorsize; } } } @@ -1018,7 +1024,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, out: spin_unlock_irq(&tree->lock); btrfs_put_ordered_extent(ordered); - return ret; + return index; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 8eadfe406cdd..58b0e3b0ebad 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -196,7 +196,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, + u32 *sum, int len); int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, struct btrfs_root *root, int wait); void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 920957ecb27e..dc0024f17c1f 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -176,7 +176,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) nr = btrfs_header_nritems(l); - printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", + btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d", (unsigned long long)btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); for (i = 0 ; i < nr ; i++) { @@ -319,10 +319,9 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) btrfs_print_leaf(root, c); return; } - printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", - (unsigned long long)btrfs_header_bytenr(c), - level, nr, - (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); + btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u", + (unsigned long long)btrfs_header_bytenr(c), + level, nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index da75efe534d5..7faddfacc5bd 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -19,5 +19,5 @@ #ifndef __PRINT_TREE_ #define __PRINT_TREE_ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); -void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t); +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b44124dd2370..9d49c586995a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -31,13 +31,13 @@ #include "locking.h" #include "ulist.h" #include "backref.h" +#include "extent_io.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? * - reorganize keys * - compressed * - sync - * - rescan * - copy also limits on subvol creation * - limit * - caches fuer ulists @@ -98,7 +98,15 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; -/* must be called with qgroup_lock held */ +struct qgroup_rescan { + struct btrfs_work work; + struct btrfs_fs_info *fs_info; +}; + +static void qgroup_rescan_start(struct btrfs_fs_info *fs_info, + struct qgroup_rescan *qscan); + +/* must be called with qgroup_ioctl_lock held */ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) { @@ -298,7 +306,20 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) } fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); - /* FIXME read scan element */ + fs_info->qgroup_rescan_progress.objectid = + btrfs_qgroup_status_rescan(l, ptr); + if (fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + struct qgroup_rescan *qscan = + kmalloc(sizeof(*qscan), GFP_NOFS); + if (!qscan) { + ret = -ENOMEM; + goto out; + } + fs_info->qgroup_rescan_progress.type = 0; + fs_info->qgroup_rescan_progress.offset = 0; + qgroup_rescan_start(fs_info, qscan); + } goto next1; } @@ -420,8 +441,6 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); - WARN_ON(!list_empty(&qgroup->dirty)); - while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, @@ -721,7 +740,8 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); btrfs_set_qgroup_status_generation(l, ptr, trans->transid); - /* XXX scan */ + btrfs_set_qgroup_status_rescan(l, ptr, + fs_info->qgroup_rescan_progress.objectid); btrfs_mark_buffer_dirty(l); @@ -783,19 +803,21 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *quota_root; + struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_path *path = NULL; struct btrfs_qgroup_status_item *ptr; struct extent_buffer *leaf; struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_qgroup *qgroup = NULL; int ret = 0; + int slot; - spin_lock(&fs_info->qgroup_lock); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (fs_info->quota_root) { fs_info->pending_quota_state = 1; - spin_unlock(&fs_info->qgroup_lock); goto out; } - spin_unlock(&fs_info->qgroup_lock); /* * initially create the quota tree @@ -830,10 +852,57 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); - btrfs_set_qgroup_status_scan(leaf, ptr, 0); + btrfs_set_qgroup_status_rescan(leaf, ptr, 0); btrfs_mark_buffer_dirty(leaf); + key.objectid = 0; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = 0; + + btrfs_release_path(path); + ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); + if (ret > 0) + goto out_add_root; + if (ret < 0) + goto out_free_path; + + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.type == BTRFS_ROOT_REF_KEY) { + ret = add_qgroup_item(trans, quota_root, + found_key.offset); + if (ret) + goto out_free_path; + + qgroup = add_qgroup_rb(fs_info, found_key.offset); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out_free_path; + } + } + ret = btrfs_next_item(tree_root, path); + if (ret < 0) + goto out_free_path; + if (ret) + break; + } + +out_add_root: + btrfs_release_path(path); + ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); + if (ret) + goto out_free_path; + + qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); + if (IS_ERR(qgroup)) { + ret = PTR_ERR(qgroup); + goto out_free_path; + } spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; fs_info->pending_quota_state = 1; @@ -847,6 +916,7 @@ out_free_root: kfree(quota_root); } out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -857,11 +927,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root; int ret = 0; + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) + goto out; spin_lock(&fs_info->qgroup_lock); - if (!fs_info->quota_root) { - spin_unlock(&fs_info->qgroup_lock); - return 0; - } fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; quota_root = fs_info->quota_root; @@ -869,8 +938,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, btrfs_free_qgroup_config(fs_info); spin_unlock(&fs_info->qgroup_lock); - if (!quota_root) - return -EINVAL; + if (!quota_root) { + ret = -EINVAL; + goto out; + } ret = btrfs_clean_quota_tree(trans, quota_root); if (ret) @@ -891,39 +962,62 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, free_extent_buffer(quota_root->commit_root); kfree(quota_root); out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } -int btrfs_quota_rescan(struct btrfs_fs_info *fs_info) +static void qgroup_dirty(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *qgroup) { - /* FIXME */ - return 0; + if (list_empty(&qgroup->dirty)) + list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { struct btrfs_root *quota_root; + struct btrfs_qgroup *parent; + struct btrfs_qgroup *member; + struct btrfs_qgroup_list *list; int ret = 0; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; - if (!quota_root) - return -EINVAL; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + member = find_qgroup_rb(fs_info, src); + parent = find_qgroup_rb(fs_info, dst); + if (!member || !parent) { + ret = -EINVAL; + goto out; + } + + /* check if such qgroup relation exist firstly */ + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) { + ret = -EEXIST; + goto out; + } + } ret = add_qgroup_relation_item(trans, quota_root, src, dst); if (ret) - return ret; + goto out; ret = add_qgroup_relation_item(trans, quota_root, dst, src); if (ret) { del_qgroup_relation_item(trans, quota_root, src, dst); - return ret; + goto out; } spin_lock(&fs_info->qgroup_lock); ret = add_relation_rb(quota_root->fs_info, src, dst); spin_unlock(&fs_info->qgroup_lock); - +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -931,13 +1025,34 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { struct btrfs_root *quota_root; + struct btrfs_qgroup *parent; + struct btrfs_qgroup *member; + struct btrfs_qgroup_list *list; int ret = 0; int err; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; - if (!quota_root) - return -EINVAL; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + member = find_qgroup_rb(fs_info, src); + parent = find_qgroup_rb(fs_info, dst); + if (!member || !parent) { + ret = -EINVAL; + goto out; + } + + /* check if such qgroup relation exist firstly */ + list_for_each_entry(list, &member->groups, next_group) { + if (list->group == parent) + goto exist; + } + ret = -ENOENT; + goto out; +exist: ret = del_qgroup_relation_item(trans, quota_root, src, dst); err = del_qgroup_relation_item(trans, quota_root, dst, src); if (err && !ret) @@ -945,9 +1060,9 @@ int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); - spin_unlock(&fs_info->qgroup_lock); - +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -958,11 +1073,21 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, struct btrfs_qgroup *qgroup; int ret = 0; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; - if (!quota_root) - return -EINVAL; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (qgroup) { + ret = -EEXIST; + goto out; + } ret = add_qgroup_item(trans, quota_root, qgroupid); + if (ret) + goto out; spin_lock(&fs_info->qgroup_lock); qgroup = add_qgroup_rb(fs_info, qgroupid); @@ -970,7 +1095,8 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, if (IS_ERR(qgroup)) ret = PTR_ERR(qgroup); - +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -981,27 +1107,32 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_qgroup *qgroup; int ret = 0; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; - if (!quota_root) - return -EINVAL; + if (!quota_root) { + ret = -EINVAL; + goto out; + } - /* check if there are no relations to this qgroup */ - spin_lock(&fs_info->qgroup_lock); qgroup = find_qgroup_rb(fs_info, qgroupid); - if (qgroup) { - if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) { - spin_unlock(&fs_info->qgroup_lock); - return -EBUSY; + if (!qgroup) { + ret = -ENOENT; + goto out; + } else { + /* check if there are no relations to this qgroup */ + if (!list_empty(&qgroup->groups) || + !list_empty(&qgroup->members)) { + ret = -EBUSY; + goto out; } } - spin_unlock(&fs_info->qgroup_lock); - ret = del_qgroup_item(trans, quota_root, qgroupid); spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); - +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -1009,13 +1140,22 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid, struct btrfs_qgroup_limit *limit) { - struct btrfs_root *quota_root = fs_info->quota_root; + struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; - if (!quota_root) - return -EINVAL; + mutex_lock(&fs_info->qgroup_ioctl_lock); + quota_root = fs_info->quota_root; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (!qgroup) { + ret = -ENOENT; + goto out; + } ret = update_qgroup_limit_item(trans, quota_root, qgroupid, limit->flags, limit->max_rfer, limit->max_excl, limit->rsv_rfer, @@ -1027,31 +1167,17 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, } spin_lock(&fs_info->qgroup_lock); - - qgroup = find_qgroup_rb(fs_info, qgroupid); - if (!qgroup) { - ret = -ENOENT; - goto unlock; - } qgroup->lim_flags = limit->flags; qgroup->max_rfer = limit->max_rfer; qgroup->max_excl = limit->max_excl; qgroup->rsv_rfer = limit->rsv_rfer; qgroup->rsv_excl = limit->rsv_excl; - -unlock: spin_unlock(&fs_info->qgroup_lock); - +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } -static void qgroup_dirty(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup *qgroup) -{ - if (list_empty(&qgroup->dirty)) - list_add(&qgroup->dirty, &fs_info->dirty_qgroups); -} - /* * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts * the modification into a list that's later used by btrfs_end_transaction to @@ -1075,6 +1201,144 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, return 0; } +static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + u64 seq) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct ulist_node *tmp_unode; + struct ulist_iterator tmp_uiter; + struct btrfs_qgroup *qg; + int ret; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + + ulist_reinit(tmp); + /* XXX id not needed */ + ret = ulist_add(tmp, qg->qgroupid, + (u64)(uintptr_t)qg, GFP_ATOMIC); + if (ret < 0) + return ret; + ULIST_ITER_INIT(&tmp_uiter); + while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; + if (qg->refcnt < seq) + qg->refcnt = seq + 1; + else + ++qg->refcnt; + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + (u64)(uintptr_t)glist->group, + GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + } + + return 0; +} + +static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + u64 seq, int sgn, u64 num_bytes, + struct btrfs_qgroup *qgroup) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + int ret; + + ulist_reinit(tmp); + ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); + if (ret < 0) + return ret; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; + if (qg->refcnt < seq) { + /* not visited by step 1 */ + qg->rfer += sgn * num_bytes; + qg->rfer_cmpr += sgn * num_bytes; + if (roots->nnodes == 0) { + qg->excl += sgn * num_bytes; + qg->excl_cmpr += sgn * num_bytes; + } + qgroup_dirty(fs_info, qg); + } + WARN_ON(qg->tag >= seq); + qg->tag = seq; + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + (uintptr_t)glist->group, GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + + return 0; +} + +static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + u64 seq, int sgn, u64 num_bytes) +{ + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup *qg; + struct ulist_node *tmp_unode; + struct ulist_iterator tmp_uiter; + int ret; + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + + ulist_reinit(tmp); + ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); + if (ret < 0) + return ret; + + ULIST_ITER_INIT(&tmp_uiter); + while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; + if (qg->tag == seq) + continue; + + if (qg->refcnt - seq == roots->nnodes) { + qg->excl -= sgn * num_bytes; + qg->excl_cmpr -= sgn * num_bytes; + qgroup_dirty(fs_info, qg); + } + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + (uintptr_t)glist->group, + GFP_ATOMIC); + if (ret < 0) + return ret; + } + } + } + + return 0; +} + /* * btrfs_qgroup_account_ref is called for every ref that is added to or deleted * from the fs. First, all roots referencing the extent are searched, and @@ -1090,10 +1354,8 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root; u64 ref_root; struct btrfs_qgroup *qgroup; - struct ulist_node *unode; struct ulist *roots = NULL; struct ulist *tmp = NULL; - struct ulist_iterator uiter; u64 seq; int ret = 0; int sgn; @@ -1132,9 +1394,11 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, case BTRFS_ADD_DELAYED_REF: case BTRFS_ADD_DELAYED_EXTENT: sgn = 1; + seq = btrfs_tree_mod_seq_prev(node->seq); break; case BTRFS_DROP_DELAYED_REF: sgn = -1; + seq = node->seq; break; case BTRFS_UPDATE_DELAYED_HEAD: return 0; @@ -1142,20 +1406,37 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, BUG(); } + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + return 0; + } + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + /* * the delayed ref sequence number we pass depends on the direction of - * the operation. for add operations, we pass (node->seq - 1) to skip + * the operation. for add operations, we pass + * tree_mod_log_prev_seq(node->seq) to skip * the delayed ref's current sequence number, because we need the state * of the tree before the add operation. for delete operations, we pass * (node->seq) to include the delayed ref's current sequence number, * because we need the state of the tree after the delete operation. */ - ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, - sgn > 0 ? node->seq - 1 : node->seq, &roots); + ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots); if (ret < 0) return ret; + mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { + ret = 0; + goto unlock; + } + } + quota_root = fs_info->quota_root; if (!quota_root) goto unlock; @@ -1175,106 +1456,29 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, seq = fs_info->qgroup_seq; fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; - struct btrfs_qgroup *qg; - - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; - - ulist_reinit(tmp); - /* XXX id not needed */ - ulist_add(tmp, qg->qgroupid, (u64)(uintptr_t)qg, GFP_ATOMIC); - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { - struct btrfs_qgroup_list *glist; - - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->refcnt < seq) - qg->refcnt = seq + 1; - else - ++qg->refcnt; - - list_for_each_entry(glist, &qg->groups, next_group) { - ulist_add(tmp, glist->group->qgroupid, - (u64)(uintptr_t)glist->group, - GFP_ATOMIC); - } - } - } + ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); + if (ret) + goto unlock; /* * step 2: walk from the new root */ - ulist_reinit(tmp); - ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; - - qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; - if (qg->refcnt < seq) { - /* not visited by step 1 */ - qg->rfer += sgn * node->num_bytes; - qg->rfer_cmpr += sgn * node->num_bytes; - if (roots->nnodes == 0) { - qg->excl += sgn * node->num_bytes; - qg->excl_cmpr += sgn * node->num_bytes; - } - qgroup_dirty(fs_info, qg); - } - WARN_ON(qg->tag >= seq); - qg->tag = seq; - - list_for_each_entry(glist, &qg->groups, next_group) { - ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); - } - } + ret = qgroup_account_ref_step2(fs_info, roots, tmp, seq, sgn, + node->num_bytes, qgroup); + if (ret) + goto unlock; /* * step 3: walk again from old refs */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - struct btrfs_qgroup *qg; - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; - - qg = find_qgroup_rb(fs_info, unode->val); - if (!qg) - continue; - - ulist_reinit(tmp); - ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC); - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { - struct btrfs_qgroup_list *glist; - - qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux; - if (qg->tag == seq) - continue; - - if (qg->refcnt - seq == roots->nnodes) { - qg->excl -= sgn * node->num_bytes; - qg->excl_cmpr -= sgn * node->num_bytes; - qgroup_dirty(fs_info, qg); - } + ret = qgroup_account_ref_step3(fs_info, roots, tmp, seq, sgn, + node->num_bytes); + if (ret) + goto unlock; - list_for_each_entry(glist, &qg->groups, next_group) { - ulist_add(tmp, glist->group->qgroupid, - (uintptr_t)glist->group, - GFP_ATOMIC); - } - } - } - ret = 0; unlock: spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); ulist_free(roots); ulist_free(tmp); @@ -1289,10 +1493,14 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, { struct btrfs_root *quota_root = fs_info->quota_root; int ret = 0; + int start_rescan_worker = 0; if (!quota_root) goto out; + if (!fs_info->quota_enabled && fs_info->pending_quota_state) + start_rescan_worker = 1; + fs_info->quota_enabled = fs_info->pending_quota_state; spin_lock(&fs_info->qgroup_lock); @@ -1318,6 +1526,13 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + if (!ret && start_rescan_worker) { + ret = btrfs_qgroup_rescan(fs_info); + if (ret) + pr_err("btrfs: start rescan quota failed: %d\n", ret); + ret = 0; + } + out: return ret; @@ -1338,12 +1553,30 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; u32 level_size = 0; + u64 nums; + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_enabled) - return 0; + goto out; - if (!quota_root) - return -EINVAL; + if (!quota_root) { + ret = -EINVAL; + goto out; + } + + if (inherit) { + i_qgroups = (u64 *)(inherit + 1); + nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + + 2 * inherit->num_excl_copies; + for (i = 0; i < nums; ++i) { + srcgroup = find_qgroup_rb(fs_info, *i_qgroups); + if (!srcgroup) { + ret = -EINVAL; + goto out; + } + ++i_qgroups; + } + } /* * create a tracking group for the subvol itself @@ -1470,6 +1703,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, unlock: spin_unlock(&fs_info->qgroup_lock); out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } @@ -1514,7 +1748,10 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) ret = -ENOMEM; goto out; } - ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); + ret = ulist_add(ulist, qgroup->qgroupid, + (uintptr_t)qgroup, GFP_ATOMIC); + if (ret < 0) + goto out; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(ulist, &uiter))) { struct btrfs_qgroup *qg; @@ -1523,25 +1760,27 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && - qg->reserved + qg->rfer + num_bytes > + qg->reserved + (s64)qg->rfer + num_bytes > qg->max_rfer) { ret = -EDQUOT; goto out; } if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && - qg->reserved + qg->excl + num_bytes > + qg->reserved + (s64)qg->excl + num_bytes > qg->max_excl) { ret = -EDQUOT; goto out; } list_for_each_entry(glist, &qg->groups, next_group) { - ulist_add(ulist, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); + ret = ulist_add(ulist, glist->group->qgroupid, + (uintptr_t)glist->group, GFP_ATOMIC); + if (ret < 0) + goto out; } } - + ret = 0; /* * no limits exceeded, now record the reservation into all qgroups */ @@ -1570,6 +1809,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) struct ulist_node *unode; struct ulist_iterator uiter; u64 ref_root = root->root_key.objectid; + int ret = 0; if (!is_fstree(ref_root)) return; @@ -1592,7 +1832,10 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) btrfs_std_error(fs_info, -ENOMEM); goto out; } - ulist_add(ulist, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC); + ret = ulist_add(ulist, qgroup->qgroupid, + (uintptr_t)qgroup, GFP_ATOMIC); + if (ret < 0) + goto out; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(ulist, &uiter))) { struct btrfs_qgroup *qg; @@ -1603,8 +1846,10 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) qg->reserved -= num_bytes; list_for_each_entry(glist, &qg->groups, next_group) { - ulist_add(ulist, glist->group->qgroupid, - (uintptr_t)glist->group, GFP_ATOMIC); + ret = ulist_add(ulist, glist->group->qgroupid, + (uintptr_t)glist->group, GFP_ATOMIC); + if (ret < 0) + goto out; } } @@ -1617,8 +1862,265 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) return; - printk(KERN_ERR "btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %llu\n", + pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n", trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", - trans->delayed_ref_elem.seq); + (u32)(trans->delayed_ref_elem.seq >> 32), + (u32)trans->delayed_ref_elem.seq); BUG(); } + +/* + * returns < 0 on error, 0 when more leafs are to be scanned. + * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. + */ +static int +qgroup_rescan_leaf(struct qgroup_rescan *qscan, struct btrfs_path *path, + struct btrfs_trans_handle *trans, struct ulist *tmp, + struct extent_buffer *scratch_leaf) +{ + struct btrfs_key found; + struct btrfs_fs_info *fs_info = qscan->fs_info; + struct ulist *roots = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct seq_list tree_mod_seq_elem = {}; + u64 seq; + int slot; + int ret; + + path->leave_spinning = 1; + mutex_lock(&fs_info->qgroup_rescan_lock); + ret = btrfs_search_slot_for_read(fs_info->extent_root, + &fs_info->qgroup_rescan_progress, + path, 1, 0); + + pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n", + (unsigned long long)fs_info->qgroup_rescan_progress.objectid, + fs_info->qgroup_rescan_progress.type, + (unsigned long long)fs_info->qgroup_rescan_progress.offset, + ret); + + if (ret) { + /* + * The rescan is about to end, we will not be scanning any + * further blocks. We cannot unset the RESCAN flag here, because + * we want to commit the transaction if everything went well. + * To make the live accounting work in this phase, we set our + * scan progress pointer such that every real extent objectid + * will be smaller. + */ + fs_info->qgroup_rescan_progress.objectid = (u64)-1; + btrfs_release_path(path); + mutex_unlock(&fs_info->qgroup_rescan_lock); + return ret; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found, + btrfs_header_nritems(path->nodes[0]) - 1); + fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; + + btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf)); + slot = path->slots[0]; + btrfs_release_path(path); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { + btrfs_item_key_to_cpu(scratch_leaf, &found, slot); + if (found.type != BTRFS_EXTENT_ITEM_KEY) + continue; + ret = btrfs_find_all_roots(trans, fs_info, found.objectid, + tree_mod_seq_elem.seq, &roots); + if (ret < 0) + goto out; + spin_lock(&fs_info->qgroup_lock); + seq = fs_info->qgroup_seq; + fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ + + ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq); + if (ret) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; + } + + /* + * step2 of btrfs_qgroup_account_ref works from a single root, + * we're doing all at once here. + */ + ulist_reinit(tmp); + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(roots, &uiter))) { + struct btrfs_qgroup *qg; + + qg = find_qgroup_rb(fs_info, unode->val); + if (!qg) + continue; + + ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, + GFP_ATOMIC); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; + } + } + + /* this loop is similar to step 2 of btrfs_qgroup_account_ref */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + struct btrfs_qgroup *qg; + struct btrfs_qgroup_list *glist; + + qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux; + qg->rfer += found.offset; + qg->rfer_cmpr += found.offset; + WARN_ON(qg->tag >= seq); + if (qg->refcnt - seq == roots->nnodes) { + qg->excl += found.offset; + qg->excl_cmpr += found.offset; + } + qgroup_dirty(fs_info, qg); + + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + (uintptr_t)glist->group, + GFP_ATOMIC); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + goto out; + } + } + } + + spin_unlock(&fs_info->qgroup_lock); + ulist_free(roots); + ret = 0; + } + +out: + btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + + return ret; +} + +static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) +{ + struct qgroup_rescan *qscan = container_of(work, struct qgroup_rescan, + work); + struct btrfs_path *path; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_fs_info *fs_info = qscan->fs_info; + struct ulist *tmp = NULL; + struct extent_buffer *scratch_leaf = NULL; + int err = -ENOMEM; + + path = btrfs_alloc_path(); + if (!path) + goto out; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + goto out; + scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); + if (!scratch_leaf) + goto out; + + err = 0; + while (!err) { + trans = btrfs_start_transaction(fs_info->fs_root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + break; + } + if (!fs_info->quota_enabled) { + err = -EINTR; + } else { + err = qgroup_rescan_leaf(qscan, path, trans, + tmp, scratch_leaf); + } + if (err > 0) + btrfs_commit_transaction(trans, fs_info->fs_root); + else + btrfs_end_transaction(trans, fs_info->fs_root); + } + +out: + kfree(scratch_leaf); + ulist_free(tmp); + btrfs_free_path(path); + kfree(qscan); + + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + + if (err == 2 && + fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } else if (err < 0) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (err >= 0) { + pr_info("btrfs: qgroup scan completed%s\n", + err == 2 ? " (inconsistency flag cleared)" : ""); + } else { + pr_err("btrfs: qgroup scan failed with %d\n", err); + } +} + +static void +qgroup_rescan_start(struct btrfs_fs_info *fs_info, struct qgroup_rescan *qscan) +{ + memset(&qscan->work, 0, sizeof(qscan->work)); + qscan->work.func = btrfs_qgroup_rescan_worker; + qscan->fs_info = fs_info; + + pr_info("btrfs: qgroup scan started\n"); + btrfs_queue_worker(&fs_info->qgroup_rescan_workers, &qscan->work); +} + +int +btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) +{ + int ret = 0; + struct rb_node *n; + struct btrfs_qgroup *qgroup; + struct qgroup_rescan *qscan = kmalloc(sizeof(*qscan), GFP_NOFS); + + if (!qscan) + return -ENOMEM; + + mutex_lock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + ret = -EINPROGRESS; + else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) + ret = -EINVAL; + if (ret) { + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + kfree(qscan); + return ret; + } + + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; + memset(&fs_info->qgroup_rescan_progress, 0, + sizeof(fs_info->qgroup_rescan_progress)); + + /* clear all current qgroup tracking information */ + for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { + qgroup = rb_entry(n, struct btrfs_qgroup, node); + qgroup->rfer = 0; + qgroup->rfer_cmpr = 0; + qgroup->excl = 0; + qgroup->excl_cmpr = 0; + } + spin_unlock(&fs_info->qgroup_lock); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + qgroup_rescan_start(fs_info, qscan); + + return 0; +} diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 9a79fb790adb..0740621daf6c 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -410,7 +410,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) /* * remove everything in the cache */ -void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) +static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) { struct btrfs_stripe_hash_table *table; unsigned long flags; @@ -1010,12 +1010,12 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) * this will try to merge into existing bios if possible, and returns * zero if all went well. */ -int rbio_add_io_page(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list, - struct page *page, - int stripe_nr, - unsigned long page_index, - unsigned long bio_max_len) +static int rbio_add_io_page(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct page *page, + int stripe_nr, + unsigned long page_index, + unsigned long bio_max_len) { struct bio *last = bio_list->tail; u64 last_end = 0; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 96b93daa0bbb..1031b69252c5 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -955,10 +955,11 @@ int btrfs_reada_wait(void *handle) while (atomic_read(&rc->elems)) { wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 5 * HZ); - dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + dump_devs(rc->root->fs_info, + atomic_read(&rc->elems) < 10 ? 1 : 0); } - dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + dump_devs(rc->root->fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); kref_put(&rc->refcnt, reada_control_release); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b67171e6d688..704a1b8d2a2b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -326,8 +326,7 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) return NULL; } -void backref_tree_panic(struct rb_node *rb_node, int errno, - u64 bytenr) +static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) { struct btrfs_fs_info *fs_info = NULL; @@ -619,10 +618,13 @@ static noinline_for_stack int find_inline_backref(struct extent_buffer *leaf, int slot, unsigned long *ptr, unsigned long *end) { + struct btrfs_key key; struct btrfs_extent_item *ei; struct btrfs_tree_block_info *bi; u32 item_size; + btrfs_item_key_to_cpu(leaf, &key, slot); + item_size = btrfs_item_size_nr(leaf, slot); #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (item_size < sizeof(*ei)) { @@ -634,13 +636,18 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, WARN_ON(!(btrfs_extent_flags(leaf, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK)); - if (item_size <= sizeof(*ei) + sizeof(*bi)) { + if (key.type == BTRFS_EXTENT_ITEM_KEY && + item_size <= sizeof(*ei) + sizeof(*bi)) { WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); return 1; } - bi = (struct btrfs_tree_block_info *)(ei + 1); - *ptr = (unsigned long)(bi + 1); + if (key.type == BTRFS_EXTENT_ITEM_KEY) { + bi = (struct btrfs_tree_block_info *)(ei + 1); + *ptr = (unsigned long)(bi + 1); + } else { + *ptr = (unsigned long)(ei + 1); + } *end = (unsigned long)ei + item_size; return 0; } @@ -708,7 +715,7 @@ again: end = 0; ptr = 0; key.objectid = cur->bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; + key.type = BTRFS_METADATA_ITEM_KEY; key.offset = (u64)-1; path1->search_commit_root = 1; @@ -766,7 +773,8 @@ again: break; } - if (key.type == BTRFS_EXTENT_ITEM_KEY) { + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { ret = find_inline_backref(eb, path1->slots[0], &ptr, &end); if (ret) @@ -1762,7 +1770,11 @@ again: eb = read_tree_block(dest, old_bytenr, blocksize, old_ptr_gen); - BUG_ON(!eb); + if (!eb || !extent_buffer_uptodate(eb)) { + ret = (!eb) ? -ENOMEM : -EIO; + free_extent_buffer(eb); + return ret; + } btrfs_tree_lock(eb); if (cow) { ret = btrfs_cow_block(trans, dest, eb, parent, @@ -1915,6 +1927,10 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, bytenr = btrfs_node_blockptr(eb, path->slots[i]); blocksize = btrfs_level_size(root, i - 1); eb = read_tree_block(root, bytenr, blocksize, ptr_gen); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } BUG_ON(btrfs_header_level(eb) != i - 1); path->nodes[i - 1] = eb; path->slots[i - 1] = 0; @@ -2592,7 +2608,8 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = btrfs_level_size(root, node->level); generation = btrfs_node_ptr_generation(upper->eb, slot); eb = read_tree_block(root, bytenr, blocksize, generation); - if (!eb) { + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); err = -EIO; goto next; } @@ -2753,7 +2770,10 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, block->key.objectid, block->key.offset); - BUG_ON(!eb); + if (!eb || !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } WARN_ON(btrfs_header_level(eb) != block->level); if (block->level == 0) btrfs_item_key_to_cpu(eb, &block->key, 0); @@ -2768,8 +2788,13 @@ static int reada_tree_block(struct reloc_control *rc, struct tree_block *block) { BUG_ON(block->key_ready); - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); + if (block->key.type == BTRFS_METADATA_ITEM_KEY) + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, + rc->extent_root->leafsize); + else + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); return 0; } @@ -2850,7 +2875,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; - goto out_path; + goto out_free_blocks; } rb_node = rb_first(blocks); @@ -2864,8 +2889,11 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, rb_node = rb_first(blocks); while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); - if (!block->key_ready) - get_tree_block_key(rc, block); + if (!block->key_ready) { + err = get_tree_block_key(rc, block); + if (err) + goto out_free_path; + } rb_node = rb_next(rb_node); } @@ -2892,8 +2920,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, out: err = finish_pending_nodes(trans, rc, path, err); +out_free_path: btrfs_free_path(path); -out_path: +out_free_blocks: free_block_list(blocks); return err; } @@ -2965,7 +2994,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, lock_extent(&BTRFS_I(inode)->io_tree, start, end); while (1) { write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 0); write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); @@ -3176,12 +3205,17 @@ static int add_tree_block(struct reloc_control *rc, eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); - if (item_size >= sizeof(*ei) + sizeof(*bi)) { + if (extent_key->type == BTRFS_METADATA_ITEM_KEY || + item_size >= sizeof(*ei) + sizeof(*bi)) { ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); - bi = (struct btrfs_tree_block_info *)(ei + 1); + if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { + bi = (struct btrfs_tree_block_info *)(ei + 1); + level = btrfs_tree_block_level(eb, bi); + } else { + level = (int)extent_key->offset; + } generation = btrfs_extent_generation(eb, ei); - level = btrfs_tree_block_level(eb, bi); } else { #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 u64 ref_owner; @@ -3210,7 +3244,7 @@ static int add_tree_block(struct reloc_control *rc, return -ENOMEM; block->bytenr = extent_key->objectid; - block->key.objectid = extent_key->offset; + block->key.objectid = rc->extent_root->leafsize; block->key.offset = generation; block->level = level; block->key_ready = 0; @@ -3252,9 +3286,15 @@ static int __add_tree_block(struct reloc_control *rc, ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) goto out; - BUG_ON(ret); btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (ret > 0) { + if (key.objectid == bytenr && + key.type == BTRFS_METADATA_ITEM_KEY) + ret = 0; + } + BUG_ON(ret); + ret = add_tree_block(rc, &key, path, blocks); out: btrfs_free_path(path); @@ -3275,7 +3315,8 @@ static int block_use_full_backref(struct reloc_control *rc, return 1; ret = btrfs_lookup_extent_info(NULL, rc->extent_root, - eb->start, eb->len, NULL, &flags); + eb->start, btrfs_header_level(eb), 1, + NULL, &flags); BUG_ON(ret); if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) @@ -3644,12 +3685,25 @@ next: break; } - if (key.type != BTRFS_EXTENT_ITEM_KEY || + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) { + path->slots[0]++; + goto next; + } + + if (key.type == BTRFS_EXTENT_ITEM_KEY && key.objectid + key.offset <= rc->search_start) { path->slots[0]++; goto next; } + if (key.type == BTRFS_METADATA_ITEM_KEY && + key.objectid + rc->extent_root->leafsize <= + rc->search_start) { + path->slots[0]++; + goto next; + } + ret = find_first_extent_bit(&rc->processed_blocks, key.objectid, &start, &end, EXTENT_DIRTY, NULL); @@ -3658,7 +3712,11 @@ next: btrfs_release_path(path); rc->search_start = end + 1; } else { - rc->search_start = key.objectid + key.offset; + if (key.type == BTRFS_EXTENT_ITEM_KEY) + rc->search_start = key.objectid + key.offset; + else + rc->search_start = key.objectid + + rc->extent_root->leafsize; memcpy(extent_key, &key, sizeof(key)); return 0; } @@ -4105,10 +4163,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) while (1) { mutex_lock(&fs_info->cleaner_mutex); - - btrfs_clean_old_snapshots(fs_info->tree_root); ret = relocate_block_group(rc); - mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { err = ret; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 668af537a3ea..5bf1ed57f178 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -29,9 +29,8 @@ * generation numbers as then we know the root was once mounted with an older * kernel that was not aware of the root item structure change. */ -void btrfs_read_root_item(struct btrfs_root *root, - struct extent_buffer *eb, int slot, - struct btrfs_root_item *item) +void btrfs_read_root_item(struct extent_buffer *eb, int slot, + struct btrfs_root_item *item) { uuid_le uuid; int len; @@ -104,7 +103,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, goto out; } if (item) - btrfs_read_root_item(root, l, slot, item); + btrfs_read_root_item(l, slot, item); if (key) memcpy(key, &found_key, sizeof(found_key)); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 85e072b956d5..f489e24659a4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1336,7 +1336,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, int page_num; u8 calculated_csum[BTRFS_CSUM_SIZE]; u32 crc = ~(u32)0; - struct btrfs_root *root = fs_info->extent_root; void *mapped_buffer; WARN_ON(!sblock->pagev[0]->page); @@ -1365,12 +1364,11 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, for (page_num = 0;;) { if (page_num == 0 && is_metadata) - crc = btrfs_csum_data(root, + crc = btrfs_csum_data( ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, crc, PAGE_SIZE - BTRFS_CSUM_SIZE); else - crc = btrfs_csum_data(root, mapped_buffer, crc, - PAGE_SIZE); + crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE); kunmap_atomic(mapped_buffer); page_num++; @@ -1657,7 +1655,6 @@ static int scrub_checksum_data(struct scrub_block *sblock) void *buffer; u32 crc = ~(u32)0; int fail = 0; - struct btrfs_root *root = sctx->dev_root; u64 len; int index; @@ -1674,7 +1671,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); - crc = btrfs_csum_data(root, buffer, crc, l); + crc = btrfs_csum_data(buffer, crc, l); kunmap_atomic(buffer); len -= l; if (len == 0) @@ -1744,7 +1741,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(root, p, crc, l); + crc = btrfs_csum_data(p, crc, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1805,7 +1802,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(root, p, crc, l); + crc = btrfs_csum_data(p, crc, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -2236,12 +2233,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 flags; int ret; int slot; - int i; u64 nstripes; struct extent_buffer *l; struct btrfs_key key; u64 physical; u64 logical; + u64 logic_end; u64 generation; int mirror_num; struct reada_control *reada1; @@ -2255,6 +2252,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 extent_len; struct btrfs_device *extent_dev; int extent_mirror_num; + int stop_loop; if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { @@ -2315,8 +2313,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_start.type = BTRFS_EXTENT_ITEM_KEY; key_start.offset = (u64)0; key_end.objectid = base + offset + nstripes * increment; - key_end.type = BTRFS_EXTENT_ITEM_KEY; - key_end.offset = (u64)0; + key_end.type = BTRFS_METADATA_ITEM_KEY; + key_end.offset = (u64)-1; reada1 = btrfs_reada_add(root, &key_start, &key_end); key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -2354,8 +2352,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ logical = base + offset; physical = map->stripes[num].physical; + logic_end = logical + increment * nstripes; ret = 0; - for (i = 0; i < nstripes; ++i) { + while (logical < logic_end) { /* * canceled? */ @@ -2391,19 +2390,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, wake_up(&fs_info->scrub_pause_wait); } - ret = btrfs_lookup_csums_range(csum_root, logical, - logical + map->stripe_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; - key.objectid = logical; key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; + key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; + if (ret > 0) { ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY); @@ -2420,7 +2414,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } } + stop_loop = 0; while (1) { + u64 bytes; + l = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(l)) { @@ -2430,19 +2427,30 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (ret < 0) goto out; + stop_loop = 1; break; } btrfs_item_key_to_cpu(l, &key, slot); - if (key.objectid + key.offset <= logical) - goto next; + if (key.type == BTRFS_METADATA_ITEM_KEY) + bytes = root->leafsize; + else + bytes = key.offset; - if (key.objectid >= logical + map->stripe_len) - break; + if (key.objectid + bytes <= logical) + goto next; - if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) goto next; + if (key.objectid >= logical + map->stripe_len) { + /* out of this device extent */ + if (key.objectid >= logic_end) + stop_loop = 1; + break; + } + extent = btrfs_item_ptr(l, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(l, extent); @@ -2458,22 +2466,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, goto next; } +again: + extent_logical = key.objectid; + extent_len = bytes; + /* * trim extent to this stripe */ - if (key.objectid < logical) { - key.offset -= logical - key.objectid; - key.objectid = logical; + if (extent_logical < logical) { + extent_len -= logical - extent_logical; + extent_logical = logical; } - if (key.objectid + key.offset > + if (extent_logical + extent_len > logical + map->stripe_len) { - key.offset = logical + map->stripe_len - - key.objectid; + extent_len = logical + map->stripe_len - + extent_logical; } - extent_logical = key.objectid; - extent_physical = key.objectid - logical + physical; - extent_len = key.offset; + extent_physical = extent_logical - logical + physical; extent_dev = scrub_dev; extent_mirror_num = mirror_num; if (is_dev_replace) @@ -2481,13 +2491,35 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, extent_len, &extent_physical, &extent_dev, &extent_mirror_num); + + ret = btrfs_lookup_csums_range(csum_root, logical, + logical + map->stripe_len - 1, + &sctx->csum_list, 1); + if (ret) + goto out; + ret = scrub_extent(sctx, extent_logical, extent_len, extent_physical, extent_dev, flags, generation, extent_mirror_num, - key.objectid - logical + physical); + extent_physical); if (ret) goto out; + if (extent_logical + extent_len < + key.objectid + bytes) { + logical += increment; + physical += map->stripe_len; + + if (logical < key.objectid + bytes) { + cond_resched(); + goto again; + } + + if (logical >= logic_end) { + stop_loop = 1; + break; + } + } next: path->slots[0]++; } @@ -2495,8 +2527,14 @@ next: logical += increment; physical += map->stripe_len; spin_lock(&sctx->stat_lock); - sctx->stat.last_physical = physical; + if (stop_loop) + sctx->stat.last_physical = map->stripes[num].physical + + length; + else + sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); + if (stop_loop) + break; } out: /* push queued extents */ @@ -3005,28 +3043,6 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_device *dev; - int ret; - - /* - * we have to hold the device_list_mutex here so the device - * does not go away in cancel_dev. FIXME: find a better solution - */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info, devid, NULL, NULL); - if (!dev) { - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - return -ENODEV; - } - ret = btrfs_scrub_cancel_dev(fs_info, dev); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - - return ret; -} - int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c85e7c6b4598..ff40f1c00ce3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -387,7 +387,7 @@ static struct btrfs_path *alloc_path_for_send(void) return path; } -int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) +static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) { int ret; mm_segment_t old_fs; @@ -3479,7 +3479,6 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, struct send_ctx *sctx = ctx; char *found_data = NULL; int found_data_len = 0; - struct fs_path *p = NULL; ret = find_xattr(sctx, sctx->parent_root, sctx->right_path, sctx->cmp_key, name, name_len, &found_data, @@ -3498,7 +3497,6 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, } kfree(found_data); - fs_path_free(sctx, p); return ret; } @@ -4529,9 +4527,11 @@ static int send_subvol(struct send_ctx *sctx) { int ret; - ret = send_header(sctx); - if (ret < 0) - goto out; + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) { + ret = send_header(sctx); + if (ret < 0) + goto out; + } ret = send_subvol_begin(sctx); if (ret < 0) @@ -4593,7 +4593,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } - if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) { + if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { ret = -EINVAL; goto out; } @@ -4612,8 +4612,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->flags = arg->flags; sctx->send_filp = fget(arg->send_fd); - if (IS_ERR(sctx->send_filp)) { - ret = PTR_ERR(sctx->send_filp); + if (!sctx->send_filp) { + ret = -EBADF; goto out; } @@ -4704,12 +4704,14 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) if (ret < 0) goto out; - ret = begin_cmd(sctx, BTRFS_SEND_C_END); - if (ret < 0) - goto out; - ret = send_cmd(sctx); - if (ret < 0) - goto out; + if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) { + ret = begin_cmd(sctx, BTRFS_SEND_C_END); + if (ret < 0) + goto out; + ret = send_cmd(sctx); + if (ret < 0) + goto out; + } out: kfree(arg); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 8bb18f7ccaa6..48d425aef05b 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -131,5 +131,4 @@ enum { #ifdef __KERNEL__ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); -int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f6b88595f858..a4807ced23cc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -56,6 +56,7 @@ #include "compression.h" #include "rcu-string.h" #include "dev-replace.h" +#include "free-space-cache.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -63,9 +64,9 @@ static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; -static const char *btrfs_decode_error(int errno, char nbuf[16]) +static const char *btrfs_decode_error(int errno) { - char *errstr = NULL; + char *errstr = "unknown"; switch (errno) { case -EIO: @@ -80,18 +81,18 @@ static const char *btrfs_decode_error(int errno, char nbuf[16]) case -EEXIST: errstr = "Object already exists"; break; - default: - if (nbuf) { - if (snprintf(nbuf, 16, "error %d", -errno) >= 0) - errstr = nbuf; - } + case -ENOSPC: + errstr = "No space left"; + break; + case -ENOENT: + errstr = "No such entry"; break; } return errstr; } -static void __save_error_info(struct btrfs_fs_info *fs_info) +static void save_error_info(struct btrfs_fs_info *fs_info) { /* * today we only save the error info into ram. Long term we'll @@ -100,11 +101,6 @@ static void __save_error_info(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); } -static void save_error_info(struct btrfs_fs_info *fs_info) -{ - __save_error_info(fs_info); -} - /* btrfs handle error by forcing the filesystem readonly */ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) { @@ -115,7 +111,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { sb->s_flags |= MS_RDONLY; - printk(KERN_INFO "btrfs is forced readonly\n"); + btrfs_info(fs_info, "forced readonly"); /* * Note that a running device replace operation is not * canceled here although there is no way to update @@ -126,7 +122,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * mounted writeable again, the device replace * operation continues. */ -// WARN_ON(1); } } @@ -139,7 +134,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; - char nbuf[16]; const char *errstr; /* @@ -149,7 +143,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) return; - errstr = btrfs_decode_error(errno, nbuf); + errstr = btrfs_decode_error(errno); if (fmt) { struct va_format vaf; va_list args; @@ -158,19 +152,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", - sb->s_id, function, line, errstr, &vaf); + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, function, line, errno, errstr, &vaf); va_end(args); } else { - printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", - sb->s_id, function, line, errstr); + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n", + sb->s_id, function, line, errno, errstr); } /* Don't go through full error handling during mount */ - if (sb->s_flags & MS_BORN) { - save_error_info(fs_info); + save_error_info(fs_info); + if (sb->s_flags & MS_BORN) btrfs_handle_error(fs_info); - } } static const char * const logtypes[] = { @@ -184,7 +177,7 @@ static const char * const logtypes[] = { "debug", }; -void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) +void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { struct super_block *sb = fs_info->sb; char lvl[4]; @@ -208,7 +201,7 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - printk("%sBTRFS %s (device %s): %pV", lvl, type, sb->s_id, &vaf); + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); va_end(args); } @@ -252,18 +245,24 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno) { - WARN_ONCE(1, KERN_DEBUG "btrfs: Transaction aborted\n"); + /* + * Report first abort since mount + */ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, + &root->fs_info->fs_state)) { + WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n", + errno); + } trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ if (!trans->blocks_used) { - char nbuf[16]; const char *errstr; - errstr = btrfs_decode_error(errno, nbuf); - btrfs_printk(root->fs_info, - "%s:%d: Aborting unused transaction(%s).\n", - function, line, errstr); + errstr = btrfs_decode_error(errno); + btrfs_warn(root->fs_info, + "%s:%d: Aborting unused transaction(%s).", + function, line, errstr); return; } ACCESS_ONCE(trans->transaction->aborted) = errno; @@ -276,7 +275,6 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { - char nbuf[16]; char *s_id = "<unknown>"; const char *errstr; struct va_format vaf = { .fmt = fmt }; @@ -288,13 +286,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, va_start(args, fmt); vaf.va = &args; - errstr = btrfs_decode_error(errno, nbuf); + errstr = btrfs_decode_error(errno); if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) - panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", - s_id, function, line, &vaf, errstr); + panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", + s_id, function, line, &vaf, errno, errstr); - printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", - s_id, function, line, &vaf, errstr); + printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", + s_id, function, line, &vaf, errno, errstr); va_end(args); /* Caller calls BUG() */ } @@ -650,7 +648,7 @@ out: */ static int btrfs_parse_early_options(const char *options, fmode_t flags, void *holder, char **subvol_name, u64 *subvol_objectid, - u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) + struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; char *device_name, *opts, *orig, *p; @@ -693,16 +691,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, } break; case Opt_subvolrootid: - intarg = 0; - error = match_int(&args[0], &intarg); - if (!error) { - /* we want the original fs_tree */ - if (!intarg) - *subvol_rootid = - BTRFS_FS_TREE_OBJECTID; - else - *subvol_rootid = intarg; - } + printk(KERN_WARNING + "btrfs: 'subvolrootid' mount option is deprecated and has no effect\n"); break; case Opt_device: device_name = match_strdup(&args[0]); @@ -876,7 +866,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_ordered_extents(root, 1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1080,7 +1070,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; - u64 subvol_rootid = 0; int error = 0; if (!(flags & MS_RDONLY)) @@ -1088,7 +1077,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_parse_early_options(data, mode, fs_type, &subvol_name, &subvol_objectid, - &subvol_rootid, &fs_devices); + &fs_devices); if (error) { kfree(subvol_name); return ERR_PTR(error); @@ -1202,11 +1191,14 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, new_pool_size); } -static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info, - unsigned long old_opts, int flags) +static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info) { set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); +} +static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, + unsigned long old_opts, int flags) +{ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || (flags & MS_RDONLY))) { @@ -1247,7 +1239,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) unsigned int old_metadata_ratio = fs_info->metadata_ratio; int ret; - btrfs_remount_prepare(fs_info, old_opts, *flags); + btrfs_remount_prepare(fs_info); ret = btrfs_parse_options(root, data); if (ret) { @@ -1255,6 +1247,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); @@ -1739,6 +1732,10 @@ static int __init init_btrfs_fs(void) btrfs_init_lockdep(); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + btrfs_test_free_space_cache(); +#endif + printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); return 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 50767bbaad6c..0544587d74f4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -34,7 +34,7 @@ #define BTRFS_ROOT_TRANS_TAG 0 -void put_transaction(struct btrfs_transaction *transaction) +static void put_transaction(struct btrfs_transaction *transaction) { WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { @@ -162,7 +162,7 @@ loop: if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when " "creating a fresh transaction\n"); - atomic_set(&fs_info->tree_mod_seq, 0); + atomic64_set(&fs_info->tree_mod_seq, 0); spin_lock_init(&cur_trans->commit_lock); spin_lock_init(&cur_trans->delayed_refs.lock); @@ -707,23 +707,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - int ret; - - ret = __btrfs_end_transaction(trans, root, 0); - if (ret) - return ret; - return 0; + return __btrfs_end_transaction(trans, root, 0); } int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - int ret; - - ret = __btrfs_end_transaction(trans, root, 1); - if (ret) - return ret; - return 0; + return __btrfs_end_transaction(trans, root, 1); } int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, @@ -948,7 +938,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, int btrfs_add_dead_root(struct btrfs_root *root) { spin_lock(&root->fs_info->trans_lock); - list_add(&root->root_list, &root->fs_info->dead_roots); + list_add_tail(&root->root_list, &root->fs_info->dead_roots); spin_unlock(&root->fs_info->trans_lock); return 0; } @@ -1179,13 +1169,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); memcpy(new_root_item->parent_uuid, root->root_item.uuid, BTRFS_UUID_SIZE); + if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { + memset(new_root_item->received_uuid, 0, + sizeof(new_root_item->received_uuid)); + memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); + memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); + btrfs_set_root_stransid(new_root_item, 0); + btrfs_set_root_rtransid(new_root_item, 0); + } new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec); new_root_item->otime.nsec = cpu_to_le32(cur_time.tv_nsec); btrfs_set_root_otransid(new_root_item, trans->transid); - memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); - memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); - btrfs_set_root_stransid(new_root_item, 0); - btrfs_set_root_rtransid(new_root_item, 0); old = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); @@ -1487,6 +1481,10 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, current->journal_info = NULL; kmem_cache_free(btrfs_trans_handle_cachep, trans); + + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 0; + spin_unlock(&root->fs_info->trans_lock); } static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, @@ -1808,7 +1806,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_transaction(trans, root); if (ret) { btrfs_error(root->fs_info, ret, - "Error while writing out transaction."); + "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); goto cleanup_transaction; } @@ -1864,8 +1862,7 @@ cleanup_transaction: btrfs_qgroup_free(root, trans->qgroup_reserved); trans->qgroup_reserved = 0; } - btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); -// WARN_ON(1); + btrfs_warn(root->fs_info, "Skipping commit of aborted transaction."); if (current->journal_info == trans) current->journal_info = NULL; cleanup_transaction(trans, root, ret); @@ -1874,31 +1871,49 @@ cleanup_transaction: } /* - * interface function to delete all the snapshots we have scheduled for deletion + * return < 0 if error + * 0 if there are no more dead_roots at the time of call + * 1 there are more to be processed, call me again + * + * The return value indicates there are certainly more snapshots to delete, but + * if there comes a new one during processing, it may return 0. We don't mind, + * because btrfs_commit_super will poke cleaner thread and it will process it a + * few seconds later. */ -int btrfs_clean_old_snapshots(struct btrfs_root *root) +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) { - LIST_HEAD(list); + int ret; struct btrfs_fs_info *fs_info = root->fs_info; + if (fs_info->sb->s_flags & MS_RDONLY) { + pr_debug("btrfs: cleaner called for RO fs!\n"); + return 0; + } + spin_lock(&fs_info->trans_lock); - list_splice_init(&fs_info->dead_roots, &list); + if (list_empty(&fs_info->dead_roots)) { + spin_unlock(&fs_info->trans_lock); + return 0; + } + root = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); + list_del(&root->root_list); spin_unlock(&fs_info->trans_lock); - while (!list_empty(&list)) { - int ret; - - root = list_entry(list.next, struct btrfs_root, root_list); - list_del(&root->root_list); + pr_debug("btrfs: cleaner removing %llu\n", + (unsigned long long)root->objectid); - btrfs_kill_all_delayed_nodes(root); + btrfs_kill_all_delayed_nodes(root); - if (btrfs_header_backref_rev(root->node) < - BTRFS_MIXED_BACKREF_REV) - ret = btrfs_drop_snapshot(root, NULL, 0, 0); - else - ret =btrfs_drop_snapshot(root, NULL, 1, 0); - BUG_ON(ret < 0); - } - return 0; + if (btrfs_header_backref_rev(root->node) < + BTRFS_MIXED_BACKREF_REV) + ret = btrfs_drop_snapshot(root, NULL, 0, 0); + else + ret = btrfs_drop_snapshot(root, NULL, 1, 0); + /* + * If we encounter a transaction abort during snapshot cleaning, we + * don't want to crash here + */ + BUG_ON(ret < 0 && ret != -EAGAIN && ret != -EROFS); + return 1; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 3c8e0d25c8e4..24c97335a59f 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -123,7 +123,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, int btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); -int btrfs_clean_old_snapshots(struct btrfs_root *root); +int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, @@ -146,5 +146,4 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); -void put_transaction(struct btrfs_transaction *transaction); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ef96381569a4..c276ac9a0ec3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -277,17 +277,19 @@ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen) { + int ret = 0; + if (wc->pin) - btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, - eb->start, eb->len); + ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, + eb->start, eb->len); - if (btrfs_buffer_uptodate(eb, gen, 0)) { + if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { if (wc->write) btrfs_write_tree_block(eb); if (wc->wait) btrfs_wait_tree_block_writeback(eb); } - return 0; + return ret; } /* @@ -408,9 +410,9 @@ insert: found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (found_size > item_size) - btrfs_truncate_item(trans, root, path, item_size, 1); + btrfs_truncate_item(root, path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(trans, root, path, + btrfs_extend_item(root, path, item_size - found_size); } else if (ret) { return ret; @@ -587,7 +589,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); - BUG_ON(ret); + if (ret) + goto out; if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -597,7 +600,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); - BUG_ON(ret); + if (ret) + goto out; dest_offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); copy_extent_buffer(path->nodes[0], eb, dest_offset, @@ -623,7 +627,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.objectid, ins.offset, 0, root->root_key.objectid, key->objectid, offset, 0); - BUG_ON(ret); + if (ret) + goto out; } else { /* * insert the extent pointer in the extent @@ -632,7 +637,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_alloc_logged_file_extent(trans, root, root->root_key.objectid, key->objectid, offset, &ins); - BUG_ON(ret); + if (ret) + goto out; } btrfs_release_path(path); @@ -649,26 +655,30 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_csums_range(root->log_root, csum_start, csum_end - 1, &ordered_sums, 0); - BUG_ON(ret); + if (ret) + goto out; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums; sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); - ret = btrfs_csum_file_blocks(trans, + if (!ret) + ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums); - BUG_ON(ret); list_del(&sums->list); kfree(sums); } + if (ret) + goto out; } else { btrfs_release_path(path); } } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { /* inline extents are easy, we just overwrite them */ ret = overwrite_item(trans, root, path, eb, slot, key); - BUG_ON(ret); + if (ret) + goto out; } inode_add_bytes(inode, nbytes); @@ -713,20 +723,21 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, inode = read_one_inode(root, location.objectid); if (!inode) { - kfree(name); - return -EIO; + ret = -EIO; + goto out; } ret = link_to_fixup_dir(trans, root, path, location.objectid); - BUG_ON(ret); + if (ret) + goto out; ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); - BUG_ON(ret); + if (ret) + goto out; + btrfs_run_delayed_items(trans, root); +out: kfree(name); - iput(inode); - - btrfs_run_delayed_items(trans, root); return ret; } @@ -879,7 +890,8 @@ again: victim_name_len = btrfs_inode_ref_name_len(leaf, victim_ref); victim_name = kmalloc(victim_name_len, GFP_NOFS); - BUG_ON(!victim_name); + if (!victim_name) + return -ENOMEM; read_extent_buffer(leaf, victim_name, (unsigned long)(victim_ref + 1), @@ -895,9 +907,10 @@ again: ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); - BUG_ON(ret); - btrfs_run_delayed_items(trans, root); kfree(victim_name); + if (ret) + return ret; + btrfs_run_delayed_items(trans, root); *search_done = 1; goto again; } @@ -905,7 +918,6 @@ again: ptr = (unsigned long)(victim_ref + 1) + victim_name_len; } - BUG_ON(ret); /* * NOTE: we have searched root tree and checked the @@ -939,6 +951,8 @@ again: goto next; victim_name = kmalloc(victim_name_len, GFP_NOFS); + if (!victim_name) + return -ENOMEM; read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, victim_name_len); @@ -965,14 +979,16 @@ again: victim_name_len); btrfs_run_delayed_items(trans, root); } - BUG_ON(ret); iput(victim_parent); kfree(victim_name); + if (ret) + return ret; *search_done = 1; goto again; } kfree(victim_name); - BUG_ON(ret); + if (ret) + return ret; next: cur_offset += victim_name_len + sizeof(*extref); } @@ -985,7 +1001,8 @@ next: ref_index, name, namelen, 0); if (di && !IS_ERR(di)) { ret = drop_one_dir_item(trans, root, path, dir, di); - BUG_ON(ret); + if (ret) + return ret; } btrfs_release_path(path); @@ -994,7 +1011,8 @@ next: name, namelen, 0); if (di && !IS_ERR(di)) { ret = drop_one_dir_item(trans, root, path, dir, di); - BUG_ON(ret); + if (ret) + return ret; } btrfs_release_path(path); @@ -1139,15 +1157,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, parent_objectid, ref_index, name, namelen, &search_done); - if (ret == 1) + if (ret == 1) { + ret = 0; + goto out; + } + if (ret) goto out; - BUG_ON(ret); } /* insert our name */ ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, ref_index); - BUG_ON(ret); + if (ret) + goto out; btrfs_update_inode(trans, root, inode); } @@ -1162,13 +1184,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, /* finally write the back reference in the inode */ ret = overwrite_item(trans, root, path, eb, slot, key); - BUG_ON(ret); - out: btrfs_release_path(path); iput(dir); iput(inode); - return 0; + return ret; } static int insert_orphan_item(struct btrfs_trans_handle *trans, @@ -1326,10 +1346,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode)) { ret = replay_dir_deletes(trans, root, NULL, path, ino, 1); - BUG_ON(ret); + if (ret) + goto out; } ret = insert_orphan_item(trans, root, ino); - BUG_ON(ret); } out: @@ -1374,9 +1394,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, return -EIO; ret = fixup_inode_link_count(trans, root, inode); - BUG_ON(ret); - iput(inode); + if (ret) + goto out; /* * fixup on a directory may create new entries, @@ -1426,7 +1446,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, } else if (ret == -EEXIST) { ret = 0; } else { - BUG(); + BUG(); /* Logic Error */ } iput(inode); @@ -1495,7 +1515,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct inode *dir; u8 log_type; int exists; - int ret; + int ret = 0; dir = read_one_inode(root, key->objectid); if (!dir) @@ -1527,7 +1547,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, key->offset, name, name_len, 1); } else { - BUG(); + /* Corruption */ + ret = -EINVAL; + goto out; } if (IS_ERR_OR_NULL(dst_di)) { /* we need a sequence number to insert, so we only @@ -1555,7 +1577,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, goto out; ret = drop_one_dir_item(trans, root, path, dir, dst_di); - BUG_ON(ret); + if (ret) + goto out; if (key->type == BTRFS_DIR_INDEX_KEY) goto insert; @@ -1563,14 +1586,15 @@ out: btrfs_release_path(path); kfree(name); iput(dir); - return 0; + return ret; insert: btrfs_release_path(path); ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - - BUG_ON(ret && ret != -ENOENT); + if (ret && ret != -ENOENT) + goto out; + ret = 0; goto out; } @@ -1601,7 +1625,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); - BUG_ON(ret); + if (ret) + return ret; ptr = (unsigned long)(di + 1); ptr += name_len; } @@ -1762,16 +1787,21 @@ again: ret = link_to_fixup_dir(trans, root, path, location.objectid); - BUG_ON(ret); + if (ret) { + kfree(name); + iput(inode); + goto out; + } + btrfs_inc_nlink(inode); ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); - BUG_ON(ret); - - btrfs_run_delayed_items(trans, root); - + if (!ret) + btrfs_run_delayed_items(trans, root); kfree(name); iput(inode); + if (ret) + goto out; /* there might still be more names under this key * check and repeat if required @@ -1875,7 +1905,8 @@ again: ret = check_item_in_log(trans, root, log, path, log_path, dir, &found_key); - BUG_ON(ret); + if (ret) + goto out; if (found_key.offset == (u64)-1) break; dir_key.offset = found_key.offset + 1; @@ -1952,11 +1983,13 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, root, log, path, key.objectid, 0); - BUG_ON(ret); + if (ret) + break; } ret = overwrite_item(wc->trans, root, path, eb, i, &key); - BUG_ON(ret); + if (ret) + break; /* for regular files, make sure corresponding * orhpan item exist. extents past the new EOF @@ -1965,12 +1998,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (S_ISREG(mode)) { ret = insert_orphan_item(wc->trans, root, key.objectid); - BUG_ON(ret); + if (ret) + break; } ret = link_to_fixup_dir(wc->trans, root, path, key.objectid); - BUG_ON(ret); + if (ret) + break; } if (wc->stage < LOG_WALK_REPLAY_ALL) continue; @@ -1979,28 +2014,35 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, if (key.type == BTRFS_XATTR_ITEM_KEY) { ret = overwrite_item(wc->trans, root, path, eb, i, &key); - BUG_ON(ret); + if (ret) + break; } else if (key.type == BTRFS_INODE_REF_KEY) { ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); - BUG_ON(ret && ret != -ENOENT); + if (ret && ret != -ENOENT) + break; + ret = 0; } else if (key.type == BTRFS_INODE_EXTREF_KEY) { ret = add_inode_ref(wc->trans, root, log, path, eb, i, &key); - BUG_ON(ret && ret != -ENOENT); + if (ret && ret != -ENOENT) + break; + ret = 0; } else if (key.type == BTRFS_EXTENT_DATA_KEY) { ret = replay_one_extent(wc->trans, root, path, eb, i, &key); - BUG_ON(ret); + if (ret) + break; } else if (key.type == BTRFS_DIR_ITEM_KEY || key.type == BTRFS_DIR_INDEX_KEY) { ret = replay_one_dir_item(wc->trans, root, path, eb, i, &key); - BUG_ON(ret); + if (ret) + break; } } btrfs_free_path(path); - return 0; + return ret; } static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, @@ -2045,8 +2087,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (*level == 1) { ret = wc->process_func(root, next, wc, ptr_gen); - if (ret) + if (ret) { + free_extent_buffer(next); return ret; + } path->slots[*level]++; if (wc->free) { @@ -2066,7 +2110,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(root, bytenr, blocksize); - BUG_ON(ret); /* -ENOMEM or logic errors */ + if (ret) { + free_extent_buffer(next); + return ret; + } } free_extent_buffer(next); continue; @@ -2139,7 +2186,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, ret = btrfs_free_and_pin_reserved_extent(root, path->nodes[*level]->start, path->nodes[*level]->len); - BUG_ON(ret); + if (ret) + return ret; } free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; @@ -2161,7 +2209,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, int wret; int level; struct btrfs_path *path; - int i; int orig_level; path = btrfs_alloc_path(); @@ -2213,17 +2260,12 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_and_pin_reserved_extent(log, next->start, next->len); - BUG_ON(ret); /* -ENOMEM or logic errors */ + if (ret) + goto out; } } out: - for (i = 0; i <= orig_level; i++) { - if (path->nodes[i]) { - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } - } btrfs_free_path(path); return ret; } @@ -2507,7 +2549,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans, if (trans) { ret = walk_log_tree(trans, log, &wc); - BUG_ON(ret); + + /* I don't think this can happen but just in case */ + if (ret) + btrfs_abort_transaction(trans, log, ret); } while (1) { @@ -2615,7 +2660,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; - BUG_ON(ret); + if (ret) { + err = ret; + goto fail; + } } btrfs_release_path(path); di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, @@ -2627,7 +2675,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; - BUG_ON(ret); + if (ret) { + err = ret; + goto fail; + } } /* update the directory size in the log to reflect the names @@ -2966,7 +3017,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, log, &key, path, -1, 1); - BUG_ON(ret == 0); + BUG_ON(ret == 0); /* Logic error */ if (ret < 0) break; @@ -3169,7 +3220,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, log->fs_info->csum_root, ds + cs, ds + cs + cl - 1, &ordered_sums, 0); - BUG_ON(ret); + if (ret) { + btrfs_release_path(dst_path); + kfree(ins_data); + return ret; + } } } } @@ -3209,115 +3264,6 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -static int drop_adjacent_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct extent_map *em, - struct btrfs_path *path) -{ - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; - struct btrfs_key key, new_key; - struct btrfs_map_token token; - u64 extent_end; - u64 extent_offset = 0; - int extent_type; - int del_slot = 0; - int del_nr = 0; - int ret = 0; - - while (1) { - btrfs_init_map_token(&token); - leaf = path->nodes[0]; - path->slots[0]++; - if (path->slots[0] >= btrfs_header_nritems(leaf)) { - if (del_nr) { - ret = btrfs_del_items(trans, root, path, - del_slot, del_nr); - if (ret) - return ret; - del_nr = 0; - } - - ret = btrfs_next_leaf_write(trans, root, path, 1); - if (ret < 0) - return ret; - if (ret > 0) - return 0; - leaf = path->nodes[0]; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY || - key.offset >= em->start + em->len) - break; - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_type = btrfs_token_file_extent_type(leaf, fi, &token); - if (extent_type == BTRFS_FILE_EXTENT_REG || - extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - extent_offset = btrfs_token_file_extent_offset(leaf, - fi, &token); - extent_end = key.offset + - btrfs_token_file_extent_num_bytes(leaf, fi, - &token); - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, fi); - } else { - BUG(); - } - - if (extent_end <= em->len + em->start) { - if (!del_nr) { - del_slot = path->slots[0]; - } - del_nr++; - continue; - } - - /* - * Ok so we'll ignore previous items if we log a new extent, - * which can lead to overlapping extents, so if we have an - * existing extent we want to adjust we _have_ to check the next - * guy to make sure we even need this extent anymore, this keeps - * us from panicing in set_item_key_safe. - */ - if (path->slots[0] < btrfs_header_nritems(leaf) - 1) { - struct btrfs_key tmp_key; - - btrfs_item_key_to_cpu(leaf, &tmp_key, - path->slots[0] + 1); - if (tmp_key.objectid == btrfs_ino(inode) && - tmp_key.type == BTRFS_EXTENT_DATA_KEY && - tmp_key.offset <= em->start + em->len) { - if (!del_nr) - del_slot = path->slots[0]; - del_nr++; - continue; - } - } - - BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); - memcpy(&new_key, &key, sizeof(new_key)); - new_key.offset = em->start + em->len; - btrfs_set_item_key_safe(trans, root, path, &new_key); - extent_offset += em->start + em->len - key.offset; - btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, - &token); - btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end - - (em->start + em->len), - &token); - btrfs_mark_buffer_dirty(leaf); - } - - if (del_nr) - ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - - return ret; -} - static int log_one_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_root *root, struct extent_map *em, struct btrfs_path *path) @@ -3339,39 +3285,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int index = log->log_transid % 2; bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; -insert: + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, + em->start + em->len, NULL, 0); + if (ret) + return ret; + INIT_LIST_HEAD(&ordered_sums); btrfs_init_map_token(&token); key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = em->start; - path->really_keep_locks = 1; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi)); - if (ret && ret != -EEXIST) { - path->really_keep_locks = 0; + if (ret) return ret; - } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - /* - * If we are overwriting an inline extent with a real one then we need - * to just delete the inline extent as it may not be large enough to - * have the entire file_extent_item. - */ - if (ret && btrfs_token_file_extent_type(leaf, fi, &token) == - BTRFS_FILE_EXTENT_INLINE) { - ret = btrfs_del_item(trans, log, path); - btrfs_release_path(path); - if (ret) { - path->really_keep_locks = 0; - return ret; - } - goto insert; - } - btrfs_set_token_file_extent_generation(leaf, fi, em->generation, &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { @@ -3410,22 +3341,14 @@ insert: em->start - em->orig_start, &token); btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); - btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, &token); btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); btrfs_mark_buffer_dirty(leaf); - /* - * Have to check the extent to the right of us to make sure it doesn't - * fall in our current range. We're ok if the previous extent is in our - * range since the recovery stuff will run us in key order and thus just - * drop the part we overwrote. - */ - ret = drop_adjacent_extents(trans, log, inode, em, path); btrfs_release_path(path); - path->really_keep_locks = 0; if (ret) { return ret; } @@ -3650,8 +3573,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool fast_search = false; u64 ino = btrfs_ino(inode); - log = root->log_root; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3918,9 +3839,9 @@ out: * only logging is done of any parent directories that are older than * the last committed transaction */ -int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only) +static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct dentry *parent, int exists_only) { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; @@ -4111,6 +4032,9 @@ again: wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); + free_extent_buffer(log->node); + free_extent_buffer(log->commit_root); + kfree(log); btrfs_error(fs_info, ret, "Couldn't read target root " "for tree log recovery."); goto error; @@ -4119,12 +4043,10 @@ again: wc.replay_dest->log_root = log; btrfs_record_root_in_trans(trans, wc.replay_dest); ret = walk_log_tree(trans, log, &wc); - BUG_ON(ret); - if (wc.stage == LOG_WALK_REPLAY_ALL) { + if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { ret = fixup_inode_link_counts(trans, wc.replay_dest, path); - BUG_ON(ret); } key.offset = found_key.offset - 1; @@ -4133,6 +4055,9 @@ again: free_extent_buffer(log->commit_root); kfree(log); + if (ret) + goto error; + if (found_key.offset == 0) break; } @@ -4153,17 +4078,20 @@ again: btrfs_free_path(path); + /* step 4: commit the transaction, which also unpins the blocks */ + ret = btrfs_commit_transaction(trans, fs_info->tree_root); + if (ret) + return ret; + free_extent_buffer(log_root_tree->node); log_root_tree->log_root = NULL; fs_info->log_root_recovering = 0; - - /* step 4: commit the transaction, which also unpins the blocks */ - btrfs_commit_transaction(trans, fs_info->tree_root); - kfree(log_root_tree); - return 0; + return 0; error: + if (wc.trans) + btrfs_end_transaction(wc.trans, fs_info->tree_root); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 862ac813f6b8..1d4ae0d15a70 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -40,9 +40,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); int btrfs_pin_log_trans(struct btrfs_root *root); -int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, int for_rename); diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index ddc61cad0080..7b417e20efe2 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -53,6 +53,7 @@ void ulist_init(struct ulist *ulist) ulist->nnodes = 0; ulist->nodes = ulist->int_nodes; ulist->nodes_alloced = ULIST_SIZE; + ulist->root = RB_ROOT; } EXPORT_SYMBOL(ulist_init); @@ -72,6 +73,7 @@ void ulist_fini(struct ulist *ulist) if (ulist->nodes_alloced > ULIST_SIZE) kfree(ulist->nodes); ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */ + ulist->root = RB_ROOT; } EXPORT_SYMBOL(ulist_fini); @@ -123,6 +125,45 @@ void ulist_free(struct ulist *ulist) } EXPORT_SYMBOL(ulist_free); +static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) +{ + struct rb_node *n = ulist->root.rb_node; + struct ulist_node *u = NULL; + + while (n) { + u = rb_entry(n, struct ulist_node, rb_node); + if (u->val < val) + n = n->rb_right; + else if (u->val > val) + n = n->rb_left; + else + return u; + } + return NULL; +} + +static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) +{ + struct rb_node **p = &ulist->root.rb_node; + struct rb_node *parent = NULL; + struct ulist_node *cur = NULL; + + while (*p) { + parent = *p; + cur = rb_entry(parent, struct ulist_node, rb_node); + + if (cur->val < ins->val) + p = &(*p)->rb_right; + else if (cur->val > ins->val) + p = &(*p)->rb_left; + else + return -EEXIST; + } + rb_link_node(&ins->rb_node, parent, p); + rb_insert_color(&ins->rb_node, &ulist->root); + return 0; +} + /** * ulist_add - add an element to the ulist * @ulist: ulist to add the element to @@ -151,14 +192,13 @@ int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask) int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask) { - int i; - - for (i = 0; i < ulist->nnodes; ++i) { - if (ulist->nodes[i].val == val) { - if (old_aux) - *old_aux = ulist->nodes[i].aux; - return 0; - } + int ret = 0; + struct ulist_node *node = NULL; + node = ulist_rbtree_search(ulist, val); + if (node) { + if (old_aux) + *old_aux = node->aux; + return 0; } if (ulist->nnodes >= ulist->nodes_alloced) { @@ -187,6 +227,8 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, } ulist->nodes[ulist->nnodes].val = val; ulist->nodes[ulist->nnodes].aux = aux; + ret = ulist_rbtree_insert(ulist, &ulist->nodes[ulist->nnodes]); + BUG_ON(ret); ++ulist->nnodes; return 1; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 21a1963439c3..fb36731074b5 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -8,6 +8,9 @@ #ifndef __ULIST__ #define __ULIST__ +#include <linux/list.h> +#include <linux/rbtree.h> + /* * ulist is a generic data structure to hold a collection of unique u64 * values. The only operations it supports is adding to the list and @@ -34,6 +37,7 @@ struct ulist_iterator { struct ulist_node { u64 val; /* value to store */ u64 aux; /* auxiliary value saved along with the val */ + struct rb_node rb_node; /* used to speed up search */ }; struct ulist { @@ -54,6 +58,8 @@ struct ulist { */ struct ulist_node *nodes; + struct rb_root root; + /* * inline storage space for the first ULIST_SIZE entries */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2854c824ab64..0e925ced971b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -46,6 +46,7 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_device *device); static int btrfs_relocate_sys_chunks(struct btrfs_root *root); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); +static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static DEFINE_MUTEX(uuid_mutex); @@ -717,9 +718,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, if (!device->name) continue; - ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &bh); - if (ret) + /* Just open everything we can; ignore failures here */ + if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, + &bdev, &bh)) continue; disk_super = (struct btrfs_super_block *)bh->b_data; @@ -1199,10 +1200,10 @@ out: return ret; } -int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset, u64 start, u64 num_bytes) +static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes) { int ret; struct btrfs_path *path; @@ -1329,9 +1330,9 @@ error: * the device information is stored in the chunk root * the btrfs_device struct should be fully filled in */ -int btrfs_add_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_device *device) +static int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) { int ret; struct btrfs_path *path; @@ -1710,8 +1711,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->fs_devices->device_list_mutex); } -int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, - struct btrfs_device **device) +static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, + struct btrfs_device **device) { int ret = 0; struct btrfs_super_block *disk_super; @@ -3607,7 +3608,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } -struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { +static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, .dev_stripes = 1, @@ -3674,18 +3675,10 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) { - u64 features; - if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) return; - features = btrfs_super_incompat_flags(info->super_copy); - if (features & BTRFS_FEATURE_INCOMPAT_RAID56) - return; - - features |= BTRFS_FEATURE_INCOMPAT_RAID56; - btrfs_set_super_incompat_flags(info->super_copy, features); - printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); + btrfs_set_fs_incompat(info, RAID56); } static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, @@ -3932,7 +3925,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em_tree = &extent_root->fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); + ret = add_extent_mapping(em_tree, em, 0); write_unlock(&em_tree->lock); if (ret) { free_extent_map(em); @@ -4240,9 +4233,25 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, len); read_unlock(&em_tree->lock); - BUG_ON(!em); - BUG_ON(em->start > logical || em->start + em->len < logical); + /* + * We could return errors for these cases, but that could get ugly and + * we'd probably do the same thing which is just not do anything else + * and exit, so return 1 so the callers don't try to use other copies. + */ + if (!em) { + btrfs_emerg(fs_info, "No mapping for %Lu-%Lu\n", logical, + logical+len); + return 1; + } + + if (em->start > logical || em->start + em->len < logical) { + btrfs_emerg(fs_info, "Invalid mapping for %Lu-%Lu, got " + "%Lu-%Lu\n", logical, logical+len, em->start, + em->start + em->len); + return 1; + } + map = (struct map_lookup *)em->bdev; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) ret = map->num_stripes; @@ -4411,13 +4420,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, read_unlock(&em_tree->lock); if (!em) { - printk(KERN_CRIT "btrfs: unable to find logical %llu len %llu\n", - (unsigned long long)logical, - (unsigned long long)*length); - BUG(); + btrfs_crit(fs_info, "unable to find logical %llu len %llu", + (unsigned long long)logical, + (unsigned long long)*length); + return -EINVAL; + } + + if (em->start > logical || em->start + em->len < logical) { + btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " + "found %Lu-%Lu\n", logical, em->start, + em->start + em->len); + return -EINVAL; } - BUG_ON(em->start > logical || em->start + em->len < logical); map = (struct map_lookup *)em->bdev; offset = logical - em->start; @@ -5106,9 +5121,9 @@ struct async_sched { * This will add one bio to the pending list for a device and make sure * the work struct is scheduled. */ -noinline void btrfs_schedule_bio(struct btrfs_root *root, - struct btrfs_device *device, - int rw, struct bio *bio) +static noinline void btrfs_schedule_bio(struct btrfs_root *root, + struct btrfs_device *device, + int rw, struct bio *bio) { int should_queue = 1; struct btrfs_pending_bios *pending_bios; @@ -5177,7 +5192,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio, } prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if ((bio->bi_size >> 9) > max_sectors) + if (bio_sectors(bio) > max_sectors) return 0; if (!q->merge_bvec_fn) @@ -5308,10 +5323,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } if (map_length < length) { - printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " - "len %llu\n", (unsigned long long)logical, - (unsigned long long)length, - (unsigned long long)map_length); + btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu", + (unsigned long long)logical, + (unsigned long long)length, + (unsigned long long)map_length); BUG(); } @@ -5476,7 +5491,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, } write_lock(&map_tree->map_tree.lock); - ret = add_extent_mapping(&map_tree->map_tree, em); + ret = add_extent_mapping(&map_tree->map_tree, em, 0); write_unlock(&map_tree->map_tree.lock); BUG_ON(ret); /* Tree corruption */ free_extent_map(em); @@ -5583,8 +5598,8 @@ static int read_one_dev(struct btrfs_root *root, return -EIO; if (!device) { - printk(KERN_WARNING "warning devid %llu missing\n", - (unsigned long long)devid); + btrfs_warn(root->fs_info, "devid %llu missing", + (unsigned long long)devid); device = add_missing_dev(root, devid, dev_uuid); if (!device) return -ENOMEM; @@ -5926,7 +5941,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) btrfs_dev_stat_print_on_error(dev); } -void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) +static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 062d8604d35b..845ccbb0d2e3 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -254,10 +254,6 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, #define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) -int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset, u64 start, u64 num_bytes); int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); @@ -282,11 +278,6 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); -int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, - struct btrfs_device **device); -int btrfs_add_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_device *device); int btrfs_rm_device(struct btrfs_root *root, char *device_path); void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); @@ -307,7 +298,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); -void btrfs_dev_stat_print_on_error(struct btrfs_device *device); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_root *root, struct btrfs_ioctl_get_dev_stats *stats); @@ -321,9 +311,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); int btrfs_scratch_superblock(struct btrfs_device *device); -void btrfs_schedule_bio(struct btrfs_root *root, - struct btrfs_device *device, - int rw, struct bio *bio); int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len, int mirror_num); unsigned long btrfs_full_stripe_len(struct btrfs_root *root, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 446a6848c554..05740b9789e4 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -406,8 +406,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) XATTR_REPLACE); } -int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) +static int btrfs_initxattrs(struct inode *inode, + const struct xattr *xattr_array, void *fs_info) { const struct xattr *xattr; struct btrfs_trans_handle *trans = fs_info; diff --git a/fs/buffer.c b/fs/buffer.c index bc1fe14aaa3e..d2a4d1bb2d57 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2977,7 +2977,6 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) bio->bi_io_vec[0].bv_offset = bh_offset(bh); bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = bh->b_size; bio->bi_end_io = end_bio_bh_io_sync; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d70830c66833..656e16907430 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -7,6 +7,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> +#include <linux/aio.h> #include "super.h" #include "mds_client.h" diff --git a/fs/compat.c b/fs/compat.c index 93f7d021b716..fc3b55dce184 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -47,6 +47,7 @@ #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> diff --git a/fs/direct-io.c b/fs/direct-io.c index cfb816dc6d9f..7ab90f5081ee 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,6 +37,7 @@ #include <linux/uio.h> #include <linux/atomic.h> #include <linux/prefetch.h> +#include <linux/aio.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -441,8 +442,8 @@ static struct bio *dio_await_one(struct dio *dio) static int dio_bio_complete(struct dio *dio, struct bio *bio) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec; - int page_no; + struct bio_vec *bvec; + unsigned i; if (!uptodate) dio->io_error = -EIO; @@ -450,8 +451,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) if (dio->is_async && dio->rw == READ) { bio_check_pages_dirty(bio); /* transfers ownership */ } else { - for (page_no = 0; page_no < bio->bi_vcnt; page_no++) { - struct page *page = bvec[page_no].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (dio->rw == READ && !PageCompound(page)) set_page_dirty_lock(page); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 63b1f54b6a1f..201f0a0d6b0a 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -31,6 +31,7 @@ #include <linux/security.h> #include <linux/compat.h> #include <linux/fs_stack.h> +#include <linux/aio.h> #include "ecryptfs_kernel.h" /** diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index f936cb50dc0d..b74422888604 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -401,7 +401,7 @@ static void _clear_bio(struct bio *bio) struct bio_vec *bv; unsigned i; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { unsigned this_count = bv->bv_len; if (likely(PAGE_SIZE == this_count)) diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index b963f38ac298..7682b970d0f1 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -432,7 +432,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) if (!bio) continue; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; SetPageUptodate(page); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index fe60cc1117d8..0a87bb10998d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -31,6 +31,7 @@ #include <linux/mpage.h> #include <linux/fiemap.h> #include <linux/namei.h> +#include <linux/aio.h> #include "ext2.h" #include "acl.h" #include "xip.h" diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index d706dbfa6220..23c712825640 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -27,6 +27,7 @@ #include <linux/writeback.h> #include <linux/mpage.h> #include <linux/namei.h> +#include <linux/aio.h> #include "ext3.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 64848b595b24..4959e29573b6 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,6 +23,7 @@ #include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> +#include <linux/aio.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include "ext4.h" diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 98be6f697463..b8d5d351e24f 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,6 +20,7 @@ * (sct@redhat.com), 1993, 1998 */ +#include <linux/aio.h> #include "ext4_jbd2.h" #include "truncate.h" #include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 793d44b84d7f..0723774bdfb5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/aio.h> #include "ext4_jbd2.h" #include "xattr.h" diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5929cd0baa20..19599bded62a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -18,6 +18,7 @@ #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> +#include <linux/aio.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2b6fc131e2ce..b1de01da1a40 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -20,6 +20,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include <trace/events/f2fs.h> static struct kmem_cache *orphan_entry_slab; static struct kmem_cache *inode_entry_slab; @@ -57,13 +58,19 @@ repeat: cond_resched(); goto repeat; } - if (f2fs_readpage(sbi, page, index, READ_SYNC)) { + if (PageUptodate(page)) + goto out; + + if (f2fs_readpage(sbi, page, index, READ_SYNC)) + goto repeat; + + lock_page(page); + if (page->mapping != mapping) { f2fs_put_page(page, 1); goto repeat; } +out: mark_page_accessed(page); - - /* We do not allow returning an errorneous page */ return page; } @@ -541,54 +548,44 @@ retry: */ static void block_operations(struct f2fs_sb_info *sbi) { - int t; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; + struct blk_plug plug; - /* Stop renaming operation */ - mutex_lock_op(sbi, RENAME); - mutex_lock_op(sbi, DENTRY_OPS); + blk_start_plug(&plug); -retry_dents: - /* write all the dirty dentry pages */ - sync_dirty_dir_inodes(sbi); +retry_flush_dents: + mutex_lock_all(sbi); - mutex_lock_op(sbi, DATA_WRITE); + /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { - mutex_unlock_op(sbi, DATA_WRITE); - goto retry_dents; + mutex_unlock_all(sbi); + sync_dirty_dir_inodes(sbi); + goto retry_flush_dents; } - /* block all the operations */ - for (t = DATA_NEW; t <= NODE_TRUNC; t++) - mutex_lock_op(sbi, t); - - mutex_lock(&sbi->write_inode); - /* * POR: we should ensure that there is no dirty node pages * until finishing nat/sit flush. */ -retry: - sync_node_pages(sbi, 0, &wbc); - - mutex_lock_op(sbi, NODE_WRITE); +retry_flush_nodes: + mutex_lock(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { - mutex_unlock_op(sbi, NODE_WRITE); - goto retry; + mutex_unlock(&sbi->node_write); + sync_node_pages(sbi, 0, &wbc); + goto retry_flush_nodes; } - mutex_unlock(&sbi->write_inode); + blk_finish_plug(&plug); } static void unblock_operations(struct f2fs_sb_info *sbi) { - int t; - for (t = NODE_WRITE; t >= RENAME; t--) - mutex_unlock_op(sbi, t); + mutex_unlock(&sbi->node_write); + mutex_unlock_all(sbi); } static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) @@ -727,9 +724,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops"); + mutex_lock(&sbi->cp_mutex); block_operations(sbi); + trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); + f2fs_submit_bio(sbi, DATA, true); f2fs_submit_bio(sbi, NODE, true); f2fs_submit_bio(sbi, META, true); @@ -746,13 +747,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) flush_nat_entries(sbi); flush_sit_entries(sbi); - reset_victim_segmap(sbi); - /* unlock all the fs_lock[] in do_checkpoint() */ do_checkpoint(sbi, is_umount); unblock_operations(sbi); mutex_unlock(&sbi->cp_mutex); + + trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } void init_orphan_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7bd22a201125..91ff93b0b0f4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -12,6 +12,7 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/mpage.h> +#include <linux/aio.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> @@ -21,6 +22,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include <trace/events/f2fs.h> /* * Lock ordering for the change of data block address: @@ -54,6 +56,8 @@ int reserve_new_block(struct dnode_of_data *dn) if (!inc_valid_block_count(sbi, dn->inode, 1)) return -ENOSPC; + trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + __set_data_blkaddr(dn, NEW_ADDR); dn->data_blkaddr = NEW_ADDR; sync_inode_page(dn); @@ -133,7 +137,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) goto end_update; } - /* Frone merge */ + /* Front merge */ if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) { fi->ext.fofs--; fi->ext.blk_addr--; @@ -169,7 +173,7 @@ end_update: return; } -struct page *find_data_page(struct inode *inode, pgoff_t index) +struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; @@ -183,7 +187,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) f2fs_put_page(page, 0); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, RDONLY_NODE); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) return ERR_PTR(err); f2fs_put_dnode(&dn); @@ -199,12 +203,20 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) if (!page) return ERR_PTR(-ENOMEM); - err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); - return ERR_PTR(err); + if (PageUptodate(page)) { + unlock_page(page); + return page; + } + + err = f2fs_readpage(sbi, page, dn.data_blkaddr, + sync ? READ_SYNC : READA); + if (sync) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 0); + return ERR_PTR(-EIO); + } } - unlock_page(page); return page; } @@ -222,14 +234,14 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) int err; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, RDONLY_NODE); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) return ERR_PTR(err); f2fs_put_dnode(&dn); if (dn.data_blkaddr == NULL_ADDR) return ERR_PTR(-ENOENT); - +repeat: page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); @@ -241,9 +253,17 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) BUG_ON(dn.data_blkaddr == NULL_ADDR); err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err) return ERR_PTR(err); + + lock_page(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; } return page; } @@ -251,6 +271,9 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) /* * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. + * + * Also, caller should grab and release a mutex by calling mutex_lock_op() and + * mutex_unlock_op(). */ struct page *get_new_data_page(struct inode *inode, pgoff_t index, bool new_i_size) @@ -262,7 +285,7 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index, int err; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, 0); + err = get_dnode_of_data(&dn, index, ALLOC_NODE); if (err) return ERR_PTR(err); @@ -273,7 +296,7 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index, } } f2fs_put_dnode(&dn); - +repeat: page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); @@ -283,14 +306,21 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index, if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); } else { err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err) return ERR_PTR(err); + lock_page(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; } } - SetPageUptodate(page); if (new_i_size && i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { @@ -325,21 +355,15 @@ static void read_end_io(struct bio *bio, int err) /* * Fill the locked page with data located in the block address. - * Read operation is synchronous, and caller must unlock the page. + * Return unlocked page. */ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, block_t blk_addr, int type) { struct block_device *bdev = sbi->sb->s_bdev; - bool sync = (type == READ_SYNC); struct bio *bio; - /* This page can be already read by other threads */ - if (PageUptodate(page)) { - if (!sync) - unlock_page(page); - return 0; - } + trace_f2fs_readpage(page, blk_addr, type); down_read(&sbi->bio_sem); @@ -354,18 +378,12 @@ int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page, kfree(bio->bi_private); bio_put(bio); up_read(&sbi->bio_sem); + f2fs_put_page(page, 1); return -EFAULT; } submit_bio(type, bio); up_read(&sbi->bio_sem); - - /* wait for read completion if sync */ - if (sync) { - lock_page(page); - if (PageError(page)) - return -EIO; - } return 0; } @@ -387,14 +405,18 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock, /* Get the page offset from the block offset(iblock) */ pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); - if (check_extent_cache(inode, pgofs, bh_result)) + if (check_extent_cache(inode, pgofs, bh_result)) { + trace_f2fs_get_data_block(inode, iblock, bh_result, 0); return 0; + } /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE); - if (err) + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + if (err) { + trace_f2fs_get_data_block(inode, iblock, bh_result, err); return (err == -ENOENT) ? 0 : err; + } /* It does not support data allocation */ BUG_ON(create); @@ -419,6 +441,7 @@ static int get_data_block_ro(struct inode *inode, sector_t iblock, bh_result->b_size = (i << blkbits); } f2fs_put_dnode(&dn); + trace_f2fs_get_data_block(inode, iblock, bh_result, 0); return 0; } @@ -437,13 +460,12 @@ static int f2fs_read_data_pages(struct file *file, int do_write_data_page(struct page *page) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); block_t old_blk_addr, new_blk_addr; struct dnode_of_data dn; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, RDONLY_NODE); + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) return err; @@ -467,8 +489,6 @@ int do_write_data_page(struct page *page) write_data_page(inode, page, &dn, old_blk_addr, &new_blk_addr); update_extent_cache(new_blk_addr, &dn); - F2FS_I(inode)->data_version = - le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver); } out_writepage: f2fs_put_dnode(&dn); @@ -484,10 +504,11 @@ static int f2fs_write_data_page(struct page *page, const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_CACHE_SHIFT; unsigned offset; + bool need_balance_fs = false; int err = 0; if (page->index < end_index) - goto out; + goto write; /* * If the offset is out-of-range of file size, @@ -499,50 +520,46 @@ static int f2fs_write_data_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); } - goto unlock_out; + goto out; } zero_user_segment(page, offset, PAGE_CACHE_SIZE); -out: - if (sbi->por_doing) - goto redirty_out; - - if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page)) +write: + if (sbi->por_doing) { + err = AOP_WRITEPAGE_ACTIVATE; goto redirty_out; + } - mutex_lock_op(sbi, DATA_WRITE); + /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { dec_page_count(sbi, F2FS_DIRTY_DENTS); inode_dec_dirty_dents(inode); + err = do_write_data_page(page); + } else { + int ilock = mutex_lock_op(sbi); + err = do_write_data_page(page); + mutex_unlock_op(sbi, ilock); + need_balance_fs = true; } - err = do_write_data_page(page); - if (err && err != -ENOENT) { - wbc->pages_skipped++; - set_page_dirty(page); - } - mutex_unlock_op(sbi, DATA_WRITE); + if (err == -ENOENT) + goto out; + else if (err) + goto redirty_out; if (wbc->for_reclaim) f2fs_submit_bio(sbi, DATA, true); - if (err == -ENOENT) - goto unlock_out; - clear_cold_data(page); +out: unlock_page(page); - - if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode)) + if (need_balance_fs) f2fs_balance_fs(sbi); return 0; -unlock_out: - unlock_page(page); - return (err == -ENOENT) ? 0 : err; - redirty_out: wbc->pages_skipped++; set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; + return err; } #define MAX_DESIRED_PAGES_WP 4096 @@ -561,19 +578,26 @@ static int f2fs_write_data_pages(struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + bool locked = false; int ret; long excess_nrtw = 0, desired_nrtw; + /* deal with chardevs and other special file */ + if (!mapping->a_ops->writepage) + return 0; + if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) { desired_nrtw = MAX_DESIRED_PAGES_WP; excess_nrtw = desired_nrtw - wbc->nr_to_write; wbc->nr_to_write = desired_nrtw; } - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode)) { mutex_lock(&sbi->writepages); + locked = true; + } ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); - if (!S_ISDIR(inode->i_mode)) + if (locked) mutex_unlock(&sbi->writepages); f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); @@ -593,39 +617,33 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; struct dnode_of_data dn; int err = 0; + int ilock; /* for nobh_write_end */ *fsdata = NULL; f2fs_balance_fs(sbi); - +repeat: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; - mutex_lock_op(sbi, DATA_NEW); + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, 0); - if (err) { - mutex_unlock_op(sbi, DATA_NEW); - f2fs_put_page(page, 1); - return err; - } + err = get_dnode_of_data(&dn, index, ALLOC_NODE); + if (err) + goto err; - if (dn.data_blkaddr == NULL_ADDR) { + if (dn.data_blkaddr == NULL_ADDR) err = reserve_new_block(&dn); - if (err) { - f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); - f2fs_put_page(page, 1); - return err; - } - } + f2fs_put_dnode(&dn); + if (err) + goto err; - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) return 0; @@ -636,21 +654,34 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, /* Reading beyond i_size is simple: memset to zero */ zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); - return 0; + goto out; } if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err) return err; + lock_page(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return -EIO; + } + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; } } +out: SetPageUptodate(page); clear_cold_data(page); return 0; + +err: + mutex_unlock_op(sbi, ilock); + f2fs_put_page(page, 1); + return err; } static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, @@ -681,7 +712,7 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned long offset) static int f2fs_release_data_page(struct page *page, gfp_t wait) { ClearPagePrivate(page); - return 0; + return 1; } static int f2fs_set_data_page_dirty(struct page *page) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 025b9e2f935d..8d9943786c31 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -13,7 +13,6 @@ #include <linux/fs.h> #include <linux/backing-dev.h> -#include <linux/proc_fs.h> #include <linux/f2fs_fs.h> #include <linux/blkdev.h> #include <linux/debugfs.h> @@ -106,7 +105,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) } } mutex_unlock(&sit_i->sentry_lock); - dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100; + dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; si->bimodal = bimodal / dist; if (si->dirty_count) si->avg_vblocks = total_vblocks / ndirty; @@ -138,14 +137,13 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi); if (sbi->segs_per_sec > 1) - si->base_mem += sbi->total_sections * - sizeof(struct sec_entry); + si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry); si->base_mem += __bitmap_size(sbi, SIT_BITMAP); /* build free segmap */ si->base_mem += sizeof(struct free_segmap_info); si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += f2fs_bitmap_size(sbi->total_sections); + si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); /* build curseg */ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; @@ -154,7 +152,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build dirty segmap */ si->base_mem += sizeof(struct dirty_seglist_info); si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); /* buld nm */ si->base_mem += sizeof(struct f2fs_nm_info); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 1be948768e2f..1ac6b93036b7 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -148,7 +148,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, for (; bidx < end_block; bidx++) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = find_data_page(dir, bidx); + dentry_page = find_data_page(dir, bidx, true); if (IS_ERR(dentry_page)) { room = true; continue; @@ -189,6 +189,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, unsigned int max_depth; unsigned int level; + if (namelen > F2FS_NAME_LEN) + return NULL; + if (npages == 0) return NULL; @@ -246,9 +249,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - - mutex_lock_op(sbi, DENTRY_OPS); lock_page(page); wait_on_page_writeback(page); de->ino = cpu_to_le32(inode->i_ino); @@ -262,7 +262,6 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, F2FS_I(inode)->i_pino = dir->i_ino; f2fs_put_page(page, 1); - mutex_unlock_op(sbi, DENTRY_OPS); } void init_dent_inode(const struct qstr *name, struct page *ipage) @@ -281,6 +280,43 @@ void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } +static int make_empty_dir(struct inode *inode, struct inode *parent) +{ + struct page *dentry_page; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dir_entry *de; + void *kaddr; + + dentry_page = get_new_data_page(inode, 0, true); + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); + + kaddr = kmap_atomic(dentry_page); + dentry_blk = (struct f2fs_dentry_block *)kaddr; + + de = &dentry_blk->dentry[0]; + de->name_len = cpu_to_le16(1); + de->hash_code = 0; + de->ino = cpu_to_le32(inode->i_ino); + memcpy(dentry_blk->filename[0], ".", 1); + set_de_type(de, inode); + + de = &dentry_blk->dentry[1]; + de->hash_code = 0; + de->name_len = cpu_to_le16(2); + de->ino = cpu_to_le32(parent->i_ino); + memcpy(dentry_blk->filename[1], "..", 2); + set_de_type(de, inode); + + test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); + test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); + kunmap_atomic(kaddr); + + set_page_dirty(dentry_page); + f2fs_put_page(dentry_page, 1); + return 0; +} + static int init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *name) { @@ -291,7 +327,7 @@ static int init_inode_metadata(struct inode *inode, return err; if (S_ISDIR(inode->i_mode)) { - err = f2fs_make_empty(inode, dir); + err = make_empty_dir(inode, dir); if (err) { remove_inode_page(inode); return err; @@ -314,7 +350,7 @@ static int init_inode_metadata(struct inode *inode, } if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { inc_nlink(inode); - f2fs_write_inode(inode, NULL); + update_inode_page(inode); } return 0; } @@ -338,7 +374,7 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode, } if (need_dir_update) - f2fs_write_inode(dir, NULL); + update_inode_page(dir); else mark_inode_dirty(dir); @@ -370,6 +406,10 @@ next: goto next; } +/* + * Caller should grab and release a mutex by calling mutex_lock_op() and + * mutex_unlock_op(). + */ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode) { unsigned int bit_pos; @@ -379,7 +419,6 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in f2fs_hash_t dentry_hash; struct f2fs_dir_entry *de; unsigned int nbucket, nblock; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); size_t namelen = name->len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; @@ -409,12 +448,9 @@ start: bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { - mutex_lock_op(sbi, DENTRY_OPS); dentry_page = get_new_data_page(dir, block, true); - if (IS_ERR(dentry_page)) { - mutex_unlock_op(sbi, DENTRY_OPS); + if (IS_ERR(dentry_page)) return PTR_ERR(dentry_page); - } dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(dentry_blk, slots); @@ -423,7 +459,6 @@ start: kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - mutex_unlock_op(sbi, DENTRY_OPS); } /* Move to next level to find the empty slot for new dentry */ @@ -453,7 +488,6 @@ add_dentry: fail: kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - mutex_unlock_op(sbi, DENTRY_OPS); return err; } @@ -473,8 +507,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, void *kaddr = page_address(page); int i; - mutex_lock_op(sbi, DENTRY_OPS); - lock_page(page); wait_on_page_writeback(page); @@ -494,7 +526,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (inode && S_ISDIR(inode->i_mode)) { drop_nlink(dir); - f2fs_write_inode(dir, NULL); + update_inode_page(dir); } else { mark_inode_dirty(dir); } @@ -506,7 +538,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, drop_nlink(inode); i_size_write(inode, 0); } - f2fs_write_inode(inode, NULL); + update_inode_page(inode); + if (inode->i_nlink == 0) add_orphan_inode(sbi, inode->i_ino); } @@ -519,45 +552,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, inode_dec_dirty_dents(dir); } f2fs_put_page(page, 1); - - mutex_unlock_op(sbi, DENTRY_OPS); -} - -int f2fs_make_empty(struct inode *inode, struct inode *parent) -{ - struct page *dentry_page; - struct f2fs_dentry_block *dentry_blk; - struct f2fs_dir_entry *de; - void *kaddr; - - dentry_page = get_new_data_page(inode, 0, true); - if (IS_ERR(dentry_page)) - return PTR_ERR(dentry_page); - - kaddr = kmap_atomic(dentry_page); - dentry_blk = (struct f2fs_dentry_block *)kaddr; - - de = &dentry_blk->dentry[0]; - de->name_len = cpu_to_le16(1); - de->hash_code = f2fs_dentry_hash(".", 1); - de->ino = cpu_to_le32(inode->i_ino); - memcpy(dentry_blk->filename[0], ".", 1); - set_de_type(de, inode); - - de = &dentry_blk->dentry[1]; - de->hash_code = f2fs_dentry_hash("..", 2); - de->name_len = cpu_to_le16(2); - de->ino = cpu_to_le32(parent->i_ino); - memcpy(dentry_blk->filename[1], "..", 2); - set_de_type(de, inode); - - test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); - test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); - kunmap_atomic(kaddr); - - set_page_dirty(dentry_page); - f2fs_put_page(dentry_page, 1); - return 0; } bool f2fs_empty_dir(struct inode *dir) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 201c8d3b0f86..20aab02f2a42 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -125,11 +125,15 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) * file keeping -1 as its node offset to * distinguish from index node blocks. */ -#define RDONLY_NODE 1 /* - * specify a read-only mode when getting - * a node block. 0 is read-write mode. - * used by get_dnode_of_data(). +enum { + ALLOC_NODE, /* allocate a new node page if needed */ + LOOKUP_NODE, /* look up a node without readahead */ + LOOKUP_NODE_RA, /* + * look up a node with readahead called + * by get_datablock_ro. */ +}; + #define F2FS_LINK_MAX 32000 /* maximum link count per file */ /* for in-memory extent cache entry */ @@ -144,6 +148,7 @@ struct extent_info { * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. */ #define FADVISE_COLD_BIT 0x01 +#define FADVISE_CP_BIT 0x02 struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ @@ -155,7 +160,6 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ - unsigned long long data_version;/* latest version of data for fsync */ atomic_t dirty_dents; /* # of dirty dentry pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ @@ -186,7 +190,6 @@ static inline void set_raw_extent(struct extent_info *ext, struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ - nid_t init_scan_nid; /* the first nid to be scanned */ nid_t next_scan_nid; /* the next nid to be scanned */ /* NAT cache management */ @@ -305,23 +308,12 @@ enum count_type { }; /* - * FS_LOCK nesting subclasses for the lock validator: - * - * The locking order between these classes is - * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW - * -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC + * Uses as sbi->fs_lock[NR_GLOBAL_LOCKS]. + * The checkpoint procedure blocks all the locks in this fs_lock array. + * Some FS operations grab free locks, and if there is no free lock, + * then wait to grab a lock in a round-robin manner. */ -enum lock_type { - RENAME, /* for renaming operations */ - DENTRY_OPS, /* for directory operations */ - DATA_WRITE, /* for data write */ - DATA_NEW, /* for data allocation */ - DATA_TRUNC, /* for data truncate */ - NODE_NEW, /* for node allocation */ - NODE_TRUNC, /* for node truncate */ - NODE_WRITE, /* for node write */ - NR_LOCK_TYPE, -}; +#define NR_GLOBAL_LOCKS 8 /* * The below are the page types of bios used in submti_bio(). @@ -361,11 +353,13 @@ struct f2fs_sb_info { /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ struct inode *meta_inode; /* cache meta blocks */ - struct mutex cp_mutex; /* for checkpoint procedure */ - struct mutex fs_lock[NR_LOCK_TYPE]; /* for blocking FS operations */ - struct mutex write_inode; /* mutex for write inode */ + struct mutex cp_mutex; /* checkpoint procedure lock */ + struct mutex fs_lock[NR_GLOBAL_LOCKS]; /* blocking FS operations */ + struct mutex node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ + unsigned char next_lock_num; /* round-robin global locks */ int por_doing; /* recovery is doing or not */ + int on_build_free_nids; /* build_free_nids is doing */ /* for orphan inode management */ struct list_head orphan_inode_list; /* orphan inode list */ @@ -406,6 +400,7 @@ struct f2fs_sb_info { /* for cleaning operations */ struct mutex gc_mutex; /* mutex for GC */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ + unsigned int cur_victim_sec; /* current victim section num */ /* * for stat information. @@ -498,22 +493,51 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t) +static inline void mutex_lock_all(struct f2fs_sb_info *sbi) { - mutex_lock_nested(&sbi->fs_lock[t], t); + int i = 0; + for (; i < NR_GLOBAL_LOCKS; i++) + mutex_lock(&sbi->fs_lock[i]); } -static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t) +static inline void mutex_unlock_all(struct f2fs_sb_info *sbi) { - mutex_unlock(&sbi->fs_lock[t]); + int i = 0; + for (; i < NR_GLOBAL_LOCKS; i++) + mutex_unlock(&sbi->fs_lock[i]); +} + +static inline int mutex_lock_op(struct f2fs_sb_info *sbi) +{ + unsigned char next_lock = sbi->next_lock_num % NR_GLOBAL_LOCKS; + int i = 0; + + for (; i < NR_GLOBAL_LOCKS; i++) + if (mutex_trylock(&sbi->fs_lock[i])) + return i; + + mutex_lock(&sbi->fs_lock[next_lock]); + sbi->next_lock_num++; + return next_lock; +} + +static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock) +{ + if (ilock < 0) + return; + BUG_ON(ilock >= NR_GLOBAL_LOCKS); + mutex_unlock(&sbi->fs_lock[ilock]); } /* * Check whether the given nid is within node id range. */ -static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) +static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - BUG_ON((nid >= NM_I(sbi)->max_nid)); + WARN_ON((nid >= NM_I(sbi)->max_nid)); + if (nid >= NM_I(sbi)->max_nid) + return -EINVAL; + return 0; } #define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 @@ -819,7 +843,6 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr) /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ - FI_NEED_CP, /* need to do checkpoint during fsync */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ @@ -872,6 +895,7 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); void update_inode(struct inode *, struct page *); +int update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); @@ -973,7 +997,6 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *, int, unsigned int, int); void flush_sit_entries(struct f2fs_sb_info *); int build_segment_manager(struct f2fs_sb_info *); -void reset_victim_segmap(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); /* @@ -1000,7 +1023,7 @@ void destroy_checkpoint_caches(void); */ int reserve_new_block(struct dnode_of_data *); void update_extent_cache(block_t, struct dnode_of_data *); -struct page *find_data_page(struct inode *, pgoff_t); +struct page *find_data_page(struct inode *, pgoff_t, bool); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, pgoff_t, bool); int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int); @@ -1020,7 +1043,7 @@ void destroy_gc_caches(void); /* * recovery.c */ -void recover_fsync_data(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *); bool space_for_roll_forward(struct f2fs_sb_info *); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index db626282d424..1cae864f8dfc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -13,6 +13,7 @@ #include <linux/stat.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/blkdev.h> #include <linux/falloc.h> #include <linux/types.h> #include <linux/compat.h> @@ -24,6 +25,7 @@ #include "segment.h" #include "xattr.h" #include "acl.h" +#include <trace/events/f2fs.h> static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -33,19 +35,18 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); block_t old_blk_addr; struct dnode_of_data dn; - int err; + int err, ilock; f2fs_balance_fs(sbi); sb_start_pagefault(inode->i_sb); - mutex_lock_op(sbi, DATA_NEW); - /* block allocation */ + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, page->index, 0); + err = get_dnode_of_data(&dn, page->index, ALLOC_NODE); if (err) { - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); goto out; } @@ -55,13 +56,12 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, err = reserve_new_block(&dn); if (err) { f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); goto out; } } f2fs_put_dnode(&dn); - - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); lock_page(page); if (page->mapping != inode->i_mapping || @@ -102,28 +102,10 @@ static const struct vm_operations_struct f2fs_file_vm_ops = { .remap_pages = generic_file_remap_pages, }; -static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode) -{ - struct dentry *dentry; - nid_t pino; - - inode = igrab(inode); - dentry = d_find_any_alias(inode); - if (!dentry) { - iput(inode); - return 0; - } - pino = dentry->d_parent->d_inode->i_ino; - dput(dentry); - iput(inode); - return !is_checkpointed_node(sbi, pino); -} - int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - unsigned long long cur_version; int ret = 0; bool need_cp = false; struct writeback_control wbc = { @@ -135,9 +117,12 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (inode->i_sb->s_flags & MS_RDONLY) return 0; + trace_f2fs_sync_file_enter(inode); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) + if (ret) { + trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; + } /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); @@ -147,28 +132,18 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; - mutex_lock(&sbi->cp_mutex); - cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver); - mutex_unlock(&sbi->cp_mutex); - - if (F2FS_I(inode)->data_version != cur_version && - !(inode->i_state & I_DIRTY)) - goto out; - F2FS_I(inode)->data_version--; - if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - else if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP)) + else if (is_cp_file(inode)) need_cp = true; else if (!space_for_roll_forward(sbi)) need_cp = true; - else if (need_to_sync_dir(sbi, inode)) + else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; if (need_cp) { /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); - clear_inode_flag(F2FS_I(inode), FI_NEED_CP); } else { /* if there is no written node page, write its inode page */ while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { @@ -178,9 +153,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); } out: mutex_unlock(&inode->i_mutex); + trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; } @@ -216,6 +193,9 @@ static int truncate_data_blocks_range(struct dnode_of_data *dn, int count) sync_inode_page(dn); } dn->ofs_in_node = ofs; + + trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, + dn->ofs_in_node, nr_free); return nr_free; } @@ -232,11 +212,15 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) if (!offset) return; - page = find_data_page(inode, from >> PAGE_CACHE_SHIFT); + page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false); if (IS_ERR(page)) return; lock_page(page); + if (page->mapping != inode->i_mapping) { + f2fs_put_page(page, 1); + return; + } wait_on_page_writeback(page); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); @@ -249,20 +233,22 @@ static int truncate_blocks(struct inode *inode, u64 from) unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; - int count = 0; + int count = 0, ilock = -1; int err; + trace_f2fs_truncate_blocks_enter(inode, from); + free_from = (pgoff_t) ((from + blocksize - 1) >> (sbi->log_blocksize)); - mutex_lock_op(sbi, DATA_TRUNC); - + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, free_from, RDONLY_NODE); + err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); if (err) { if (err == -ENOENT) goto free_next; - mutex_unlock_op(sbi, DATA_TRUNC); + mutex_unlock_op(sbi, ilock); + trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -273,6 +259,7 @@ static int truncate_blocks(struct inode *inode, u64 from) count -= dn.ofs_in_node; BUG_ON(count < 0); + if (dn.ofs_in_node || IS_INODE(dn.node_page)) { truncate_data_blocks_range(&dn, count); free_from += count; @@ -281,11 +268,12 @@ static int truncate_blocks(struct inode *inode, u64 from) f2fs_put_dnode(&dn); free_next: err = truncate_inode_blocks(inode, free_from); - mutex_unlock_op(sbi, DATA_TRUNC); + mutex_unlock_op(sbi, ilock); /* lastly zero out the first data page */ truncate_partial_data_page(inode, from); + trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -295,6 +283,8 @@ void f2fs_truncate(struct inode *inode) S_ISLNK(inode->i_mode))) return; + trace_f2fs_truncate(inode); + if (!truncate_blocks(inode, i_size_read(inode))) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); @@ -389,15 +379,16 @@ static void fill_zero(struct inode *inode, pgoff_t index, { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; + int ilock; if (!len) return; f2fs_balance_fs(sbi); - mutex_lock_op(sbi, DATA_NEW); + ilock = mutex_lock_op(sbi); page = get_new_data_page(inode, index, false); - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); if (!IS_ERR(page)) { wait_on_page_writeback(page); @@ -414,15 +405,10 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) for (index = pg_start; index < pg_end; index++) { struct dnode_of_data dn; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - - f2fs_balance_fs(sbi); - mutex_lock_op(sbi, DATA_TRUNC); set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, RDONLY_NODE); + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { - mutex_unlock_op(sbi, DATA_TRUNC); if (err == -ENOENT) continue; return err; @@ -431,7 +417,6 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) if (dn.data_blkaddr != NULL_ADDR) truncate_data_blocks_range(&dn, 1); f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_TRUNC); } return 0; } @@ -461,12 +446,19 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode) if (pg_start < pg_end) { struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int ilock; + + f2fs_balance_fs(sbi); blk_start = pg_start << PAGE_CACHE_SHIFT; blk_end = pg_end << PAGE_CACHE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); + + ilock = mutex_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); + mutex_unlock_op(sbi, ilock); } } @@ -500,13 +492,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset, for (index = pg_start; index <= pg_end; index++) { struct dnode_of_data dn; + int ilock; - mutex_lock_op(sbi, DATA_NEW); - + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, index, 0); + ret = get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); break; } @@ -514,13 +506,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, ret = reserve_new_block(&dn); if (ret) { f2fs_put_dnode(&dn); - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); break; } } f2fs_put_dnode(&dn); - - mutex_unlock_op(sbi, DATA_NEW); + mutex_unlock_op(sbi, ilock); if (pg_start == pg_end) new_size = offset + len; @@ -559,6 +550,7 @@ static long f2fs_fallocate(struct file *file, int mode, inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } + trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2e3eb2d4fc30..14961593e93c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -11,7 +11,6 @@ #include <linux/fs.h> #include <linux/module.h> #include <linux/backing-dev.h> -#include <linux/proc_fs.h> #include <linux/init.h> #include <linux/f2fs_fs.h> #include <linux/kthread.h> @@ -23,6 +22,7 @@ #include "node.h" #include "segment.h" #include "gc.h" +#include <trace/events/f2fs.h> static struct kmem_cache *winode_slab; @@ -81,9 +81,6 @@ static int gc_thread_func(void *data) /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi)) wait_ms = GC_THREAD_NOGC_SLEEP_TIME; - else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) - wait_ms = GC_THREAD_MAX_SLEEP_TIME; - } while (!kthread_should_stop()); return 0; } @@ -131,7 +128,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - if (p->alloc_mode) { + if (p->alloc_mode == SSR) { p->gc_mode = GC_GREEDY; p->dirty_segmap = dirty_i->dirty_segmap[type]; p->ofs_unit = 1; @@ -160,18 +157,21 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int segno; + unsigned int hint = 0; + unsigned int secno; /* * If the gc_type is FG_GC, we can select victim segments * selected by background GC before. * Those segments guarantee they have small valid blocks. */ - segno = find_next_bit(dirty_i->victim_segmap[BG_GC], - TOTAL_SEGS(sbi), 0); - if (segno < TOTAL_SEGS(sbi)) { - clear_bit(segno, dirty_i->victim_segmap[BG_GC]); - return segno; +next: + secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++); + if (secno < TOTAL_SECS(sbi)) { + if (sec_usage_check(sbi, secno)) + goto next; + clear_bit(secno, dirty_i->victim_secmap); + return secno * sbi->segs_per_sec; } return NULL_SEGNO; } @@ -234,7 +234,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; - unsigned int segno; + unsigned int secno; int nsearched = 0; p.alloc_mode = alloc_mode; @@ -253,6 +253,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, while (1) { unsigned long cost; + unsigned int segno; segno = find_next_bit(p.dirty_segmap, TOTAL_SEGS(sbi), p.offset); @@ -265,13 +266,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, break; } p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit; + secno = GET_SECNO(sbi, segno); - if (test_bit(segno, dirty_i->victim_segmap[FG_GC])) - continue; - if (gc_type == BG_GC && - test_bit(segno, dirty_i->victim_segmap[BG_GC])) + if (sec_usage_check(sbi, secno)) continue; - if (IS_CURSEC(sbi, GET_SECNO(sbi, segno))) + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) continue; cost = get_gc_cost(sbi, segno, &p); @@ -291,13 +290,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } got_it: if (p.min_segno != NULL_SEGNO) { - *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; if (p.alloc_mode == LFS) { - int i; - for (i = 0; i < p.ofs_unit; i++) - set_bit(*result + i, - dirty_i->victim_segmap[gc_type]); + secno = GET_SECNO(sbi, p.min_segno); + if (gc_type == FG_GC) + sbi->cur_victim_sec = secno; + else + set_bit(secno, dirty_i->victim_secmap); } + *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; + + trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, + sbi->cur_victim_sec, + prefree_segments(sbi), free_segments(sbi)); } mutex_unlock(&dirty_i->seglist_lock); @@ -381,6 +385,7 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, next_step: entry = sum; + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; @@ -401,11 +406,18 @@ next_step: continue; /* set page dirty and write it */ - if (!PageWriteback(node_page)) + if (gc_type == FG_GC) { + f2fs_submit_bio(sbi, NODE, true); + wait_on_page_writeback(node_page); set_page_dirty(node_page); + } else { + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } f2fs_put_page(node_page, 1); stat_inc_node_blk_count(sbi, 1); } + if (initial) { initial = false; goto next_step; @@ -418,6 +430,13 @@ next_step: .for_reclaim = 0, }; sync_node_pages(sbi, 0, &wbc); + + /* + * In the case of FG_GC, it'd be better to reclaim this victim + * completely. + */ + if (get_valid_blocks(sbi, segno, 1) != 0) + goto next_step; } } @@ -481,21 +500,19 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, static void move_data_page(struct inode *inode, struct page *page, int gc_type) { - if (page->mapping != inode->i_mapping) - goto out; - - if (inode != page->mapping->host) - goto out; - - if (PageWriteback(page)) - goto out; - if (gc_type == BG_GC) { + if (PageWriteback(page)) + goto out; set_page_dirty(page); set_cold_data(page); } else { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - mutex_lock_op(sbi, DATA_WRITE); + + if (PageWriteback(page)) { + f2fs_submit_bio(sbi, DATA, true); + wait_on_page_writeback(page); + } + if (clear_page_dirty_for_io(page) && S_ISDIR(inode->i_mode)) { dec_page_count(sbi, F2FS_DIRTY_DENTS); @@ -503,7 +520,6 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) } set_cold_data(page); do_write_data_page(page); - mutex_unlock_op(sbi, DATA_WRITE); clear_cold_data(page); } out: @@ -530,6 +546,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, next_step: entry = sum; + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { struct page *data_page; struct inode *inode; @@ -567,7 +584,7 @@ next_step: continue; data_page = find_data_page(inode, - start_bidx + ofs_in_node); + start_bidx + ofs_in_node, false); if (IS_ERR(data_page)) goto next_iput; @@ -588,11 +605,22 @@ next_step: next_iput: iput(inode); } + if (++phase < 4) goto next_step; - if (gc_type == FG_GC) + if (gc_type == FG_GC) { f2fs_submit_bio(sbi, DATA, true); + + /* + * In the case of FG_GC, it'd be better to reclaim this victim + * completely. + */ + if (get_valid_blocks(sbi, segno, 1) != 0) { + phase = 2; + goto next_step; + } + } } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -611,18 +639,15 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, { struct page *sum_page; struct f2fs_summary_block *sum; + struct blk_plug plug; /* read segment summary of victim */ sum_page = get_sum_page(sbi, segno); if (IS_ERR(sum_page)) return; - /* - * CP needs to lock sum_page. In this time, we don't need - * to lock this page, because this summary page is not gone anywhere. - * Also, this page is not gonna be updated before GC is done. - */ - unlock_page(sum_page); + blk_start_plug(&plug); + sum = page_address(sum_page); switch (GET_SUM_TYPE((&sum->footer))) { @@ -633,10 +658,12 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); break; } + blk_finish_plug(&plug); + stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); stat_inc_call_count(sbi->stat_info); - f2fs_put_page(sum_page, 0); + f2fs_put_page(sum_page, 1); } int f2fs_gc(struct f2fs_sb_info *sbi) @@ -652,8 +679,10 @@ gc_more: if (!(sbi->sb->s_flags & MS_ACTIVE)) goto stop; - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; + write_checkpoint(sbi, false); + } if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) goto stop; @@ -662,9 +691,11 @@ gc_more: for (i = 0; i < sbi->segs_per_sec; i++) do_garbage_collect(sbi, segno + i, &ilist, gc_type); - if (gc_type == FG_GC && - get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + if (gc_type == FG_GC) { + sbi->cur_victim_sec = NULL_SEGNO; nfree++; + WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec)); + } if (has_not_enough_free_secs(sbi, nfree)) goto gc_more; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 30b2db003acd..2c6a6bd08322 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,9 +13,9 @@ * whether IO subsystem is idle * or not */ -#define GC_THREAD_MIN_SLEEP_TIME 10000 /* milliseconds */ -#define GC_THREAD_MAX_SLEEP_TIME 30000 -#define GC_THREAD_NOGC_SLEEP_TIME 10000 +#define GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ +#define GC_THREAD_MAX_SLEEP_TIME 60000 +#define GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ #define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */ @@ -58,6 +58,9 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) static inline long increase_sleep_time(long wait) { + if (wait == GC_THREAD_NOGC_SLEEP_TIME) + return wait; + wait += GC_THREAD_MIN_SLEEP_TIME; if (wait > GC_THREAD_MAX_SLEEP_TIME) wait = GC_THREAD_MAX_SLEEP_TIME; @@ -66,6 +69,9 @@ static inline long increase_sleep_time(long wait) static inline long decrease_sleep_time(long wait) { + if (wait == GC_THREAD_NOGC_SLEEP_TIME) + wait = GC_THREAD_MAX_SLEEP_TIME; + wait -= GC_THREAD_MIN_SLEEP_TIME; if (wait <= GC_THREAD_MIN_SLEEP_TIME) wait = GC_THREAD_MIN_SLEEP_TIME; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index ddae412d30c8..91ac7f9d88ee 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -16,6 +16,8 @@ #include "f2fs.h" #include "node.h" +#include <trace/events/f2fs.h> + void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; @@ -44,7 +46,11 @@ static int do_read_inode(struct inode *inode) struct f2fs_inode *ri; /* Check if ino is within scope */ - check_nid_range(sbi, inode->i_ino); + if (check_nid_range(sbi, inode->i_ino)) { + f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu", + (unsigned long) inode->i_ino); + return -EINVAL; + } node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) @@ -76,7 +82,6 @@ static int do_read_inode(struct inode *inode) fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid); fi->i_flags = le32_to_cpu(ri->i_flags); fi->flags = 0; - fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1; fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); get_extent_info(&fi->ext, ri->i_ext); @@ -88,13 +93,16 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - int ret; + int ret = 0; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + + if (!(inode->i_state & I_NEW)) { + trace_f2fs_iget(inode); return inode; + } if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) goto make_now; @@ -136,11 +144,12 @@ make_now: goto bad_inode; } unlock_new_inode(inode); - + trace_f2fs_iget(inode); return inode; bad_inode: iget_failed(inode); + trace_f2fs_iget_exit(inode, ret); return ERR_PTR(ret); } @@ -192,47 +201,51 @@ void update_inode(struct inode *inode, struct page *node_page) set_page_dirty(node_page); } -int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +int update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *node_page; - bool need_lock = false; - - if (inode->i_ino == F2FS_NODE_INO(sbi) || - inode->i_ino == F2FS_META_INO(sbi)) - return 0; - - if (wbc) - f2fs_balance_fs(sbi); node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); - if (!PageDirty(node_page)) { - need_lock = true; - f2fs_put_page(node_page, 1); - mutex_lock(&sbi->write_inode); - node_page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) { - mutex_unlock(&sbi->write_inode); - return PTR_ERR(node_page); - } - } update_inode(inode, node_page); f2fs_put_page(node_page, 1); - if (need_lock) - mutex_unlock(&sbi->write_inode); return 0; } +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int ret, ilock; + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return 0; + + if (wbc) + f2fs_balance_fs(sbi); + + /* + * We need to lock here to prevent from producing dirty node pages + * during the urgent cleaning time when runing out of free sections. + */ + ilock = mutex_lock_op(sbi); + ret = update_inode_page(inode); + mutex_unlock_op(sbi, ilock); + return ret; +} + /* * Called at the last iput() if i_nlink is zero */ void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + int ilock; + trace_f2fs_evict_inode(inode); truncate_inode_pages(&inode->i_data, 0); if (inode->i_ino == F2FS_NODE_INO(sbi) || @@ -252,7 +265,10 @@ void f2fs_evict_inode(struct inode *inode) if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); + ilock = mutex_lock_op(sbi); remove_inode_page(inode); + mutex_unlock_op(sbi, ilock); + sb_end_intwrite(inode->i_sb); no_delete: clear_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 1a49b881bac0..47abc9722b17 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -15,8 +15,10 @@ #include <linux/ctype.h> #include "f2fs.h" +#include "node.h" #include "xattr.h" #include "acl.h" +#include <trace/events/f2fs.h> static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) { @@ -25,19 +27,19 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_t ino; struct inode *inode; bool nid_free = false; - int err; + int err, ilock; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); - mutex_lock_op(sbi, NODE_NEW); + ilock = mutex_lock_op(sbi); if (!alloc_nid(sbi, &ino)) { - mutex_unlock_op(sbi, NODE_NEW); + mutex_unlock_op(sbi, ilock); err = -ENOSPC; goto fail; } - mutex_unlock_op(sbi, NODE_NEW); + mutex_unlock_op(sbi, ilock); inode->i_uid = current_fsuid(); @@ -61,7 +63,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_free = true; goto out; } - + trace_f2fs_new_inode(inode, 0); mark_inode_dirty(inode); return inode; @@ -69,6 +71,8 @@ out: clear_nlink(inode); unlock_new_inode(inode); fail: + trace_f2fs_new_inode(inode, err); + make_bad_inode(inode); iput(inode); if (nid_free) alloc_nid_failed(sbi, ino); @@ -82,7 +86,7 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) int ret; if (sublen > slen) - return 1; + return 0; ret = memcmp(s + slen - sublen, sub, sublen); if (ret) { /* compare upper case */ @@ -90,16 +94,16 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) char upper_sub[8]; for (i = 0; i < sublen && i < sizeof(upper_sub); i++) upper_sub[i] = toupper(sub[i]); - return memcmp(s + slen - sublen, upper_sub, sublen); + return !memcmp(s + slen - sublen, upper_sub, sublen); } - return ret; + return !ret; } /* * Set multimedia files as cold files for hot/cold data separation */ -static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode, +static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) { int i; @@ -107,8 +111,8 @@ static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode, int count = le32_to_cpu(sbi->raw_super->extension_count); for (i = 0; i < count; i++) { - if (!is_multimedia_file(name, extlist[i])) { - F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT; + if (is_multimedia_file(name, extlist[i])) { + set_cold_file(inode); break; } } @@ -121,7 +125,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; nid_t ino = 0; - int err; + int err, ilock; f2fs_balance_fs(sbi); @@ -130,14 +134,16 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return PTR_ERR(inode); if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) - set_cold_file(sbi, inode, dentry->d_name.name); + set_cold_files(sbi, inode, dentry->d_name.name); inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out; @@ -150,6 +156,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, out: clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, ino); return err; @@ -161,7 +168,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct inode *inode = old_dentry->d_inode; struct super_block *sb = dir->i_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); - int err; + int err, ilock; f2fs_balance_fs(sbi); @@ -169,14 +176,23 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, atomic_inc(&inode->i_count); set_inode_flag(F2FS_I(inode), FI_INC_LINK); + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out; + /* + * This file should be checkpointed during fsync. + * We lost i_pino from now on. + */ + set_cp_file(inode); + d_instantiate(dentry, inode); return 0; out: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + make_bad_inode(inode); iput(inode); return err; } @@ -197,7 +213,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct f2fs_dir_entry *de; struct page *page; - if (dentry->d_name.len > F2FS_MAX_NAME_LEN) + if (dentry->d_name.len > F2FS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); de = f2fs_find_entry(dir, &dentry->d_name, &page); @@ -222,7 +238,9 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct f2fs_dir_entry *de; struct page *page; int err = -ENOENT; + int ilock; + trace_f2fs_unlink_enter(dir, dentry); f2fs_balance_fs(sbi); de = f2fs_find_entry(dir, &dentry->d_name, &page); @@ -236,11 +254,14 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } + ilock = mutex_lock_op(sbi); f2fs_delete_entry(de, page, inode); + mutex_unlock_op(sbi, ilock); /* In order to evict this inode, we set it dirty */ mark_inode_dirty(inode); fail: + trace_f2fs_unlink_exit(inode, err); return err; } @@ -251,7 +272,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; size_t symlen = strlen(symname) + 1; - int err; + int err, ilock; f2fs_balance_fs(sbi); @@ -262,7 +283,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out; @@ -275,6 +298,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, out: clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, inode->i_ino); return err; @@ -284,7 +308,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct inode *inode; - int err; + int err, ilock; f2fs_balance_fs(sbi); @@ -298,7 +322,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); set_inode_flag(F2FS_I(inode), FI_INC_LINK); + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out_fail; @@ -313,6 +339,7 @@ out_fail: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, inode->i_ino); return err; @@ -333,6 +360,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; int err = 0; + int ilock; if (!new_valid_dev(rdev)) return -EINVAL; @@ -346,7 +374,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; + ilock = mutex_lock_op(sbi); err = f2fs_add_link(dentry, inode); + mutex_unlock_op(sbi, ilock); if (err) goto out; @@ -357,6 +387,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, out: clear_nlink(inode); unlock_new_inode(inode); + make_bad_inode(inode); iput(inode); alloc_nid_failed(sbi, inode->i_ino); return err; @@ -374,7 +405,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; - int err = -ENOENT; + int err = -ENOENT, ilock = -1; f2fs_balance_fs(sbi); @@ -389,7 +420,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_old; } - mutex_lock_op(sbi, RENAME); + ilock = mutex_lock_op(sbi); if (new_inode) { struct page *new_page; @@ -412,7 +443,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, drop_nlink(new_inode); if (!new_inode->i_nlink) add_orphan_inode(sbi, new_inode->i_ino); - f2fs_write_inode(new_inode, NULL); + update_inode_page(new_inode); } else { err = f2fs_add_link(new_dentry, old_inode); if (err) @@ -420,12 +451,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir_entry) { inc_nlink(new_dir); - f2fs_write_inode(new_dir, NULL); + update_inode_page(new_dir); } } old_inode->i_ctime = CURRENT_TIME; - set_inode_flag(F2FS_I(old_inode), FI_NEED_CP); mark_inode_dirty(old_inode); f2fs_delete_entry(old_entry, old_page, NULL); @@ -439,10 +469,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_put_page(old_dir_page, 0); } drop_nlink(old_dir); - f2fs_write_inode(old_dir, NULL); + update_inode_page(old_dir); } - mutex_unlock_op(sbi, RENAME); + mutex_unlock_op(sbi, ilock); return 0; out_dir: @@ -450,7 +480,7 @@ out_dir: kunmap(old_dir_page); f2fs_put_page(old_dir_page, 0); } - mutex_unlock_op(sbi, RENAME); + mutex_unlock_op(sbi, ilock); out_old: kunmap(old_page); f2fs_put_page(old_page, 0); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e275218904ed..3df43b4efd89 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,6 +19,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include <trace/events/f2fs.h> static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -88,10 +89,13 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) { struct address_space *mapping = sbi->meta_inode->i_mapping; struct f2fs_nm_info *nm_i = NM_I(sbi); + struct blk_plug plug; struct page *page; pgoff_t index; int i; + blk_start_plug(&plug); + for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) { if (nid >= nm_i->max_nid) nid = 0; @@ -100,12 +104,16 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid) page = grab_cache_page(mapping, index); if (!page) continue; - if (f2fs_readpage(sbi, page, index, READ)) { + if (PageUptodate(page)) { f2fs_put_page(page, 1); continue; } + if (f2fs_readpage(sbi, page, index, READ)) + continue; + f2fs_put_page(page, 0); } + blk_finish_plug(&plug); } static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) @@ -236,7 +244,7 @@ static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD) + if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD) return 0; write_lock(&nm_i->nat_tree_lock); @@ -320,15 +328,14 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[0] = 0; if (block < direct_index) { - offset[n++] = block; - level = 0; + offset[n] = block; goto got; } block -= direct_index; if (block < direct_blks) { offset[n++] = NODE_DIR1_BLOCK; noffset[n] = 1; - offset[n++] = block; + offset[n] = block; level = 1; goto got; } @@ -336,7 +343,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) if (block < direct_blks) { offset[n++] = NODE_DIR2_BLOCK; noffset[n] = 2; - offset[n++] = block; + offset[n] = block; level = 1; goto got; } @@ -346,7 +353,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[n] = 3; offset[n++] = block / direct_blks; noffset[n] = 4 + offset[n - 1]; - offset[n++] = block % direct_blks; + offset[n] = block % direct_blks; level = 2; goto got; } @@ -356,7 +363,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[n] = 4 + dptrs_per_blk; offset[n++] = block / direct_blks; noffset[n] = 5 + dptrs_per_blk + offset[n - 1]; - offset[n++] = block % direct_blks; + offset[n] = block % direct_blks; level = 2; goto got; } @@ -371,7 +378,7 @@ static int get_node_path(long block, int offset[4], unsigned int noffset[4]) noffset[n] = 7 + (dptrs_per_blk * 2) + offset[n - 2] * (dptrs_per_blk + 1) + offset[n - 1]; - offset[n++] = block % direct_blks; + offset[n] = block % direct_blks; level = 3; goto got; } else { @@ -383,8 +390,11 @@ got: /* * Caller should call f2fs_put_dnode(dn). + * Also, it should grab and release a mutex by calling mutex_lock_op() and + * mutex_unlock_op() only if ro is not set RDONLY_NODE. + * In the case of RDONLY_NODE, we don't need to care about mutex. */ -int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct page *npage[4]; @@ -403,7 +413,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) return PTR_ERR(npage[0]); parent = npage[0]; - nids[1] = get_nid(parent, offset[0], true); + if (level != 0) + nids[1] = get_nid(parent, offset[0], true); dn->inode_page = npage[0]; dn->inode_page_locked = true; @@ -411,12 +422,9 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) for (i = 1; i <= level; i++) { bool done = false; - if (!nids[i] && !ro) { - mutex_lock_op(sbi, NODE_NEW); - + if (!nids[i] && mode == ALLOC_NODE) { /* alloc new node */ if (!alloc_nid(sbi, &(nids[i]))) { - mutex_unlock_op(sbi, NODE_NEW); err = -ENOSPC; goto release_pages; } @@ -425,16 +433,14 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro) npage[i] = new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); - mutex_unlock_op(sbi, NODE_NEW); err = PTR_ERR(npage[i]); goto release_pages; } set_nid(parent, offset[i - 1], nids[i], i == 1); alloc_nid_done(sbi, nids[i]); - mutex_unlock_op(sbi, NODE_NEW); done = true; - } else if (ro && i == level && level > 1) { + } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { npage[i] = get_node_page_ra(parent, offset[i - 1]); if (IS_ERR(npage[i])) { err = PTR_ERR(npage[i]); @@ -507,6 +513,7 @@ invalidate: f2fs_put_page(dn->node_page, 1); dn->node_page = NULL; + trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); } static int truncate_dnode(struct dnode_of_data *dn) @@ -547,9 +554,13 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, if (dn->nid == 0) return NIDS_PER_BLOCK + 1; + trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); + page = get_node_page(sbi, dn->nid); - if (IS_ERR(page)) + if (IS_ERR(page)) { + trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); return PTR_ERR(page); + } rn = (struct f2fs_node *)page_address(page); if (depth < 3) { @@ -591,10 +602,12 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, } else { f2fs_put_page(page, 1); } + trace_f2fs_truncate_nodes_exit(dn->inode, freed); return freed; out_err: f2fs_put_page(page, 1); + trace_f2fs_truncate_nodes_exit(dn->inode, ret); return ret; } @@ -649,6 +662,9 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, fail: for (i = depth - 3; i >= 0; i--) f2fs_put_page(pages[i], 1); + + trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); + return err; } @@ -658,6 +674,7 @@ fail: int truncate_inode_blocks(struct inode *inode, pgoff_t from) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct address_space *node_mapping = sbi->node_inode->i_mapping; int err = 0, cont = 1; int level, offset[4], noffset[4]; unsigned int nofs = 0; @@ -665,11 +682,15 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) struct dnode_of_data dn; struct page *page; - level = get_node_path(from, offset, noffset); + trace_f2fs_truncate_inode_blocks_enter(inode, from); + level = get_node_path(from, offset, noffset); +restart: page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) + if (IS_ERR(page)) { + trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); return PTR_ERR(page); + } set_new_dnode(&dn, inode, page, NULL, 0); unlock_page(page); @@ -728,6 +749,10 @@ skip_partial: if (offset[1] == 0 && rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); + if (page->mapping != node_mapping) { + f2fs_put_page(page, 1); + goto restart; + } wait_on_page_writeback(page); rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); @@ -739,9 +764,14 @@ skip_partial: } fail: f2fs_put_page(page, 0); + trace_f2fs_truncate_inode_blocks_exit(inode, err); return err > 0 ? 0 : err; } +/* + * Caller should grab and release a mutex by calling mutex_lock_op() and + * mutex_unlock_op(). + */ int remove_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); @@ -749,21 +779,16 @@ int remove_inode_page(struct inode *inode) nid_t ino = inode->i_ino; struct dnode_of_data dn; - mutex_lock_op(sbi, NODE_TRUNC); page = get_node_page(sbi, ino); - if (IS_ERR(page)) { - mutex_unlock_op(sbi, NODE_TRUNC); + if (IS_ERR(page)) return PTR_ERR(page); - } if (F2FS_I(inode)->i_xattr_nid) { nid_t nid = F2FS_I(inode)->i_xattr_nid; struct page *npage = get_node_page(sbi, nid); - if (IS_ERR(npage)) { - mutex_unlock_op(sbi, NODE_TRUNC); + if (IS_ERR(npage)) return PTR_ERR(npage); - } F2FS_I(inode)->i_xattr_nid = 0; set_new_dnode(&dn, inode, page, npage, nid); @@ -775,23 +800,18 @@ int remove_inode_page(struct inode *inode) BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1); set_new_dnode(&dn, inode, page, page, ino); truncate_node(&dn); - - mutex_unlock_op(sbi, NODE_TRUNC); return 0; } int new_inode_page(struct inode *inode, const struct qstr *name) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; struct dnode_of_data dn; /* allocate inode page for new inode */ set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - mutex_lock_op(sbi, NODE_NEW); page = new_node_page(&dn, 0); init_dent_inode(name, page); - mutex_unlock_op(sbi, NODE_NEW); if (IS_ERR(page)) return PTR_ERR(page); f2fs_put_page(page, 1); @@ -844,6 +864,12 @@ fail: return ERR_PTR(err); } +/* + * Caller should do after getting the following values. + * 0: f2fs_put_page(page, 0) + * LOCKED_PAGE: f2fs_put_page(page, 1) + * error: nothing + */ static int read_node_page(struct page *page, int type) { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); @@ -851,8 +877,14 @@ static int read_node_page(struct page *page, int type) get_node_info(sbi, page->index, &ni); - if (ni.blk_addr == NULL_ADDR) + if (ni.blk_addr == NULL_ADDR) { + f2fs_put_page(page, 1); return -ENOENT; + } + + if (PageUptodate(page)) + return LOCKED_PAGE; + return f2fs_readpage(sbi, page, ni.blk_addr, type); } @@ -863,40 +895,53 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) { struct address_space *mapping = sbi->node_inode->i_mapping; struct page *apage; + int err; apage = find_get_page(mapping, nid); - if (apage && PageUptodate(apage)) - goto release_out; + if (apage && PageUptodate(apage)) { + f2fs_put_page(apage, 0); + return; + } f2fs_put_page(apage, 0); apage = grab_cache_page(mapping, nid); if (!apage) return; - if (read_node_page(apage, READA)) - unlock_page(apage); - -release_out: - f2fs_put_page(apage, 0); + err = read_node_page(apage, READA); + if (err == 0) + f2fs_put_page(apage, 0); + else if (err == LOCKED_PAGE) + f2fs_put_page(apage, 1); return; } struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - int err; - struct page *page; struct address_space *mapping = sbi->node_inode->i_mapping; - + struct page *page; + int err; +repeat: page = grab_cache_page(mapping, nid); if (!page) return ERR_PTR(-ENOMEM); err = read_node_page(page, READ_SYNC); - if (err) { - f2fs_put_page(page, 1); + if (err < 0) return ERR_PTR(err); - } + else if (err == LOCKED_PAGE) + goto got_it; + lock_page(page); + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + if (page->mapping != mapping) { + f2fs_put_page(page, 1); + goto repeat; + } +got_it: BUG_ON(nid != nid_of_node(page)); mark_page_accessed(page); return page; @@ -910,31 +955,27 @@ struct page *get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); struct address_space *mapping = sbi->node_inode->i_mapping; - int i, end; - int err = 0; - nid_t nid; + struct blk_plug plug; struct page *page; + int err, i, end; + nid_t nid; /* First, try getting the desired direct node. */ nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); - - page = find_get_page(mapping, nid); - if (page && PageUptodate(page)) - goto page_hit; - f2fs_put_page(page, 0); - repeat: page = grab_cache_page(mapping, nid); if (!page) return ERR_PTR(-ENOMEM); - err = read_node_page(page, READA); - if (err) { - f2fs_put_page(page, 1); + err = read_node_page(page, READ_SYNC); + if (err < 0) return ERR_PTR(err); - } + else if (err == LOCKED_PAGE) + goto page_hit; + + blk_start_plug(&plug); /* Then, try readahead for siblings of the desired node */ end = start + MAX_RA_NODE; @@ -946,18 +987,19 @@ repeat: ra_node_page(sbi, nid); } -page_hit: - lock_page(page); - if (PageError(page)) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } + blk_finish_plug(&plug); - /* Has the page been truncated? */ + lock_page(page); if (page->mapping != mapping) { f2fs_put_page(page, 1); goto repeat; } +page_hit: + if (!PageUptodate(page)) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } + mark_page_accessed(page); return page; } @@ -972,7 +1014,7 @@ void sync_inode_page(struct dnode_of_data *dn) if (!dn->inode_page_locked) unlock_page(dn->inode_page); } else { - f2fs_write_inode(dn->inode, NULL); + update_inode_page(dn->inode); } } @@ -1087,17 +1129,8 @@ static int f2fs_write_node_page(struct page *page, block_t new_addr; struct node_info ni; - if (wbc->for_reclaim) { - dec_page_count(sbi, F2FS_DIRTY_NODES); - wbc->pages_skipped++; - set_page_dirty(page); - return AOP_WRITEPAGE_ACTIVATE; - } - wait_on_page_writeback(page); - mutex_lock_op(sbi, NODE_WRITE); - /* get old block addr of this node page */ nid = nid_of_node(page); BUG_ON(page->index != nid); @@ -1105,17 +1138,25 @@ static int f2fs_write_node_page(struct page *page, get_node_info(sbi, nid, &ni); /* This page is already truncated */ - if (ni.blk_addr == NULL_ADDR) + if (ni.blk_addr == NULL_ADDR) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + unlock_page(page); return 0; + } - set_page_writeback(page); + if (wbc->for_reclaim) { + dec_page_count(sbi, F2FS_DIRTY_NODES); + wbc->pages_skipped++; + set_page_dirty(page); + return AOP_WRITEPAGE_ACTIVATE; + } - /* insert node offset */ + mutex_lock(&sbi->node_write); + set_page_writeback(page); write_node_page(sbi, page, nid, ni.blk_addr, &new_addr); set_node_addr(sbi, &ni, new_addr); dec_page_count(sbi, F2FS_DIRTY_NODES); - - mutex_unlock_op(sbi, NODE_WRITE); + mutex_unlock(&sbi->node_write); unlock_page(page); return 0; } @@ -1130,12 +1171,11 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - struct block_device *bdev = sbi->sb->s_bdev; long nr_to_write = wbc->nr_to_write; /* First check balancing cached NAT entries */ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { - write_checkpoint(sbi, false); + f2fs_sync_fs(sbi->sb, true); return 0; } @@ -1144,10 +1184,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, return 0; /* if mounting is failed, skip writing node pages */ - wbc->nr_to_write = bio_get_nr_vecs(bdev); + wbc->nr_to_write = max_hw_blocks(sbi); sync_node_pages(sbi, 0, wbc); - wbc->nr_to_write = nr_to_write - - (bio_get_nr_vecs(bdev) - wbc->nr_to_write); + wbc->nr_to_write = nr_to_write - (max_hw_blocks(sbi) - wbc->nr_to_write); return 0; } @@ -1178,7 +1217,7 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned long offset) static int f2fs_release_node_page(struct page *page, gfp_t wait) { ClearPagePrivate(page); - return 0; + return 1; } /* @@ -1195,14 +1234,13 @@ const struct address_space_operations f2fs_node_aops = { static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head) { struct list_head *this; - struct free_nid *i = NULL; + struct free_nid *i; list_for_each(this, head) { i = list_entry(this, struct free_nid, list); if (i->nid == n) - break; - i = NULL; + return i; } - return i; + return NULL; } static void __del_from_free_nid_list(struct free_nid *i) @@ -1211,11 +1249,29 @@ static void __del_from_free_nid_list(struct free_nid *i) kmem_cache_free(free_nid_slab, i); } -static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build) { struct free_nid *i; + struct nat_entry *ne; + bool allocated = false; if (nm_i->fcnt > 2 * MAX_FREE_NIDS) + return -1; + + /* 0 nid should not be used */ + if (nid == 0) + return 0; + + if (!build) + goto retry; + + /* do not add allocated nids */ + read_lock(&nm_i->nat_tree_lock); + ne = __lookup_nat_cache(nm_i, nid); + if (ne && nat_get_blkaddr(ne) != NULL_ADDR) + allocated = true; + read_unlock(&nm_i->nat_tree_lock); + if (allocated) return 0; retry: i = kmem_cache_alloc(free_nid_slab, GFP_NOFS); @@ -1250,63 +1306,59 @@ static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) spin_unlock(&nm_i->free_nid_list_lock); } -static int scan_nat_page(struct f2fs_nm_info *nm_i, +static void scan_nat_page(struct f2fs_nm_info *nm_i, struct page *nat_page, nid_t start_nid) { struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; - int fcnt = 0; int i; - /* 0 nid should not be used */ - if (start_nid == 0) - ++start_nid; - i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { - blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); + + if (start_nid >= nm_i->max_nid) + break; + + blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); BUG_ON(blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) - fcnt += add_free_nid(nm_i, start_nid); + if (blk_addr == NULL_ADDR) { + if (add_free_nid(nm_i, start_nid, true) < 0) + break; + } } - return fcnt; } static void build_free_nids(struct f2fs_sb_info *sbi) { - struct free_nid *fnid, *next_fnid; struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - nid_t nid = 0; - bool is_cycled = false; - int fcnt = 0; - int i; + int i = 0; + nid_t nid = nm_i->next_scan_nid; - nid = nm_i->next_scan_nid; - nm_i->init_scan_nid = nid; + /* Enough entries */ + if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) + return; + /* readahead nat pages to be scanned */ ra_nat_pages(sbi, nid); while (1) { struct page *page = get_current_nat_page(sbi, nid); - fcnt += scan_nat_page(nm_i, page, nid); + scan_nat_page(nm_i, page, nid); f2fs_put_page(page, 1); nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); - - if (nid >= nm_i->max_nid) { + if (nid >= nm_i->max_nid) nid = 0; - is_cycled = true; - } - if (fcnt > MAX_FREE_NIDS) - break; - if (is_cycled && nm_i->init_scan_nid <= nid) + + if (i++ == FREE_NID_PAGES) break; } + /* go to the next free nat pages to find free nids abundantly */ nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ @@ -1315,22 +1367,11 @@ static void build_free_nids(struct f2fs_sb_info *sbi) block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); nid = le32_to_cpu(nid_in_journal(sum, i)); if (addr == NULL_ADDR) - add_free_nid(nm_i, nid); + add_free_nid(nm_i, nid, true); else remove_free_nid(nm_i, nid); } mutex_unlock(&curseg->curseg_mutex); - - /* remove the free nids from current allocated nids */ - list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) { - struct nat_entry *ne; - - read_lock(&nm_i->nat_tree_lock); - ne = __lookup_nat_cache(nm_i, fnid->nid); - if (ne && nat_get_blkaddr(ne) != NULL_ADDR) - remove_free_nid(nm_i, fnid->nid); - read_unlock(&nm_i->nat_tree_lock); - } } /* @@ -1344,41 +1385,36 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; struct list_head *this; retry: - mutex_lock(&nm_i->build_lock); - if (!nm_i->fcnt) { - /* scan NAT in order to build free nid list */ - build_free_nids(sbi); - if (!nm_i->fcnt) { - mutex_unlock(&nm_i->build_lock); - return false; - } - } - mutex_unlock(&nm_i->build_lock); + if (sbi->total_valid_node_count + 1 >= nm_i->max_nid) + return false; - /* - * We check fcnt again since previous check is racy as - * we didn't hold free_nid_list_lock. So other thread - * could consume all of free nids. - */ spin_lock(&nm_i->free_nid_list_lock); - if (!nm_i->fcnt) { - spin_unlock(&nm_i->free_nid_list_lock); - goto retry; - } - BUG_ON(list_empty(&nm_i->free_nid_list)); - list_for_each(this, &nm_i->free_nid_list) { - i = list_entry(this, struct free_nid, list); - if (i->state == NID_NEW) - break; - } + /* We should not use stale free nids created by build_free_nids */ + if (nm_i->fcnt && !sbi->on_build_free_nids) { + BUG_ON(list_empty(&nm_i->free_nid_list)); + list_for_each(this, &nm_i->free_nid_list) { + i = list_entry(this, struct free_nid, list); + if (i->state == NID_NEW) + break; + } - BUG_ON(i->state != NID_NEW); - *nid = i->nid; - i->state = NID_ALLOC; - nm_i->fcnt--; + BUG_ON(i->state != NID_NEW); + *nid = i->nid; + i->state = NID_ALLOC; + nm_i->fcnt--; + spin_unlock(&nm_i->free_nid_list_lock); + return true; + } spin_unlock(&nm_i->free_nid_list_lock); - return true; + + /* Let's scan nat pages and its caches to get free nids */ + mutex_lock(&nm_i->build_lock); + sbi->on_build_free_nids = 1; + build_free_nids(sbi); + sbi->on_build_free_nids = 0; + mutex_unlock(&nm_i->build_lock); + goto retry; } /* @@ -1391,10 +1427,8 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); - if (i) { - BUG_ON(i->state != NID_ALLOC); - __del_from_free_nid_list(i); - } + BUG_ON(!i || i->state != NID_ALLOC); + __del_from_free_nid_list(i); spin_unlock(&nm_i->free_nid_list_lock); } @@ -1403,8 +1437,19 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) */ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) { - alloc_nid_done(sbi, nid); - add_free_nid(NM_I(sbi), nid); + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct free_nid *i; + + spin_lock(&nm_i->free_nid_list_lock); + i = __lookup_free_nid_list(nid, &nm_i->free_nid_list); + BUG_ON(!i || i->state != NID_ALLOC); + if (nm_i->fcnt > 2 * MAX_FREE_NIDS) { + __del_from_free_nid_list(i); + } else { + i->state = NID_NEW; + nm_i->fcnt++; + } + spin_unlock(&nm_i->free_nid_list_lock); } void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, @@ -1475,23 +1520,24 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i++, sum_entry++) { + /* + * In order to read next node page, + * we must clear PageUptodate flag. + */ + ClearPageUptodate(page); + if (f2fs_readpage(sbi, page, addr, READ_SYNC)) goto out; + lock_page(page); rn = (struct f2fs_node *)page_address(page); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; sum_entry->ofs_in_node = 0; addr++; - - /* - * In order to read next node page, - * we must clear PageUptodate flag. - */ - ClearPageUptodate(page); } -out: unlock_page(page); +out: __free_pages(page, 0); return 0; } @@ -1614,13 +1660,11 @@ flush_now: nid_in_journal(sum, offset) = cpu_to_le32(nid); } - if (nat_get_blkaddr(ne) == NULL_ADDR) { + if (nat_get_blkaddr(ne) == NULL_ADDR && + add_free_nid(NM_I(sbi), nid, false) <= 0) { write_lock(&nm_i->nat_tree_lock); __del_from_nat_cache(nm_i, ne); write_unlock(&nm_i->nat_tree_lock); - - /* We can reuse this freed nid at this point */ - add_free_nid(NM_I(sbi), nid); } else { write_lock(&nm_i->nat_tree_lock); __clear_nat_cache_dirty(nm_i, ne); @@ -1661,19 +1705,16 @@ static int init_node_manager(struct f2fs_sb_info *sbi) spin_lock_init(&nm_i->free_nid_list_lock); rwlock_init(&nm_i->nat_tree_lock); - nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); - nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); - - nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL); - if (!nm_i->nat_bitmap) - return -ENOMEM; + nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP); if (!version_bitmap) return -EFAULT; - /* copy version bitmap */ - memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size); + nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap) + return -ENOMEM; return 0; } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index afdb130f782e..0a2d72f0024d 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -29,6 +29,9 @@ /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 +/* return value for read_node_page */ +#define LOCKED_PAGE 1 + /* * For node information */ @@ -239,7 +242,7 @@ static inline bool IS_DNODE(struct page *node_page) return false; if (ofs >= 6 + 2 * NIDS_PER_BLOCK) { ofs -= 6 + 2 * NIDS_PER_BLOCK; - if ((long int)ofs % (NIDS_PER_BLOCK + 1)) + if (!((long int)ofs % (NIDS_PER_BLOCK + 1))) return false; } return true; @@ -277,6 +280,21 @@ static inline int is_cold_file(struct inode *inode) return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT; } +static inline void set_cold_file(struct inode *inode) +{ + F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT; +} + +static inline int is_cp_file(struct inode *inode) +{ + return F2FS_I(inode)->i_advise & FADVISE_CP_BIT; +} + +static inline void set_cp_file(struct inode *inode) +{ + F2FS_I(inode)->i_advise |= FADVISE_CP_BIT; +} + static inline int is_cold_data(struct page *page) { return PageChecked(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b235215ac138..60c8a5097058 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -53,7 +53,7 @@ static int recover_dentry(struct page *ipage, struct inode *inode) dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino)); if (IS_ERR(dir)) { - err = -EINVAL; + err = PTR_ERR(dir); goto out; } @@ -112,11 +112,14 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) while (1) { struct fsync_inode_entry *entry; - if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC)) + err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); + if (err) goto out; + lock_page(page); + if (cp_ver != cpver_of_node(page)) - goto out; + goto unlock_out; if (!is_fsync_dnode(page)) goto next; @@ -129,24 +132,23 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) FI_INC_LINK); } else { if (IS_INODE(page) && is_dent_dnode(page)) { - if (recover_inode_page(sbi, page)) { - err = -ENOMEM; - goto out; - } + err = recover_inode_page(sbi, page); + if (err) + goto unlock_out; } /* add this fsync inode to the list */ entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); if (!entry) { err = -ENOMEM; - goto out; + goto unlock_out; } entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); if (IS_ERR(entry->inode)) { err = PTR_ERR(entry->inode); kmem_cache_free(fsync_entry_slab, entry); - goto out; + goto unlock_out; } list_add_tail(&entry->list, head); @@ -154,16 +156,20 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) } if (IS_INODE(page)) { err = recover_inode(entry->inode, page); - if (err) - goto out; + if (err == -ENOENT) { + goto next; + } else if (err) { + err = -EINVAL; + goto unlock_out; + } } next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); - ClearPageUptodate(page); } -out: +unlock_out: unlock_page(page); +out: __free_pages(page, 0); return err; } @@ -232,13 +238,15 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, iput(inode); } -static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, +static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, block_t blkaddr) { unsigned int start, end; struct dnode_of_data dn; struct f2fs_summary sum; struct node_info ni; + int err = 0; + int ilock; start = start_bidx_of_node(ofs_of_node(page)); if (IS_INODE(page)) @@ -246,9 +254,14 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, else end = start + ADDRS_PER_BLOCK; + ilock = mutex_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, start, 0)) - return; + + err = get_dnode_of_data(&dn, start, ALLOC_NODE); + if (err) { + mutex_unlock_op(sbi, ilock); + return err; + } wait_on_page_writeback(dn.node_page); @@ -293,14 +306,17 @@ static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); f2fs_put_dnode(&dn); + mutex_unlock_op(sbi, ilock); + return 0; } -static void recover_data(struct f2fs_sb_info *sbi, +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head, int type) { unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver); struct curseg_info *curseg; struct page *page; + int err = 0; block_t blkaddr; /* get node pages in the current segment */ @@ -310,23 +326,29 @@ static void recover_data(struct f2fs_sb_info *sbi, /* read node page */ page = alloc_page(GFP_NOFS | __GFP_ZERO); if (IS_ERR(page)) - return; + return -ENOMEM; + lock_page(page); while (1) { struct fsync_inode_entry *entry; - if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC)) + err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC); + if (err) goto out; + lock_page(page); + if (cp_ver != cpver_of_node(page)) - goto out; + goto unlock_out; entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) goto next; - do_recover_data(sbi, entry->inode, page, blkaddr); + err = do_recover_data(sbi, entry->inode, page, blkaddr); + if (err) + goto out; if (entry->blkaddr == blkaddr) { iput(entry->inode); @@ -336,28 +358,32 @@ static void recover_data(struct f2fs_sb_info *sbi, next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); - ClearPageUptodate(page); } -out: +unlock_out: unlock_page(page); +out: __free_pages(page, 0); - allocate_new_segments(sbi); + if (!err) + allocate_new_segments(sbi); + return err; } -void recover_fsync_data(struct f2fs_sb_info *sbi) +int recover_fsync_data(struct f2fs_sb_info *sbi) { struct list_head inode_list; + int err; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry), NULL); if (unlikely(!fsync_entry_slab)) - return; + return -ENOMEM; INIT_LIST_HEAD(&inode_list); /* step #1: find fsynced inode numbers */ - if (find_fsync_dnodes(sbi, &inode_list)) + err = find_fsync_dnodes(sbi, &inode_list); + if (err) goto out; if (list_empty(&inode_list)) @@ -365,11 +391,12 @@ void recover_fsync_data(struct f2fs_sb_info *sbi) /* step #2: recover data */ sbi->por_doing = 1; - recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); sbi->por_doing = 0; BUG_ON(!list_empty(&inode_list)); out: destroy_fsync_dnodes(sbi, &inode_list); kmem_cache_destroy(fsync_entry_slab); write_checkpoint(sbi, false); + return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 777f17e496e6..d8e84e49a5c3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -18,6 +18,7 @@ #include "f2fs.h" #include "segment.h" #include "node.h" +#include <trace/events/f2fs.h> /* * This function balances dirty node and dentry pages. @@ -49,9 +50,20 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); + enum dirty_type t = DIRTY_HOT_DATA; + dirty_type = sentry->type; + if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]++; + + /* Only one bitmap should be set */ + for (; t <= DIRTY_COLD_NODE; t++) { + if (t == dirty_type) + continue; + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; + } } } @@ -64,13 +76,16 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, dirty_i->nr_dirty[dirty_type]--; if (dirty_type == DIRTY) { - struct seg_entry *sentry = get_seg_entry(sbi, segno); - dirty_type = sentry->type; - if (test_and_clear_bit(segno, - dirty_i->dirty_segmap[dirty_type])) - dirty_i->nr_dirty[dirty_type]--; - clear_bit(segno, dirty_i->victim_segmap[FG_GC]); - clear_bit(segno, dirty_i->victim_segmap[BG_GC]); + enum dirty_type t = DIRTY_HOT_DATA; + + /* clear all the bitmaps */ + for (; t <= DIRTY_COLD_NODE; t++) + if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) + dirty_i->nr_dirty[t]--; + + if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) + clear_bit(GET_SECNO(sbi, segno), + dirty_i->victim_secmap); } } @@ -296,13 +311,12 @@ static void write_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } -static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, - int ofs_unit, int type) +static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE]; - unsigned int segno, next_segno, i; - int ofs = 0; + unsigned int segno; + unsigned int ofs = 0; /* * If there is not enough reserved sections, @@ -318,28 +332,46 @@ static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi, if (IS_NODESEG(type)) return NULL_SEGNO; next: - segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++); - ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit; + segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs); + ofs += sbi->segs_per_sec; + if (segno < TOTAL_SEGS(sbi)) { + int i; + /* skip intermediate segments in a section */ - if (segno % ofs_unit) + if (segno % sbi->segs_per_sec) goto next; - /* skip if whole section is not prefree */ - next_segno = find_next_zero_bit(prefree_segmap, - TOTAL_SEGS(sbi), segno + 1); - if (next_segno - segno < ofs_unit) + /* skip if the section is currently used */ + if (sec_usage_check(sbi, GET_SECNO(sbi, segno))) goto next; + /* skip if whole section is not prefree */ + for (i = 1; i < sbi->segs_per_sec; i++) + if (!test_bit(segno + i, prefree_segmap)) + goto next; + /* skip if whole section was not free at the last checkpoint */ - for (i = 0; i < ofs_unit; i++) - if (get_seg_entry(sbi, segno)->ckpt_valid_blocks) + for (i = 0; i < sbi->segs_per_sec; i++) + if (get_seg_entry(sbi, segno + i)->ckpt_valid_blocks) goto next; + return segno; } return NULL_SEGNO; } +static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno; + struct free_segmap_info *free_i = FREE_I(sbi); + + if (segno + 1 < TOTAL_SEGS(sbi) && (segno + 1) % sbi->segs_per_sec) + return !test_bit(segno + 1, free_i->free_segmap); + return 0; +} + /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG @@ -348,9 +380,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, int dir) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int total_secs = sbi->total_sections; unsigned int segno, secno, zoneno; - unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone; + unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = *newseg / sbi->segs_per_sec; unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); unsigned int left_start = hint; @@ -363,16 +394,17 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, TOTAL_SEGS(sbi), *newseg + 1); - if (segno < TOTAL_SEGS(sbi)) + if (segno - *newseg < sbi->segs_per_sec - + (*newseg % sbi->segs_per_sec)) goto got_it; } find_other_zone: - secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint); - if (secno >= total_secs) { + secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint); + if (secno >= TOTAL_SECS(sbi)) { if (dir == ALLOC_RIGHT) { secno = find_next_zero_bit(free_i->free_secmap, - total_secs, 0); - BUG_ON(secno >= total_secs); + TOTAL_SECS(sbi), 0); + BUG_ON(secno >= TOTAL_SECS(sbi)); } else { go_left = 1; left_start = hint - 1; @@ -387,8 +419,8 @@ find_other_zone: continue; } left_start = find_next_zero_bit(free_i->free_secmap, - total_secs, 0); - BUG_ON(left_start >= total_secs); + TOTAL_SECS(sbi), 0); + BUG_ON(left_start >= TOTAL_SECS(sbi)); break; } secno = left_start; @@ -561,20 +593,20 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, int type, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int ofs_unit; if (force) { new_curseg(sbi, type, true); goto out; } - ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec; - curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type); + curseg->next_segno = check_prefree_segments(sbi, type); if (curseg->next_segno != NULL_SEGNO) change_curseg(sbi, type, false); else if (type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type, true); else @@ -656,10 +688,16 @@ static void do_submit_bio(struct f2fs_sb_info *sbi, if (type >= META_FLUSH) rw = WRITE_FLUSH_FUA; + if (btype == META) + rw |= REQ_META; + if (sbi->bio[btype]) { struct bio_private *p = sbi->bio[btype]->bi_private; p->sbi = sbi; sbi->bio[btype]->bi_end_io = f2fs_end_io_write; + + trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]); + if (type == META_FLUSH) { DECLARE_COMPLETION_ONSTACK(wait); p->is_sync = true; @@ -696,7 +734,7 @@ static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page, do_submit_bio(sbi, type, false); alloc_new: if (sbi->bio[type] == NULL) { - sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev)); + sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi)); sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); /* * The end_io will be assigned at the sumbission phase. @@ -714,6 +752,7 @@ alloc_new: sbi->last_block_in_bio[type] = blk_addr; up_write(&sbi->bio_sem); + trace_f2fs_submit_write_page(page, blk_addr, type); } static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) @@ -1390,7 +1429,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) } if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = vzalloc(sbi->total_sections * + sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) * sizeof(struct sec_entry)); if (!sit_i->sec_entries) return -ENOMEM; @@ -1403,10 +1442,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); - dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL); + dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); if (!dst_bitmap) return -ENOMEM; - memcpy(dst_bitmap, src_bitmap, bitmap_size); /* init SIT information */ sit_i->s_ops = &default_salloc_ops; @@ -1442,7 +1480,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) if (!free_i->free_segmap) return -ENOMEM; - sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections); + sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -1559,14 +1597,13 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) } } -static int init_victim_segmap(struct f2fs_sb_info *sbi) +static int init_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); - dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL); - dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL); - if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC]) + dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dirty_i->victim_secmap) return -ENOMEM; return 0; } @@ -1593,7 +1630,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) } init_dirty_segmap(sbi); - return init_victim_segmap(sbi); + return init_victim_secmap(sbi); } /* @@ -1680,18 +1717,10 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, mutex_unlock(&dirty_i->seglist_lock); } -void reset_victim_segmap(struct f2fs_sb_info *sbi) -{ - unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); - memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size); -} - -static void destroy_victim_segmap(struct f2fs_sb_info *sbi) +static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - - kfree(dirty_i->victim_segmap[FG_GC]); - kfree(dirty_i->victim_segmap[BG_GC]); + kfree(dirty_i->victim_secmap); } static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) @@ -1706,7 +1735,7 @@ static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) for (i = 0; i < NR_DIRTY_TYPE; i++) discard_dirty_segmap(sbi, i); - destroy_victim_segmap(sbi); + destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; kfree(dirty_i); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 552dadbb2327..062424a0e4c3 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -8,10 +8,13 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#include <linux/blkdev.h> + /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) +#define NULL_SECNO ((unsigned int)(~0)) -/* V: Logical segment # in volume, R: Relative segment # in main area */ +/* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) @@ -23,13 +26,13 @@ ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \ (t == CURSEG_WARM_NODE)) -#define IS_CURSEG(sbi, segno) \ - ((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) +#define IS_CURSEG(sbi, seg) \ + ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ @@ -81,9 +84,12 @@ #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) #define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) +#define TOTAL_SECS(sbi) (sbi->total_sections) #define SECTOR_FROM_BLOCK(sbi, blk_addr) \ (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) +#define SECTOR_TO_BLOCK(sbi, sectors) \ + (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE)) /* during checkpoint, bio_private is used to synchronize the last bio */ struct bio_private { @@ -213,7 +219,7 @@ struct dirty_seglist_info { unsigned long *dirty_segmap[NR_DIRTY_TYPE]; struct mutex seglist_lock; /* lock for segment bitmaps */ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */ - unsigned long *victim_segmap[2]; /* BG_GC, FG_GC */ + unsigned long *victim_secmap; /* background GC victims */ }; /* victim selection function for cleaning and SSR */ @@ -464,8 +470,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) static inline int utilization(struct f2fs_sb_info *sbi) { - return (long int)valid_user_blocks(sbi) * 100 / - (long int)sbi->user_block_count; + return div_u64(valid_user_blocks(sbi) * 100, sbi->user_block_count); } /* @@ -616,3 +621,17 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count) - (base + 1) + type; } + +static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) +{ + if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) + return true; + return false; +} + +static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 62e017743af6..8555f7df82c7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -12,7 +12,6 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/statfs.h> -#include <linux/proc_fs.h> #include <linux/buffer_head.h> #include <linux/backing-dev.h> #include <linux/kthread.h> @@ -21,12 +20,17 @@ #include <linux/seq_file.h> #include <linux/random.h> #include <linux/exportfs.h> +#include <linux/blkdev.h> #include <linux/f2fs_fs.h> #include "f2fs.h" #include "node.h" +#include "segment.h" #include "xattr.h" +#define CREATE_TRACE_POINTS +#include <trace/events/f2fs.h> + static struct kmem_cache *f2fs_inode_cachep; enum { @@ -94,6 +98,20 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) return &fi->vfs_inode; } +static int f2fs_drop_inode(struct inode *inode) +{ + /* + * This is to avoid a deadlock condition like below. + * writeback_single_inode(inode) + * - f2fs_write_data_page + * - f2fs_gc -> iput -> evict + * - inode_wait_for_writeback(inode) + */ + if (!inode_unhashed(inode) && inode->i_state & I_SYNC) + return 0; + return generic_drop_inode(inode); +} + static void f2fs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); @@ -132,13 +150,18 @@ int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + trace_f2fs_sync_fs(sb, sync); + if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES)) return 0; - if (sync) + if (sync) { + mutex_lock(&sbi->gc_mutex); write_checkpoint(sbi, false); - else + mutex_unlock(&sbi->gc_mutex); + } else { f2fs_balance_fs(sbi); + } return 0; } @@ -180,7 +203,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = sbi->total_node_count; buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi); - buf->f_namelen = F2FS_MAX_NAME_LEN; + buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); @@ -223,6 +246,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) static struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, + .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, .write_inode = f2fs_write_inode, .show_options = f2fs_show_options, @@ -457,6 +481,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->root_ino_num = le32_to_cpu(raw_super->root_ino); sbi->node_ino_num = le32_to_cpu(raw_super->node_ino); sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino); + sbi->cur_victim_sec = NULL_SECNO; for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); @@ -473,7 +498,7 @@ static int validate_superblock(struct super_block *sb, if (!*raw_super_buf) { f2fs_msg(sb, KERN_ERR, "unable to read %s superblock", super); - return 1; + return -EIO; } *raw_super = (struct f2fs_super_block *) @@ -485,7 +510,7 @@ static int validate_superblock(struct super_block *sb, f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem " "in %s superblock", super); - return 1; + return -EINVAL; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) @@ -508,9 +533,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - if (validate_superblock(sb, &raw_super, &raw_super_buf, 0)) { + err = validate_superblock(sb, &raw_super, &raw_super_buf, 0); + if (err) { brelse(raw_super_buf); - if (validate_superblock(sb, &raw_super, &raw_super_buf, 1)) + /* check secondary superblock when primary failed */ + err = validate_superblock(sb, &raw_super, &raw_super_buf, 1); + if (err) goto free_sb_buf; } /* init some FS parameters */ @@ -525,7 +553,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi, POSIX_ACL); #endif /* parse mount options */ - if (parse_options(sb, sbi, (char *)data)) + err = parse_options(sb, sbi, (char *)data); + if (err) goto free_sb_buf; sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); @@ -547,11 +576,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->raw_super = raw_super; sbi->raw_super_buf = raw_super_buf; mutex_init(&sbi->gc_mutex); - mutex_init(&sbi->write_inode); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); - for (i = 0; i < NR_LOCK_TYPE; i++) + for (i = 0; i < NR_GLOBAL_LOCKS; i++) mutex_init(&sbi->fs_lock[i]); + mutex_init(&sbi->node_write); sbi->por_doing = 0; spin_lock_init(&sbi->stat_lock); init_rwsem(&sbi->bio_sem); @@ -638,8 +667,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) - recover_fsync_data(sbi); + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + err = recover_fsync_data(sbi); + if (err) + f2fs_msg(sb, KERN_ERR, + "Cannot recover all fsync data errno=%ld", err); + } /* After POR, we can run background GC thread */ err = start_gc_thread(sbi); @@ -650,6 +683,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto fail; + if (test_opt(sbi, DISCARD)) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + if (!blk_queue_discard(q)) + f2fs_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } + return 0; fail: stop_gc_thread(sbi); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8038c0496504..0b02dce31356 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -307,27 +307,30 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, int error, found, free, newsize; size_t name_len; char *pval; + int ilock; if (name == NULL) return -EINVAL; - name_len = strlen(name); if (value == NULL) value_len = 0; - if (name_len > 255 || value_len > MAX_VALUE_LEN) + name_len = strlen(name); + + if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN) return -ERANGE; f2fs_balance_fs(sbi); - mutex_lock_op(sbi, NODE_NEW); + ilock = mutex_lock_op(sbi); + if (!fi->i_xattr_nid) { /* Allocate new attribute block */ struct dnode_of_data dn; if (!alloc_nid(sbi, &fi->i_xattr_nid)) { - mutex_unlock_op(sbi, NODE_NEW); - return -ENOSPC; + error = -ENOSPC; + goto exit; } set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid); mark_inode_dirty(inode); @@ -336,8 +339,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, if (IS_ERR(page)) { alloc_nid_failed(sbi, fi->i_xattr_nid); fi->i_xattr_nid = 0; - mutex_unlock_op(sbi, NODE_NEW); - return PTR_ERR(page); + error = PTR_ERR(page); + goto exit; } alloc_nid_done(sbi, fi->i_xattr_nid); @@ -349,8 +352,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, /* The inode already has an extended attribute block. */ page = get_node_page(sbi, fi->i_xattr_nid); if (IS_ERR(page)) { - mutex_unlock_op(sbi, NODE_NEW); - return PTR_ERR(page); + error = PTR_ERR(page); + goto exit; } base_addr = page_address(page); @@ -432,12 +435,13 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, inode->i_ctime = CURRENT_TIME; clear_inode_flag(fi, FI_ACL_MODE); } - f2fs_write_inode(inode, NULL); - mutex_unlock_op(sbi, NODE_NEW); + update_inode_page(inode); + mutex_unlock_op(sbi, ilock); return 0; cleanup: f2fs_put_page(page, 1); - mutex_unlock_op(sbi, NODE_NEW); +exit: + mutex_unlock_op(sbi, ilock); return error; } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 4ff901632b26..dfce656ddb33 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -19,6 +19,7 @@ #include <linux/mpage.h> #include <linux/buffer_head.h> #include <linux/mount.h> +#include <linux/aio.h> #include <linux/vfs.h> #include <linux/parser.h> #include <linux/uio.h> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 798d4458a4d3..3be57189efd5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -22,7 +22,6 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kthread.h> -#include <linux/freezer.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> @@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head) #define CREATE_TRACE_POINTS #include <trace/events/writeback.h> -/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ -static void bdi_wakeup_flusher(struct backing_dev_info *bdi) -{ - if (bdi->wb.task) { - wake_up_process(bdi->wb.task); - } else { - /* - * The bdi thread isn't there, wake up the forker thread which - * will create and run it. - */ - wake_up_process(default_backing_dev_info.wb.task); - } -} - static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { @@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, spin_lock_bh(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); - if (!bdi->wb.task) - trace_writeback_nothread(bdi, work); - bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); + + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } static void @@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) { - trace_writeback_nowork(bdi); - wake_up_process(bdi->wb.task); - } + trace_writeback_nowork(bdi); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); return; } @@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(bdi); - spin_lock_bh(&bdi->wb_lock); - bdi_wakeup_flusher(bdi); - spin_unlock_bh(&bdi->wb_lock); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } /* @@ -1020,67 +1000,49 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) /* * Handle writeback of dirty data for the device backed by this bdi. Also - * wakes up periodically and does kupdated style flushing. + * reschedules periodically and does kupdated style flushing. */ -int bdi_writeback_thread(void *data) +void bdi_writeback_workfn(struct work_struct *work) { - struct bdi_writeback *wb = data; + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, dwork); struct backing_dev_info *bdi = wb->bdi; long pages_written; set_worker_desc("flush-%s", dev_name(bdi->dev)); current->flags |= PF_SWAPWRITE; - set_freezable(); - wb->last_active = jiffies; - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - trace_writeback_thread_start(bdi); - while (!kthread_freezable_should_stop(NULL)) { + if (likely(!current_is_workqueue_rescuer() || + list_empty(&bdi->bdi_list))) { /* - * Remove own delayed wake-up timer, since we are already awake - * and we'll take care of the periodic write-back. + * The normal path. Keep writing back @bdi until its + * work_list is empty. Note that this path is also taken + * if @bdi is shutting down even when we're running off the + * rescuer as work_list needs to be drained. */ - del_timer(&wb->wakeup_timer); - - pages_written = wb_do_writeback(wb, 0); - + do { + pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + } while (!list_empty(&bdi->work_list)); + } else { + /* + * bdi_wq can't get enough workers and we're running off + * the emergency worker. Don't hog it. Hopefully, 1024 is + * enough for efficient IO. + */ + pages_written = writeback_inodes_wb(&bdi->wb, 1024, + WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); - - if (pages_written) - wb->last_active = jiffies; - - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&bdi->work_list) || kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - continue; - } - - if (wb_has_dirty_io(wb) && dirty_writeback_interval) - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - else { - /* - * We have nothing to do, so can go sleep without any - * timeout and save power. When a work is queued or - * something is made dirty - we will be woken up. - */ - schedule(); - } } - /* Flush any work that raced with us exiting */ - if (!list_empty(&bdi->work_list)) - wb_do_writeback(wb, 1); + if (!list_empty(&bdi->work_list) || + (wb_has_dirty_io(wb) && dirty_writeback_interval)) + queue_delayed_work(bdi_wq, &wb->dwork, + msecs_to_jiffies(dirty_writeback_interval * 10)); - trace_writeback_thread_stop(bdi); - return 0; + current->flags &= ~PF_SWAPWRITE; } - /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b3aaf7b3578b..aef34b1e635e 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -38,6 +38,7 @@ #include <linux/device.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/aio.h> #include <linux/kdev_t.h> #include <linux/kthread.h> #include <linux/list.h> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a6c1664e330b..1d55f9465400 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -19,6 +19,7 @@ #include <linux/pipe_fs_i.h> #include <linux/swap.h> #include <linux/splice.h> +#include <linux/aio.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4655e59d545b..d1c9b85b3f58 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/compat.h> #include <linux/swap.h> +#include <linux/aio.h> static const struct file_operations fuse_direct_io_file_operations; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9883694f1e7c..0bad69ed6336 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -20,6 +20,7 @@ #include <linux/swap.h> #include <linux/gfs2_ondisk.h> #include <linux/backing-dev.h> +#include <linux/aio.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d79c2dadc536..acd16764b133 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -25,6 +25,7 @@ #include <asm/uaccess.h> #include <linux/dlm.h> #include <linux/dlm_plock.h> +#include <linux/aio.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 7318abf9d0fb..c5fa758fd844 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -300,7 +300,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) u64 nblk; if (bio) { - nblk = bio->bi_sector + bio_sectors(bio); + nblk = bio_end_sector(bio); nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk) return bio; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 716e1aafb2e2..f9299d8a64e3 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> +#include <linux/aio.h> #include "hfs_fs.h" #include "btree.h" diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 7faaa964968e..f833d35630ab 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> +#include <linux/aio.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 523464e62849..a3f868ae3fd4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -909,11 +909,8 @@ static int can_do_hugetlb_shm(void) static int get_hstate_idx(int page_size_log) { - struct hstate *h; + struct hstate *h = hstate_sizelog(page_size_log); - if (!page_size_log) - return default_hstate_idx; - h = size_to_hstate(1 << page_size_log); if (!h) return -1; return h - hstates; @@ -929,9 +926,12 @@ static struct dentry_operations anon_ops = { .d_dname = hugetlb_dname }; -struct file *hugetlb_file_setup(const char *name, unsigned long addr, - size_t size, vm_flags_t acctflag, - struct user_struct **user, +/* + * Note that size should be aligned to proper hugepage size in caller side, + * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. + */ +struct file *hugetlb_file_setup(const char *name, size_t size, + vm_flags_t acctflag, struct user_struct **user, int creat_flags, int page_size_log) { struct file *file = ERR_PTR(-ENOMEM); @@ -939,8 +939,6 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, struct path path; struct super_block *sb; struct qstr quick_string; - struct hstate *hstate; - unsigned long num_pages; int hstate_idx; hstate_idx = get_hstate_idx(page_size_log); @@ -980,12 +978,10 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr, if (!inode) goto out_dentry; - hstate = hstate_inode(inode); - size += addr & ~huge_page_mask(hstate); - num_pages = ALIGN(size, huge_page_size(hstate)) >> - huge_page_shift(hstate); file = ERR_PTR(-ENOMEM); - if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag)) + if (hugetlb_reserve_pages(inode, 0, + size >> huge_page_shift(hstate_inode(inode)), NULL, + acctflag)) goto out_inode; d_instantiate(path.dentry, inode); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 77554b61d124..730f24e282a6 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -23,6 +23,7 @@ #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/writeback.h> +#include <linux/aio.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index cbe48ea9318e..c57499dca89c 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2005,7 +2005,6 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; @@ -2146,7 +2145,6 @@ static void lbmStartIO(struct lbuf * bp) bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index e784a217b500..550475ca6a0e 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -32,7 +32,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) bio_vec.bv_len = PAGE_SIZE; bio_vec.bv_offset = 0; bio.bi_vcnt = 1; - bio.bi_idx = 0; bio.bi_size = PAGE_SIZE; bio.bi_bdev = bdev; bio.bi_sector = page->index * (PAGE_SIZE >> 9); @@ -108,7 +107,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_idx = 0; bio->bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; @@ -136,7 +134,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unlock_page(page); } bio->bi_vcnt = nr_pages; - bio->bi_idx = 0; bio->bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; @@ -202,7 +199,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_idx = 0; bio->bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; @@ -224,7 +220,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_io_vec[i].bv_offset = 0; } bio->bi_vcnt = nr_pages; - bio->bi_idx = 0; bio->bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 553a83cc4106..a1dd768d0a35 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -47,6 +47,8 @@ struct nfs4_minor_version_ops { const nfs4_stateid *); int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); + int (*free_lock_state)(struct nfs_server *, + struct nfs4_lock_state *); const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; const struct nfs4_state_maintenance_ops *state_renewal_ops; @@ -234,7 +236,6 @@ extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struc extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *, struct nfs_fh *, struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); -extern int nfs4_release_lockowner(struct nfs4_lock_state *); extern const struct xattr_handler *nfs4_xattr_handlers[]; extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_open_context *ctx, diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index b8da95548d3d..235ff952d3c8 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -70,6 +70,8 @@ struct nfs4_pnfs_ds { struct list_head ds_addrs; struct nfs_client *ds_clp; atomic_t ds_count; + unsigned long ds_state; +#define NFS4DS_CONNECTING 0 /* ds is establishing connection */ }; struct nfs4_file_layout_dsaddr { diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 1fe284f01f8b..661a0f611215 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -775,6 +775,22 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) return flseg->fh_array[i]; } +static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) +{ + might_sleep(); + wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, + nfs_wait_bit_killable, TASK_KILLABLE); +} + +static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) +{ + smp_mb__before_clear_bit(); + clear_bit(NFS4DS_CONNECTING, &ds->ds_state); + smp_mb__after_clear_bit(); + wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); +} + + struct nfs4_pnfs_ds * nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) { @@ -791,16 +807,22 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) filelayout_mark_devid_invalid(devid); return NULL; } + if (ds->ds_clp) + return ds; - if (!ds->ds_clp) { + if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); int err; err = nfs4_ds_connect(s, ds); if (err) { nfs4_mark_deviceid_unavailable(devid); - return NULL; + ds = NULL; } + nfs4_clear_ds_conn_bit(ds); + } else { + /* Either ds is connected, or ds is NULL */ + nfs4_wait_ds_connect(ds); } return ds; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9da4bd55eb30..8fbc10054115 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4766,9 +4766,9 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * if (status != 0) goto out; /* Is this a delegated lock? */ - if (test_bit(NFS_DELEGATED_STATE, &state->flags)) - goto out; lsp = request->fl_u.nfs4_fl.owner; + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) + goto out; seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); status = -ENOMEM; if (seqid == NULL) @@ -5238,9 +5238,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = { .rpc_release = nfs4_release_lockowner_release, }; -int nfs4_release_lockowner(struct nfs4_lock_state *lsp) +static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) { - struct nfs_server *server = lsp->ls_state->owner->so_server; struct nfs_release_lockowner_data *data; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], @@ -6783,26 +6782,76 @@ static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid) return err; } -static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) -{ - struct nfs41_free_stateid_args args = { - .stateid = stateid, - }; +struct nfs_free_stateid_data { + struct nfs_server *server; + struct nfs41_free_stateid_args args; struct nfs41_free_stateid_res res; +}; + +static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs_free_stateid_data *data = calldata; + nfs41_setup_sequence(nfs4_get_session(data->server), + &data->args.seq_args, + &data->res.seq_res, + task); +} + +static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata) +{ + struct nfs_free_stateid_data *data = calldata; + + nfs41_sequence_done(task, &data->res.seq_res); + + switch (task->tk_status) { + case -NFS4ERR_DELAY: + if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } +} + +static void nfs41_free_stateid_release(void *calldata) +{ + kfree(calldata); +} + +const struct rpc_call_ops nfs41_free_stateid_ops = { + .rpc_call_prepare = nfs41_free_stateid_prepare, + .rpc_call_done = nfs41_free_stateid_done, + .rpc_release = nfs41_free_stateid_release, +}; + +static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + bool privileged) +{ struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID], - .rpc_argp = &args, - .rpc_resp = &res, }; - int status; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs41_free_stateid_ops, + .flags = RPC_TASK_ASYNC, + }; + struct nfs_free_stateid_data *data; dprintk("NFS call free_stateid %p\n", stateid); - nfs41_init_sequence(&args.seq_args, &res.seq_res, 0); - nfs4_set_sequence_privileged(&args.seq_args); - status = nfs4_call_sync_sequence(server->client, server, &msg, - &args.seq_args, &res.seq_res); - dprintk("NFS reply free_stateid: %d\n", status); - return status; + data = kmalloc(sizeof(*data), GFP_NOFS); + if (!data) + return ERR_PTR(-ENOMEM); + data->server = server; + nfs4_stateid_copy(&data->args.stateid, stateid); + + task_setup.callback_data = data; + + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + if (privileged) + nfs4_set_sequence_privileged(&data->args.seq_args); + + return rpc_run_task(&task_setup); } /** @@ -6816,15 +6865,29 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) */ static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid) { - struct nfs4_exception exception = { }; - int err; - do { - err = _nfs4_free_stateid(server, stateid); - if (err != -NFS4ERR_DELAY) - break; - nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; + struct rpc_task *task; + int ret; + + task = _nfs41_free_stateid(server, stateid, true); + if (IS_ERR(task)) + return PTR_ERR(task); + ret = rpc_wait_for_completion_task(task); + if (!ret) + ret = task->tk_status; + rpc_put_task(task); + return ret; +} + +static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) +{ + struct rpc_task *task; + + task = _nfs41_free_stateid(server, &lsp->ls_stateid, false); + nfs4_free_lock_state(server, lsp); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; } static bool nfs41_match_stateid(const nfs4_stateid *s1, @@ -6916,6 +6979,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .call_sync = _nfs4_call_sync, .match_stateid = nfs4_match_stateid, .find_root_sec = nfs4_find_root_sec, + .free_lock_state = nfs4_release_lockowner, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, .nograce_recovery_ops = &nfs40_nograce_recovery_ops, .state_renewal_ops = &nfs40_state_renewal_ops, @@ -6933,6 +6997,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .call_sync = nfs4_call_sync_sequence, .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, + .free_lock_state = nfs41_free_lock_state, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0b32f9483b7a..300d17d85c0e 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -921,6 +921,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ */ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) { + struct nfs_server *server; struct nfs4_state *state; if (lsp == NULL) @@ -932,11 +933,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); + server = state->owner->so_server; if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { - if (nfs4_release_lockowner(lsp) == 0) - return; - } - nfs4_free_lock_state(lsp->ls_state->owner->so_server, lsp); + struct nfs_client *clp = server->nfs_client; + + clp->cl_mvops->free_lock_state(server, lsp); + } else + nfs4_free_lock_state(server, lsp); } static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 3c79c5878c6d..4be8d135ed61 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2003,7 +2003,7 @@ static void encode_free_stateid(struct xdr_stream *xdr, struct compound_hdr *hdr) { encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr); - encode_nfs4_stateid(xdr, args->stateid); + encode_nfs4_stateid(xdr, &args->stateid); } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 1bb071dca9ab..a366107a7331 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1610,16 +1610,15 @@ out_security_failure: /* * Select a security flavor for this mount. The selected flavor * is planted in args->auth_flavors[0]. + * + * Returns 0 on success, -EACCES on failure. */ -static void nfs_select_flavor(struct nfs_parsed_mount_data *args, +static int nfs_select_flavor(struct nfs_parsed_mount_data *args, struct nfs_mount_request *request) { unsigned int i, count = *(request->auth_flav_len); rpc_authflavor_t flavor; - if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) - goto out; - /* * The NFSv2 MNT operation does not return a flavor list. */ @@ -1634,6 +1633,25 @@ static void nfs_select_flavor(struct nfs_parsed_mount_data *args, goto out_default; /* + * If the sec= mount option is used, the specified flavor or AUTH_NULL + * must be in the list returned by the server. + * + * AUTH_NULL has a special meaning when it's in the server list - it + * means that the server will ignore the rpc creds, so any flavor + * can be used. + */ + if (args->auth_flavors[0] != RPC_AUTH_MAXFLAVOR) { + for (i = 0; i < count; i++) { + if (args->auth_flavors[0] == request->auth_flavs[i] || + request->auth_flavs[i] == RPC_AUTH_NULL) + goto out; + } + dfprintk(MOUNT, "NFS: auth flavor %d not supported by server\n", + args->auth_flavors[0]); + goto out_err; + } + + /* * RFC 2623, section 2.7 suggests we SHOULD prefer the * flavor listed first. However, some servers list * AUTH_NULL first. Avoid ever choosing AUTH_NULL. @@ -1653,12 +1671,29 @@ static void nfs_select_flavor(struct nfs_parsed_mount_data *args, } } + /* + * As a last chance, see if the server list contains AUTH_NULL - + * if it does, use the default flavor. + */ + for (i = 0; i < count; i++) { + if (request->auth_flavs[i] == RPC_AUTH_NULL) + goto out_default; + } + + dfprintk(MOUNT, "NFS: no auth flavors in common with server\n"); + goto out_err; + out_default: - flavor = RPC_AUTH_UNIX; + /* use default if flavor not already set */ + flavor = (args->auth_flavors[0] == RPC_AUTH_MAXFLAVOR) ? + RPC_AUTH_UNIX : args->auth_flavors[0]; out_set: args->auth_flavors[0] = flavor; out: dfprintk(MOUNT, "NFS: using auth flavor %d\n", args->auth_flavors[0]); + return 0; +out_err: + return -EACCES; } /* @@ -1721,8 +1756,7 @@ static int nfs_request_mount(struct nfs_parsed_mount_data *args, return status; } - nfs_select_flavor(args, &request); - return 0; + return nfs_select_flavor(args, &request); } struct dentry *nfs_try_mount(int flags, const char *dev_name, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index cf02f5530713..689fb608648e 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -25,7 +25,7 @@ #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/writeback.h> -#include <linux/uio.h> +#include <linux/aio.h> #include "nilfs.h" #include "btnode.h" #include "segment.h" diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 1da4b81e6f76..c5670b8d198c 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -27,6 +27,7 @@ #include <linux/swap.h> #include <linux/uio.h> #include <linux/writeback.h> +#include <linux/aio.h> #include <asm/page.h> #include <asm/uaccess.h> diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index d3e118cc6ffa..2778b0255dc6 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -28,6 +28,7 @@ #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/log2.h> +#include <linux/aio.h> #include "aops.h" #include "attrib.h" diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index ffb2da370a99..f671e49beb34 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,6 +22,8 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H +#include <linux/aio.h> + handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, unsigned from, diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 12ae194ac943..3a44a648dae7 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags, subclass, _RET_IP_); if (status < 0) { - if (status != -EAGAIN && status != -EIOCBRETRY) + if (status != -EAGAIN) mlog_errno(status); goto bail; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 88924a3133fa..621fc73bf23d 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -147,8 +147,6 @@ void ocfs2_refresh_inode(struct inode *inode, int ocfs2_mark_inode_dirty(handle_t *handle, struct inode *inode, struct buffer_head *bh); -int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); -int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); struct buffer_head *ocfs2_bread(struct inode *inode, int block, int *err, int reada); diff --git a/fs/pipe.c b/fs/pipe.c index a029a14bacf1..d2c45e14e6d8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,6 +21,7 @@ #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> +#include <linux/aio.h> #include <asm/uaccess.h> #include <asm/ioctls.h> diff --git a/fs/read_write.c b/fs/read_write.c index 90ba3b350e50..03430008704e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,6 +9,7 @@ #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> +#include <linux/aio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> @@ -329,16 +330,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; } -static void wait_on_retry_sync_kiocb(struct kiocb *iocb) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - if (!kiocbIsKicked(iocb)) - schedule(); - else - kiocbClearKicked(iocb); - __set_current_state(TASK_RUNNING); -} - ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; @@ -350,13 +341,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -406,13 +391,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -592,13 +571,7 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, kiocb.ki_left = len; kiocb.ki_nbytes = len; - for (;;) { - ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); - if (ret != -EIOCBRETRY) - break; - wait_on_retry_sync_kiocb(&kiocb); - } - + ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ea5061fd4f3e..77d6d47abc83 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -18,6 +18,7 @@ #include <linux/writeback.h> #include <linux/quotaops.h> #include <linux/swap.h> +#include <linux/aio.h> int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f12189d2db1d..14374530784c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -50,6 +50,7 @@ */ #include "ubifs.h" +#include <linux/aio.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/slab.h> diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7a12e48ad819..b6d15d349810 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -38,6 +38,7 @@ #include <linux/slab.h> #include <linux/crc-itu-t.h> #include <linux/mpage.h> +#include <linux/aio.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index cc33aaf219f1..399e8cec6e60 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -69,6 +69,19 @@ config XFS_RT If unsure, say N. +config XFS_WARN + bool "XFS Verbose Warnings" + depends on XFS_FS && !XFS_DEBUG + help + Say Y here to get an XFS build with many additional warnings. + It converts ASSERT checks to WARN, so will log any out-of-bounds + conditions that occur that would otherwise be missed. It is much + lighter weight than XFS_DEBUG and does not modify algorithms and will + not cause the kernel to panic on non-fatal errors. + + However, similar to XFS_DEBUG, it is only advisable to use this if you + are debugging a particular problem. + config XFS_DEBUG bool "XFS Debugging support" depends on XFS_FS diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h index ff6a19873e5c..e3c92d19e540 100644 --- a/fs/xfs/mrlock.h +++ b/fs/xfs/mrlock.h @@ -22,12 +22,12 @@ typedef struct { struct rw_semaphore mr_lock; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int mr_writer; #endif } mrlock_t; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) #define mrinit(mrp, name) \ do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) #else @@ -46,7 +46,7 @@ static inline void mraccess_nested(mrlock_t *mrp, int subclass) static inline void mrupdate_nested(mrlock_t *mrp, int subclass) { down_write_nested(&mrp->mr_lock, subclass); -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 1; #endif } @@ -60,7 +60,7 @@ static inline int mrtryupdate(mrlock_t *mrp) { if (!down_write_trylock(&mrp->mr_lock)) return 0; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 1; #endif return 1; @@ -68,7 +68,7 @@ static inline int mrtryupdate(mrlock_t *mrp) static inline void mrunlock_excl(mrlock_t *mrp) { -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 0; #endif up_write(&mrp->mr_lock); @@ -81,7 +81,7 @@ static inline void mrunlock_shared(mrlock_t *mrp) static inline void mrdemote(mrlock_t *mrp) { -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 0; #endif downgrade_write(&mrp->mr_lock); diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index d8b11b7f94aa..a742c47f7d5a 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -24,6 +24,11 @@ #define XFS_BUF_LOCK_TRACKING 1 #endif +#ifdef CONFIG_XFS_WARN +#define XFS_WARN 1 +#endif + + #include "xfs_linux.h" #endif /* __XFS_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 30c4c1434faf..cafc90251d19 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -386,7 +386,7 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = { }; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_allocbt_keys_inorder( struct xfs_btree_cur *cur, @@ -442,7 +442,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_allocbt_key_diff, .buf_ops = &xfs_allocbt_buf_ops, -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_allocbt_keys_inorder, .recs_inorder = xfs_allocbt_recs_inorder, #endif diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3244c988d379..2b2691b73428 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,6 +31,7 @@ #include "xfs_vnodeops.h" #include "xfs_trace.h" #include "xfs_bmap.h" +#include <linux/aio.h> #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 3a86c3fa6de1..0c61a22be6fd 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -813,7 +813,7 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = { }; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_bmbt_keys_inorder( struct xfs_btree_cur *cur, @@ -853,7 +853,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, .buf_ops = &xfs_bmbt_buf_ops, -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, #endif diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 6e6c915673fe..55e3c7cc3c3d 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -215,7 +215,7 @@ struct xfs_btree_ops { const struct xfs_buf_ops *buf_ops; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, union xfs_btree_key *k1, diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index ecc6c661064c..5246de4912d4 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -993,7 +993,7 @@ xfs_dir2_leafn_rebalance( xfs_dir2_leaf_t *leaf1; /* first leaf structure */ xfs_dir2_leaf_t *leaf2; /* second leaf structure */ int mid; /* midpoint leaf index */ -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int oldstale; /* old count of stale leaves */ #endif int oldsum; /* old total leaf count */ @@ -1022,7 +1022,7 @@ xfs_dir2_leafn_rebalance( ents2 = xfs_dir3_leaf_ents_p(leaf2); oldsum = hdr1.count + hdr2.count; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) oldstale = hdr1.stale + hdr2.stale; #endif mid = oldsum >> 1; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 054d60c0ac57..a5f2042aec8b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -36,6 +36,7 @@ #include "xfs_ioctl.h" #include "xfs_trace.h" +#include <linux/aio.h> #include <linux/dcache.h> #include <linux/falloc.h> #include <linux/pagevec.h> diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index c82ac8867421..5448eb6b8c12 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -272,7 +272,7 @@ const struct xfs_buf_ops xfs_inobt_buf_ops = { .verify_write = xfs_inobt_write_verify, }; -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) STATIC int xfs_inobt_keys_inorder( struct xfs_btree_cur *cur, @@ -310,7 +310,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, .buf_ops = &xfs_inobt_buf_ops, -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, #endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 558ef4947206..efbe1accb6ca 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -287,7 +287,7 @@ xfs_ilock_demote( trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); } -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int xfs_isilocked( xfs_inode_t *ip, diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d681e34c2950..5e999680094a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -422,9 +422,12 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL); - if (!kbuf) - goto out_dput; + kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL); + if (!kbuf) { + kbuf = kmem_zalloc_large(al_hreq.buflen); + if (!kbuf) + goto out_dput; + } cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, @@ -436,7 +439,10 @@ xfs_attrlist_by_handle( error = -EFAULT; out_kfree: - kfree(kbuf); + if (is_vmalloc_addr(kbuf)) + kmem_free_large(kbuf); + else + kmem_free(kbuf); out_dput: dput(dentry); return error; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 63b8fc432151..c0c66259cc91 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -373,9 +373,12 @@ xfs_compat_attrlist_by_handle( return PTR_ERR(dentry); error = -ENOMEM; - kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); - if (!kbuf) - goto out_dput; + kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL); + if (!kbuf) { + kbuf = kmem_zalloc_large(al_hreq.buflen); + if (!kbuf) + goto out_dput; + } cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, @@ -387,7 +390,10 @@ xfs_compat_attrlist_by_handle( error = -EFAULT; out_kfree: - kfree(kbuf); + if (is_vmalloc_addr(kbuf)) + kmem_free_large(kbuf); + else + kmem_free(kbuf); out_dput: dput(dentry); return error; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 14e59d953b7b..800f896a6cc4 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -293,22 +293,34 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) #define ASSERT_ALWAYS(expr) \ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) -#ifndef DEBUG -#define ASSERT(expr) ((void)0) +#ifdef DEBUG +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) #ifndef STATIC -# define STATIC static noinline +# define STATIC noinline #endif -#else /* DEBUG */ +#else /* !DEBUG */ + +#ifdef XFS_WARN #define ASSERT(expr) \ - (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) #ifndef STATIC -# define STATIC noinline +# define STATIC static noinline +#endif + +#else /* !DEBUG && !XFS_WARN */ + +#define ASSERT(expr) ((void)0) + +#ifndef STATIC +# define STATIC static noinline #endif +#endif /* XFS_WARN */ #endif /* DEBUG */ #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 331cd9f83a7f..9163dc140532 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -93,6 +93,14 @@ xfs_alert_tag( } void +asswarn(char *expr, char *file, int line) +{ + xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d", + expr, file, line); + WARN_ON(1); +} + +void assfail(char *expr, char *file, int line) { xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d", diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 76c81982f964..85401155750e 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -57,6 +57,7 @@ do { \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) extern void assfail(char *expr, char *f, int l); +extern void asswarn(char *expr, char *f, int l); extern void xfs_hex_dump(void *p, int length); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index cd29f6171021..a44dba5b2cdb 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -405,7 +405,7 @@ typedef struct xfs_trans { int64_t t_res_fdblocks_delta; /* on-disk only chg */ int64_t t_frextents_delta;/* superblock freextents chg*/ int64_t t_res_frextents_delta; /* on-disk only chg */ -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int64_t t_ag_freeblks_delta; /* debugging counter */ int64_t t_ag_flist_delta; /* debugging counter */ int64_t t_ag_btree_delta; /* debugging counter */ @@ -433,7 +433,7 @@ typedef struct xfs_trans { #define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) #define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) #define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) #define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) |