diff options
Diffstat (limited to 'fs')
164 files changed, 7641 insertions, 6025 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 58e6cbce4156..08f2e1e9a7e6 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -603,10 +603,11 @@ static int v9fs_cache_register(void) if (ret < 0) return ret; #ifdef CONFIG_9P_FSCACHE - return fscache_register_netfs(&v9fs_cache_netfs); -#else - return ret; + ret = fscache_register_netfs(&v9fs_cache_netfs); + if (ret < 0) + v9fs_destroy_inode_cache(); #endif + return ret; } static void v9fs_cache_unregister(void) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 53687bbf2296..a7c481402c46 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -267,14 +267,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, } /* Only creates */ - if (!(flags & O_CREAT)) + if (!(flags & O_CREAT) || dentry->d_inode) return finish_no_open(file, res); - else if (dentry->d_inode) { - if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) - return -EEXIST; - else - return finish_no_open(file, res); - } v9ses = v9fs_inode2v9ses(dir); diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 5f95d1ed9c6d..b9acadafa4a1 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -50,7 +50,7 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } static int adfs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/affs/file.c b/fs/affs/file.c index 776e3935a758..8669b6ecddee 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -406,7 +406,7 @@ static void affs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); affs_truncate(inode); } } @@ -26,6 +26,7 @@ #include <linux/mm.h> #include <linux/mman.h> #include <linux/mmu_context.h> +#include <linux/percpu.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/aio.h> @@ -35,6 +36,10 @@ #include <linux/eventfd.h> #include <linux/blkdev.h> #include <linux/compat.h> +#include <linux/anon_inodes.h> +#include <linux/migrate.h> +#include <linux/ramfs.h> +#include <linux/percpu-refcount.h> #include <asm/kmap_types.h> #include <asm/uaccess.h> @@ -61,14 +66,29 @@ struct aio_ring { #define AIO_RING_PAGES 8 +struct kioctx_table { + struct rcu_head rcu; + unsigned nr; + struct kioctx *table[]; +}; + +struct kioctx_cpu { + unsigned reqs_available; +}; + struct kioctx { - atomic_t users; + struct percpu_ref users; atomic_t dead; - /* This needs improving */ unsigned long user_id; - struct hlist_node list; + struct __percpu kioctx_cpu *cpu; + + /* + * For percpu reqs_available, number of slots we move to/from global + * counter at a time: + */ + unsigned req_batch; /* * This is what userspace passed to io_setup(), it's not used for * anything but counting against the global max_reqs quota. @@ -88,10 +108,18 @@ struct kioctx { long nr_pages; struct rcu_head rcu_head; - struct work_struct rcu_work; + struct work_struct free_work; struct { - atomic_t reqs_active; + /* + * This counts the number of available slots in the ringbuffer, + * so we avoid overflowing it: it's decremented (if positive) + * when allocating a kiocb and incremented when the resulting + * io_event is pulled off the ringbuffer. + * + * We batch accesses to it with a percpu version. + */ + atomic_t reqs_available; } ____cacheline_aligned_in_smp; struct { @@ -110,6 +138,9 @@ struct kioctx { } ____cacheline_aligned_in_smp; struct page *internal_pages[AIO_RING_PAGES]; + struct file *aio_ring_file; + + unsigned id; }; /*------ sysctl variables----*/ @@ -138,15 +169,77 @@ __initcall(aio_setup); static void aio_free_ring(struct kioctx *ctx) { - long i; + int i; + struct file *aio_ring_file = ctx->aio_ring_file; - for (i = 0; i < ctx->nr_pages; i++) + for (i = 0; i < ctx->nr_pages; i++) { + pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, + page_count(ctx->ring_pages[i])); put_page(ctx->ring_pages[i]); + } if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) kfree(ctx->ring_pages); + + if (aio_ring_file) { + truncate_setsize(aio_ring_file->f_inode, 0); + fput(aio_ring_file); + ctx->aio_ring_file = NULL; + } +} + +static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &generic_file_vm_ops; + return 0; } +static const struct file_operations aio_ring_fops = { + .mmap = aio_ring_mmap, +}; + +static int aio_set_page_dirty(struct page *page) +{ + return 0; +} + +#if IS_ENABLED(CONFIG_MIGRATION) +static int aio_migratepage(struct address_space *mapping, struct page *new, + struct page *old, enum migrate_mode mode) +{ + struct kioctx *ctx = mapping->private_data; + unsigned long flags; + unsigned idx = old->index; + int rc; + + /* Writeback must be complete */ + BUG_ON(PageWriteback(old)); + put_page(old); + + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); + if (rc != MIGRATEPAGE_SUCCESS) { + get_page(old); + return rc; + } + + get_page(new); + + spin_lock_irqsave(&ctx->completion_lock, flags); + migrate_page_copy(new, old); + ctx->ring_pages[idx] = new; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + return rc; +} +#endif + +static const struct address_space_operations aio_ctx_aops = { + .set_page_dirty = aio_set_page_dirty, +#if IS_ENABLED(CONFIG_MIGRATION) + .migratepage = aio_migratepage, +#endif +}; + static int aio_setup_ring(struct kioctx *ctx) { struct aio_ring *ring; @@ -154,20 +247,45 @@ static int aio_setup_ring(struct kioctx *ctx) struct mm_struct *mm = current->mm; unsigned long size, populate; int nr_pages; + int i; + struct file *file; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; - nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + nr_pages = PFN_UP(size); if (nr_pages < 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR); + if (IS_ERR(file)) { + ctx->aio_ring_file = NULL; + return -EAGAIN; + } + + file->f_inode->i_mapping->a_ops = &aio_ctx_aops; + file->f_inode->i_mapping->private_data = ctx; + file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages; + + for (i = 0; i < nr_pages; i++) { + struct page *page; + page = find_or_create_page(file->f_inode->i_mapping, + i, GFP_HIGHUSER | __GFP_ZERO); + if (!page) + break; + pr_debug("pid(%d) page[%d]->count=%d\n", + current->pid, i, page_count(page)); + SetPageUptodate(page); + SetPageDirty(page); + unlock_page(page); + } + ctx->aio_ring_file = file; + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) + / sizeof(struct io_event); - ctx->nr_events = 0; ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), @@ -178,10 +296,11 @@ static int aio_setup_ring(struct kioctx *ctx) ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); + down_write(&mm->mmap_sem); - ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); + ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, 0, &populate); if (IS_ERR((void *)ctx->mmap_base)) { up_write(&mm->mmap_sem); ctx->mmap_size = 0; @@ -190,23 +309,34 @@ static int aio_setup_ring(struct kioctx *ctx) } pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); + + /* We must do this while still holding mmap_sem for write, as we + * need to be protected against userspace attempting to mremap() + * or munmap() the ring buffer. + */ ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, 1, 0, ctx->ring_pages, NULL); + + /* Dropping the reference here is safe as the page cache will hold + * onto the pages for us. It is also required so that page migration + * can unmap the pages and get the right reference count. + */ + for (i = 0; i < ctx->nr_pages; i++) + put_page(ctx->ring_pages[i]); + up_write(&mm->mmap_sem); if (unlikely(ctx->nr_pages != nr_pages)) { aio_free_ring(ctx); return -EAGAIN; } - if (populate) - mm_populate(ctx->mmap_base, populate); ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ - ring->id = ctx->user_id; + ring->id = ~0U; ring->head = ring->tail = 0; ring->magic = AIO_RING_MAGIC; ring->compat_features = AIO_RING_COMPAT_FEATURES; @@ -238,11 +368,9 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) } EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, - struct io_event *res) +static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) { kiocb_cancel_fn *old, *cancel; - int ret = -EINVAL; /* * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it @@ -252,28 +380,20 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, cancel = ACCESS_ONCE(kiocb->ki_cancel); do { if (!cancel || cancel == KIOCB_CANCELLED) - return ret; + return -EINVAL; old = cancel; cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); } while (cancel != old); - atomic_inc(&kiocb->ki_users); - spin_unlock_irq(&ctx->ctx_lock); - - memset(res, 0, sizeof(*res)); - res->obj = (u64)(unsigned long)kiocb->ki_obj.user; - res->data = kiocb->ki_user_data; - ret = cancel(kiocb, res); - - spin_lock_irq(&ctx->ctx_lock); - - return ret; + return cancel(kiocb); } static void free_ioctx_rcu(struct rcu_head *head) { struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + + free_percpu(ctx->cpu); kmem_cache_free(kioctx_cachep, ctx); } @@ -282,12 +402,13 @@ static void free_ioctx_rcu(struct rcu_head *head) * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */ -static void free_ioctx(struct kioctx *ctx) +static void free_ioctx(struct work_struct *work) { + struct kioctx *ctx = container_of(work, struct kioctx, free_work); struct aio_ring *ring; - struct io_event res; struct kiocb *req; - unsigned head, avail; + unsigned cpu, avail; + DEFINE_WAIT(wait); spin_lock_irq(&ctx->ctx_lock); @@ -296,28 +417,38 @@ static void free_ioctx(struct kioctx *ctx) struct kiocb, ki_list); list_del_init(&req->ki_list); - kiocb_cancel(ctx, req, &res); + kiocb_cancel(ctx, req); } spin_unlock_irq(&ctx->ctx_lock); - ring = kmap_atomic(ctx->ring_pages[0]); - head = ring->head; - kunmap_atomic(ring); + for_each_possible_cpu(cpu) { + struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); - while (atomic_read(&ctx->reqs_active) > 0) { - wait_event(ctx->wait, - head != ctx->tail || - atomic_read(&ctx->reqs_active) <= 0); + atomic_add(kcpu->reqs_available, &ctx->reqs_available); + kcpu->reqs_available = 0; + } - avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; + while (1) { + prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE); - atomic_sub(avail, &ctx->reqs_active); - head += avail; - head %= ctx->nr_events; + ring = kmap_atomic(ctx->ring_pages[0]); + avail = (ring->head <= ring->tail) + ? ring->tail - ring->head + : ctx->nr_events - ring->head + ring->tail; + + atomic_add(avail, &ctx->reqs_available); + ring->head = ring->tail; + kunmap_atomic(ring); + + if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1) + break; + + schedule(); } + finish_wait(&ctx->wait, &wait); - WARN_ON(atomic_read(&ctx->reqs_active) < 0); + WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); aio_free_ring(ctx); @@ -333,10 +464,68 @@ static void free_ioctx(struct kioctx *ctx) call_rcu(&ctx->rcu_head, free_ioctx_rcu); } -static void put_ioctx(struct kioctx *ctx) +static void free_ioctx_ref(struct percpu_ref *ref) { - if (unlikely(atomic_dec_and_test(&ctx->users))) - free_ioctx(ctx); + struct kioctx *ctx = container_of(ref, struct kioctx, users); + + INIT_WORK(&ctx->free_work, free_ioctx); + schedule_work(&ctx->free_work); +} + +static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) +{ + unsigned i, new_nr; + struct kioctx_table *table, *old; + struct aio_ring *ring; + + spin_lock(&mm->ioctx_lock); + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + + while (1) { + if (table) + for (i = 0; i < table->nr; i++) + if (!table->table[i]) { + ctx->id = i; + table->table[i] = ctx; + rcu_read_unlock(); + spin_unlock(&mm->ioctx_lock); + + ring = kmap_atomic(ctx->ring_pages[0]); + ring->id = ctx->id; + kunmap_atomic(ring); + return 0; + } + + new_nr = (table ? table->nr : 1) * 4; + + rcu_read_unlock(); + spin_unlock(&mm->ioctx_lock); + + table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * + new_nr, GFP_KERNEL); + if (!table) + return -ENOMEM; + + table->nr = new_nr; + + spin_lock(&mm->ioctx_lock); + rcu_read_lock(); + old = rcu_dereference(mm->ioctx_table); + + if (!old) { + rcu_assign_pointer(mm->ioctx_table, table); + } else if (table->nr > old->nr) { + memcpy(table->table, old->table, + old->nr * sizeof(struct kioctx *)); + + rcu_assign_pointer(mm->ioctx_table, table); + kfree_rcu(old, rcu); + } else { + kfree(table); + table = old; + } + } } /* ioctx_alloc @@ -348,6 +537,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) struct kioctx *ctx; int err = -ENOMEM; + /* + * We keep track of the number of available ringbuffer slots, to prevent + * overflow (reqs_available), and we also use percpu counters for this. + * + * So since up to half the slots might be on other cpu's percpu counters + * and unavailable, double nr_events so userspace sees what they + * expected: additionally, we move req_batch slots to/from percpu + * counters at a time, so make sure that isn't 0: + */ + nr_events = max(nr_events, num_possible_cpus() * 4); + nr_events *= 2; + /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || (nr_events > (0x10000000U / sizeof(struct kiocb)))) { @@ -355,7 +556,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-EINVAL); } - if (!nr_events || (unsigned long)nr_events > aio_max_nr) + if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL)) return ERR_PTR(-EAGAIN); ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); @@ -364,8 +565,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; - atomic_set(&ctx->users, 2); - atomic_set(&ctx->dead, 0); + if (percpu_ref_init(&ctx->users, free_ioctx_ref)) + goto out_freectx; + spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); @@ -373,12 +575,21 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); + ctx->cpu = alloc_percpu(struct kioctx_cpu); + if (!ctx->cpu) + goto out_freeref; + if (aio_setup_ring(ctx) < 0) - goto out_freectx; + goto out_freepcpu; + + atomic_set(&ctx->reqs_available, ctx->nr_events - 1); + ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); + if (ctx->req_batch < 1) + ctx->req_batch = 1; /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); - if (aio_nr + nr_events > aio_max_nr || + if (aio_nr + nr_events > (aio_max_nr * 2UL) || aio_nr + nr_events < aio_nr) { spin_unlock(&aio_nr_lock); goto out_cleanup; @@ -386,49 +597,54 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); - /* now link into global list. */ - spin_lock(&mm->ioctx_lock); - hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); - spin_unlock(&mm->ioctx_lock); + percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + + err = ioctx_add_table(ctx, mm); + if (err) + goto out_cleanup_put; pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; +out_cleanup_put: + percpu_ref_put(&ctx->users); out_cleanup: err = -EAGAIN; aio_free_ring(ctx); +out_freepcpu: + free_percpu(ctx->cpu); +out_freeref: + free_percpu(ctx->users.pcpu_count); out_freectx: + if (ctx->aio_ring_file) + fput(ctx->aio_ring_file); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); } -static void kill_ioctx_work(struct work_struct *work) -{ - struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); - - wake_up_all(&ctx->wait); - put_ioctx(ctx); -} - -static void kill_ioctx_rcu(struct rcu_head *head) -{ - struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); - - INIT_WORK(&ctx->rcu_work, kill_ioctx_work); - schedule_work(&ctx->rcu_work); -} - /* kill_ioctx * Cancels all outstanding aio requests on an aio context. Used * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ioctx(struct kioctx *ctx) +static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) { if (!atomic_xchg(&ctx->dead, 1)) { - hlist_del_rcu(&ctx->list); + struct kioctx_table *table; + + spin_lock(&mm->ioctx_lock); + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + + WARN_ON(ctx != table->table[ctx->id]); + table->table[ctx->id] = NULL; + rcu_read_unlock(); + spin_unlock(&mm->ioctx_lock); + + /* percpu_ref_kill() will do the necessary call_rcu() */ + wake_up_all(&ctx->wait); /* * It'd be more correct to do this in free_ioctx(), after all @@ -445,24 +661,23 @@ static void kill_ioctx(struct kioctx *ctx) if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); - /* Between hlist_del_rcu() and dropping the initial ref */ - call_rcu(&ctx->rcu_head, kill_ioctx_rcu); + percpu_ref_kill(&ctx->users); } } /* wait_on_sync_kiocb: * Waits on the given sync kiocb to complete. */ -ssize_t wait_on_sync_kiocb(struct kiocb *iocb) +ssize_t wait_on_sync_kiocb(struct kiocb *req) { - while (atomic_read(&iocb->ki_users)) { + while (!req->ki_ctx) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!atomic_read(&iocb->ki_users)) + if (req->ki_ctx) break; io_schedule(); } __set_current_state(TASK_RUNNING); - return iocb->ki_user_data; + return req->ki_user_data; } EXPORT_SYMBOL(wait_on_sync_kiocb); @@ -476,16 +691,28 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); */ void exit_aio(struct mm_struct *mm) { + struct kioctx_table *table; struct kioctx *ctx; - struct hlist_node *n; - - hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { - if (1 != atomic_read(&ctx->users)) - printk(KERN_DEBUG - "exit_aio:ioctx still alive: %d %d %d\n", - atomic_read(&ctx->users), - atomic_read(&ctx->dead), - atomic_read(&ctx->reqs_active)); + unsigned i = 0; + + while (1) { + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + + do { + if (!table || i >= table->nr) { + rcu_read_unlock(); + rcu_assign_pointer(mm->ioctx_table, NULL); + if (table) + kfree(table); + return; + } + + ctx = table->table[i++]; + } while (!ctx); + + rcu_read_unlock(); + /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -496,40 +723,75 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - kill_ioctx(ctx); + kill_ioctx(mm, ctx); + } +} + +static void put_reqs_available(struct kioctx *ctx, unsigned nr) +{ + struct kioctx_cpu *kcpu; + + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); + + kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { + kcpu->reqs_available -= ctx->req_batch; + atomic_add(ctx->req_batch, &ctx->reqs_available); + } + + preempt_enable(); +} + +static bool get_reqs_available(struct kioctx *ctx) +{ + struct kioctx_cpu *kcpu; + bool ret = false; + + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); + + if (!kcpu->reqs_available) { + int old, avail = atomic_read(&ctx->reqs_available); + + do { + if (avail < ctx->req_batch) + goto out; + + old = avail; + avail = atomic_cmpxchg(&ctx->reqs_available, + avail, avail - ctx->req_batch); + } while (avail != old); + + kcpu->reqs_available += ctx->req_batch; } + + ret = true; + kcpu->reqs_available--; +out: + preempt_enable(); + return ret; } /* aio_get_req - * Allocate a slot for an aio request. Increments the ki_users count - * of the kioctx so that the kioctx stays around until all requests are - * complete. Returns NULL if no requests are free. - * - * Returns with kiocb->ki_users set to 2. The io submit code path holds - * an extra reference while submitting the i/o. - * This prevents races between the aio code path referencing the - * req (after submitting it) and aio_complete() freeing the req. + * Allocate a slot for an aio request. + * Returns NULL if no requests are free. */ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) + if (!get_reqs_available(ctx)) return NULL; - if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) - goto out_put; - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); if (unlikely(!req)) goto out_put; - atomic_set(&req->ki_users, 2); req->ki_ctx = ctx; - return req; out_put: - atomic_dec(&ctx->reqs_active); + put_reqs_available(ctx, 1); return NULL; } @@ -539,35 +801,32 @@ static void kiocb_free(struct kiocb *req) fput(req->ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); - if (req->ki_dtor) - req->ki_dtor(req); - if (req->ki_iovec != &req->ki_inline_vec) - kfree(req->ki_iovec); kmem_cache_free(kiocb_cachep, req); } -void aio_put_req(struct kiocb *req) -{ - if (atomic_dec_and_test(&req->ki_users)) - kiocb_free(req); -} -EXPORT_SYMBOL(aio_put_req); - static struct kioctx *lookup_ioctx(unsigned long ctx_id) { + struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; + struct kioctx_table *table; + unsigned id; + + if (get_user(id, &ring->id)) + return NULL; rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); - hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { - if (ctx->user_id == ctx_id) { - atomic_inc(&ctx->users); - ret = ctx; - break; - } - } + if (!table || id >= table->nr) + goto out; + ctx = table->table[id]; + if (ctx && ctx->user_id == ctx_id) { + percpu_ref_get(&ctx->users); + ret = ctx; + } +out: rcu_read_unlock(); return ret; } @@ -591,16 +850,16 @@ void aio_complete(struct kiocb *iocb, long res, long res2) * - the sync task helpfully left a reference to itself in the iocb */ if (is_sync_kiocb(iocb)) { - BUG_ON(atomic_read(&iocb->ki_users) != 1); iocb->ki_user_data = res; - atomic_set(&iocb->ki_users, 0); + smp_wmb(); + iocb->ki_ctx = ERR_PTR(-EXDEV); wake_up_process(iocb->ki_obj.tsk); return; } /* * Take rcu_read_lock() in case the kioctx is being destroyed, as we - * need to issue a wakeup after decrementing reqs_active. + * need to issue a wakeup after incrementing reqs_available. */ rcu_read_lock(); @@ -613,17 +872,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) } /* - * cancelled requests don't get events, userland was given one - * when the event got cancelled. - */ - if (unlikely(xchg(&iocb->ki_cancel, - KIOCB_CANCELLED) == KIOCB_CANCELLED)) { - atomic_dec(&ctx->reqs_active); - /* Still need the wake_up in case free_ioctx is waiting */ - goto put_rq; - } - - /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail * pointer since we might be called from irq context. @@ -675,9 +923,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2) if (iocb->ki_eventfd != NULL) eventfd_signal(iocb->ki_eventfd, 1); -put_rq: /* everything turned out well, dispose of the aiocb. */ - aio_put_req(iocb); + kiocb_free(iocb); /* * We have to order our ring_info tail store above and test @@ -702,7 +949,7 @@ static long aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr) { struct aio_ring *ring; - unsigned head, pos; + unsigned head, tail, pos; long ret = 0; int copy_ret; @@ -710,11 +957,12 @@ static long aio_read_events_ring(struct kioctx *ctx, ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; + tail = ring->tail; kunmap_atomic(ring); - pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); + pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); - if (head == ctx->tail) + if (head == tail) goto out; while (ret < nr) { @@ -722,8 +970,8 @@ static long aio_read_events_ring(struct kioctx *ctx, struct io_event *ev; struct page *page; - avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; - if (head == ctx->tail) + avail = (head <= tail ? tail : ctx->nr_events) - head; + if (head == tail) break; avail = min(avail, nr - ret); @@ -754,9 +1002,9 @@ static long aio_read_events_ring(struct kioctx *ctx, kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); - pr_debug("%li h%u t%u\n", ret, head, ctx->tail); + pr_debug("%li h%u t%u\n", ret, head, tail); - atomic_sub(ret, &ctx->reqs_active); + put_reqs_available(ctx, ret); out: mutex_unlock(&ctx->ring_lock); @@ -854,8 +1102,8 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - kill_ioctx(ioctx); - put_ioctx(ioctx); + kill_ioctx(current->mm, ioctx); + percpu_ref_put(&ioctx->users); } out: @@ -872,101 +1120,37 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - kill_ioctx(ioctx); - put_ioctx(ioctx); + kill_ioctx(current->mm, ioctx); + percpu_ref_put(&ioctx->users); return 0; } pr_debug("EINVAL: io_destroy: invalid context id\n"); return -EINVAL; } -static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) -{ - struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; - - BUG_ON(ret <= 0); - - while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { - ssize_t this = min((ssize_t)iov->iov_len, ret); - iov->iov_base += this; - iov->iov_len -= this; - iocb->ki_left -= this; - ret -= this; - if (iov->iov_len == 0) { - iocb->ki_cur_seg++; - iov++; - } - } - - /* the caller should not have done more io than what fit in - * the remaining iovecs */ - BUG_ON(ret > 0 && iocb->ki_left == 0); -} - typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, unsigned long, loff_t); -static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret = 0; - - /* This matches the pread()/pwrite() logic */ - if (iocb->ki_pos < 0) - return -EINVAL; - - if (rw == WRITE) - file_start_write(file); - do { - ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], - iocb->ki_nr_segs - iocb->ki_cur_seg, - iocb->ki_pos); - if (ret > 0) - aio_advance_iovec(iocb, ret); - - /* retry all partial writes. retry partial reads as long as its a - * regular file. */ - } while (ret > 0 && iocb->ki_left > 0 && - (rw == WRITE || - (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); - if (rw == WRITE) - file_end_write(file); - - /* This means we must have transferred all that we could */ - /* No need to retry anymore */ - if ((ret == 0) || (iocb->ki_left == 0)) - ret = iocb->ki_nbytes - iocb->ki_left; - - /* If we managed to write some out we return that, rather than - * the eventual error. */ - if (rw == WRITE - && ret < 0 && ret != -EIOCBQUEUED - && iocb->ki_nbytes - iocb->ki_left) - ret = iocb->ki_nbytes - iocb->ki_left; - - return ret; -} - -static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) +static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, + int rw, char __user *buf, + unsigned long *nr_segs, + struct iovec **iovec, + bool compat) { ssize_t ret; - kiocb->ki_nr_segs = kiocb->ki_nbytes; + *nr_segs = kiocb->ki_nbytes; #ifdef CONFIG_COMPAT if (compat) ret = compat_rw_copy_check_uvector(rw, - (struct compat_iovec __user *)kiocb->ki_buf, - kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + (struct compat_iovec __user *)buf, + *nr_segs, 1, *iovec, iovec); else #endif ret = rw_copy_check_uvector(rw, - (struct iovec __user *)kiocb->ki_buf, - kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + (struct iovec __user *)buf, + *nr_segs, 1, *iovec, iovec); if (ret < 0) return ret; @@ -975,15 +1159,17 @@ static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) return 0; } -static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(struct kiocb *kiocb, + int rw, char __user *buf, + unsigned long *nr_segs, + struct iovec *iovec) { - if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) + if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes))) return -EFAULT; - kiocb->ki_iovec = &kiocb->ki_inline_vec; - kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; - kiocb->ki_nr_segs = 1; + iovec->iov_base = buf; + iovec->iov_len = kiocb->ki_nbytes; + *nr_segs = 1; return 0; } @@ -992,15 +1178,18 @@ static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. */ -static ssize_t aio_run_iocb(struct kiocb *req, bool compat) +static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, + char __user *buf, bool compat) { struct file *file = req->ki_filp; ssize_t ret; + unsigned long nr_segs; int rw; fmode_t mode; aio_rw_op *rw_op; + struct iovec inline_vec, *iovec = &inline_vec; - switch (req->ki_opcode) { + switch (opcode) { case IOCB_CMD_PREAD: case IOCB_CMD_PREADV: mode = FMODE_READ; @@ -1021,21 +1210,38 @@ rw_common: if (!rw_op) return -EINVAL; - ret = (req->ki_opcode == IOCB_CMD_PREADV || - req->ki_opcode == IOCB_CMD_PWRITEV) - ? aio_setup_vectored_rw(rw, req, compat) - : aio_setup_single_vector(rw, req); + ret = (opcode == IOCB_CMD_PREADV || + opcode == IOCB_CMD_PWRITEV) + ? aio_setup_vectored_rw(req, rw, buf, &nr_segs, + &iovec, compat) + : aio_setup_single_vector(req, rw, buf, &nr_segs, + iovec); if (ret) return ret; ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); - if (ret < 0) + if (ret < 0) { + if (iovec != &inline_vec) + kfree(iovec); return ret; + } req->ki_nbytes = ret; - req->ki_left = ret; - ret = aio_rw_vect_retry(req, rw, rw_op); + /* XXX: move/kill - rw_verify_area()? */ + /* This matches the pread()/pwrite() logic */ + if (req->ki_pos < 0) { + ret = -EINVAL; + break; + } + + if (rw == WRITE) + file_start_write(file); + + ret = rw_op(req, iovec, nr_segs, req->ki_pos); + + if (rw == WRITE) + file_end_write(file); break; case IOCB_CMD_FDSYNC: @@ -1057,6 +1263,9 @@ rw_common: return -EINVAL; } + if (iovec != &inline_vec) + kfree(iovec); + if (ret != -EIOCBQUEUED) { /* * There's no easy way to restart the syscall since other AIO's @@ -1128,21 +1337,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_obj.user = user_iocb; req->ki_user_data = iocb->aio_data; req->ki_pos = iocb->aio_offset; + req->ki_nbytes = iocb->aio_nbytes; - req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; - req->ki_left = req->ki_nbytes = iocb->aio_nbytes; - req->ki_opcode = iocb->aio_lio_opcode; - - ret = aio_run_iocb(req, compat); + ret = aio_run_iocb(req, iocb->aio_lio_opcode, + (char __user *)(unsigned long)iocb->aio_buf, + compat); if (ret) goto out_put_req; - aio_put_req(req); /* drop extra ref to req */ return 0; out_put_req: - atomic_dec(&ctx->reqs_active); - aio_put_req(req); /* drop extra ref to req */ - aio_put_req(req); /* drop i/o ref to req */ + put_reqs_available(ctx, 1); + kiocb_free(req); return ret; } @@ -1195,7 +1401,7 @@ long do_io_submit(aio_context_t ctx_id, long nr, } blk_finish_plug(&plug); - put_ioctx(ctx); + percpu_ref_put(&ctx->users); return i ? i : ret; } @@ -1252,7 +1458,6 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { - struct io_event res; struct kioctx *ctx; struct kiocb *kiocb; u32 key; @@ -1270,21 +1475,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, kiocb = lookup_kiocb(ctx, iocb, key); if (kiocb) - ret = kiocb_cancel(ctx, kiocb, &res); + ret = kiocb_cancel(ctx, kiocb); else ret = -EINVAL; spin_unlock_irq(&ctx->ctx_lock); if (!ret) { - /* Cancellation succeeded -- copy the result - * into the user's buffer. + /* + * The result argument is no longer used - the io_event is + * always delivered via the ring buffer. -EINPROGRESS indicates + * cancellation is progress: */ - if (copy_to_user(result, &res, sizeof(res))) - ret = -EFAULT; + ret = -EINPROGRESS; } - put_ioctx(ctx); + percpu_ref_put(&ctx->users); return ret; } @@ -1313,7 +1519,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, if (likely(ioctx)) { if (likely(min_nr <= nr && min_nr >= 0)) ret = read_events(ioctx, min_nr, nr, events, timeout); - put_ioctx(ioctx); + percpu_ref_put(&ioctx->users); } return ret; } diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 47a65df8c871..85c961849953 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -109,6 +109,72 @@ static struct file_system_type anon_inode_fs_type = { }; /** + * anon_inode_getfile_private - creates a new file instance by hooking it up to an + * anonymous inode, and a dentry that describe the "class" + * of the file + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * + * + * Similar to anon_inode_getfile, but each file holds a single inode. + * + */ +struct file *anon_inode_getfile_private(const char *name, + const struct file_operations *fops, + void *priv, int flags) +{ + struct qstr this; + struct path path; + struct file *file; + struct inode *inode; + + if (fops->owner && !try_module_get(fops->owner)) + return ERR_PTR(-ENOENT); + + inode = anon_inode_mkinode(anon_inode_mnt->mnt_sb); + if (IS_ERR(inode)) { + file = ERR_PTR(-ENOMEM); + goto err_module; + } + + /* + * Link the inode to a directory entry by creating a unique name + * using the inode sequence number. + */ + file = ERR_PTR(-ENOMEM); + this.name = name; + this.len = strlen(name); + this.hash = 0; + path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); + if (!path.dentry) + goto err_module; + + path.mnt = mntget(anon_inode_mnt); + + d_instantiate(path.dentry, inode); + + file = alloc_file(&path, OPEN_FMODE(flags), fops); + if (IS_ERR(file)) + goto err_dput; + + file->f_mapping = inode->i_mapping; + file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); + file->private_data = priv; + + return file; + +err_dput: + path_put(&path); +err_module: + module_put(fops->owner); + return file; +} +EXPORT_SYMBOL_GPL(anon_inode_getfile_private); + +/** * anon_inode_getfile - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" * of the file diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 3db70dae40d3..689e40d983ad 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -109,13 +109,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, pkt.hdr.proto_version = sbi->version; pkt.hdr.type = type; - mutex_lock(&sbi->wq_mutex); - /* Check if we have become catatonic */ - if (sbi->catatonic) { - mutex_unlock(&sbi->wq_mutex); - return; - } switch (type) { /* Kernel protocol v4 missing and expire packets */ case autofs_ptype_missing: @@ -427,7 +421,6 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->tgid = current->tgid; wq->status = -EINTR; /* Status return if interrupted */ wq->wait_ctr = 2; - mutex_unlock(&sbi->wq_mutex); if (sbi->version < 5) { if (notify == NFY_MOUNT) @@ -449,15 +442,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); - /* autofs4_notify_daemon() may block */ + /* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */ autofs4_notify_daemon(sbi, wq, type); } else { wq->wait_ctr++; - mutex_unlock(&sbi->wq_mutex); - kfree(qstr.name); DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); + mutex_unlock(&sbi->wq_mutex); + kfree(qstr.name); } /* diff --git a/fs/bfs/file.c b/fs/bfs/file.c index ad3ea1497cc3..ae2892218335 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -166,7 +166,7 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } static int bfs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/block_dev.c b/fs/block_dev.c index 1173a4ee0830..1e86823a9cbd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -592,7 +592,7 @@ static struct block_device *bd_acquire(struct inode *inode) return bdev; } -static inline int sb_is_blkdev_sb(struct super_block *sb) +int sb_is_blkdev_sb(struct super_block *sb) { return sb == blockdev_superblock; } @@ -1542,7 +1542,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, return 0; size -= pos; - if (size < iocb->ki_left) + if (size < iocb->ki_nbytes) nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); return generic_file_aio_read(iocb, iov, nr_segs, pos); } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 2b3b83296977..398cbd517be2 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -72,3 +72,12 @@ config BTRFS_DEBUG performance, or export extra information via sysfs. If unsure, say N. + +config BTRFS_ASSERT + bool "Btrfs assert support" + depends on BTRFS_FS + help + Enable run-time assertion checking. This will result in panics if + any of the assertions trip. This is meant for btrfs developers only. + + If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 3932224f99e9..a91a6a355cc5 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -8,7 +8,10 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ - reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ + uuid-tree.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o + +btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8bc5e8ccb091..0552a599b28f 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -119,6 +119,26 @@ struct __prelim_ref { u64 wanted_disk_byte; }; +static struct kmem_cache *btrfs_prelim_ref_cache; + +int __init btrfs_prelim_ref_init(void) +{ + btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", + sizeof(struct __prelim_ref), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_prelim_ref_cache) + return -ENOMEM; + return 0; +} + +void btrfs_prelim_ref_exit(void) +{ + if (btrfs_prelim_ref_cache) + kmem_cache_destroy(btrfs_prelim_ref_cache); +} + /* * the rules for all callers of this function are: * - obtaining the parent is the goal @@ -160,12 +180,12 @@ struct __prelim_ref { static int __add_prelim_ref(struct list_head *head, u64 root_id, struct btrfs_key *key, int level, - u64 parent, u64 wanted_disk_byte, int count) + u64 parent, u64 wanted_disk_byte, int count, + gfp_t gfp_mask) { struct __prelim_ref *ref; - /* in case we're adding delayed refs, we're holding the refs spinlock */ - ref = kmalloc(sizeof(*ref), GFP_ATOMIC); + ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask); if (!ref) return -ENOMEM; @@ -295,10 +315,9 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); pr_debug("search slot in root %llu (level %d, ref count %d) returned " "%d for key (%llu %u %llu)\n", - (unsigned long long)ref->root_id, level, ref->count, ret, - (unsigned long long)ref->key_for_search.objectid, - ref->key_for_search.type, - (unsigned long long)ref->key_for_search.offset); + ref->root_id, level, ref->count, ret, + ref->key_for_search.objectid, ref->key_for_search.type, + ref->key_for_search.offset); if (ret < 0) goto out; @@ -365,11 +384,12 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, node = ulist_next(parents, &uiter); ref->parent = node ? node->val : 0; ref->inode_list = node ? - (struct extent_inode_elem *)(uintptr_t)node->aux : 0; + (struct extent_inode_elem *)(uintptr_t)node->aux : NULL; /* additional parents require new refs being added here */ while ((node = ulist_next(parents, &uiter))) { - new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS); + new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache, + GFP_NOFS); if (!new_ref) { ret = -ENOMEM; goto out; @@ -493,7 +513,7 @@ static void __merge_refs(struct list_head *head, int mode) ref1->count += ref2->count; list_del(&ref2->list); - kfree(ref2); + kmem_cache_free(btrfs_prelim_ref_cache, ref2); } } @@ -548,7 +568,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, ref = btrfs_delayed_node_to_tree_ref(node); ret = __add_prelim_ref(prefs, ref->root, &op_key, ref->level + 1, 0, node->bytenr, - node->ref_mod * sgn); + node->ref_mod * sgn, GFP_ATOMIC); break; } case BTRFS_SHARED_BLOCK_REF_KEY: { @@ -558,7 +578,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, ret = __add_prelim_ref(prefs, ref->root, NULL, ref->level + 1, ref->parent, node->bytenr, - node->ref_mod * sgn); + node->ref_mod * sgn, GFP_ATOMIC); break; } case BTRFS_EXTENT_DATA_REF_KEY: { @@ -570,7 +590,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, key.offset = ref->offset; ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, node->bytenr, - node->ref_mod * sgn); + node->ref_mod * sgn, GFP_ATOMIC); break; } case BTRFS_SHARED_DATA_REF_KEY: { @@ -583,7 +603,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, key.offset = ref->offset; ret = __add_prelim_ref(prefs, ref->root, &key, 0, ref->parent, node->bytenr, - node->ref_mod * sgn); + node->ref_mod * sgn, GFP_ATOMIC); break; } default: @@ -657,7 +677,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, case BTRFS_SHARED_BLOCK_REF_KEY: ret = __add_prelim_ref(prefs, 0, NULL, *info_level + 1, offset, - bytenr, 1); + bytenr, 1, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_shared_data_ref *sdref; @@ -666,13 +686,13 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, sdref = (struct btrfs_shared_data_ref *)(iref + 1); count = btrfs_shared_data_ref_count(leaf, sdref); ret = __add_prelim_ref(prefs, 0, NULL, 0, offset, - bytenr, count); + bytenr, count, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: ret = __add_prelim_ref(prefs, offset, NULL, *info_level + 1, 0, - bytenr, 1); + bytenr, 1, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -687,7 +707,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, key.offset = btrfs_extent_data_ref_offset(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref); ret = __add_prelim_ref(prefs, root, &key, 0, 0, - bytenr, count); + bytenr, count, GFP_NOFS); break; } default: @@ -738,7 +758,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, case BTRFS_SHARED_BLOCK_REF_KEY: ret = __add_prelim_ref(prefs, 0, NULL, info_level + 1, key.offset, - bytenr, 1); + bytenr, 1, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_shared_data_ref *sdref; @@ -748,13 +768,13 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, struct btrfs_shared_data_ref); count = btrfs_shared_data_ref_count(leaf, sdref); ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset, - bytenr, count); + bytenr, count, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: ret = __add_prelim_ref(prefs, key.offset, NULL, info_level + 1, 0, - bytenr, 1); + bytenr, 1, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -770,7 +790,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, key.offset = btrfs_extent_data_ref_offset(leaf, dref); root = btrfs_extent_data_ref_root(leaf, dref); ret = __add_prelim_ref(prefs, root, &key, 0, 0, - bytenr, count); + bytenr, count, GFP_NOFS); break; } default: @@ -911,7 +931,6 @@ again: while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); - list_del(&ref->list); WARN_ON(ref->count < 0); if (ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ @@ -935,8 +954,10 @@ again: } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie); - ref->inode_list = eie; free_extent_buffer(eb); + if (ret < 0) + goto out; + ref->inode_list = eie; } ret = ulist_add_merge(refs, ref->parent, (uintptr_t)ref->inode_list, @@ -954,7 +975,8 @@ again: eie->next = ref->inode_list; } } - kfree(ref); + list_del(&ref->list); + kmem_cache_free(btrfs_prelim_ref_cache, ref); } out: @@ -962,13 +984,13 @@ out: while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); list_del(&ref->list); - kfree(ref); + kmem_cache_free(btrfs_prelim_ref_cache, ref); } while (!list_empty(&prefs_delayed)) { ref = list_first_entry(&prefs_delayed, struct __prelim_ref, list); list_del(&ref->list); - kfree(ref); + kmem_cache_free(btrfs_prelim_ref_cache, ref); } return ret; @@ -1326,8 +1348,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, found_key->type != BTRFS_METADATA_ITEM_KEY) || found_key->objectid > logical || found_key->objectid + size <= logical) { - pr_debug("logical %llu is not within any extent\n", - (unsigned long long)logical); + pr_debug("logical %llu is not within any extent\n", logical); return -ENOENT; } @@ -1340,11 +1361,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, pr_debug("logical %llu is at position %llu within the extent (%llu " "EXTENT_ITEM %llu) flags %#llx size %u\n", - (unsigned long long)logical, - (unsigned long long)(logical - found_key->objectid), - (unsigned long long)found_key->objectid, - (unsigned long long)found_key->offset, - (unsigned long long)flags, item_size); + logical, logical - found_key->objectid, found_key->objectid, + found_key->offset, flags, item_size); WARN_ON(!flags_ret); if (flags_ret) { @@ -1516,7 +1534,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, while (!ret && (root_node = ulist_next(roots, &root_uiter))) { pr_debug("root %llu references leaf %llu, data list " "%#llx\n", root_node->val, ref_node->val, - (long long)ref_node->aux); + ref_node->aux); ret = iterate_leaf_refs((struct extent_inode_elem *) (uintptr_t)ref_node->aux, root_node->val, @@ -1608,9 +1626,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, name_len = btrfs_inode_ref_name_len(eb, iref); /* path must be released before calling iterate()! */ pr_debug("following ref at offset %u for inode %llu in " - "tree %llu\n", cur, - (unsigned long long)found_key.objectid, - (unsigned long long)fs_root->objectid); + "tree %llu\n", cur, found_key.objectid, + fs_root->objectid); ret = iterate(parent, name_len, (unsigned long)(iref + 1), eb, ctx); if (ret) diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 8f2e76702932..a910b27a8ad9 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -72,4 +72,6 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, struct btrfs_inode_extref **ret_extref, u64 *found_off); +int __init btrfs_prelim_ref_init(void); +void btrfs_prelim_ref_exit(void); #endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 08b286b2a2c5..d0ae226926ee 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -218,6 +218,27 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) return 0; } +struct btrfs_dio_private { + struct inode *inode; + u64 logical_offset; + u64 disk_bytenr; + u64 bytes; + void *private; + + /* number of bios pending for this dio */ + atomic_t pending_bios; + + /* IO errors */ + int errors; + + /* orig_bio is our btrfs_io_bio */ + struct bio *orig_bio; + + /* dio_bio came from fs/direct-io.c */ + struct bio *dio_bio; + u8 csum[0]; +}; + /* * Disable DIO read nolock optimization, so new dio readers will be forced * to grab i_mutex. It is used to avoid the endless truncate due to diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 1431a6965017..1c47be187240 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -701,15 +701,13 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, next_bytenr = btrfs_super_root(selected_super); if (state->print_mask & BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "root@%llu\n", - (unsigned long long)next_bytenr); + printk(KERN_INFO "root@%llu\n", next_bytenr); break; case 1: next_bytenr = btrfs_super_chunk_root(selected_super); if (state->print_mask & BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "chunk@%llu\n", - (unsigned long long)next_bytenr); + printk(KERN_INFO "chunk@%llu\n", next_bytenr); break; case 2: next_bytenr = btrfs_super_log_root(selected_super); @@ -717,8 +715,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, continue; if (state->print_mask & BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "log@%llu\n", - (unsigned long long)next_bytenr); + printk(KERN_INFO "log@%llu\n", next_bytenr); break; } @@ -727,7 +724,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); + next_bytenr, num_copies); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { struct btrfsic_block *next_block; @@ -742,8 +739,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, printk(KERN_INFO "btrfsic:" " btrfsic_map_block(root @%llu," " mirror %d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); + next_bytenr, mirror_num); kfree(selected_super); return -1; } @@ -767,7 +763,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, if (ret < (int)PAGE_CACHE_SIZE) { printk(KERN_INFO "btrfsic: read @logical %llu failed!\n", - (unsigned long long) tmp_next_block_ctx.start); btrfsic_release_block_ctx(&tmp_next_block_ctx); kfree(selected_super); @@ -813,7 +808,7 @@ static int btrfsic_process_superblock_dev_mirror( (bh->b_data + (dev_bytenr & 4095)); if (btrfs_super_bytenr(super_tmp) != dev_bytenr || - super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) || + btrfs_super_magic(super_tmp) != BTRFS_MAGIC || memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || btrfs_super_nodesize(super_tmp) != state->metablock_size || btrfs_super_leafsize(super_tmp) != state->metablock_size || @@ -847,10 +842,8 @@ static int btrfsic_process_superblock_dev_mirror( printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" " @%llu (%s/%llu/%d)\n", superblock_bdev, - rcu_str_deref(device->name), - (unsigned long long)dev_bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, + rcu_str_deref(device->name), dev_bytenr, + dev_state->name, dev_bytenr, superblock_mirror_num); list_add(&superblock_tmp->all_blocks_node, &state->all_blocks_list); @@ -880,20 +873,20 @@ static int btrfsic_process_superblock_dev_mirror( tmp_disk_key.offset = 0; switch (pass) { case 0: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_TREE_OBJECTID); additional_string = "initial root "; next_bytenr = btrfs_super_root(super_tmp); break; case 1: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_CHUNK_TREE_OBJECTID); additional_string = "initial chunk "; next_bytenr = btrfs_super_chunk_root(super_tmp); break; case 2: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_TREE_LOG_OBJECTID); additional_string = "initial log "; next_bytenr = btrfs_super_log_root(super_tmp); if (0 == next_bytenr) @@ -906,7 +899,7 @@ static int btrfsic_process_superblock_dev_mirror( next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); + next_bytenr, num_copies); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { struct btrfsic_block *next_block; struct btrfsic_block_data_ctx tmp_next_block_ctx; @@ -918,8 +911,7 @@ static int btrfsic_process_superblock_dev_mirror( mirror_num)) { printk(KERN_INFO "btrfsic: btrfsic_map_block(" "bytenr @%llu, mirror %d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); + next_bytenr, mirror_num); brelse(bh); return -1; } @@ -1003,19 +995,17 @@ continue_with_new_stack_frame: (struct btrfs_leaf *)sf->hdr; if (-1 == sf->i) { - sf->nr = le32_to_cpu(leafhdr->header.nritems); + sf->nr = btrfs_stack_header_nritems(&leafhdr->header); if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "leaf %llu items %d generation %llu" " owner %llu\n", - (unsigned long long) - sf->block_ctx->start, - sf->nr, - (unsigned long long) - le64_to_cpu(leafhdr->header.generation), - (unsigned long long) - le64_to_cpu(leafhdr->header.owner)); + sf->block_ctx->start, sf->nr, + btrfs_stack_header_generation( + &leafhdr->header), + btrfs_stack_header_owner( + &leafhdr->header)); } continue_with_current_leaf_stack_frame: @@ -1047,10 +1037,10 @@ leaf_item_out_of_bounce_error: &disk_item, disk_item_offset, sizeof(struct btrfs_item)); - item_offset = le32_to_cpu(disk_item.offset); - item_size = le32_to_cpu(disk_item.size); + item_offset = btrfs_stack_item_offset(&disk_item); + item_size = btrfs_stack_item_offset(&disk_item); disk_key = &disk_item.key; - type = disk_key->type; + type = btrfs_disk_key_type(disk_key); if (BTRFS_ROOT_ITEM_KEY == type) { struct btrfs_root_item root_item; @@ -1066,7 +1056,7 @@ leaf_item_out_of_bounce_error: sf->block_ctx, &root_item, root_item_offset, item_size); - next_bytenr = le64_to_cpu(root_item.bytenr); + next_bytenr = btrfs_root_bytenr(&root_item); sf->error = btrfsic_create_link_to_next_block( @@ -1081,8 +1071,8 @@ leaf_item_out_of_bounce_error: &sf->num_copies, &sf->mirror_num, disk_key, - le64_to_cpu(root_item. - generation)); + btrfs_root_generation( + &root_item)); if (sf->error) goto one_stack_frame_backwards; @@ -1130,18 +1120,17 @@ leaf_item_out_of_bounce_error: struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; if (-1 == sf->i) { - sf->nr = le32_to_cpu(nodehdr->header.nritems); + sf->nr = btrfs_stack_header_nritems(&nodehdr->header); if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "node %llu level %d items %d" " generation %llu owner %llu\n", - (unsigned long long) sf->block_ctx->start, nodehdr->header.level, sf->nr, - (unsigned long long) - le64_to_cpu(nodehdr->header.generation), - (unsigned long long) - le64_to_cpu(nodehdr->header.owner)); + btrfs_stack_header_generation( + &nodehdr->header), + btrfs_stack_header_owner( + &nodehdr->header)); } continue_with_current_node_stack_frame: @@ -1168,7 +1157,7 @@ continue_with_current_node_stack_frame: btrfsic_read_from_block_data( sf->block_ctx, &key_ptr, key_ptr_offset, sizeof(struct btrfs_key_ptr)); - next_bytenr = le64_to_cpu(key_ptr.blockptr); + next_bytenr = btrfs_stack_key_blockptr(&key_ptr); sf->error = btrfsic_create_link_to_next_block( state, @@ -1182,7 +1171,7 @@ continue_with_current_node_stack_frame: &sf->num_copies, &sf->mirror_num, &key_ptr.key, - le64_to_cpu(key_ptr.generation)); + btrfs_stack_key_generation(&key_ptr)); if (sf->error) goto one_stack_frame_backwards; @@ -1247,8 +1236,7 @@ static void btrfsic_read_from_block_data( unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT; WARN_ON(offset + len > block_ctx->len); - offset_in_page = (start_offset + offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1); while (len > 0) { cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); @@ -1290,7 +1278,7 @@ static int btrfsic_create_link_to_next_block( next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, *num_copiesp); + next_bytenr, *num_copiesp); *mirror_nump = 1; } @@ -1307,7 +1295,7 @@ static int btrfsic_create_link_to_next_block( if (ret) { printk(KERN_INFO "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - (unsigned long long)next_bytenr, *mirror_nump); + next_bytenr, *mirror_nump); btrfsic_release_block_ctx(next_block_ctx); *next_blockp = NULL; return -1; @@ -1335,20 +1323,16 @@ static int btrfsic_create_link_to_next_block( "Referenced block @%llu (%s/%llu/%d)" " found in hash table, %c," " bytenr mismatch (!= stored %llu).\n", - (unsigned long long)next_bytenr, - next_block_ctx->dev->name, - (unsigned long long)next_block_ctx->dev_bytenr, - *mirror_nump, + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, btrfsic_get_block_type(state, next_block), - (unsigned long long)next_block->logical_bytenr); + next_block->logical_bytenr); } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Referenced block @%llu (%s/%llu/%d)" " found in hash table, %c.\n", - (unsigned long long)next_bytenr, - next_block_ctx->dev->name, - (unsigned long long)next_block_ctx->dev_bytenr, - *mirror_nump, + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, btrfsic_get_block_type(state, next_block)); next_block->logical_bytenr = next_bytenr; @@ -1400,7 +1384,7 @@ static int btrfsic_create_link_to_next_block( if (ret < (int)next_block_ctx->len) { printk(KERN_INFO "btrfsic: read block @logical %llu failed!\n", - (unsigned long long)next_bytenr); + next_bytenr); btrfsic_release_block_ctx(next_block_ctx); *next_blockp = NULL; return -1; @@ -1444,12 +1428,12 @@ static int btrfsic_handle_extent_data( file_extent_item_offset, offsetof(struct btrfs_file_extent_item, disk_num_bytes)); if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || - ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) { + btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n", file_extent_item.type, - (unsigned long long) - le64_to_cpu(file_extent_item.disk_bytenr)); + btrfs_stack_file_extent_disk_bytenr( + &file_extent_item)); return 0; } @@ -1463,20 +1447,19 @@ static int btrfsic_handle_extent_data( btrfsic_read_from_block_data(block_ctx, &file_extent_item, file_extent_item_offset, sizeof(struct btrfs_file_extent_item)); - next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) + - le64_to_cpu(file_extent_item.offset); - generation = le64_to_cpu(file_extent_item.generation); - num_bytes = le64_to_cpu(file_extent_item.num_bytes); - generation = le64_to_cpu(file_extent_item.generation); + next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item) + + btrfs_stack_file_extent_offset(&file_extent_item); + generation = btrfs_stack_file_extent_generation(&file_extent_item); + num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); + generation = btrfs_stack_file_extent_generation(&file_extent_item); if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu," " offset = %llu, num_bytes = %llu\n", file_extent_item.type, - (unsigned long long) - le64_to_cpu(file_extent_item.disk_bytenr), - (unsigned long long)le64_to_cpu(file_extent_item.offset), - (unsigned long long)num_bytes); + btrfs_stack_file_extent_disk_bytenr(&file_extent_item), + btrfs_stack_file_extent_offset(&file_extent_item), + num_bytes); while (num_bytes > 0) { u32 chunk_len; int num_copies; @@ -1492,7 +1475,7 @@ static int btrfsic_handle_extent_data( next_bytenr, state->datablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); + next_bytenr, num_copies); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { struct btrfsic_block_data_ctx next_block_ctx; struct btrfsic_block *next_block; @@ -1504,8 +1487,7 @@ static int btrfsic_handle_extent_data( if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) printk(KERN_INFO "\tdisk_bytenr = %llu, num_bytes %u\n", - (unsigned long long)next_bytenr, - chunk_len); + next_bytenr, chunk_len); ret = btrfsic_map_block(state, next_bytenr, chunk_len, &next_block_ctx, mirror_num); @@ -1513,8 +1495,7 @@ static int btrfsic_handle_extent_data( printk(KERN_INFO "btrfsic: btrfsic_map_block(@%llu," " mirror=%d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); + next_bytenr, mirror_num); return -1; } @@ -1543,12 +1524,10 @@ static int btrfsic_handle_extent_data( " found in hash table, D," " bytenr mismatch" " (!= stored %llu).\n", - (unsigned long long)next_bytenr, + next_bytenr, next_block_ctx.dev->name, - (unsigned long long) next_block_ctx.dev_bytenr, mirror_num, - (unsigned long long) next_block->logical_bytenr); } next_block->logical_bytenr = next_bytenr; @@ -1675,7 +1654,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: read_block() with unaligned bytenr %llu\n", - (unsigned long long)block_ctx->dev_bytenr); + block_ctx->dev_bytenr); return -1; } @@ -1772,10 +1751,8 @@ static void btrfsic_dump_database(struct btrfsic_state *state) printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num); + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num); list_for_each(elem_ref_to, &b_all->ref_to_list) { const struct btrfsic_block_link *const l = @@ -1787,16 +1764,13 @@ static void btrfsic_dump_database(struct btrfsic_state *state) " refers %u* to" " %c @%llu (%s/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num, + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } @@ -1810,16 +1784,12 @@ static void btrfsic_dump_database(struct btrfsic_state *state) " is ref %u* from" " %c @%llu (%s/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num, + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long) l->block_ref_from->logical_bytenr, l->block_ref_from->dev_state->name, - (unsigned long long) l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num); } @@ -1896,8 +1866,8 @@ again: struct list_head *tmp_ref_to; if (block->is_superblock) { - bytenr = le64_to_cpu(((struct btrfs_super_block *) - mapped_datav[0])->bytenr); + bytenr = btrfs_super_bytenr((struct btrfs_super_block *) + mapped_datav[0]); if (num_pages * PAGE_CACHE_SIZE < BTRFS_SUPER_INFO_SIZE) { printk(KERN_INFO @@ -1923,8 +1893,9 @@ again: return; } processed_len = state->metablock_size; - bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_datav[0])->bytenr); + bytenr = btrfs_stack_header_bytenr( + (struct btrfs_header *) + mapped_datav[0]); btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, dev_bytenr); @@ -1935,12 +1906,9 @@ again: " found in hash table, %c," " bytenr mismatch" " (!= stored %llu).\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, + bytenr, dev_state->name, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block), - (unsigned long long) block->logical_bytenr); block->logical_bytenr = bytenr; } else if (state->print_mask & @@ -1948,9 +1916,7 @@ again: printk(KERN_INFO "Written block @%llu (%s/%llu/%d)" " found in hash table, %c.\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, + bytenr, dev_state->name, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block)); } else { @@ -1966,9 +1932,7 @@ again: printk(KERN_INFO "Written block @%llu (%s/%llu/%d)" " found in hash table, %c.\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, + bytenr, dev_state->name, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block)); } @@ -1985,21 +1949,14 @@ again: " new(gen=%llu)," " which is referenced by most recent superblock" " (superblockgen=%llu)!\n", - btrfsic_get_block_type(state, block), - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - (unsigned long long)block->generation, - (unsigned long long) - le64_to_cpu(block->disk_key.objectid), + btrfsic_get_block_type(state, block), bytenr, + dev_state->name, dev_bytenr, block->mirror_num, + block->generation, + btrfs_disk_key_objectid(&block->disk_key), block->disk_key.type, - (unsigned long long) - le64_to_cpu(block->disk_key.offset), - (unsigned long long) - le64_to_cpu(((struct btrfs_header *) - mapped_datav[0])->generation), - (unsigned long long) + btrfs_disk_key_offset(&block->disk_key), + btrfs_stack_header_generation( + (struct btrfs_header *) mapped_datav[0]), state->max_superblock_generation); btrfsic_dump_tree(state); } @@ -2008,15 +1965,12 @@ again: printk(KERN_INFO "btrfs: attempt to overwrite %c-block" " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu," " which is not yet iodone!\n", - btrfsic_get_block_type(state, block), - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr, - block->mirror_num, - (unsigned long long)block->generation, - (unsigned long long) - le64_to_cpu(((struct btrfs_header *) - mapped_datav[0])->generation)); + btrfsic_get_block_type(state, block), bytenr, + dev_state->name, dev_bytenr, block->mirror_num, + block->generation, + btrfs_stack_header_generation( + (struct btrfs_header *) + mapped_datav[0])); /* it would not be safe to go on */ btrfsic_dump_tree(state); goto continue_loop; @@ -2056,7 +2010,7 @@ again: if (ret) { printk(KERN_INFO "btrfsic: btrfsic_map_block(root @%llu)" - " failed!\n", (unsigned long long)bytenr); + " failed!\n", bytenr); goto continue_loop; } block_ctx.datav = mapped_datav; @@ -2140,7 +2094,7 @@ again: printk(KERN_INFO "btrfsic: btrfsic_process_metablock" "(root @%llu) failed!\n", - (unsigned long long)dev_bytenr); + dev_bytenr); } else { block->is_metadata = 0; block->mirror_num = 0; /* unknown */ @@ -2168,8 +2122,7 @@ again: if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Written block (%s/%llu/?)" " !found in hash table, D.\n", - dev_state->name, - (unsigned long long)dev_bytenr); + dev_state->name, dev_bytenr); if (!state->include_extent_data) { /* ignore that written D block */ goto continue_loop; @@ -2184,17 +2137,16 @@ again: block_ctx.pagev = NULL; } else { processed_len = state->metablock_size; - bytenr = le64_to_cpu(((struct btrfs_header *) - mapped_datav[0])->bytenr); + bytenr = btrfs_stack_header_bytenr( + (struct btrfs_header *) + mapped_datav[0]); btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, dev_bytenr); if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "Written block @%llu (%s/%llu/?)" " !found in hash table, M.\n", - (unsigned long long)bytenr, - dev_state->name, - (unsigned long long)dev_bytenr); + bytenr, dev_state->name, dev_bytenr); ret = btrfsic_map_block(state, bytenr, processed_len, &block_ctx, 0); @@ -2202,7 +2154,7 @@ again: printk(KERN_INFO "btrfsic: btrfsic_map_block(root @%llu)" " failed!\n", - (unsigned long long)dev_bytenr); + dev_bytenr); goto continue_loop; } } @@ -2267,10 +2219,8 @@ again: printk(KERN_INFO "New written %c-block @%llu (%s/%llu/%d)\n", is_metadata ? 'M' : 'D', - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num); list_add(&block->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(block, &state->block_hashtable); @@ -2281,7 +2231,7 @@ again: printk(KERN_INFO "btrfsic: process_metablock(root @%llu)" " failed!\n", - (unsigned long long)dev_bytenr); + dev_bytenr); } btrfsic_release_block_ctx(&block_ctx); } @@ -2319,10 +2269,8 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", bio_error_status, btrfsic_get_block_type(dev_state->state, block), - (unsigned long long)block->logical_bytenr, - dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); + block->logical_bytenr, dev_state->name, + block->dev_bytenr, block->mirror_num); next_block = block->next_in_same_bio; block->iodone_w_error = iodone_w_error; if (block->submit_bio_bh_rw & REQ_FLUSH) { @@ -2332,7 +2280,6 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) printk(KERN_INFO "bio_end_io() new %s flush_gen=%llu\n", dev_state->name, - (unsigned long long) dev_state->last_flush_gen); } if (block->submit_bio_bh_rw & REQ_FUA) @@ -2358,10 +2305,8 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n", iodone_w_error, btrfsic_get_block_type(dev_state->state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num); block->iodone_w_error = iodone_w_error; if (block->submit_bio_bh_rw & REQ_FLUSH) { @@ -2370,8 +2315,7 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) printk(KERN_INFO "bh_end_io() new %s flush_gen=%llu\n", - dev_state->name, - (unsigned long long)dev_state->last_flush_gen); + dev_state->name, dev_state->last_flush_gen); } if (block->submit_bio_bh_rw & REQ_FUA) block->flush_gen = 0; /* FUA completed means block is on disk */ @@ -2396,26 +2340,20 @@ static int btrfsic_process_written_superblock( printk(KERN_INFO "btrfsic: superblock @%llu (%s/%llu/%d)" " with old gen %llu <= %llu\n", - (unsigned long long)superblock->logical_bytenr, + superblock->logical_bytenr, superblock->dev_state->name, - (unsigned long long)superblock->dev_bytenr, - superblock->mirror_num, - (unsigned long long) + superblock->dev_bytenr, superblock->mirror_num, btrfs_super_generation(super_hdr), - (unsigned long long) state->max_superblock_generation); } else { if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) printk(KERN_INFO "btrfsic: got new superblock @%llu (%s/%llu/%d)" " with new gen %llu > %llu\n", - (unsigned long long)superblock->logical_bytenr, + superblock->logical_bytenr, superblock->dev_state->name, - (unsigned long long)superblock->dev_bytenr, - superblock->mirror_num, - (unsigned long long) + superblock->dev_bytenr, superblock->mirror_num, btrfs_super_generation(super_hdr), - (unsigned long long) state->max_superblock_generation); state->max_superblock_generation = @@ -2432,43 +2370,41 @@ static int btrfsic_process_written_superblock( int num_copies; int mirror_num; const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key; + struct btrfs_disk_key tmp_disk_key = {0}; - tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_disk_key.offset = 0; + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_ITEM_KEY); + btrfs_set_disk_key_objectid(&tmp_disk_key, 0); switch (pass) { case 0: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID); + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_ROOT_TREE_OBJECTID); additional_string = "root "; next_bytenr = btrfs_super_root(super_hdr); if (state->print_mask & BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "root@%llu\n", - (unsigned long long)next_bytenr); + printk(KERN_INFO "root@%llu\n", next_bytenr); break; case 1: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID); + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_CHUNK_TREE_OBJECTID); additional_string = "chunk "; next_bytenr = btrfs_super_chunk_root(super_hdr); if (state->print_mask & BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "chunk@%llu\n", - (unsigned long long)next_bytenr); + printk(KERN_INFO "chunk@%llu\n", next_bytenr); break; case 2: - tmp_disk_key.objectid = - cpu_to_le64(BTRFS_TREE_LOG_OBJECTID); + btrfs_set_disk_key_objectid(&tmp_disk_key, + BTRFS_TREE_LOG_OBJECTID); additional_string = "log "; next_bytenr = btrfs_super_log_root(super_hdr); if (0 == next_bytenr) continue; if (state->print_mask & BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - printk(KERN_INFO "log@%llu\n", - (unsigned long long)next_bytenr); + printk(KERN_INFO "log@%llu\n", next_bytenr); break; } @@ -2477,7 +2413,7 @@ static int btrfsic_process_written_superblock( next_bytenr, BTRFS_SUPER_INFO_SIZE); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n", - (unsigned long long)next_bytenr, num_copies); + next_bytenr, num_copies); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { int was_created; @@ -2493,8 +2429,7 @@ static int btrfsic_process_written_superblock( printk(KERN_INFO "btrfsic: btrfsic_map_block(@%llu," " mirror=%d) failed!\n", - (unsigned long long)next_bytenr, - mirror_num); + next_bytenr, mirror_num); return -1; } @@ -2579,26 +2514,22 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, " %u* refers to %c @%llu (%s/%llu/%d)\n", recursion_level, btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num, + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); if (l->block_ref_to->never_written) { printk(KERN_INFO "btrfs: attempt to write superblock" " which references block %c @%llu (%s/%llu/%d)" " which is never written!\n", btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; } else if (!l->block_ref_to->is_iodone) { @@ -2606,10 +2537,9 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, " which references block %c @%llu (%s/%llu/%d)" " which is not yet iodone!\n", btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; } else if (l->block_ref_to->iodone_w_error) { @@ -2617,10 +2547,9 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, " which references block %c @%llu (%s/%llu/%d)" " which has write error!\n", btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; } else if (l->parent_generation != @@ -2634,13 +2563,12 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, " with generation %llu !=" " parent generation %llu!\n", btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num, - (unsigned long long)l->block_ref_to->generation, - (unsigned long long)l->parent_generation); + l->block_ref_to->generation, + l->parent_generation); ret = -1; } else if (l->block_ref_to->flush_gen > l->block_ref_to->dev_state->last_flush_gen) { @@ -2650,13 +2578,10 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, " (block flush_gen=%llu," " dev->flush_gen=%llu)!\n", btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long) l->block_ref_to->logical_bytenr, l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, - (unsigned long long)block->flush_gen, - (unsigned long long) + l->block_ref_to->dev_bytenr, + l->block_ref_to->mirror_num, block->flush_gen, l->block_ref_to->dev_state->last_flush_gen); ret = -1; } else if (-1 == btrfsic_check_all_ref_blocks(state, @@ -2701,16 +2626,12 @@ static int btrfsic_is_block_ref_by_superblock( " is ref %u* from %c @%llu (%s/%llu/%d)\n", recursion_level, btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num, + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long) l->block_ref_from->logical_bytenr, l->block_ref_from->dev_state->name, - (unsigned long long) l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num); if (l->block_ref_from->is_superblock && @@ -2737,14 +2658,12 @@ static void btrfsic_print_add_link(const struct btrfsic_state *state, " to %c @%llu (%s/%llu/%d).\n", l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long)l->block_ref_from->logical_bytenr, + l->block_ref_from->logical_bytenr, l->block_ref_from->dev_state->name, - (unsigned long long)l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num, + l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long)l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } @@ -2756,14 +2675,12 @@ static void btrfsic_print_rem_link(const struct btrfsic_state *state, " to %c @%llu (%s/%llu/%d).\n", l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), - (unsigned long long)l->block_ref_from->logical_bytenr, + l->block_ref_from->logical_bytenr, l->block_ref_from->dev_state->name, - (unsigned long long)l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num, + l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, btrfsic_get_block_type(state, l->block_ref_to), - (unsigned long long)l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, - (unsigned long long)l->block_ref_to->dev_bytenr, + l->block_ref_to->logical_bytenr, + l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } @@ -2807,10 +2724,8 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, */ indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - block->dev_state->name, - (unsigned long long)block->dev_bytenr, - block->mirror_num); + block->logical_bytenr, block->dev_state->name, + block->dev_bytenr, block->mirror_num); if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { printk("[...]\n"); return; @@ -2943,10 +2858,8 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( "New %s%c-block @%llu (%s/%llu/%d)\n", additional_string, btrfsic_get_block_type(state, block), - (unsigned long long)block->logical_bytenr, - dev_state->name, - (unsigned long long)block->dev_bytenr, - mirror_num); + block->logical_bytenr, dev_state->name, + block->dev_bytenr, mirror_num); list_add(&block->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(block, &state->block_hashtable); if (NULL != was_created) @@ -2980,7 +2893,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, printk(KERN_INFO "btrfsic:" " btrfsic_map_block(logical @%llu," " mirror %d) failed!\n", - (unsigned long long)bytenr, mirror_num); + bytenr, mirror_num); continue; } @@ -2997,8 +2910,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio," " buffer->log_bytenr=%llu, submit_bio(bdev=%s," " phys_bytenr=%llu)!\n", - (unsigned long long)bytenr, dev_state->name, - (unsigned long long)dev_bytenr); + bytenr, dev_state->name, dev_bytenr); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { ret = btrfsic_map_block(state, bytenr, state->metablock_size, @@ -3008,10 +2920,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, printk(KERN_INFO "Read logical bytenr @%llu maps to" " (%s/%llu/%d)\n", - (unsigned long long)bytenr, - block_ctx.dev->name, - (unsigned long long)block_ctx.dev_bytenr, - mirror_num); + bytenr, block_ctx.dev->name, + block_ctx.dev_bytenr, mirror_num); } WARN_ON(1); } @@ -3048,12 +2958,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu)," - " size=%lu, data=%p, bdev=%p)\n", - rw, (unsigned long)bh->b_blocknr, - (unsigned long long)dev_bytenr, - (unsigned long)bh->b_size, bh->b_data, - bh->b_bdev); + "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu)," + " size=%zu, data=%p, bdev=%p)\n", + rw, (unsigned long long)bh->b_blocknr, + dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev); btrfsic_process_written_block(dev_state, dev_bytenr, &bh->b_data, 1, NULL, NULL, bh, rw); @@ -3118,9 +3026,9 @@ void btrfsic_submit_bio(int rw, struct bio *bio) BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO "submit_bio(rw=0x%x, bi_vcnt=%u," - " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n", - rw, bio->bi_vcnt, (unsigned long)bio->bi_sector, - (unsigned long long)dev_bytenr, + " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + rw, bio->bi_vcnt, + (unsigned long long)bio->bi_sector, dev_bytenr, bio->bi_bdev); mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, @@ -3213,19 +3121,19 @@ int btrfsic_mount(struct btrfs_root *root, if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->nodesize, (unsigned long)PAGE_CACHE_SIZE); + root->nodesize, PAGE_CACHE_SIZE); return -1; } if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->leafsize, (unsigned long)PAGE_CACHE_SIZE); + root->leafsize, PAGE_CACHE_SIZE); return -1; } if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->sectorsize, (unsigned long)PAGE_CACHE_SIZE); + root->sectorsize, PAGE_CACHE_SIZE); return -1; } state = kzalloc(sizeof(*state), GFP_NOFS); @@ -3369,10 +3277,8 @@ void btrfsic_unmount(struct btrfs_root *root, " @%llu (%s/%llu/%d) on umount which is" " not yet iodone!\n", btrfsic_get_block_type(state, b_all), - (unsigned long long)b_all->logical_bytenr, - b_all->dev_state->name, - (unsigned long long)b_all->dev_bytenr, - b_all->mirror_num); + b_all->logical_bytenr, b_all->dev_state->name, + b_all->dev_bytenr, b_all->mirror_num); } mutex_unlock(&btrfsic_mutex); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b189bd1e7a3e..6aad98cb343f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -132,9 +132,8 @@ static int check_compressed_csum(struct inode *inode, printk(KERN_INFO "btrfs csum failed ino %llu " "extent %llu csum %u " "wanted %u mirror %d\n", - (unsigned long long)btrfs_ino(inode), - (unsigned long long)disk_start, - csum, *cb_sum, cb->mirror_num); + btrfs_ino(inode), disk_start, csum, *cb_sum, + cb->mirror_num); ret = -EIO; goto fail; } @@ -639,7 +638,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, faili = nr_pages - 1; cb->nr_pages = nr_pages; - add_ra_bio_pages(inode, em_start + em_len, cb); + /* In the parent-locked case, we only locked the range we are + * interested in. In all other cases, we can opportunistically + * cache decompressed data that goes beyond the requested range. */ + if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED)) + add_ra_bio_pages(inode, em_start + em_len, cb); /* include any pages we added in add_ra-bio_pages */ uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ed504607d8ec..64346721173f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -274,8 +274,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer(cow, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(cow), + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow), BTRFS_FSID_SIZE); WARN_ON(btrfs_header_generation(buf) > trans->transid); @@ -484,8 +483,27 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) struct rb_node **new; struct rb_node *parent = NULL; struct tree_mod_elem *cur; + int ret = 0; + + BUG_ON(!tm); + + tree_mod_log_write_lock(fs_info); + if (list_empty(&fs_info->tree_mod_seq_list)) { + tree_mod_log_write_unlock(fs_info); + /* + * Ok we no longer care about logging modifications, free up tm + * and return 0. Any callers shouldn't be using tm after + * calling tree_mod_log_insert, but if they do we can just + * change this to return a special error code to let the callers + * do their own thing. + */ + kfree(tm); + return 0; + } - BUG_ON(!tm || !tm->seq); + spin_lock(&fs_info->tree_mod_seq_lock); + tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); + spin_unlock(&fs_info->tree_mod_seq_lock); tm_root = &fs_info->tree_mod_log; new = &tm_root->rb_node; @@ -501,14 +519,17 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) else if (cur->seq > tm->seq) new = &((*new)->rb_right); else { + ret = -EEXIST; kfree(tm); - return -EEXIST; + goto out; } } rb_link_node(&tm->node, parent, new); rb_insert_color(&tm->node, tm_root); - return 0; +out: + tree_mod_log_write_unlock(fs_info); + return ret; } /* @@ -524,57 +545,19 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, return 1; if (eb && btrfs_header_level(eb) == 0) return 1; - - tree_mod_log_write_lock(fs_info); - if (list_empty(&fs_info->tree_mod_seq_list)) { - /* - * someone emptied the list while we were waiting for the lock. - * we must not add to the list when no blocker exists. - */ - tree_mod_log_write_unlock(fs_info); - return 1; - } - return 0; } -/* - * This allocates memory and gets a tree modification sequence number. - * - * Returns <0 on error. - * Returns >0 (the added sequence number) on success. - */ -static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags, - struct tree_mod_elem **tm_ret) -{ - struct tree_mod_elem *tm; - - /* - * once we switch from spin locks to something different, we should - * honor the flags parameter here. - */ - tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC); - if (!tm) - return -ENOMEM; - - spin_lock(&fs_info->tree_mod_seq_lock); - tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info); - spin_unlock(&fs_info->tree_mod_seq_lock); - - return tm->seq; -} - static inline int __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int slot, enum mod_log_op op, gfp_t flags) { - int ret; struct tree_mod_elem *tm; - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret < 0) - return ret; + tm = kzalloc(sizeof(*tm), flags); + if (!tm) + return -ENOMEM; tm->index = eb->start >> PAGE_CACHE_SHIFT; if (op != MOD_LOG_KEY_ADD) { @@ -589,34 +572,14 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, } static noinline int -tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int slot, + enum mod_log_op op, gfp_t flags) { - int ret; - if (tree_mod_dont_log(fs_info, eb)) return 0; - ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); - - tree_mod_log_write_unlock(fs_info); - return ret; -} - -static noinline int -tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - int slot, enum mod_log_op op) -{ - return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS); -} - -static noinline int -tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int slot, - enum mod_log_op op) -{ - return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS); + return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags); } static noinline int @@ -637,14 +600,14 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, * buffer, i.e. dst_slot < src_slot. */ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING); + ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot, + MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); BUG_ON(ret < 0); } - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret < 0) - goto out; + tm = kzalloc(sizeof(*tm), flags); + if (!tm) + return -ENOMEM; tm->index = eb->start >> PAGE_CACHE_SHIFT; tm->slot = src_slot; @@ -652,10 +615,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, tm->move.nr_items = nr_items; tm->op = MOD_LOG_MOVE_KEYS; - ret = __tree_mod_log_insert(fs_info, tm); -out: - tree_mod_log_write_unlock(fs_info); - return ret; + return __tree_mod_log_insert(fs_info, tm); } static inline void @@ -670,8 +630,8 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) nritems = btrfs_header_nritems(eb); for (i = nritems - 1; i >= 0; i--) { - ret = tree_mod_log_insert_key_locked(fs_info, eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING); + ret = __tree_mod_log_insert_key(fs_info, eb, i, + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); BUG_ON(ret < 0); } } @@ -683,7 +643,6 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, int log_removal) { struct tree_mod_elem *tm; - int ret; if (tree_mod_dont_log(fs_info, NULL)) return 0; @@ -691,9 +650,9 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (log_removal) __tree_mod_log_free_eb(fs_info, old_root); - ret = tree_mod_alloc(fs_info, flags, &tm); - if (ret < 0) - goto out; + tm = kzalloc(sizeof(*tm), flags); + if (!tm) + return -ENOMEM; tm->index = new_root->start >> PAGE_CACHE_SHIFT; tm->old_root.logical = old_root->start; @@ -701,10 +660,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, tm->generation = btrfs_header_generation(old_root); tm->op = MOD_LOG_ROOT_REPLACE; - ret = __tree_mod_log_insert(fs_info, tm); -out: - tree_mod_log_write_unlock(fs_info); - return ret; + return __tree_mod_log_insert(fs_info, tm); } static struct tree_mod_elem * @@ -784,23 +740,20 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, if (tree_mod_dont_log(fs_info, NULL)) return; - if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) { - tree_mod_log_write_unlock(fs_info); + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) return; - } for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, src, + ret = __tree_mod_log_insert_key(fs_info, src, i + src_offset, - MOD_LOG_KEY_REMOVE); + MOD_LOG_KEY_REMOVE, GFP_NOFS); BUG_ON(ret < 0); - ret = tree_mod_log_insert_key_locked(fs_info, dst, + ret = __tree_mod_log_insert_key(fs_info, dst, i + dst_offset, - MOD_LOG_KEY_ADD); + MOD_LOG_KEY_ADD, + GFP_NOFS); BUG_ON(ret < 0); } - - tree_mod_log_write_unlock(fs_info); } static inline void @@ -819,9 +772,9 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info, { int ret; - ret = tree_mod_log_insert_key_mask(fs_info, eb, slot, - MOD_LOG_KEY_REPLACE, - atomic ? GFP_ATOMIC : GFP_NOFS); + ret = __tree_mod_log_insert_key(fs_info, eb, slot, + MOD_LOG_KEY_REPLACE, + atomic ? GFP_ATOMIC : GFP_NOFS); BUG_ON(ret < 0); } @@ -830,10 +783,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { if (tree_mod_dont_log(fs_info, eb)) return; - __tree_mod_log_free_eb(fs_info, eb); - - tree_mod_log_write_unlock(fs_info); } static noinline void @@ -1046,8 +996,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, root->root_key.objectid); - write_extent_buffer(cow, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(cow), + write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow), BTRFS_FSID_SIZE); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); @@ -1083,7 +1032,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, WARN_ON(trans->transid != btrfs_header_generation(parent)); tree_mod_log_insert_key(root->fs_info, parent, parent_slot, - MOD_LOG_KEY_REPLACE); + MOD_LOG_KEY_REPLACE, GFP_NOFS); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, @@ -1116,7 +1065,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, int looped = 0; if (!time_seq) - return 0; + return NULL; /* * the very last operation that's logged for a root is the replacement @@ -1127,7 +1076,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, tm = tree_mod_log_search_oldest(fs_info, root_logical, time_seq); if (!looped && !tm) - return 0; + return NULL; /* * if there are no tree operation for the oldest root, we simply * return it. this should only happen if that (old) root is at @@ -1240,8 +1189,8 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, * is freed (its refcount is decremented). */ static struct extent_buffer * -tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - u64 time_seq) +tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, + struct extent_buffer *eb, u64 time_seq) { struct extent_buffer *eb_rewin; struct tree_mod_elem *tm; @@ -1256,11 +1205,18 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, if (!tm) return eb; + btrfs_set_path_blocking(path); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); eb_rewin = alloc_dummy_extent_buffer(eb->start, fs_info->tree_root->nodesize); - BUG_ON(!eb_rewin); + if (!eb_rewin) { + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + return NULL; + } btrfs_set_header_bytenr(eb_rewin, eb->start); btrfs_set_header_backref_rev(eb_rewin, btrfs_header_backref_rev(eb)); @@ -1268,10 +1224,15 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); } else { eb_rewin = btrfs_clone_extent_buffer(eb); - BUG_ON(!eb_rewin); + if (!eb_rewin) { + btrfs_tree_read_unlock_blocking(eb); + free_extent_buffer(eb); + return NULL; + } } - btrfs_tree_read_unlock(eb); + btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK); + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); extent_buffer_get(eb_rewin); @@ -1335,8 +1296,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) free_extent_buffer(eb_root); eb = alloc_dummy_extent_buffer(logical, root->nodesize); } else { + btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); - btrfs_tree_read_unlock(eb_root); + btrfs_tree_read_unlock_blocking(eb_root); free_extent_buffer(eb_root); } @@ -1419,14 +1381,12 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, if (trans->transaction != root->fs_info->running_transaction) WARN(1, KERN_CRIT "trans %llu running %llu\n", - (unsigned long long)trans->transid, - (unsigned long long) + trans->transid, root->fs_info->running_transaction->transid); if (trans->transid != root->fs_info->generation) WARN(1, KERN_CRIT "trans %llu running %llu\n", - (unsigned long long)trans->transid, - (unsigned long long)root->fs_info->generation); + trans->transid, root->fs_info->generation); if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; @@ -2466,6 +2426,40 @@ done: return ret; } +static void key_search_validate(struct extent_buffer *b, + struct btrfs_key *key, + int level) +{ +#ifdef CONFIG_BTRFS_ASSERT + struct btrfs_disk_key disk_key; + + btrfs_cpu_key_to_disk(&disk_key, key); + + if (level == 0) + ASSERT(!memcmp_extent_buffer(b, &disk_key, + offsetof(struct btrfs_leaf, items[0].key), + sizeof(disk_key))); + else + ASSERT(!memcmp_extent_buffer(b, &disk_key, + offsetof(struct btrfs_node, ptrs[0].key), + sizeof(disk_key))); +#endif +} + +static int key_search(struct extent_buffer *b, struct btrfs_key *key, + int level, int *prev_cmp, int *slot) +{ + if (*prev_cmp != 0) { + *prev_cmp = bin_search(b, key, level, slot); + return *prev_cmp; + } + + key_search_validate(b, key, level); + *slot = 0; + + return 0; +} + /* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf @@ -2494,6 +2488,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root int write_lock_level = 0; u8 lowest_level = 0; int min_write_lock_level; + int prev_cmp; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -2524,6 +2519,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root min_write_lock_level = write_lock_level; again: + prev_cmp = -1; /* * we try very hard to do read locks on the root */ @@ -2624,7 +2620,7 @@ cow_done: if (!cow) btrfs_unlock_up_safe(p, level + 1); - ret = bin_search(b, key, level, &slot); + ret = key_search(b, key, level, &prev_cmp, &slot); if (level != 0) { int dec = 0; @@ -2759,6 +2755,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, int level; int lowest_unlock = 1; u8 lowest_level = 0; + int prev_cmp; lowest_level = p->lowest_level; WARN_ON(p->nodes[0] != NULL); @@ -2769,6 +2766,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, } again: + prev_cmp = -1; b = get_old_root(root, time_seq); level = btrfs_header_level(b); p->locks[level] = BTRFS_READ_LOCK; @@ -2786,7 +2784,7 @@ again: */ btrfs_unlock_up_safe(p, level + 1); - ret = bin_search(b, key, level, &slot); + ret = key_search(b, key, level, &prev_cmp, &slot); if (level != 0) { int dec = 0; @@ -2820,7 +2818,11 @@ again: btrfs_clear_path_blocking(p, b, BTRFS_READ_LOCK); } - b = tree_mod_log_rewind(root->fs_info, b, time_seq); + b = tree_mod_log_rewind(root->fs_info, p, b, time_seq); + if (!b) { + ret = -ENOMEM; + goto done; + } p->locks[level] = BTRFS_READ_LOCK; p->nodes[level] = b; } else { @@ -3143,13 +3145,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(c, root->root_key.objectid); - write_extent_buffer(c, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(c), + write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(c), BTRFS_FSID_SIZE); write_extent_buffer(c, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(c), - BTRFS_UUID_SIZE); + btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE); btrfs_set_node_key(c, &lower_key, 0); btrfs_set_node_blockptr(c, 0, lower->start); @@ -3208,7 +3208,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans, } if (level) { ret = tree_mod_log_insert_key(root->fs_info, lower, slot, - MOD_LOG_KEY_ADD); + MOD_LOG_KEY_ADD, GFP_NOFS); BUG_ON(ret < 0); } btrfs_set_node_key(lower, key, slot); @@ -3284,10 +3284,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(split, root->root_key.objectid); write_extent_buffer(split, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(split), - BTRFS_FSID_SIZE); + btrfs_header_fsid(split), BTRFS_FSID_SIZE); write_extent_buffer(split, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(split), + btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); @@ -4040,11 +4039,10 @@ again: btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); write_extent_buffer(right, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(right), - BTRFS_FSID_SIZE); + btrfs_header_fsid(right), BTRFS_FSID_SIZE); write_extent_buffer(right, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(right), + btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); if (split == 0) { @@ -4642,7 +4640,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, (nritems - slot - 1)); } else if (level) { ret = tree_mod_log_insert_key(root->fs_info, parent, slot, - MOD_LOG_KEY_REMOVE); + MOD_LOG_KEY_REMOVE, GFP_NOFS); BUG_ON(ret < 0); } @@ -4814,7 +4812,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * This may release the path, and so you may lose any locks held at the * time you call it. */ -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) { struct btrfs_key key; struct btrfs_disk_key found_key; @@ -5329,19 +5327,20 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; advance_right = ADVANCE; } else { + enum btrfs_compare_tree_result cmp; + WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); ret = tree_compare_item(left_root, left_path, right_path, tmp_buf); - if (ret) { - WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); - ret = changed_cb(left_root, right_root, - left_path, right_path, - &left_key, - BTRFS_COMPARE_TREE_CHANGED, - ctx); - if (ret < 0) - goto out; - } + if (ret) + cmp = BTRFS_COMPARE_TREE_CHANGED; + else + cmp = BTRFS_COMPARE_TREE_SAME; + ret = changed_cb(left_root, right_root, + left_path, right_path, + &left_key, cmp, ctx); + if (ret < 0) + goto out; advance_left = ADVANCE; advance_right = ADVANCE; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e795bf135e80..3c1da6f98a4d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -23,6 +23,7 @@ #include <linux/highmem.h> #include <linux/fs.h> #include <linux/rwsem.h> +#include <linux/semaphore.h> #include <linux/completion.h> #include <linux/backing-dev.h> #include <linux/wait.h> @@ -91,6 +92,9 @@ struct btrfs_ordered_sum; /* holds quota configuration and tracking */ #define BTRFS_QUOTA_TREE_OBJECTID 8ULL +/* for storing items that use the BTRFS_UUID_KEY* types */ +#define BTRFS_UUID_TREE_OBJECTID 9ULL + /* for storing balance parameters in the root tree */ #define BTRFS_BALANCE_OBJECTID -4ULL @@ -142,7 +146,7 @@ struct btrfs_ordered_sum; #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 -#define BTRFS_DEV_REPLACE_DEVID 0 +#define BTRFS_DEV_REPLACE_DEVID 0ULL /* * the max metadata block size. This limit is somewhat artificial, @@ -478,9 +482,10 @@ struct btrfs_super_block { char label[BTRFS_LABEL_SIZE]; __le64 cache_generation; + __le64 uuid_tree_generation; /* future expansion */ - __le64 reserved[31]; + __le64 reserved[30]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); @@ -1188,6 +1193,7 @@ enum btrfs_caching_type { BTRFS_CACHE_STARTED = 1, BTRFS_CACHE_FAST = 2, BTRFS_CACHE_FINISHED = 3, + BTRFS_CACHE_ERROR = 4, }; enum btrfs_disk_cache_state { @@ -1302,6 +1308,7 @@ struct btrfs_fs_info { struct btrfs_root *fs_root; struct btrfs_root *csum_root; struct btrfs_root *quota_root; + struct btrfs_root *uuid_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -1350,6 +1357,7 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long mount_opt; unsigned long compress_type:4; + int commit_interval; /* * It is a suggestive number, the read side is safe even it gets a * wrong number because we will write out the data into a regular @@ -1411,6 +1419,13 @@ struct btrfs_fs_info { * before jumping into the main commit. */ struct mutex ordered_operations_mutex; + + /* + * Same as ordered_operations_mutex except this is for ordered extents + * and not the operations. + */ + struct mutex ordered_extent_flush_mutex; + struct rw_semaphore extent_commit_sem; struct rw_semaphore cleanup_work_sem; @@ -1641,6 +1656,9 @@ struct btrfs_fs_info { struct btrfs_dev_replace dev_replace; atomic_t mutually_exclusive_operation_running; + + struct semaphore uuid_tree_rescan_sem; + unsigned int update_uuid_tree_gen:1; }; /* @@ -1934,6 +1952,19 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_DEV_REPLACE_KEY 250 /* + * Stores items that allow to quickly map UUIDs to something else. + * These items are part of the filesystem UUID tree. + * The key is built like this: + * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits). + */ +#if BTRFS_UUID_SIZE != 16 +#error "UUID items require BTRFS_UUID_SIZE == 16!" +#endif +#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */ +#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to + * received subvols */ + +/* * string items are for debugging. They just store a short string of * data in the FS */ @@ -1967,6 +1998,9 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) +#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) + +#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2130,14 +2164,14 @@ BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, generation, 64); -static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) +static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) { - return (char *)d + offsetof(struct btrfs_dev_item, uuid); + return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); } -static inline char *btrfs_device_fsid(struct btrfs_dev_item *d) +static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) { - return (char *)d + offsetof(struct btrfs_dev_item, fsid); + return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); } BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); @@ -2240,6 +2274,23 @@ BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, + sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, + transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, + nbytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, + block_group, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); static inline struct btrfs_timespec * btrfs_inode_atime(struct btrfs_inode_item *inode_item) @@ -2267,6 +2318,8 @@ btrfs_inode_ctime(struct btrfs_inode_item *inode_item) BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); /* struct btrfs_dev_extent */ BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, @@ -2277,10 +2330,10 @@ BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, chunk_offset, 64); BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); -static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) +static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) { unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); - return (u8 *)((unsigned long)dev + ptr); + return (unsigned long)dev + ptr; } BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); @@ -2348,6 +2401,10 @@ BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, + blockptr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, + generation, 64); static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) { @@ -2404,6 +2461,8 @@ static inline void btrfs_set_node_key(struct extent_buffer *eb, /* struct btrfs_item */ BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); static inline unsigned long btrfs_item_nr_offset(int nr) { @@ -2466,6 +2525,13 @@ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, + data_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, + name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, + transid, 64); static inline void btrfs_dir_item_key(struct extent_buffer *eb, struct btrfs_dir_item *item, @@ -2568,6 +2634,12 @@ BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); +BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, + nritems, 32); +BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) { @@ -2603,16 +2675,14 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, btrfs_set_header_flags(eb, flags); } -static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) +static inline unsigned long btrfs_header_fsid(struct extent_buffer *eb) { - unsigned long ptr = offsetof(struct btrfs_header, fsid); - return (u8 *)ptr; + return offsetof(struct btrfs_header, fsid); } -static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) +static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) { - unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid); - return (u8 *)ptr; + return offsetof(struct btrfs_header, chunk_tree_uuid); } static inline int btrfs_is_leaf(struct extent_buffer *eb) @@ -2830,6 +2900,9 @@ BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, csum_type, 16); BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, cache_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); +BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, + uuid_tree_generation, 64); static inline int btrfs_super_csum_size(struct btrfs_super_block *s) { @@ -2847,6 +2920,14 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) /* struct btrfs_file_extent_item */ BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, + struct btrfs_file_extent_item, disk_bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, + struct btrfs_file_extent_item, offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, + struct btrfs_file_extent_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, + struct btrfs_file_extent_item, num_bytes, 64); static inline unsigned long btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) @@ -3107,11 +3188,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 min_alloc_size, - u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data); +int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, + u64 min_alloc_size, u64 empty_size, u64 hint_byte, + struct btrfs_key *ins, int is_data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref, int for_cow); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3175,7 +3254,7 @@ void btrfs_orphan_release_metadata(struct inode *inode); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, - u64 *qgroup_reserved); + u64 *qgroup_reserved, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, u64 qgroup_reserved); @@ -3245,6 +3324,7 @@ enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_NEW, BTRFS_COMPARE_TREE_DELETED, BTRFS_COMPARE_TREE_CHANGED, + BTRFS_COMPARE_TREE_SAME, }; typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root, struct btrfs_root *right_root, @@ -3380,6 +3460,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->dev_root); kfree(fs_info->csum_root); kfree(fs_info->quota_root); + kfree(fs_info->uuid_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); kfree(fs_info); @@ -3414,8 +3495,6 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item); -void btrfs_read_root_item(struct extent_buffer *eb, int slot, - struct btrfs_root_item *item); int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, struct btrfs_path *path, struct btrfs_root_item *root_item, struct btrfs_key *root_key); @@ -3426,6 +3505,17 @@ void btrfs_check_and_init_root_item(struct btrfs_root_item *item); void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); +/* uuid-tree.c */ +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, + int (*check_func)(struct btrfs_fs_info *, u8 *, u8, + u64)); + /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const char *name, int name_len); @@ -3509,12 +3599,14 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, struct btrfs_inode_extref **extref_ret); /* file-item.c */ +struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 logical_offset); + struct btrfs_dio_private *dip, struct bio *bio, + u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -3552,8 +3644,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create); -noinline int can_nocow_extent(struct btrfs_trans_handle *trans, - struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes); @@ -3643,11 +3734,15 @@ extern const struct dentry_operations btrfs_dentry_operations; long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); +int btrfs_is_empty_uuid(u8 *uuid); int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_pages); void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); +void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, + struct btrfs_ioctl_balance_args *bargs); + /* file.c */ int btrfs_auto_defrag_init(void); @@ -3720,6 +3815,22 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#ifdef CONFIG_BTRFS_ASSERT + +static inline void assfail(char *expr, char *file, int line) +{ + printk(KERN_ERR "BTRFS assertion failed: %s, file: %s, line: %d", + expr, file, line); + BUG(); +} + +#define ASSERT(expr) \ + (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) +#else +#define ASSERT(expr) ((void)0) +#endif + +#define btrfs_assert() __printf(5, 6) void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 375510913fe7..cbd9523ad09c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -21,6 +21,7 @@ #include "delayed-inode.h" #include "disk-io.h" #include "transaction.h" +#include "ctree.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 @@ -1453,10 +1454,10 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, dir_item = (struct btrfs_dir_item *)delayed_item->data; dir_item->location = *disk_key; - dir_item->transid = cpu_to_le64(trans->transid); - dir_item->data_len = 0; - dir_item->name_len = cpu_to_le16(name_len); - dir_item->type = type; + btrfs_set_stack_dir_transid(dir_item, trans->transid); + btrfs_set_stack_dir_data_len(dir_item, 0); + btrfs_set_stack_dir_name_len(dir_item, name_len); + btrfs_set_stack_dir_type(dir_item, type); memcpy((char *)(dir_item + 1), name, name_len); ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); @@ -1470,13 +1471,11 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); if (unlikely(ret)) { - printk(KERN_ERR "err add delayed dir index item(name: %s) into " - "the insertion tree of the delayed node" + printk(KERN_ERR "err add delayed dir index item(name: %.*s) " + "into the insertion tree of the delayed node" "(root id: %llu, inode id: %llu, errno: %d)\n", - name, - (unsigned long long)delayed_node->root->objectid, - (unsigned long long)delayed_node->inode_id, - ret); + name_len, name, delayed_node->root->objectid, + delayed_node->inode_id, ret); BUG(); } mutex_unlock(&delayed_node->mutex); @@ -1547,9 +1546,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, printk(KERN_ERR "err add delayed dir index item(index: %llu) " "into the deletion tree of the delayed node" "(root id: %llu, inode id: %llu, errno: %d)\n", - (unsigned long long)index, - (unsigned long long)node->root->objectid, - (unsigned long long)node->inode_id, + index, node->root->objectid, node->inode_id, ret); BUG(); } @@ -1699,7 +1696,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, di = (struct btrfs_dir_item *)curr->data; name = (char *)(di + 1); - name_len = le16_to_cpu(di->name_len); + name_len = btrfs_stack_dir_name_len(di); d_type = btrfs_filetype_table[di->type]; btrfs_disk_key_to_cpu(&location, &di->location); @@ -1716,27 +1713,6 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, return 0; } -BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, - sequence, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, - transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, - nbytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, - block_group, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); -BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); - static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, struct inode *inode) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index c219463fb1fd..e4d467be2dd4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -241,7 +241,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, return 0; } -static void inline drop_delayed_ref(struct btrfs_trans_handle *trans, +static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_node *ref) { @@ -600,7 +600,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&head_ref->cluster); mutex_init(&head_ref->mutex); - trace_btrfs_delayed_ref_head(ref, head_ref, action); + trace_add_delayed_ref_head(ref, head_ref, action); existing = tree_insert(&delayed_refs->root, &ref->rb_node); @@ -661,7 +661,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->type = BTRFS_TREE_BLOCK_REF_KEY; full_ref->level = level; - trace_btrfs_delayed_tree_ref(ref, full_ref, action); + trace_add_delayed_tree_ref(ref, full_ref, action); existing = tree_insert(&delayed_refs->root, &ref->rb_node); @@ -722,7 +722,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, full_ref->objectid = owner; full_ref->offset = offset; - trace_btrfs_delayed_data_ref(ref, full_ref, action); + trace_add_delayed_data_ref(ref, full_ref, action); existing = tree_insert(&delayed_refs->root, &ref->rb_node); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5f8f3341c099..a64435359385 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -148,13 +148,13 @@ no_valid_dev_replace_entry_found: !btrfs_test_opt(dev_root, DEGRADED)) { ret = -EIO; pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n", - (unsigned long long)src_devid); + src_devid); } if (!dev_replace->tgtdev && !btrfs_test_opt(dev_root, DEGRADED)) { ret = -EIO; pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n", - (unsigned long long)BTRFS_DEV_REPLACE_DEVID); + BTRFS_DEV_REPLACE_DEVID); } if (dev_replace->tgtdev) { if (dev_replace->srcdev) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6b092a1c4e37..4cbb00af92ff 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -31,6 +31,7 @@ #include <linux/migrate.h> #include <linux/ratelimit.h> #include <linux/uuid.h> +#include <linux/semaphore.h> #include <asm/unaligned.h> #include "compat.h" #include "ctree.h" @@ -302,9 +303,8 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, printk_ratelimited(KERN_INFO "btrfs: %s checksum verify " "failed on %llu wanted %X found %X " "level %d\n", - root->fs_info->sb->s_id, - (unsigned long long)buf->start, val, found, - btrfs_header_level(buf)); + root->fs_info->sb->s_id, buf->start, + val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) kfree(result); return 1; @@ -345,9 +345,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, } printk_ratelimited("parent transid verify failed on %llu wanted %llu " "found %llu\n", - (unsigned long long)eb->start, - (unsigned long long)parent_transid, - (unsigned long long)btrfs_header_generation(eb)); + eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; clear_extent_buffer_uptodate(eb); out: @@ -497,8 +495,7 @@ static int check_tree_block_fsid(struct btrfs_root *root, u8 fsid[BTRFS_UUID_SIZE]; int ret = 1; - read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb), - BTRFS_FSID_SIZE); + read_extent_buffer(eb, fsid, btrfs_header_fsid(eb), BTRFS_FSID_SIZE); while (fs_devices) { if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { ret = 0; @@ -512,8 +509,7 @@ static int check_tree_block_fsid(struct btrfs_root *root, #define CORRUPT(reason, eb, root, slot) \ printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \ "root=%llu, slot=%d\n", reason, \ - (unsigned long long)btrfs_header_bytenr(eb), \ - (unsigned long long)root->objectid, slot) + btrfs_header_bytenr(eb), root->objectid, slot) static noinline int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) @@ -576,8 +572,9 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } -static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int mirror) +static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, + u64 phy_offset, struct page *page, + u64 start, u64 end, int mirror) { struct extent_io_tree *tree; u64 found_start; @@ -612,14 +609,13 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, if (found_start != eb->start) { printk_ratelimited(KERN_INFO "btrfs bad tree block start " "%llu %llu\n", - (unsigned long long)found_start, - (unsigned long long)eb->start); + found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", - (unsigned long long)eb->start); + eb->start); ret = -EIO; goto err; } @@ -1148,6 +1144,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, return NULL; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + if (ret) { + free_extent_buffer(buf); + return NULL; + } return buf; } @@ -1291,11 +1291,10 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_header_owner(leaf, objectid); root->node = leaf; - write_extent_buffer(leaf, fs_info->fsid, - (unsigned long)btrfs_header_fsid(leaf), + write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(leaf), BTRFS_FSID_SIZE); write_extent_buffer(leaf, fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + btrfs_header_chunk_tree_uuid(leaf), BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); @@ -1379,8 +1378,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->node = leaf; write_extent_buffer(root->node, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(root->node), - BTRFS_FSID_SIZE); + btrfs_header_fsid(root->node), BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); return root; @@ -1413,11 +1411,11 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, log_root->root_key.offset = root->root_key.objectid; inode_item = &log_root->root_item.inode; - inode_item->generation = cpu_to_le64(1); - inode_item->size = cpu_to_le64(3); - inode_item->nlink = cpu_to_le32(1); - inode_item->nbytes = cpu_to_le64(root->leafsize); - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + btrfs_set_stack_inode_generation(inode_item, 1); + btrfs_set_stack_inode_size(inode_item, 3); + btrfs_set_stack_inode_nlink(inode_item, 1); + btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); + btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_node(&log_root->root_item, log_root->node); @@ -1428,8 +1426,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return 0; } -struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key) +static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; @@ -1529,8 +1527,8 @@ fail: return ret; } -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id) +static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) { struct btrfs_root *root; @@ -1581,10 +1579,16 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID) return fs_info->quota_root ? fs_info->quota_root : ERR_PTR(-ENOENT); + if (location->objectid == BTRFS_UUID_TREE_OBJECTID) + return fs_info->uuid_root ? fs_info->uuid_root : + ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, location->objectid); - if (root) + if (root) { + if (btrfs_root_refs(&root->root_item) == 0) + return ERR_PTR(-ENOENT); return root; + } root = btrfs_read_fs_root(fs_info->tree_root, location); if (IS_ERR(root)) @@ -1737,7 +1741,7 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; - delay = HZ * 30; + delay = HZ * root->fs_info->commit_interval; mutex_lock(&root->fs_info->transaction_kthread_mutex); spin_lock(&root->fs_info->trans_lock); @@ -1749,7 +1753,8 @@ static int transaction_kthread(void *arg) now = get_seconds(); if (cur->state < TRANS_STATE_BLOCKED && - (now < cur->start_time || now - cur->start_time < 30)) { + (now < cur->start_time || + now - cur->start_time < root->fs_info->commit_interval)) { spin_unlock(&root->fs_info->trans_lock); delay = HZ * 5; goto sleep; @@ -2038,6 +2043,12 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) info->quota_root->node = NULL; info->quota_root->commit_root = NULL; } + if (info->uuid_root) { + free_extent_buffer(info->uuid_root->node); + free_extent_buffer(info->uuid_root->commit_root); + info->uuid_root->node = NULL; + info->uuid_root->commit_root = NULL; + } if (chunk_root) { free_extent_buffer(info->chunk_root->node); free_extent_buffer(info->chunk_root->commit_root); @@ -2098,11 +2109,14 @@ int open_ctree(struct super_block *sb, struct btrfs_root *chunk_root; struct btrfs_root *dev_root; struct btrfs_root *quota_root; + struct btrfs_root *uuid_root; struct btrfs_root *log_tree_root; int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; + bool create_uuid_tree; + bool check_uuid_tree; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); @@ -2189,6 +2203,7 @@ int open_ctree(struct super_block *sb, fs_info->defrag_inodes = RB_ROOT; fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; + fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); @@ -2270,6 +2285,7 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->ordered_operations_mutex); + mutex_init(&fs_info->ordered_extent_flush_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); @@ -2278,6 +2294,7 @@ int open_ctree(struct super_block *sb, init_rwsem(&fs_info->extent_commit_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); + sema_init(&fs_info->uuid_tree_rescan_sem, 1); fs_info->dev_replace.lock_owner = 0; atomic_set(&fs_info->dev_replace.nesting_level, 0); mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); @@ -2383,7 +2400,7 @@ int open_ctree(struct super_block *sb, if (features) { printk(KERN_ERR "BTRFS: couldn't mount because of " "unsupported optional features (%Lx).\n", - (unsigned long long)features); + features); err = -EINVAL; goto fail_alloc; } @@ -2453,7 +2470,7 @@ int open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY) && features) { printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " "unsupported option features (%Lx).\n", - (unsigned long long)features); + features); err = -EINVAL; goto fail_alloc; } @@ -2466,20 +2483,17 @@ int open_ctree(struct super_block *sb, &fs_info->generic_worker); btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size, - &fs_info->generic_worker); + fs_info->thread_pool_size, NULL); btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc", - fs_info->thread_pool_size, - &fs_info->generic_worker); + fs_info->thread_pool_size, NULL); btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size), - &fs_info->generic_worker); + fs_info->thread_pool_size), NULL); btrfs_init_workers(&fs_info->caching_workers, "cache", - 2, &fs_info->generic_worker); + fs_info->thread_pool_size, NULL); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -2575,7 +2589,7 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) { + if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2615,8 +2629,7 @@ int open_ctree(struct super_block *sb, chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), - BTRFS_UUID_SIZE); + btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); ret = btrfs_read_chunk_tree(chunk_root); if (ret) { @@ -2696,6 +2709,22 @@ retry_root_backup: fs_info->quota_root = quota_root; } + location.objectid = BTRFS_UUID_TREE_OBJECTID; + uuid_root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(uuid_root)) { + ret = PTR_ERR(uuid_root); + if (ret != -ENOENT) + goto recovery_tree_root; + create_uuid_tree = true; + check_uuid_tree = false; + } else { + uuid_root->track_dirty = 1; + fs_info->uuid_root = uuid_root; + create_uuid_tree = false; + check_uuid_tree = + generation != btrfs_super_uuid_tree_generation(disk_super); + } + fs_info->generation = generation; fs_info->last_trans_committed = generation; @@ -2882,6 +2911,29 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); + if (create_uuid_tree) { + pr_info("btrfs: creating UUID tree\n"); + ret = btrfs_create_uuid_tree(fs_info); + if (ret) { + pr_warn("btrfs: failed to create the UUID tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } else if (check_uuid_tree || + btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) { + pr_info("btrfs: checking UUID tree\n"); + ret = btrfs_check_uuid_tree(fs_info); + if (ret) { + pr_warn("btrfs: failed to check the UUID tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } else { + fs_info->update_uuid_tree_gen = 1; + } + return 0; fail_qgroup: @@ -2983,15 +3035,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) */ for (i = 0; i < 1; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + 4096 >= i_size_read(bdev->bd_inode)) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + i_size_read(bdev->bd_inode)) break; - bh = __bread(bdev, bytenr / 4096, 4096); + bh = __bread(bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); if (!bh) continue; super = (struct btrfs_super_block *)bh->b_data; if (btrfs_super_bytenr(super) != bytenr || - super->magic != cpu_to_le64(BTRFS_MAGIC)) { + btrfs_super_magic(super) != BTRFS_MAGIC) { brelse(bh); continue; } @@ -3311,7 +3365,6 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); backup_super_roots(root->fs_info); @@ -3320,6 +3373,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; + max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; if (do_barriers) { ret = barrier_all_devices(root->fs_info); @@ -3362,8 +3416,10 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) printk(KERN_ERR "btrfs: %d errors while writing supers\n", total_errors); - /* This shouldn't happen. FUA is masked off if unsupported */ - BUG(); + /* FUA is masked off if unsupported and can't be the reason */ + btrfs_error(root->fs_info, -EIO, + "%d errors while writing supers", total_errors); + return -EIO; } total_errors = 0; @@ -3421,6 +3477,8 @@ static void free_fs_root(struct btrfs_root *root) { iput(root->cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; if (root->anon_dev) free_anon_bdev(root->anon_dev); free_extent_buffer(root->node); @@ -3510,6 +3568,11 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); + /* wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* avoid complains from lockdep et al., set sem back to initial state */ + up(&fs_info->uuid_tree_rescan_sem); + /* pause restriper - we want to resume on mount */ btrfs_pause_balance(fs_info); @@ -3573,6 +3636,9 @@ int close_ctree(struct btrfs_root *root) btrfs_free_stripe_hash_table(fs_info); + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; + return 0; } @@ -3608,9 +3674,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (transid != root->fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, " "found %llu running %llu\n", - (unsigned long long)buf->start, - (unsigned long long)transid, - (unsigned long long)root->fs_info->generation); + buf->start, transid, root->fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, @@ -3744,8 +3808,8 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->ordered_root_lock); } -int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_root *root) +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1204c8ef6f32..cfb3cf711b34 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -113,7 +113,8 @@ static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) { smp_mb(); - return cache->cached == BTRFS_CACHE_FINISHED; + return cache->cached == BTRFS_CACHE_FINISHED || + cache->cached == BTRFS_CACHE_ERROR; } static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) @@ -389,7 +390,7 @@ static noinline void caching_thread(struct btrfs_work *work) u64 total_found = 0; u64 last = 0; u32 nritems; - int ret = 0; + int ret = -ENOMEM; caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; @@ -420,6 +421,7 @@ again: /* need to make sure the commit_root doesn't disappear */ down_read(&fs_info->extent_commit_sem); +next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto err; @@ -459,6 +461,16 @@ again: continue; } + if (key.objectid < last) { + key.objectid = last; + key.offset = 0; + key.type = BTRFS_EXTENT_ITEM_KEY; + + caching_ctl->progress = last; + btrfs_release_path(path); + goto next; + } + if (key.objectid < block_group->key.objectid) { path->slots[0]++; continue; @@ -506,6 +518,12 @@ err: mutex_unlock(&caching_ctl->mutex); out: + if (ret) { + spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; + block_group->cached = BTRFS_CACHE_ERROR; + spin_unlock(&block_group->lock); + } wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); @@ -771,10 +789,23 @@ again: goto out_free; if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = root->leafsize; - btrfs_release_path(path); - goto again; + metadata = 0; + if (path->slots[0]) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == bytenr && + key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == root->leafsize) + ret = 0; + } + if (ret) { + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = root->leafsize; + btrfs_release_path(path); + goto again; + } } if (ret == 0) { @@ -2011,6 +2042,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); + trace_run_delayed_data_ref(node, ref, node->action); + if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; else @@ -2154,6 +2187,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); + trace_run_delayed_tree_ref(node, ref, node->action); + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; else @@ -2212,6 +2247,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, */ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); + trace_run_delayed_ref_head(node, head, node->action); + if (insert_reserved) { btrfs_pin_extent(root, node->bytenr, node->num_bytes, 1); @@ -2403,6 +2440,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, default: WARN_ON(1); } + } else { + list_del_init(&locked_ref->cluster); } spin_unlock(&delayed_refs->lock); @@ -2425,7 +2464,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * list before we release it. */ if (btrfs_delayed_ref_is_head(ref)) { - list_del_init(&locked_ref->cluster); btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; } @@ -3799,8 +3837,12 @@ again: if (force < space_info->force_alloc) force = space_info->force_alloc; if (space_info->full) { + if (should_alloc_chunk(extent_root, space_info, force)) + ret = -ENOSPC; + else + ret = 0; spin_unlock(&space_info->lock); - return 0; + return ret; } if (!should_alloc_chunk(extent_root, space_info, force)) { @@ -4320,6 +4362,9 @@ static struct btrfs_block_rsv *get_block_rsv( if (root == root->fs_info->csum_root && trans->adding_csums) block_rsv = trans->block_rsv; + if (root == root->fs_info->uuid_root) + block_rsv = trans->block_rsv; + if (!block_rsv) block_rsv = root->block_rsv; @@ -4729,10 +4774,12 @@ void btrfs_orphan_release_metadata(struct inode *inode) int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int items, - u64 *qgroup_reserved) + u64 *qgroup_reserved, + bool use_global_rsv) { u64 num_bytes; int ret; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; if (root->fs_info->quota_enabled) { /* One for parent inode, two for dir entries */ @@ -4751,6 +4798,10 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, BTRFS_BLOCK_GROUP_METADATA); ret = btrfs_block_rsv_add(root, rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); + + if (ret == -ENOSPC && use_global_rsv) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); + if (ret) { if (*qgroup_reserved) btrfs_qgroup_free(root, *qgroup_reserved); @@ -5668,7 +5719,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (ret) { btrfs_err(info, "umm, got %d back from search, was looking for %llu", - ret, (unsigned long long)bytenr); + ret, bytenr); if (ret > 0) btrfs_print_leaf(extent_root, path->nodes[0]); @@ -5684,11 +5735,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, WARN_ON(1); btrfs_err(info, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", - (unsigned long long)bytenr, - (unsigned long long)parent, - (unsigned long long)root_objectid, - (unsigned long long)owner_objectid, - (unsigned long long)owner_offset); + bytenr, parent, root_objectid, owner_objectid, + owner_offset); } else { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -5717,7 +5765,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, -1, 1); if (ret) { btrfs_err(info, "umm, got %d back from search, was looking for %llu", - ret, (unsigned long long)bytenr); + ret, bytenr); btrfs_print_leaf(extent_root, path->nodes[0]); } if (ret < 0) { @@ -5999,8 +6047,11 @@ static u64 stripe_align(struct btrfs_root *root, * for our min num_bytes. Another option is to have it go ahead * and look in the rbtree for a free extent of a given size, but this * is a good start. + * + * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using + * any of the information in this block group. */ -static noinline int +static noinline void wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, u64 num_bytes) { @@ -6008,28 +6059,29 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, caching_ctl = get_caching_control(cache); if (!caching_ctl) - return 0; + return; wait_event(caching_ctl->wait, block_group_cache_done(cache) || (cache->free_space_ctl->free_space >= num_bytes)); put_caching_control(caching_ctl); - return 0; } static noinline int wait_block_group_cache_done(struct btrfs_block_group_cache *cache) { struct btrfs_caching_control *caching_ctl; + int ret = 0; caching_ctl = get_caching_control(cache); if (!caching_ctl) - return 0; + return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; wait_event(caching_ctl->wait, block_group_cache_done(cache)); - + if (cache->cached == BTRFS_CACHE_ERROR) + ret = -EIO; put_caching_control(caching_ctl); - return 0; + return ret; } int __get_raid_index(u64 flags) @@ -6070,8 +6122,7 @@ enum btrfs_loop_type { * ins->offset == number of blocks * Any available blocks before search_start are skipped. */ -static noinline int find_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *orig_root, +static noinline int find_free_extent(struct btrfs_root *orig_root, u64 num_bytes, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, u64 flags) @@ -6212,6 +6263,8 @@ have_block_group: ret = 0; } + if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) + goto loop; if (unlikely(block_group->ro)) goto loop; @@ -6292,10 +6345,10 @@ refill_cluster: block_group->full_stripe_len); /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(trans, root, - block_group, last_ptr, - search_start, num_bytes, - aligned_cluster); + ret = btrfs_find_space_cluster(root, block_group, + last_ptr, search_start, + num_bytes, + aligned_cluster); if (ret == 0) { /* * now pull our allocation out of this @@ -6426,17 +6479,28 @@ loop: index = 0; loop++; if (loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = do_chunk_alloc(trans, root, flags, CHUNK_ALLOC_FORCE); /* * Do not bail out on ENOSPC since we * can do more things. */ - if (ret < 0 && ret != -ENOSPC) { + if (ret < 0 && ret != -ENOSPC) btrfs_abort_transaction(trans, root, ret); + else + ret = 0; + btrfs_end_transaction(trans, root); + if (ret) goto out; - } } if (loop == LOOP_NO_EMPTY_SIZE) { @@ -6463,19 +6527,15 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, spin_lock(&info->lock); printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", - (unsigned long long)info->flags, - (unsigned long long)(info->total_bytes - info->bytes_used - - info->bytes_pinned - info->bytes_reserved - - info->bytes_readonly), + info->flags, + info->total_bytes - info->bytes_used - info->bytes_pinned - + info->bytes_reserved - info->bytes_readonly, (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", - (unsigned long long)info->total_bytes, - (unsigned long long)info->bytes_used, - (unsigned long long)info->bytes_pinned, - (unsigned long long)info->bytes_reserved, - (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_readonly); + info->total_bytes, info->bytes_used, info->bytes_pinned, + info->bytes_reserved, info->bytes_may_use, + info->bytes_readonly); spin_unlock(&info->lock); if (!dump_block_groups) @@ -6486,12 +6546,9 @@ again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", - (unsigned long long)cache->key.objectid, - (unsigned long long)cache->key.offset, - (unsigned long long)btrfs_block_group_used(&cache->item), - (unsigned long long)cache->pinned, - (unsigned long long)cache->reserved, - cache->ro ? "[readonly]" : ""); + cache->key.objectid, cache->key.offset, + btrfs_block_group_used(&cache->item), cache->pinned, + cache->reserved, cache->ro ? "[readonly]" : ""); btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } @@ -6500,8 +6557,7 @@ again: up_read(&info->groups_sem); } -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data) @@ -6513,8 +6569,8 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, flags = btrfs_get_alloc_profile(root, is_data); again: WARN_ON(num_bytes < root->sectorsize); - ret = find_free_extent(trans, root, num_bytes, empty_size, - hint_byte, ins, flags); + ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, + flags); if (ret == -ENOSPC) { if (!final_tried) { @@ -6529,8 +6585,7 @@ again: sinfo = __find_space_info(root->fs_info, flags); btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu", - (unsigned long long)flags, - (unsigned long long)num_bytes); + flags, num_bytes); if (sinfo) dump_space_info(sinfo, num_bytes, 1); } @@ -6550,7 +6605,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, cache = btrfs_lookup_block_group(root->fs_info, start); if (!cache) { btrfs_err(root->fs_info, "Unable to find block group for %llu", - (unsigned long long)start); + start); return -ENOSPC; } @@ -6646,8 +6701,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ret = update_block_group(root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", - (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + ins->objectid, ins->offset); BUG(); } return ret; @@ -6719,8 +6773,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ret = update_block_group(root, ins->objectid, root->leafsize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", - (unsigned long long)ins->objectid, - (unsigned long long)ins->offset); + ins->objectid, ins->offset); BUG(); } return ret; @@ -6902,7 +6955,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); - ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, + ret = btrfs_reserve_extent(root, blocksize, blocksize, empty_size, hint, &ins, 0); if (ret) { unuse_block_rsv(root->fs_info, block_rsv, blocksize); @@ -7173,6 +7226,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!next) return -ENOMEM; + btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, + level - 1); reada = 1; } btrfs_tree_lock(next); @@ -7658,7 +7713,7 @@ out: * don't have it in the radix (like when we recover after a power fail * or unmount) so we don't leak memory. */ - if (root_dropped == false) + if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err) btrfs_std_error(root->fs_info, err); @@ -8192,7 +8247,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) * We haven't cached this block group, which means we could * possibly have excluded extents on this block group. */ - if (block_group->cached == BTRFS_CACHE_NO) + if (block_group->cached == BTRFS_CACHE_NO || + block_group->cached == BTRFS_CACHE_ERROR) free_excluded_extents(info->extent_root, block_group); btrfs_remove_free_space_cache(block_group); @@ -8409,9 +8465,13 @@ int btrfs_read_block_groups(struct btrfs_root *root) * avoid allocating from un-mirrored block group if there are * mirrored block groups. */ - list_for_each_entry(cache, &space_info->block_groups[3], list) + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_RAID0], + list) set_block_group_ro(cache, 1); - list_for_each_entry(cache, &space_info->block_groups[4], list) + list_for_each_entry(cache, + &space_info->block_groups[BTRFS_RAID_SINGLE], + list) set_block_group_ro(cache, 1); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fe443fece851..09582b81640c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -61,9 +61,8 @@ void btrfs_leak_debug_check(void) state = list_entry(states.next, struct extent_state, leak_list); printk(KERN_ERR "btrfs state leak: start %llu end %llu " "state %lu in tree %p refs %d\n", - (unsigned long long)state->start, - (unsigned long long)state->end, - state->state, state->tree, atomic_read(&state->refs)); + state->start, state->end, state->state, state->tree, + atomic_read(&state->refs)); list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); } @@ -71,8 +70,8 @@ void btrfs_leak_debug_check(void) while (!list_empty(&buffers)) { eb = list_entry(buffers.next, struct extent_buffer, leak_list); printk(KERN_ERR "btrfs buffer leak start %llu len %lu " - "refs %d\n", (unsigned long long)eb->start, - eb->len, atomic_read(&eb->refs)); + "refs %d\n", + eb->start, eb->len, atomic_read(&eb->refs)); list_del(&eb->leak_list); kmem_cache_free(extent_buffer_cache, eb); } @@ -88,11 +87,7 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { printk_ratelimited(KERN_DEBUG "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n", - caller, - (unsigned long long)btrfs_ino(inode), - (unsigned long long)isize, - (unsigned long long)start, - (unsigned long long)end); + caller, btrfs_ino(inode), isize, start, end); } } #else @@ -388,8 +383,7 @@ static int insert_state(struct extent_io_tree *tree, if (end < start) WARN(1, KERN_ERR "btrfs end < start %llu %llu\n", - (unsigned long long)end, - (unsigned long long)start); + end, start); state->start = start; state->end = end; @@ -400,9 +394,8 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); printk(KERN_ERR "btrfs found node %llu %llu on insert of " - "%llu %llu\n", (unsigned long long)found->start, - (unsigned long long)found->end, - (unsigned long long)start, (unsigned long long)end); + "%llu %llu\n", + found->start, found->end, start, end); return -EEXIST; } state->tree = tree; @@ -762,15 +755,6 @@ static void cache_state(struct extent_state *state, } } -static void uncache_state(struct extent_state **cached_ptr) -{ - if (cached_ptr && (*cached_ptr)) { - struct extent_state *state = *cached_ptr; - *cached_ptr = NULL; - free_extent_state(state); - } -} - /* * set some bits on a range in the tree. This may require allocations or * sleeping, so the gfp mask is used to indicate what is allowed. @@ -1687,31 +1671,21 @@ out_failed: return found; } -int extent_clear_unlock_delalloc(struct inode *inode, - struct extent_io_tree *tree, - u64 start, u64 end, struct page *locked_page, - unsigned long op) +int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, + struct page *locked_page, + unsigned long clear_bits, + unsigned long page_ops) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; int ret; struct page *pages[16]; unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; unsigned long nr_pages = end_index - index + 1; int i; - unsigned long clear_bits = 0; - - if (op & EXTENT_CLEAR_UNLOCK) - clear_bits |= EXTENT_LOCKED; - if (op & EXTENT_CLEAR_DIRTY) - clear_bits |= EXTENT_DIRTY; - - if (op & EXTENT_CLEAR_DELALLOC) - clear_bits |= EXTENT_DELALLOC; clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); - if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | - EXTENT_SET_PRIVATE2))) + if (page_ops == 0) return 0; while (nr_pages > 0) { @@ -1720,20 +1694,20 @@ int extent_clear_unlock_delalloc(struct inode *inode, nr_pages, ARRAY_SIZE(pages)), pages); for (i = 0; i < ret; i++) { - if (op & EXTENT_SET_PRIVATE2) + if (page_ops & PAGE_SET_PRIVATE2) SetPagePrivate2(pages[i]); if (pages[i] == locked_page) { page_cache_release(pages[i]); continue; } - if (op & EXTENT_CLEAR_DIRTY) + if (page_ops & PAGE_CLEAR_DIRTY) clear_page_dirty_for_io(pages[i]); - if (op & EXTENT_SET_WRITEBACK) + if (page_ops & PAGE_SET_WRITEBACK) set_page_writeback(pages[i]); - if (op & EXTENT_END_WRITEBACK) + if (page_ops & PAGE_END_WRITEBACK) end_page_writeback(pages[i]); - if (op & EXTENT_CLEAR_UNLOCK_PAGE) + if (page_ops & PAGE_UNLOCK) unlock_page(pages[i]); page_cache_release(pages[i]); } @@ -1810,7 +1784,7 @@ out: * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. */ -int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) +static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) { struct rb_node *node; struct extent_state *state; @@ -1837,64 +1811,6 @@ out: return ret; } -void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[], - int count) -{ - struct rb_node *node; - struct extent_state *state; - - spin_lock(&tree->lock); - /* - * this search will find all the extents that end after - * our range starts. - */ - node = tree_search(tree, start); - BUG_ON(!node); - - state = rb_entry(node, struct extent_state, rb_node); - BUG_ON(state->start != start); - - while (count) { - state->private = *csums++; - count--; - state = next_state(state); - } - spin_unlock(&tree->lock); -} - -static inline u64 __btrfs_get_bio_offset(struct bio *bio, int bio_index) -{ - struct bio_vec *bvec = bio->bi_io_vec + bio_index; - - return page_offset(bvec->bv_page) + bvec->bv_offset; -} - -void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, int bio_index, - u32 csums[], int count) -{ - struct rb_node *node; - struct extent_state *state = NULL; - u64 start; - - spin_lock(&tree->lock); - do { - start = __btrfs_get_bio_offset(bio, bio_index); - if (state == NULL || state->start != start) { - node = tree_search(tree, start); - BUG_ON(!node); - - state = rb_entry(node, struct extent_state, rb_node); - BUG_ON(state->start != start); - } - state->private = *csums++; - count--; - bio_index++; - - state = next_state(state); - } while (count); - spin_unlock(&tree->lock); -} - int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) { struct rb_node *node; @@ -2173,7 +2089,8 @@ static int clean_io_failure(u64 start, struct page *page) EXTENT_LOCKED); spin_unlock(&BTRFS_I(inode)->io_tree.lock); - if (state && state->start == failrec->start) { + if (state && state->start <= failrec->start && + state->end >= failrec->start + failrec->len - 1) { fs_info = BTRFS_I(inode)->root->fs_info; num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); @@ -2201,9 +2118,9 @@ out: * needed */ -static int bio_readpage_error(struct bio *failed_bio, struct page *page, - u64 start, u64 end, int failed_mirror, - struct extent_state *state) +static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int failed_mirror) { struct io_failure_record *failrec = NULL; u64 private; @@ -2213,6 +2130,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct bio *bio; + struct btrfs_io_bio *btrfs_failed_bio; + struct btrfs_io_bio *btrfs_bio; int num_copies; int ret; int read_mode; @@ -2296,23 +2215,12 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, * all the retry and error correction code that follows. no * matter what the error is, it is very likely to persist. */ - pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " - "state=%p, num_copies=%d, next_mirror %d, " - "failed_mirror %d\n", state, num_copies, - failrec->this_mirror, failed_mirror); + pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); free_io_failure(inode, failrec, 0); return -EIO; } - if (!state) { - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, failrec->start, - EXTENT_LOCKED); - if (state && state->start != failrec->start) - state = NULL; - spin_unlock(&tree->lock); - } - /* * there are two premises: * a) deliver good data to the caller @@ -2349,9 +2257,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, read_mode = READ_SYNC; } - if (!state || failrec->this_mirror > num_copies) { - pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " - "next_mirror %d, failed_mirror %d\n", state, + if (failrec->this_mirror > num_copies) { + pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", num_copies, failrec->this_mirror, failed_mirror); free_io_failure(inode, failrec, 0); return -EIO; @@ -2362,12 +2269,24 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, free_io_failure(inode, failrec, 0); return -EIO; } - bio->bi_private = state; bio->bi_end_io = failed_bio->bi_end_io; bio->bi_sector = failrec->logical >> 9; bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio->bi_size = 0; + btrfs_failed_bio = btrfs_io_bio(failed_bio); + if (btrfs_failed_bio->csum) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = btrfs_bio->csum_inline; + phy_offset >>= inode->i_sb->s_blocksize_bits; + phy_offset *= csum_size; + memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset, + csum_size); + } + bio_add_page(bio, page, failrec->len, start - page_offset(page)); pr_debug("bio_readpage_error: submitting new read[%#x] to " @@ -2450,6 +2369,18 @@ static void end_bio_extent_writepage(struct bio *bio, int err) bio_put(bio); } +static void +endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len, + int uptodate) +{ + struct extent_state *cached = NULL; + u64 end = start + len - 1; + + if (uptodate && tree->track_uptodate) + set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); + unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); +} + /* * after a readpage IO is done, we need to: * clear the uptodate bits on error @@ -2466,9 +2397,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err) int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; struct bio_vec *bvec = bio->bi_io_vec; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree; + u64 offset = 0; u64 start; u64 end; + u64 len; + u64 extent_start = 0; + u64 extent_len = 0; int mirror; int ret; @@ -2477,9 +2413,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err) do { struct page *page = bvec->bv_page; - struct extent_state *cached = NULL; - struct extent_state *state; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " @@ -2500,37 +2433,32 @@ static void end_bio_extent_readpage(struct bio *bio, int err) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; + len = bvec->bv_len; if (++bvec <= bvec_end) prefetchw(&bvec->bv_page->flags); - spin_lock(&tree->lock); - state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); - if (state && state->start == start) { - /* - * take a reference on the state, unlock will drop - * the ref - */ - cache_state(state, &cached); - } - spin_unlock(&tree->lock); - mirror = io_bio->mirror_num; - if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { - ret = tree->ops->readpage_end_io_hook(page, start, end, - state, mirror); + if (likely(uptodate && tree->ops && + tree->ops->readpage_end_io_hook)) { + ret = tree->ops->readpage_end_io_hook(io_bio, offset, + page, start, end, + mirror); if (ret) uptodate = 0; else clean_io_failure(start, page); } - if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { + if (likely(uptodate)) + goto readpage_ok; + + if (tree->ops && tree->ops->readpage_io_failed_hook) { ret = tree->ops->readpage_io_failed_hook(page, mirror); if (!ret && !err && test_bit(BIO_UPTODATE, &bio->bi_flags)) uptodate = 1; - } else if (!uptodate) { + } else { /* * The generic bio_readpage_error handles errors the * following way: If possible, new read requests are @@ -2541,24 +2469,18 @@ static void end_bio_extent_readpage(struct bio *bio, int err) * can't handle the error it will return -EIO and we * remain responsible for that page. */ - ret = bio_readpage_error(bio, page, start, end, mirror, NULL); + ret = bio_readpage_error(bio, offset, page, start, end, + mirror); if (ret == 0) { uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) uptodate = 0; - uncache_state(&cached); continue; } } - - if (uptodate && tree->track_uptodate) { - set_extent_uptodate(tree, start, end, &cached, - GFP_ATOMIC); - } - unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); - - if (uptodate) { +readpage_ok: + if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; unsigned offset; @@ -2573,8 +2495,36 @@ static void end_bio_extent_readpage(struct bio *bio, int err) SetPageError(page); } unlock_page(page); + offset += len; + + if (unlikely(!uptodate)) { + if (extent_len) { + endio_readpage_release_extent(tree, + extent_start, + extent_len, 1); + extent_start = 0; + extent_len = 0; + } + endio_readpage_release_extent(tree, start, + end - start + 1, 0); + } else if (!extent_len) { + extent_start = start; + extent_len = end + 1 - start; + } else if (extent_start + extent_len == start) { + extent_len += end + 1 - start; + } else { + endio_readpage_release_extent(tree, extent_start, + extent_len, uptodate); + extent_start = start; + extent_len = end + 1 - start; + } } while (bvec <= bvec_end); + if (extent_len) + endio_readpage_release_extent(tree, extent_start, extent_len, + uptodate); + if (io_bio->end_io) + io_bio->end_io(io_bio, err); bio_put(bio); } @@ -2586,6 +2536,7 @@ struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags) { + struct btrfs_io_bio *btrfs_bio; struct bio *bio; bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset); @@ -2601,6 +2552,10 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, bio->bi_size = 0; bio->bi_bdev = bdev; bio->bi_sector = first_sector; + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; } return bio; } @@ -2614,7 +2569,17 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) /* this also allocates from the btrfs_bioset */ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { - return bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); + struct btrfs_io_bio *btrfs_bio; + struct bio *bio; + + bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset); + if (bio) { + btrfs_bio = btrfs_io_bio(bio); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; + } + return bio; } @@ -2738,17 +2703,45 @@ void set_page_extent_mapped(struct page *page) } } +static struct extent_map * +__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, + u64 start, u64 len, get_extent_t *get_extent, + struct extent_map **em_cached) +{ + struct extent_map *em; + + if (em_cached && *em_cached) { + em = *em_cached; + if (em->in_tree && start >= em->start && + start < extent_map_end(em)) { + atomic_inc(&em->refs); + return em; + } + + free_extent_map(em); + *em_cached = NULL; + } + + em = get_extent(inode, page, pg_offset, start, len, 0); + if (em_cached && !IS_ERR_OR_NULL(em)) { + BUG_ON(*em_cached); + atomic_inc(&em->refs); + *em_cached = em; + } + return em; +} /* * basic readpage implementation. Locked extent state structs are inserted * into the tree that are removed when the IO is done (by the end_io * handlers) * XXX JDM: This needs looking at to ensure proper page locking */ -static int __extent_read_full_page(struct extent_io_tree *tree, - struct page *page, - get_extent_t *get_extent, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) +static int __do_readpage(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -2762,35 +2755,26 @@ static int __extent_read_full_page(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; - struct btrfs_ordered_extent *ordered; int ret; int nr = 0; + int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED; size_t pg_offset = 0; size_t iosize; size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = 0; + unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED; set_page_extent_mapped(page); + end = page_end; if (!PageUptodate(page)) { if (cleancache_get_page(page) == 0) { BUG_ON(blocksize != PAGE_SIZE); + unlock_extent(tree, start, end); goto out; } } - end = page_end; - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_extent(inode, start); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } - if (page->index == last_byte >> PAGE_CACHE_SHIFT) { char *userpage; size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); @@ -2817,15 +2801,18 @@ static int __extent_read_full_page(struct extent_io_tree *tree, kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); + if (!parent_locked) + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); break; } - em = get_extent(inode, page, pg_offset, cur, - end - cur + 1, 0); + em = __get_extent_map(inode, page, pg_offset, cur, + end - cur + 1, get_extent, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); - unlock_extent(tree, cur, end); + if (!parent_locked) + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; @@ -2833,7 +2820,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, BUG_ON(end < cur); if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - this_bio_flag = EXTENT_BIO_COMPRESSED; + this_bio_flag |= EXTENT_BIO_COMPRESSED; extent_set_compress_type(&this_bio_flag, em->compress_type); } @@ -2877,7 +2864,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - unlock_extent(tree, cur, cur + iosize - 1); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -2887,7 +2875,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - unlock_extent(tree, cur, cur + iosize - 1); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -2905,7 +2894,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, *bio_flags = this_bio_flag; } else { SetPageError(page); - unlock_extent(tree, cur, cur + iosize - 1); + if (!parent_locked) + unlock_extent(tree, cur, cur + iosize - 1); } cur = cur + iosize; pg_offset += iosize; @@ -2919,6 +2909,104 @@ out: return 0; } +static inline void __do_contiguous_readpages(struct extent_io_tree *tree, + struct page *pages[], int nr_pages, + u64 start, u64 end, + get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + struct inode *inode; + struct btrfs_ordered_extent *ordered; + int index; + + inode = pages[0]->mapping->host; + while (1) { + lock_extent(tree, start, end); + ordered = btrfs_lookup_ordered_range(inode, start, + end - start + 1); + if (!ordered) + break; + unlock_extent(tree, start, end); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + for (index = 0; index < nr_pages; index++) { + __do_readpage(tree, pages[index], get_extent, em_cached, bio, + mirror_num, bio_flags, rw); + page_cache_release(pages[index]); + } +} + +static void __extent_readpages(struct extent_io_tree *tree, + struct page *pages[], + int nr_pages, get_extent_t *get_extent, + struct extent_map **em_cached, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + u64 start = 0; + u64 end = 0; + u64 page_start; + int index; + int first_index = 0; + + for (index = 0; index < nr_pages; index++) { + page_start = page_offset(pages[index]); + if (!end) { + start = page_start; + end = start + PAGE_CACHE_SIZE - 1; + first_index = index; + } else if (end + 1 == page_start) { + end += PAGE_CACHE_SIZE; + } else { + __do_contiguous_readpages(tree, &pages[first_index], + index - first_index, start, + end, get_extent, em_cached, + bio, mirror_num, bio_flags, + rw); + start = page_start; + end = start + PAGE_CACHE_SIZE - 1; + first_index = index; + } + } + + if (end) + __do_contiguous_readpages(tree, &pages[first_index], + index - first_index, start, + end, get_extent, em_cached, bio, + mirror_num, bio_flags, rw); +} + +static int __extent_read_full_page(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct bio **bio, int mirror_num, + unsigned long *bio_flags, int rw) +{ + struct inode *inode = page->mapping->host; + struct btrfs_ordered_extent *ordered; + u64 start = page_offset(page); + u64 end = start + PAGE_CACHE_SIZE - 1; + int ret; + + while (1) { + lock_extent(tree, start, end); + ordered = btrfs_lookup_ordered_extent(inode, start); + if (!ordered) + break; + unlock_extent(tree, start, end); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } + + ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, + bio_flags, rw); + return ret; +} + int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num) { @@ -2933,6 +3021,20 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } +int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, int mirror_num) +{ + struct bio *bio = NULL; + unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED; + int ret; + + ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, + &bio_flags, READ); + if (bio) + ret = submit_one_bio(READ, bio, mirror_num, bio_flags); + return ret; +} + static noinline void update_nr_written(struct page *page, struct writeback_control *wbc, unsigned long nr_written) @@ -3189,8 +3291,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (!PageWriteback(page)) { printk(KERN_ERR "btrfs warning page %lu not " "writeback, cur %llu end %llu\n", - page->index, (unsigned long long)cur, - (unsigned long long)end); + page->index, cur, end); } ret = submit_extent_page(write_flags, tree, page, @@ -3769,7 +3870,7 @@ int extent_readpages(struct extent_io_tree *tree, unsigned long bio_flags = 0; struct page *pagepool[16]; struct page *page; - int i = 0; + struct extent_map *em_cached = NULL; int nr = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { @@ -3786,18 +3887,16 @@ int extent_readpages(struct extent_io_tree *tree, pagepool[nr++] = page; if (nr < ARRAY_SIZE(pagepool)) continue; - for (i = 0; i < nr; i++) { - __extent_read_full_page(tree, pagepool[i], get_extent, - &bio, 0, &bio_flags, READ); - page_cache_release(pagepool[i]); - } + __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, + &bio, 0, &bio_flags, READ); nr = 0; } - for (i = 0; i < nr; i++) { - __extent_read_full_page(tree, pagepool[i], get_extent, - &bio, 0, &bio_flags, READ); - page_cache_release(pagepool[i]); - } + if (nr) + __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, + &bio, 0, &bio_flags, READ); + + if (em_cached) + free_extent_map(em_cached); BUG_ON(!list_empty(pages)); if (bio) @@ -4136,6 +4235,76 @@ static void __free_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } +static int extent_buffer_under_io(struct extent_buffer *eb) +{ + return (atomic_read(&eb->io_pages) || + test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); +} + +/* + * Helper for releasing extent buffer page. + */ +static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, + unsigned long start_idx) +{ + unsigned long index; + unsigned long num_pages; + struct page *page; + int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); + + BUG_ON(extent_buffer_under_io(eb)); + + num_pages = num_extent_pages(eb->start, eb->len); + index = start_idx + num_pages; + if (start_idx >= index) + return; + + do { + index--; + page = extent_buffer_page(eb, index); + if (page && mapped) { + spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); + /* + * We need to make sure we haven't be attached + * to a new eb. + */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ + page_cache_release(page); + } + spin_unlock(&page->mapping->private_lock); + + } + if (page) { + /* One for when we alloced the page */ + page_cache_release(page); + } + } while (index != start_idx); +} + +/* + * Helper for releasing the extent buffer. + */ +static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) +{ + btrfs_release_extent_buffer_page(eb, 0); + __free_extent_buffer(eb); +} + static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len, @@ -4184,13 +4353,16 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) struct extent_buffer *new; unsigned long num_pages = num_extent_pages(src->start, src->len); - new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC); + new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS); if (new == NULL) return NULL; for (i = 0; i < num_pages; i++) { - p = alloc_page(GFP_ATOMIC); - BUG_ON(!p); + p = alloc_page(GFP_NOFS); + if (!p) { + btrfs_release_extent_buffer(new); + return NULL; + } attach_extent_buffer_page(new, p); WARN_ON(PageDirty(p)); SetPageUptodate(p); @@ -4210,12 +4382,12 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) unsigned long num_pages = num_extent_pages(0, len); unsigned long i; - eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC); + eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS); if (!eb) return NULL; for (i = 0; i < num_pages; i++) { - eb->pages[i] = alloc_page(GFP_ATOMIC); + eb->pages[i] = alloc_page(GFP_NOFS); if (!eb->pages[i]) goto err; } @@ -4231,76 +4403,6 @@ err: return NULL; } -static int extent_buffer_under_io(struct extent_buffer *eb) -{ - return (atomic_read(&eb->io_pages) || - test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || - test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); -} - -/* - * Helper for releasing extent buffer page. - */ -static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, - unsigned long start_idx) -{ - unsigned long index; - unsigned long num_pages; - struct page *page; - int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); - - BUG_ON(extent_buffer_under_io(eb)); - - num_pages = num_extent_pages(eb->start, eb->len); - index = start_idx + num_pages; - if (start_idx >= index) - return; - - do { - index--; - page = extent_buffer_page(eb, index); - if (page && mapped) { - spin_lock(&page->mapping->private_lock); - /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear page_private if it's still connected to - * this eb. - */ - if (PagePrivate(page) && - page->private == (unsigned long)eb) { - BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); - /* - * We need to make sure we haven't be attached - * to a new eb. - */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - page_cache_release(page); - } - spin_unlock(&page->mapping->private_lock); - - } - if (page) { - /* One for when we alloced the page */ - page_cache_release(page); - } - } while (index != start_idx); -} - -/* - * Helper for releasing the extent buffer. - */ -static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) -{ - btrfs_release_extent_buffer_page(eb, 0); - __free_extent_buffer(eb); -} - static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; @@ -4771,7 +4873,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); @@ -4813,8 +4915,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, if (start + min_len > eb->len) { WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, " - "wanted %lu %lu\n", (unsigned long long)eb->start, - eb->len, start, min_len); + "wanted %lu %lu\n", + eb->start, eb->len, start, min_len); return -EINVAL; } @@ -4841,7 +4943,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); @@ -4875,7 +4977,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); @@ -4905,7 +5007,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(eb, i); @@ -4936,7 +5038,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, WARN_ON(src->len != dst_len); offset = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); while (len > 0) { page = extent_buffer_page(dst, i); @@ -5022,9 +5124,9 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, while (len > 0) { dst_off_in_page = (start_offset + dst_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); src_off_in_page = (start_offset + src_offset) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; @@ -5075,9 +5177,9 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; dst_off_in_page = (start_offset + dst_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); src_off_in_page = (start_offset + src_end) & - ((unsigned long)PAGE_CACHE_SIZE - 1); + (PAGE_CACHE_SIZE - 1); cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3b8c4e26e1da..6dbc645f1f3d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -29,6 +29,7 @@ */ #define EXTENT_BIO_COMPRESSED 1 #define EXTENT_BIO_TREE_LOG 2 +#define EXTENT_BIO_PARENT_LOCKED 4 #define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ @@ -44,14 +45,11 @@ #define EXTENT_BUFFER_DUMMY 9 /* these are flags for extent_clear_unlock_delalloc */ -#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 -#define EXTENT_CLEAR_UNLOCK 0x2 -#define EXTENT_CLEAR_DELALLOC 0x4 -#define EXTENT_CLEAR_DIRTY 0x8 -#define EXTENT_SET_WRITEBACK 0x10 -#define EXTENT_END_WRITEBACK 0x20 -#define EXTENT_SET_PRIVATE2 0x40 -#define EXTENT_CLEAR_ACCOUNTING 0x80 +#define PAGE_UNLOCK (1 << 0) +#define PAGE_CLEAR_DIRTY (1 << 1) +#define PAGE_SET_WRITEBACK (1 << 2) +#define PAGE_END_WRITEBACK (1 << 3) +#define PAGE_SET_PRIVATE2 (1 << 4) /* * page->private values. Every page that is controlled by the extent @@ -62,6 +60,7 @@ struct extent_state; struct btrfs_root; +struct btrfs_io_bio; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, @@ -77,8 +76,9 @@ struct extent_io_ops { size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); - int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state, int mirror); + int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, @@ -200,6 +200,8 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); +int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); void extent_io_exit(void); @@ -261,11 +263,6 @@ int extent_readpages(struct extent_io_tree *tree, get_extent_t get_extent); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent); -int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); -void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[], - int count); -void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, - int bvec_index, u32 csums[], int count); int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); @@ -330,10 +327,10 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long *map_len); int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); -int extent_clear_unlock_delalloc(struct inode *inode, - struct extent_io_tree *tree, - u64 start, u64 end, struct page *locked_page, - unsigned long op); +int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, + struct page *locked_page, + unsigned long bits_to_clear, + unsigned long page_ops); struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a7bfc9541803..4f53159bdb9d 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -23,6 +23,7 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "volumes.h" #include "print-tree.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ @@ -152,28 +153,54 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } +static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) +{ + kfree(bio->csum_allocated); +} + static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { - u32 sum[16]; - int len; struct bio_vec *bvec = bio->bi_io_vec; - int bio_index = 0; + struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); + struct btrfs_csum_item *item = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_path *path; + u8 *csum; u64 offset = 0; u64 item_start_offset = 0; u64 item_last_offset = 0; u64 disk_bytenr; u32 diff; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + int nblocks; + int bio_index = 0; int count; - struct btrfs_path *path; - struct btrfs_csum_item *item = NULL; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) return -ENOMEM; + + nblocks = bio->bi_size >> inode->i_sb->s_blocksize_bits; + if (!dst) { + if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { + btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size, + GFP_NOFS); + if (!btrfs_bio->csum_allocated) { + btrfs_free_path(path); + return -ENOMEM; + } + btrfs_bio->csum = btrfs_bio->csum_allocated; + btrfs_bio->end_io = btrfs_io_bio_endio_readpage; + } else { + btrfs_bio->csum = btrfs_bio->csum_inline; + } + csum = btrfs_bio->csum; + } else { + csum = (u8 *)dst; + } + if (bio->bi_size > PAGE_CACHE_SIZE * 8) path->reada = 2; @@ -194,11 +221,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (dio) offset = logical_offset; while (bio_index < bio->bi_vcnt) { - len = min_t(int, ARRAY_SIZE(sum), bio->bi_vcnt - bio_index); if (!dio) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum, - len); + count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, + (u32 *)csum, nblocks); if (count) goto found; @@ -213,7 +239,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, path, disk_bytenr, 0); if (IS_ERR(item)) { count = 1; - sum[0] = 0; + memset(csum, 0, csum_size); if (BTRFS_I(inode)->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, @@ -222,9 +248,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, } else { printk(KERN_INFO "btrfs no csum found " "for inode %llu start %llu\n", - (unsigned long long) - btrfs_ino(inode), - (unsigned long long)offset); + btrfs_ino(inode), offset); } item = NULL; btrfs_release_path(path); @@ -249,23 +273,14 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, diff = disk_bytenr - item_start_offset; diff = diff / root->sectorsize; diff = diff * csum_size; - count = min_t(int, len, (item_last_offset - disk_bytenr) >> - inode->i_sb->s_blocksize_bits); - read_extent_buffer(path->nodes[0], sum, + count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >> + inode->i_sb->s_blocksize_bits); + read_extent_buffer(path->nodes[0], csum, ((unsigned long)item) + diff, csum_size * count); found: - if (dst) { - memcpy(dst, sum, count * csum_size); - dst += count; - } else { - if (dio) - extent_cache_csums_dio(io_tree, offset, sum, - count); - else - extent_cache_csums(io_tree, bio, bio_index, sum, - count); - } + csum += count * csum_size; + nblocks -= count; while (count--) { disk_bytenr += bvec->bv_len; offset += bvec->bv_len; @@ -284,9 +299,19 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, } int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 offset) + struct btrfs_dio_private *dip, struct bio *bio, + u64 offset) { - return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1); + int len = (bio->bi_sector << 9) - dip->disk_bytenr; + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + int ret; + + len >>= inode->i_sb->s_blocksize_bits; + len *= csum_size; + + ret = __btrfs_lookup_bio_sums(root, inode, bio, offset, + (u32 *)(dip->csum + len), 1); + return ret; } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4d2eb6417145..bc5072b2db53 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1334,7 +1334,6 @@ fail: static noinline int check_can_nocow(struct inode *inode, loff_t pos, size_t *write_bytes) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered; u64 lockstart, lockend; @@ -1356,16 +1355,8 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, btrfs_put_ordered_extent(ordered); } - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); - return PTR_ERR(trans); - } - num_bytes = lockend - lockstart + 1; - ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL, - NULL); - btrfs_end_transaction(trans, root); + ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); if (ret <= 0) { ret = 0; } else { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b21a3cd667d8..3f0ddfce96e6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -221,12 +221,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_path *path, struct inode *inode) { - loff_t oldsize; int ret = 0; - oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); - truncate_pagecache(inode, oldsize, 0); + truncate_pagecache(inode, 0); /* * We don't need an orphan item because truncating the free space cache @@ -308,7 +306,7 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl) static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) { - BUG_ON(io_ctl->index >= io_ctl->num_pages); + ASSERT(io_ctl->index < io_ctl->num_pages); io_ctl->page = io_ctl->pages[io_ctl->index++]; io_ctl->cur = kmap(io_ctl->page); io_ctl->orig = io_ctl->cur; @@ -673,8 +671,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, btrfs_err(root->fs_info, "free space inode generation (%llu) " "did not match free space cache generation (%llu)", - (unsigned long long)BTRFS_I(inode)->generation, - (unsigned long long)generation); + BTRFS_I(inode)->generation, generation); return 0; } @@ -729,7 +726,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, goto free_cache; } } else { - BUG_ON(!num_bitmaps); + ASSERT(num_bitmaps); num_bitmaps--; e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); if (!e->bitmap) { @@ -1029,7 +1026,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, leaf = path->nodes[0]; if (ret > 0) { struct btrfs_key found_key; - BUG_ON(!path->slots[0]); + ASSERT(path->slots[0]); path->slots[0]--; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || @@ -1117,7 +1114,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit, u64 offset) { - BUG_ON(offset < bitmap_start); + ASSERT(offset >= bitmap_start); offset -= bitmap_start; return (unsigned long)(div_u64(offset, unit)); } @@ -1272,7 +1269,7 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, if (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); - BUG_ON(entry->offset > offset); + ASSERT(entry->offset <= offset); } else { if (fuzzy) return entry; @@ -1336,7 +1333,7 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, { int ret = 0; - BUG_ON(!info->bitmap && !info->bytes); + ASSERT(info->bytes || info->bitmap); ret = tree_insert_offset(&ctl->free_space_offset, info->offset, &info->offset_index, (info->bitmap != NULL)); if (ret) @@ -1359,7 +1356,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) max_bitmaps = max(max_bitmaps, 1); - BUG_ON(ctl->total_bitmaps > max_bitmaps); + ASSERT(ctl->total_bitmaps <= max_bitmaps); /* * The goal is to keep the total amount of memory used per 1gb of space @@ -1403,7 +1400,7 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); - BUG_ON(start + count > BITS_PER_BITMAP); + ASSERT(start + count <= BITS_PER_BITMAP); bitmap_clear(info->bitmap, start, count); @@ -1426,7 +1423,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); - BUG_ON(start + count > BITS_PER_BITMAP); + ASSERT(start + count <= BITS_PER_BITMAP); bitmap_set(info->bitmap, start, count); @@ -1742,7 +1739,7 @@ no_cluster_bitmap: bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!bitmap_info) { - BUG_ON(added); + ASSERT(added == 0); goto new_bitmap; } @@ -1882,7 +1879,7 @@ out: if (ret) { printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); - BUG_ON(ret == -EEXIST); + ASSERT(ret != -EEXIST); } return ret; @@ -1991,8 +1988,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, if (info->bytes >= bytes && !block_group->ro) count++; printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, + info->offset, info->bytes, (info->bitmap) ? "yes" : "no"); } printk(KERN_INFO "block group has cluster?: %s\n", @@ -2371,7 +2367,7 @@ again: rb_erase(&entry->offset_index, &ctl->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 1); - BUG_ON(ret); /* -EEXIST; Logic error */ + ASSERT(!ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, total_found * ctl->unit, 1); @@ -2464,7 +2460,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); total_size += entry->bytes; - BUG_ON(ret); /* -EEXIST; Logic error */ + ASSERT(!ret); /* -EEXIST; Logic error */ } while (node && entry != last); cluster->max_size = max_extent; @@ -2525,8 +2521,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, * returns zero and sets up cluster if things worked out, otherwise * it returns -enospc */ -int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_find_space_cluster(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) @@ -2856,7 +2851,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) ret = search_bitmap(ctl, entry, &offset, &count); /* Logic error; Should be empty if it can't find anything */ - BUG_ON(ret); + ASSERT(!ret); ino = offset; bitmap_clear_bits(ctl, entry, offset, 1); @@ -2973,33 +2968,68 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -static struct btrfs_block_group_cache *init_test_block_group(void) +/* + * Use this if you need to make a bitmap or extent entry specifically, it + * doesn't do any of the merging that add_free_space does, this acts a lot like + * how the free space cache loading stuff works, so you can get really weird + * configurations. + */ +int test_add_free_space_entry(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes, bool bitmap) { - struct btrfs_block_group_cache *cache; + struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; + struct btrfs_free_space *info = NULL, *bitmap_info; + void *map = NULL; + u64 bytes_added; + int ret; - cache = kzalloc(sizeof(*cache), GFP_NOFS); - if (!cache) - return NULL; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return NULL; +again: + if (!info) { + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); + if (!info) + return -ENOMEM; } - cache->key.objectid = 0; - cache->key.offset = 1024 * 1024 * 1024; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = 4096; + if (!bitmap) { + spin_lock(&ctl->tree_lock); + info->offset = offset; + info->bytes = bytes; + ret = link_free_space(ctl, info); + spin_unlock(&ctl->tree_lock); + if (ret) + kmem_cache_free(btrfs_free_space_cachep, info); + return ret; + } + + if (!map) { + map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!map) { + kmem_cache_free(btrfs_free_space_cachep, info); + return -ENOMEM; + } + } + + spin_lock(&ctl->tree_lock); + bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), + 1, 0); + if (!bitmap_info) { + info->bitmap = map; + map = NULL; + add_new_bitmap(ctl, info, offset); + bitmap_info = info; + } - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + spin_unlock(&ctl->tree_lock); - btrfs_init_free_space_ctl(cache); + if (bytes) + goto again; - return cache; + if (map) + kfree(map); + return 0; } /* @@ -3007,8 +3037,8 @@ static struct btrfs_block_group_cache *init_test_block_group(void) * just used to check the absence of space, so if there is free space in the * range at all we will return 1. */ -static int check_exists(struct btrfs_block_group_cache *cache, u64 offset, - u64 bytes) +int test_check_exists(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes) { struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; struct btrfs_free_space *info; @@ -3085,411 +3115,4 @@ out: spin_unlock(&ctl->tree_lock); return ret; } - -/* - * Use this if you need to make a bitmap or extent entry specifically, it - * doesn't do any of the merging that add_free_space does, this acts a lot like - * how the free space cache loading stuff works, so you can get really weird - * configurations. - */ -static int add_free_space_entry(struct btrfs_block_group_cache *cache, - u64 offset, u64 bytes, bool bitmap) -{ - struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; - struct btrfs_free_space *info = NULL, *bitmap_info; - void *map = NULL; - u64 bytes_added; - int ret; - -again: - if (!info) { - info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); - if (!info) - return -ENOMEM; - } - - if (!bitmap) { - spin_lock(&ctl->tree_lock); - info->offset = offset; - info->bytes = bytes; - ret = link_free_space(ctl, info); - spin_unlock(&ctl->tree_lock); - if (ret) - kmem_cache_free(btrfs_free_space_cachep, info); - return ret; - } - - if (!map) { - map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!map) { - kmem_cache_free(btrfs_free_space_cachep, info); - return -ENOMEM; - } - } - - spin_lock(&ctl->tree_lock); - bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), - 1, 0); - if (!bitmap_info) { - info->bitmap = map; - map = NULL; - add_new_bitmap(ctl, info, offset); - bitmap_info = info; - } - - bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); - bytes -= bytes_added; - offset += bytes_added; - spin_unlock(&ctl->tree_lock); - - if (bytes) - goto again; - - if (map) - kfree(map); - return 0; -} - -#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__) - -/* - * This test just does basic sanity checking, making sure we can add an exten - * entry and remove space from either end and the middle, and make sure we can - * remove space that covers adjacent extent entries. - */ -static int test_extents(struct btrfs_block_group_cache *cache) -{ - int ret = 0; - - test_msg("Running extent only tests\n"); - - /* First just make sure we can remove an entire entry */ - ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); - if (ret) { - test_msg("Error adding initial extents %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); - if (ret) { - test_msg("Error removing extent %d\n", ret); - return ret; - } - - if (check_exists(cache, 0, 4 * 1024 * 1024)) { - test_msg("Full remove left some lingering space\n"); - return -1; - } - - /* Ok edge and middle cases now */ - ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); - if (ret) { - test_msg("Error adding half extent %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); - if (ret) { - test_msg("Error removing tail end %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); - if (ret) { - test_msg("Error removing front end %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); - if (ret) { - test_msg("Error removing middle piece %d\n", ret); - return ret; - } - - if (check_exists(cache, 0, 1 * 1024 * 1024)) { - test_msg("Still have space at the front\n"); - return -1; - } - - if (check_exists(cache, 2 * 1024 * 1024, 4096)) { - test_msg("Still have space in the middle\n"); - return -1; - } - - if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { - test_msg("Still have space at the end\n"); - return -1; - } - - /* Cleanup */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); - - return 0; -} - -static int test_bitmaps(struct btrfs_block_group_cache *cache) -{ - u64 next_bitmap_offset; - int ret; - - test_msg("Running bitmap only tests\n"); - - ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); - if (ret) { - test_msg("Couldn't create a bitmap entry %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); - if (ret) { - test_msg("Error removing bitmap full range %d\n", ret); - return ret; - } - - if (check_exists(cache, 0, 4 * 1024 * 1024)) { - test_msg("Left some space in bitmap\n"); - return -1; - } - - ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); - if (ret) { - test_msg("Couldn't add to our bitmap entry %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); - if (ret) { - test_msg("Couldn't remove middle chunk %d\n", ret); - return ret; - } - - /* - * The first bitmap we have starts at offset 0 so the next one is just - * at the end of the first bitmap. - */ - next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); - - /* Test a bit straddling two bitmaps */ - ret = add_free_space_entry(cache, next_bitmap_offset - - (2 * 1024 * 1024), 4 * 1024 * 1024, 1); - if (ret) { - test_msg("Couldn't add space that straddles two bitmaps %d\n", - ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, next_bitmap_offset - - (1 * 1024 * 1024), 2 * 1024 * 1024); - if (ret) { - test_msg("Couldn't remove overlapping space %d\n", ret); - return ret; - } - - if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), - 2 * 1024 * 1024)) { - test_msg("Left some space when removing overlapping\n"); - return -1; - } - - __btrfs_remove_free_space_cache(cache->free_space_ctl); - - return 0; -} - -/* This is the high grade jackassery */ -static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) -{ - u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); - int ret; - - test_msg("Running bitmap and extent tests\n"); - - /* - * First let's do something simple, an extent at the same offset as the - * bitmap, but the free space completely in the extent and then - * completely in the bitmap. - */ - ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); - if (ret) { - test_msg("Couldn't create bitmap entry %d\n", ret); - return ret; - } - - ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); - if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); - if (ret) { - test_msg("Couldn't remove extent entry %d\n", ret); - return ret; - } - - if (check_exists(cache, 0, 1 * 1024 * 1024)) { - test_msg("Left remnants after our remove\n"); - return -1; - } - - /* Now to add back the extent entry and remove from the bitmap */ - ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); - if (ret) { - test_msg("Couldn't re-add extent entry %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); - if (ret) { - test_msg("Couldn't remove from bitmap %d\n", ret); - return ret; - } - - if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { - test_msg("Left remnants in the bitmap\n"); - return -1; - } - - /* - * Ok so a little more evil, extent entry and bitmap at the same offset, - * removing an overlapping chunk. - */ - ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); - if (ret) { - test_msg("Couldn't add to a bitmap %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); - if (ret) { - test_msg("Couldn't remove overlapping space %d\n", ret); - return ret; - } - - if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { - test_msg("Left over peices after removing overlapping\n"); - return -1; - } - - __btrfs_remove_free_space_cache(cache->free_space_ctl); - - /* Now with the extent entry offset into the bitmap */ - ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); - if (ret) { - test_msg("Couldn't add space to the bitmap %d\n", ret); - return ret; - } - - ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); - if (ret) { - test_msg("Couldn't add extent to the cache %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); - if (ret) { - test_msg("Problem removing overlapping space %d\n", ret); - return ret; - } - - if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { - test_msg("Left something behind when removing space"); - return -1; - } - - /* - * This has blown up in the past, the extent entry starts before the - * bitmap entry, but we're trying to remove an offset that falls - * completely within the bitmap range and is in both the extent entry - * and the bitmap entry, looks like this - * - * [ extent ] - * [ bitmap ] - * [ del ] - */ - __btrfs_remove_free_space_cache(cache->free_space_ctl); - ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, - 4 * 1024 * 1024, 1); - if (ret) { - test_msg("Couldn't add bitmap %d\n", ret); - return ret; - } - - ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, - 5 * 1024 * 1024, 0); - if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, - 5 * 1024 * 1024); - if (ret) { - test_msg("Failed to free our space %d\n", ret); - return ret; - } - - if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024, - 5 * 1024 * 1024)) { - test_msg("Left stuff over\n"); - return -1; - } - - __btrfs_remove_free_space_cache(cache->free_space_ctl); - - /* - * This blew up before, we have part of the free space in a bitmap and - * then the entirety of the rest of the space in an extent. This used - * to return -EAGAIN back from btrfs_remove_extent, make sure this - * doesn't happen. - */ - ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); - if (ret) { - test_msg("Couldn't add bitmap entry %d\n", ret); - return ret; - } - - ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); - if (ret) { - test_msg("Couldn't add extent entry %d\n", ret); - return ret; - } - - ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); - if (ret) { - test_msg("Error removing bitmap and extent overlapping %d\n", ret); - return ret; - } - - __btrfs_remove_free_space_cache(cache->free_space_ctl); - return 0; -} - -void btrfs_test_free_space_cache(void) -{ - struct btrfs_block_group_cache *cache; - - test_msg("Running btrfs free space cache tests\n"); - - cache = init_test_block_group(); - if (!cache) { - test_msg("Couldn't run the tests\n"); - return; - } - - if (test_extents(cache)) - goto out; - if (test_bitmaps(cache)) - goto out; - if (test_bitmaps_and_extents(cache)) - goto out; -out: - __btrfs_remove_free_space_cache(cache->free_space_ctl); - kfree(cache->free_space_ctl); - kfree(cache); - test_msg("Free space cache tests finished\n"); -} -#undef test_msg -#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ -void btrfs_test_free_space_cache(void) {} -#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ +#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */ diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 894116b71304..c74904167476 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -98,8 +98,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes); -int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_find_space_cluster(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size); @@ -113,6 +112,12 @@ int btrfs_return_cluster_to_free_space( int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); -void btrfs_test_free_space_cache(void); +/* Support functions for runnint our sanity tests */ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int test_add_free_space_entry(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes, bool bitmap); +int test_check_exists(struct btrfs_block_group_cache *cache, + u64 offset, u64 bytes); +#endif #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7bdc83d04d54..f338c5672d58 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -230,12 +230,13 @@ fail: * does the checks required to make sure the data is small enough * to fit as an inline extent. */ -static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, u64 start, u64 end, - size_t compressed_size, int compress_type, - struct page **compressed_pages) +static noinline int cow_file_range_inline(struct btrfs_root *root, + struct inode *inode, u64 start, + u64 end, size_t compressed_size, + int compress_type, + struct page **compressed_pages) { + struct btrfs_trans_handle *trans; u64 isize = i_size_read(inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; @@ -256,9 +257,16 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1); - if (ret) - return ret; + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } if (isize > actual_end) inline_len = min_t(u64, isize, actual_end); @@ -267,15 +275,18 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); - return ret; + goto out; } else if (ret == -ENOSPC) { - return 1; + ret = 1; + goto out; } set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); - return 0; +out: + btrfs_end_transaction(trans, root); + return ret; } struct async_extent { @@ -343,7 +354,6 @@ static noinline int compress_file_range(struct inode *inode, int *num_added) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; u64 num_bytes; u64 blocksize = root->sectorsize; u64 actual_end; @@ -461,45 +471,36 @@ again: } cont: if (start == 0) { - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - trans = NULL; - goto cleanup_and_out; - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - /* lets try to make an inline extent */ if (ret || total_in < (actual_end - start)) { /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(trans, root, inode, - start, end, 0, 0, NULL); + ret = cow_file_range_inline(root, inode, start, end, + 0, 0, NULL); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(trans, root, inode, - start, end, + ret = cow_file_range_inline(root, inode, start, end, total_compressed, compress_type, pages); } if (ret <= 0) { + unsigned long clear_flags = EXTENT_DELALLOC | + EXTENT_DEFRAG; + clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; + /* * inline extent creation worked or returned error, * we don't need to create any more async work items. * Unlock and free up our temp pages. */ - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | - EXTENT_CLEAR_DELALLOC | - EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); - - btrfs_end_transaction(trans, root); + extent_clear_unlock_delalloc(inode, start, end, NULL, + clear_flags, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); goto free_pages_out; } - btrfs_end_transaction(trans, root); } if (will_compress) { @@ -590,20 +591,6 @@ free_pages_out: kfree(pages); goto out; - -cleanup_and_out: - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_DIRTY | - EXTENT_CLEAR_DELALLOC | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - if (!trans || IS_ERR(trans)) - btrfs_error(root->fs_info, ret, "Failed to join transaction"); - else - btrfs_abort_transaction(trans, root, ret); - goto free_pages_out; } /* @@ -617,7 +604,6 @@ static noinline int submit_compressed_extents(struct inode *inode, { struct async_extent *async_extent; u64 alloc_hint = 0; - struct btrfs_trans_handle *trans; struct btrfs_key ins; struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -678,20 +664,10 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - } else { - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_reserve_extent(trans, root, + ret = btrfs_reserve_extent(root, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, &ins, 1); - if (ret && ret != -ENOSPC) - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); - } - if (ret) { int i; @@ -770,16 +746,12 @@ retry: /* * clear dirty, set writeback and unlock the pages. */ - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - async_extent->start, + extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - NULL, EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); - + NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK); ret = btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, @@ -798,16 +770,13 @@ out: out_free_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset); out_free: - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - async_extent->start, + extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - NULL, EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); + NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, + PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); kfree(async_extent); goto again; } @@ -857,14 +826,13 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, * required to start IO on it. It may be clean and already done with * IO when we return. */ -static noinline int __cow_file_range(struct btrfs_trans_handle *trans, - struct inode *inode, - struct btrfs_root *root, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) { + struct btrfs_root *root = BTRFS_I(inode)->root; u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; @@ -885,29 +853,24 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, /* if this is a small write inside eof, kick off defrag */ if (num_bytes < 64 * 1024 && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) - btrfs_add_inode_defrag(trans, inode); + btrfs_add_inode_defrag(NULL, inode); if (start == 0) { /* lets try to make an inline extent */ - ret = cow_file_range_inline(trans, root, inode, - start, end, 0, 0, NULL); + ret = cow_file_range_inline(root, inode, start, end, 0, 0, + NULL); if (ret == 0) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); + extent_clear_unlock_delalloc(inode, start, end, NULL, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DEFRAG, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; goto out; } else if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); goto out_unlock; } } @@ -922,13 +885,11 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, unsigned long op; cur_alloc_size = disk_num_bytes; - ret = btrfs_reserve_extent(trans, root, cur_alloc_size, + ret = btrfs_reserve_extent(root, cur_alloc_size, root->sectorsize, 0, alloc_hint, &ins, 1); - if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + if (ret < 0) goto out_unlock; - } em = alloc_extent_map(); if (!em) { @@ -974,10 +935,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); - if (ret) { - btrfs_abort_transaction(trans, root, ret); + if (ret) goto out_reserve; - } } if (disk_num_bytes < cur_alloc_size) @@ -990,13 +949,13 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, * Do set the Private2 bit so we know this page was properly * setup for writepage */ - op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; - op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_SET_PRIVATE2; + op = unlock ? PAGE_UNLOCK : 0; + op |= PAGE_SET_PRIVATE2; - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - start, start + ram_size - 1, - locked_page, op); + extent_clear_unlock_delalloc(inode, start, + start + ram_size - 1, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC, + op); disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; @@ -1008,52 +967,14 @@ out: out_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset); out_unlock: - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - + extent_clear_unlock_delalloc(inode, start, end, locked_page, + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DELALLOC | EXTENT_DEFRAG, + PAGE_UNLOCK | PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); goto out; } -static noinline int cow_file_range(struct inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) -{ - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - return PTR_ERR(trans); - } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - - ret = __cow_file_range(trans, inode, root, locked_page, start, end, - page_started, nr_written, unlock); - - btrfs_end_transaction(trans, root); - - return ret; -} - /* * work queue call back to started compression on a file and pages */ @@ -1221,15 +1142,13 @@ static noinline int run_delalloc_nocow(struct inode *inode, path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); + extent_clear_unlock_delalloc(inode, start, end, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); return -ENOMEM; } @@ -1241,15 +1160,13 @@ static noinline int run_delalloc_nocow(struct inode *inode, trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); + extent_clear_unlock_delalloc(inode, start, end, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); btrfs_free_path(path); return PTR_ERR(trans); } @@ -1369,9 +1286,9 @@ out_check: btrfs_release_path(path); if (cow_start != (u64)-1) { - ret = __cow_file_range(trans, inode, root, locked_page, - cow_start, found_key.offset - 1, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, + cow_start, found_key.offset - 1, + page_started, nr_written, 1); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error; @@ -1428,11 +1345,11 @@ out_check: } } - extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - cur_offset, cur_offset + num_bytes - 1, - locked_page, EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_SET_PRIVATE2); + extent_clear_unlock_delalloc(inode, cur_offset, + cur_offset + num_bytes - 1, + locked_page, EXTENT_LOCKED | + EXTENT_DELALLOC, PAGE_UNLOCK | + PAGE_SET_PRIVATE2); cur_offset = extent_end; if (cur_offset > end) break; @@ -1445,9 +1362,8 @@ out_check: } if (cow_start != (u64)-1) { - ret = __cow_file_range(trans, inode, root, locked_page, - cow_start, end, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, cow_start, end, + page_started, nr_written, 1); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error; @@ -1460,16 +1376,13 @@ error: ret = err; if (ret && cur_offset < end) - extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - cur_offset, end, locked_page, - EXTENT_CLEAR_UNLOCK_PAGE | - EXTENT_CLEAR_UNLOCK | - EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_DIRTY | - EXTENT_SET_WRITEBACK | - EXTENT_END_WRITEBACK); - + extent_clear_unlock_delalloc(inode, cur_offset, end, + locked_page, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | + PAGE_CLEAR_DIRTY | + PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK); btrfs_free_path(path); return ret; } @@ -2132,6 +2045,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, WARN_ON(1); return ret; } + ret = 0; while (1) { cond_resched(); @@ -2181,8 +2095,6 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, old->len || extent_offset + num_bytes <= old->extent_offset + old->offset) continue; - - ret = 0; break; } @@ -2238,16 +2150,18 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path, static int relink_is_mergable(struct extent_buffer *leaf, struct btrfs_file_extent_item *fi, - u64 disk_bytenr) + struct new_sa_defrag_extent *new) { - if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr) + if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) return 0; if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) return 0; - if (btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || + if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) + return 0; + + if (btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) return 0; @@ -2391,8 +2305,8 @@ again: struct btrfs_file_extent_item); extent_len = btrfs_file_extent_num_bytes(leaf, fi); - if (relink_is_mergable(leaf, fi, new->bytenr) && - extent_len + found_key.offset == start) { + if (extent_len + found_key.offset == start && + relink_is_mergable(leaf, fi, new)) { btrfs_set_file_extent_num_bytes(leaf, fi, extent_len + len); btrfs_mark_buffer_dirty(leaf); @@ -2648,8 +2562,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) struct extent_state *cached_state = NULL; struct new_sa_defrag_extent *new = NULL; int compress_type = 0; - int ret; + int ret = 0; + u64 logical_len = ordered_extent->len; bool nolock; + bool truncated = false; nolock = btrfs_is_free_space_inode(inode); @@ -2658,6 +2574,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { + truncated = true; + logical_len = ordered_extent->truncated_len; + /* Truncated the entire extent, don't bother adding */ + if (!logical_len) + goto out; + } + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ btrfs_ordered_update_i_size(inode, 0, ordered_extent); @@ -2713,15 +2637,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + - ordered_extent->len); + logical_len); } else { BUG_ON(root == root->fs_info->tree_root); ret = insert_reserved_file_extent(trans, inode, ordered_extent->file_offset, ordered_extent->start, ordered_extent->disk_len, - ordered_extent->len, - ordered_extent->len, + logical_len, logical_len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); } @@ -2753,17 +2676,27 @@ out: if (trans) btrfs_end_transaction(trans, root); - if (ret) { - clear_extent_uptodate(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, NULL, GFP_NOFS); + if (ret || truncated) { + u64 start, end; + + if (truncated) + start = ordered_extent->file_offset + logical_len; + else + start = ordered_extent->file_offset; + end = ordered_extent->file_offset + ordered_extent->len - 1; + clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS); + + /* Drop the cache for the part of the extent we didn't write. */ + btrfs_drop_extent_cache(inode, start, end, 0); /* * If the ordered extent had an IOERR or something else went * wrong we need to return the space for this ordered extent - * back to the allocator. + * back to the allocator. We only free the extent in the + * truncated case if we didn't write out the extent at all. */ - if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + if ((ret || !logical_len) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) btrfs_free_reserved_extent(root, ordered_extent->start, ordered_extent->disk_len); @@ -2827,16 +2760,16 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, * if there's a match, we allow the bio to finish. If not, the code in * extent_io.c will try to find good copies for us. */ -static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state, int mirror) +static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, + u64 phy_offset, struct page *page, + u64 start, u64 end, int mirror) { size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; char *kaddr; - u64 private = ~(u32)0; - int ret; struct btrfs_root *root = BTRFS_I(inode)->root; + u32 csum_expected; u32 csum = ~(u32)0; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -2856,19 +2789,13 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, return 0; } - if (state && state->start == start) { - private = state->private; - ret = 0; - } else { - ret = get_state_private(io_tree, start, &private); - } - kaddr = kmap_atomic(page); - if (ret) - goto zeroit; + phy_offset >>= inode->i_sb->s_blocksize_bits; + csum_expected = *(((u32 *)io_bio->csum) + phy_offset); + kaddr = kmap_atomic(page); csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1); btrfs_csum_final(csum, (char *)&csum); - if (csum != private) + if (csum != csum_expected) goto zeroit; kunmap_atomic(kaddr); @@ -2877,14 +2804,12 @@ good: zeroit: if (__ratelimit(&_rs)) - btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u private %llu", - (unsigned long long)btrfs_ino(page->mapping->host), - (unsigned long long)start, csum, - (unsigned long long)private); + btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_ino(page->mapping->host), start, csum, csum_expected); memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); kunmap_atomic(kaddr); - if (private == 0) + if (csum_expected == 0) return 0; return -EIO; } @@ -2971,8 +2896,10 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, btrfs_root_refs(&root->root_item) > 0) { ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); - BUG_ON(ret); - root->orphan_item_inserted = 0; + if (ret) + btrfs_abort_transaction(trans, root, ret); + else + root->orphan_item_inserted = 0; } if (block_rsv) { @@ -3041,11 +2968,18 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) /* insert an orphan item to track this unlinked/truncated file */ if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); - if (ret && ret != -EEXIST) { - clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, - &BTRFS_I(inode)->runtime_flags); - btrfs_abort_transaction(trans, root, ret); - return ret; + if (ret) { + if (reserve) { + clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); + btrfs_orphan_release_metadata(inode); + } + if (ret != -EEXIST) { + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); + btrfs_abort_transaction(trans, root, ret); + return ret; + } } ret = 0; } @@ -3084,17 +3018,15 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, release_rsv = 1; spin_unlock(&root->orphan_lock); - if (trans && delete_item) { + if (trans && delete_item) ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); - BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ - } if (release_rsv) { btrfs_orphan_release_metadata(inode); atomic_dec(&root->orphan_inodes); } - return 0; + return ret; } /* @@ -3224,8 +3156,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.objectid); ret = btrfs_del_orphan_item(trans, root, found_key.objectid); - BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */ btrfs_end_transaction(trans, root); + if (ret) + goto out; continue; } @@ -3657,8 +3590,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, if (ret) { btrfs_info(root->fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", - name_len, name, - (unsigned long long)ino, (unsigned long long)dir_ino); + name_len, name, ino, dir_ino); btrfs_abort_transaction(trans, root, ret); goto err; } @@ -3929,6 +3861,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; + u64 last_size = (u64)-1; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -4026,6 +3959,11 @@ search_again: if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; + if (del_item) + last_size = found_key.offset; + else + last_size = new_size; + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); @@ -4137,6 +4075,8 @@ out: btrfs_abort_transaction(trans, root, ret); } error: + if (last_size != (u64)-1) + btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); return err; } @@ -4409,7 +4349,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); if (newsize > oldsize) { - truncate_pagecache(inode, oldsize, newsize); + truncate_pagecache(inode, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) return ret; @@ -4465,8 +4405,26 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) btrfs_inode_resume_unlocked_dio(inode); ret = btrfs_truncate(inode); - if (ret && inode->i_nlink) - btrfs_orphan_del(NULL, inode); + if (ret && inode->i_nlink) { + int err; + + /* + * failed to truncate, disk_i_size is only adjusted down + * as we remove extents, so it should represent the true + * size of the inode, so reset the in memory size and + * delete our orphan entry. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_orphan_del(NULL, inode); + return ret; + } + i_size_write(inode, BTRFS_I(inode)->disk_i_size); + err = btrfs_orphan_del(trans, inode); + if (err) + btrfs_abort_transaction(trans, root, err); + btrfs_end_transaction(trans, root); + } } return ret; @@ -4601,10 +4559,15 @@ void btrfs_evict_inode(struct inode *inode) btrfs_free_block_rsv(root, rsv); + /* + * Errors here aren't a big deal, it just means we leave orphan items + * in the tree. They will be cleaned up on the next mount. + */ if (ret == 0) { trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_orphan_del(trans, inode); - BUG_ON(ret); + btrfs_orphan_del(trans, inode); + } else { + btrfs_orphan_del(NULL, inode); } trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -6161,10 +6124,7 @@ insert: btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", - (unsigned long long)em->start, - (unsigned long long)em->len, - (unsigned long long)start, - (unsigned long long)len); + em->start, em->len, start, len); err = -EIO; goto out; } @@ -6362,39 +6322,32 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; struct extent_map *em; struct btrfs_key ins; u64 alloc_hint; int ret; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return ERR_CAST(trans); - - trans->block_rsv = &root->fs_info->delalloc_block_rsv; - alloc_hint = get_extent_allocation_hint(inode, start, len); - ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, + ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, alloc_hint, &ins, 1); - if (ret) { - em = ERR_PTR(ret); - goto out; - } + if (ret) + return ERR_PTR(ret); em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) - goto out; + if (IS_ERR(em)) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + return em; + } ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); if (ret) { btrfs_free_reserved_extent(root, ins.objectid, ins.offset); - em = ERR_PTR(ret); + free_extent_map(em); + return ERR_PTR(ret); } -out: - btrfs_end_transaction(trans, root); + return em; } @@ -6402,11 +6355,11 @@ out: * returns 1 when the nocow is safe, < 1 on error, 0 if the * block must be cow'd */ -noinline int can_nocow_extent(struct btrfs_trans_handle *trans, - struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes) { + struct btrfs_trans_handle *trans; struct btrfs_path *path; int ret; struct extent_buffer *leaf; @@ -6424,7 +6377,7 @@ noinline int can_nocow_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), offset, 0); if (ret < 0) goto out; @@ -6489,9 +6442,19 @@ noinline int can_nocow_extent(struct btrfs_trans_handle *trans, * look for other files referencing this extent, if we * find any we must cow */ - if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), - key.offset - backref_offset, disk_bytenr)) + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = 0; goto out; + } + + ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), + key.offset - backref_offset, disk_bytenr); + btrfs_end_transaction(trans, root); + if (ret) { + ret = 0; + goto out; + } /* * adjust disk_bytenr and num_bytes to cover just the bytes @@ -6633,7 +6596,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; - struct btrfs_trans_handle *trans; int unlock_bits = EXTENT_LOCKED; int ret = 0; @@ -6715,16 +6677,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, len = min(len, em->len - (start - em->start)); block_start = em->block_start + (start - em->start); - /* - * we're not going to log anything, but we do need - * to make sure the current transaction stays open - * while we look for nocow cross refs - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - goto must_cow; - - if (can_nocow_extent(trans, inode, start, &len, &orig_start, + if (can_nocow_extent(inode, start, &len, &orig_start, &orig_block_len, &ram_bytes) == 1) { if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); @@ -6733,24 +6686,20 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start, len, orig_block_len, ram_bytes, type); - if (IS_ERR(em)) { - btrfs_end_transaction(trans, root); + if (IS_ERR(em)) goto unlock_err; - } } ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, len, type); - btrfs_end_transaction(trans, root); if (ret) { free_extent_map(em); goto unlock_err; } goto unlock; } - btrfs_end_transaction(trans, root); } -must_cow: + /* * this will cow the extent, reset the len in case we changed * it above @@ -6813,26 +6762,6 @@ unlock_err: return ret; } -struct btrfs_dio_private { - struct inode *inode; - u64 logical_offset; - u64 disk_bytenr; - u64 bytes; - void *private; - - /* number of bios pending for this dio */ - atomic_t pending_bios; - - /* IO errors */ - int errors; - - /* orig_bio is our btrfs_io_bio */ - struct bio *orig_bio; - - /* dio_bio came from fs/direct-io.c */ - struct bio *dio_bio; -}; - static void btrfs_endio_direct_read(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; @@ -6841,6 +6770,8 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *dio_bio; + u32 *csums = (u32 *)dip->csum; + int index = 0; u64 start; start = dip->logical_offset; @@ -6849,12 +6780,8 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) struct page *page = bvec->bv_page; char *kaddr; u32 csum = ~(u32)0; - u64 private = ~(u32)0; unsigned long flags; - if (get_state_private(&BTRFS_I(inode)->io_tree, - start, &private)) - goto failed; local_irq_save(flags); kaddr = kmap_atomic(page); csum = btrfs_csum_data(kaddr + bvec->bv_offset, @@ -6864,18 +6791,17 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) local_irq_restore(flags); flush_dcache_page(bvec->bv_page); - if (csum != private) { -failed: - btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u private %u", - (unsigned long long)btrfs_ino(inode), - (unsigned long long)start, - csum, (unsigned)private); + if (csum != csums[index]) { + btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_ino(inode), start, csum, + csums[index]); err = -EIO; } } start += bvec->bv_len; bvec++; + index++; } while (bvec <= bvec_end); unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, @@ -6956,7 +6882,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) if (err) { printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu " "sector %#Lx len %u err no %d\n", - (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw, + btrfs_ino(dip->inode), bio->bi_rw, (unsigned long long)bio->bi_sector, bio->bi_size, err); dip->errors = 1; @@ -6992,6 +6918,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, int rw, u64 file_offset, int skip_sum, int async_submit) { + struct btrfs_dio_private *dip = bio->bi_private; int write = rw & REQ_WRITE; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; @@ -7026,7 +6953,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (ret) goto err; } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset); + ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio, + file_offset); if (ret) goto err; } @@ -7061,6 +6989,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio_put(orig_bio); return -EIO; } + if (map_length >= orig_bio->bi_size) { bio = orig_bio; goto submit; @@ -7156,19 +7085,28 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, struct btrfs_dio_private *dip; struct bio *io_bio; int skip_sum; + int sum_len; int write = rw & REQ_WRITE; int ret = 0; + u16 csum_size; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS); - if (!io_bio) { ret = -ENOMEM; goto free_ordered; } - dip = kmalloc(sizeof(*dip), GFP_NOFS); + if (!skip_sum && !write) { + csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits; + sum_len *= csum_size; + } else { + sum_len = 0; + } + + dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS); if (!dip) { ret = -ENOMEM; goto free_io_bio; @@ -7443,10 +7381,23 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, * whoever cleared the private bit is responsible * for the finish_ordered_io */ - if (TestClearPagePrivate2(page) && - btrfs_dec_test_ordered_pending(inode, &ordered, page_start, - PAGE_CACHE_SIZE, 1)) { - btrfs_finish_ordered_io(ordered); + if (TestClearPagePrivate2(page)) { + struct btrfs_ordered_inode_tree *tree; + u64 new_len; + + tree = &BTRFS_I(inode)->ordered_tree; + + spin_lock_irq(&tree->lock); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + new_len = page_start - ordered->file_offset; + if (new_len < ordered->truncated_len) + ordered->truncated_len = new_len; + spin_unlock_irq(&tree->lock); + + if (btrfs_dec_test_ordered_pending(inode, &ordered, + page_start, + PAGE_CACHE_SIZE, 1)) + btrfs_finish_ordered_io(ordered); } btrfs_put_ordered_extent(ordered); cached_state = NULL; @@ -7612,7 +7563,6 @@ static int btrfs_truncate(struct inode *inode) u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); /* * Yes ladies and gentelment, this is indeed ugly. The fact is we have @@ -7876,7 +7826,7 @@ void btrfs_destroy_inode(struct inode *inode) if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)) { btrfs_info(root->fs_info, "inode %llu still on the orphan list", - (unsigned long long)btrfs_ino(inode)); + btrfs_ino(inode)); atomic_dec(&root->orphan_inodes); } @@ -7886,8 +7836,7 @@ void btrfs_destroy_inode(struct inode *inode) break; else { btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", - (unsigned long long)ordered->file_offset, - (unsigned long long)ordered->len); + ordered->file_offset, ordered->len); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -8161,10 +8110,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.name, new_dentry->d_name.len); } - if (!ret && new_inode->i_nlink == 0) { + if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, new_dentry->d_inode); - BUG_ON(ret); - } if (ret) { btrfs_abort_transaction(trans, root, ret); goto out_fail; @@ -8525,8 +8472,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); cur_bytes = max(cur_bytes, min_size); - ret = btrfs_reserve_extent(trans, root, cur_bytes, - min_size, 0, *alloc_hint, &ins, 1); + ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, + *alloc_hint, &ins, 1); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 238a05545ee2..1a5b9462dd9a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -43,6 +43,7 @@ #include <linux/blkdev.h> #include <linux/uuid.h> #include <linux/btrfs.h> +#include <linux/uaccess.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -57,6 +58,9 @@ #include "send.h" #include "dev-replace.h" +static int btrfs_clone(struct inode *src, struct inode *inode, + u64 off, u64 olen, u64 olen_aligned, u64 destoff); + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) { @@ -363,6 +367,13 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) return 0; } +int btrfs_is_empty_uuid(u8 *uuid) +{ + static char empty_uuid[BTRFS_UUID_SIZE] = {0}; + + return !memcmp(uuid, empty_uuid, BTRFS_UUID_SIZE); +} + static noinline int create_subvol(struct inode *dir, struct dentry *dentry, char *name, int namelen, @@ -396,7 +407,7 @@ static noinline int create_subvol(struct inode *dir, * of create_snapshot(). */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, - 7, &qgroup_reserved); + 8, &qgroup_reserved, false); if (ret) return ret; @@ -425,26 +436,25 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); - write_extent_buffer(leaf, root->fs_info->fsid, - (unsigned long)btrfs_header_fsid(leaf), + write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(leaf), BTRFS_FSID_SIZE); write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + btrfs_header_chunk_tree_uuid(leaf), BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); memset(&root_item, 0, sizeof(root_item)); inode_item = &root_item.inode; - inode_item->generation = cpu_to_le64(1); - inode_item->size = cpu_to_le64(3); - inode_item->nlink = cpu_to_le32(1); - inode_item->nbytes = cpu_to_le64(root->leafsize); - inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + btrfs_set_stack_inode_generation(inode_item, 1); + btrfs_set_stack_inode_size(inode_item, 3); + btrfs_set_stack_inode_nlink(inode_item, 1); + btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); + btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); - root_item.flags = 0; - root_item.byte_limit = 0; - inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT); + btrfs_set_root_flags(&root_item, 0); + btrfs_set_root_limit(&root_item, 0); + btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); btrfs_set_root_bytenr(&root_item, leaf->start); btrfs_set_root_generation(&root_item, trans->transid); @@ -457,8 +467,8 @@ static noinline int create_subvol(struct inode *dir, btrfs_root_generation(&root_item)); uuid_le_gen(&new_uuid); memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); - root_item.otime.sec = cpu_to_le64(cur_time.tv_sec); - root_item.otime.nsec = cpu_to_le32(cur_time.tv_nsec); + btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); root_item.ctime = root_item.otime; btrfs_set_root_ctransid(&root_item, trans->transid); btrfs_set_root_otransid(&root_item, trans->transid); @@ -518,9 +528,14 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, objectid, root->root_key.objectid, btrfs_ino(dir), index, name, namelen); - BUG_ON(ret); + ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, + root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + objectid); + if (ret) + btrfs_abort_transaction(trans, root, ret); + fail: trans->block_rsv = NULL; trans->bytes_reserved = 0; @@ -573,10 +588,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, * 1 - root item * 2 - root ref/backref * 1 - root of snapshot + * 1 - UUID item */ ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, 7, - &pending_snapshot->qgroup_reserved); + &pending_snapshot->block_rsv, 8, + &pending_snapshot->qgroup_reserved, + false); if (ret) goto out; @@ -1267,9 +1284,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, cluster = max_cluster; } - if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) - BTRFS_I(inode)->force_compress = compress_type; - if (i + cluster > ra_index) { ra_index = max(i, ra_index); btrfs_force_ra(inode->i_mapping, ra, file, ra_index, @@ -1278,6 +1292,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } mutex_lock(&inode->i_mutex); + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) + BTRFS_I(inode)->force_compress = compress_type; ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) { mutex_unlock(&inode->i_mutex); @@ -1334,10 +1350,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); - - mutex_lock(&inode->i_mutex); - BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; - mutex_unlock(&inode->i_mutex); } if (range->compress_type == BTRFS_COMPRESS_LZO) { @@ -1347,6 +1359,11 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ret = defrag_count; out_ra: + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + mutex_lock(&inode->i_mutex); + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; + mutex_unlock(&inode->i_mutex); + } if (!file) kfree(ra); kfree(pages); @@ -1377,9 +1394,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINVAL; + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } mutex_lock(&root->fs_info->volume_mutex); @@ -1403,14 +1419,13 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_free; } - printk(KERN_INFO "btrfs: resizing devid %llu\n", - (unsigned long long)devid); + printk(KERN_INFO "btrfs: resizing devid %llu\n", devid); } device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", - (unsigned long long)devid); + devid); ret = -ENODEV; goto out_free; } @@ -1418,7 +1433,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (!device->writeable) { printk(KERN_INFO "btrfs: resizer unable to apply on " "readonly device %llu\n", - (unsigned long long)devid); + devid); ret = -EPERM; goto out_free; } @@ -1470,8 +1485,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size *= root->sectorsize; printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n", - rcu_str_deref(device->name), - (unsigned long long)new_size); + rcu_str_deref(device->name), new_size); if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); @@ -1721,13 +1735,28 @@ out: static noinline int may_destroy_subvol(struct btrfs_root *root) { struct btrfs_path *path; + struct btrfs_dir_item *di; struct btrfs_key key; + u64 dir_id; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + /* Make sure this root isn't set as the default subvol */ + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); + di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path, + dir_id, "default", 7, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); + if (key.objectid == root->root_key.objectid) { + ret = -ENOTEMPTY; + goto out; + } + btrfs_release_path(path); + } + key.objectid = root->root_key.objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = (u64)-1; @@ -1993,25 +2022,29 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; + else if (ret > 0) { + ret = btrfs_previous_item(root, path, dirid, + BTRFS_INODE_REF_KEY); + if (ret < 0) + goto out; + else if (ret > 0) { + ret = -ENOENT; + goto out; + } + } l = path->nodes[0]; slot = path->slots[0]; - if (ret > 0 && slot > 0) - slot--; btrfs_item_key_to_cpu(l, &key, slot); - if (ret > 0 && (key.objectid != dirid || - key.type != BTRFS_INODE_REF_KEY)) { - ret = -ENOENT; - goto out; - } - iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(l, iref); ptr -= len + 1; total_len += len + 1; - if (ptr < name) + if (ptr < name) { + ret = -ENAMETOOLONG; goto out; + } *(ptr + len) = '/'; read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len); @@ -2024,8 +2057,6 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; dirid = key.objectid; } - if (ptr < name) - goto out; memmove(name, ptr, total_len); name[total_len]='\0'; ret = 0; @@ -2174,7 +2205,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * ref/backref. */ err = btrfs_subvolume_reserve_metadata(root, &block_rsv, - 5, &qgroup_reserved); + 5, &qgroup_reserved, true); if (err) goto out_up_write; @@ -2213,6 +2244,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_end_trans; } } + + ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } + if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { + ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + dest->root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + dest->root_key.objectid); + if (ret && ret != -ENOENT) { + btrfs_abort_transaction(trans, root, ret); + err = ret; + goto out_end_trans; + } + } + out_end_trans: trans->block_rsv = NULL; trans->bytes_reserved = 0; @@ -2326,8 +2378,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINVAL; + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } mutex_lock(&root->fs_info->volume_mutex); @@ -2400,10 +2451,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) if (!fi_args) return -ENOMEM; + mutex_lock(&fs_devices->device_list_mutex); fi_args->num_devices = fs_devices->num_devices; memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->devid > fi_args->max_id) fi_args->max_id = device->devid; @@ -2424,7 +2475,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; int ret = 0; char *s_uuid = NULL; - char empty_uuid[BTRFS_UUID_SIZE] = {0}; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2433,7 +2483,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) if (IS_ERR(di_args)) return PTR_ERR(di_args); - if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0) + if (!btrfs_is_empty_uuid(di_args->uuid)) s_uuid = di_args->uuid; mutex_lock(&fs_devices->device_list_mutex); @@ -2469,150 +2519,336 @@ out: return ret; } -static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +static struct page *extent_same_get_page(struct inode *inode, u64 off) +{ + struct page *page; + pgoff_t index; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + + index = off >> PAGE_CACHE_SHIFT; + + page = grab_cache_page(inode->i_mapping, index); + if (!page) + return NULL; + + if (!PageUptodate(page)) { + if (extent_read_full_page_nolock(tree, page, btrfs_get_extent, + 0)) + return NULL; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + return NULL; + } + } + unlock_page(page); + + return page; +} + +static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) +{ + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); + ordered = btrfs_lookup_first_ordered_extent(inode, + off + len - 1); + if (!ordered && + !test_range_bit(&BTRFS_I(inode)->io_tree, off, + off + len - 1, EXTENT_DELALLOC, 0, NULL)) + break; + unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1); + if (ordered) + btrfs_put_ordered_extent(ordered); + btrfs_wait_ordered_range(inode, off, len); + } +} + +static void btrfs_double_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + + mutex_unlock(&inode1->i_mutex); + mutex_unlock(&inode2->i_mutex); +} + +static void btrfs_double_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + if (inode1 < inode2) { + swap(inode1, inode2); + swap(loff1, loff2); + } + + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + lock_extent_range(inode1, loff1, len); + if (inode1 != inode2) { + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + lock_extent_range(inode2, loff2, len); + } +} + +static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, + u64 dst_loff, u64 len) +{ + int ret = 0; + struct page *src_page, *dst_page; + unsigned int cmp_len = PAGE_CACHE_SIZE; + void *addr, *dst_addr; + + while (len) { + if (len < PAGE_CACHE_SIZE) + cmp_len = len; + + src_page = extent_same_get_page(src, loff); + if (!src_page) + return -EINVAL; + dst_page = extent_same_get_page(dst, dst_loff); + if (!dst_page) { + page_cache_release(src_page); + return -EINVAL; + } + addr = kmap_atomic(src_page); + dst_addr = kmap_atomic(dst_page); + + flush_dcache_page(src_page); + flush_dcache_page(dst_page); + + if (memcmp(addr, dst_addr, cmp_len)) + ret = BTRFS_SAME_DATA_DIFFERS; + + kunmap_atomic(addr); + kunmap_atomic(dst_addr); + page_cache_release(src_page); + page_cache_release(dst_page); + + if (ret) + break; + + loff += cmp_len; + dst_loff += cmp_len; + len -= cmp_len; + } + + return ret; +} + +static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) +{ + u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; + + if (off + len > inode->i_size || off + len < off) + return -EINVAL; + /* Check that we are block aligned - btrfs_clone() requires this */ + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) + return -EINVAL; + + return 0; +} + +static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, + struct inode *dst, u64 dst_loff) { - struct inode *inode = file_inode(file); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct fd src_file; - struct inode *src; - struct btrfs_trans_handle *trans; - struct btrfs_path *path; - struct extent_buffer *leaf; - char *buf; - struct btrfs_key key; - u32 nritems; - int slot; int ret; - u64 len = olen; - u64 bs = root->fs_info->sb->s_blocksize; - int same_inode = 0; /* - * TODO: - * - split compressed inline extents. annoying: we need to - * decompress into destination's address_space (the file offset - * may change, so source mapping won't do), then recompress (or - * otherwise reinsert) a subrange. - * - allow ranges within the same file to be cloned (provided - * they don't overlap)? + * btrfs_clone() can't handle extents in the same file + * yet. Once that works, we can drop this check and replace it + * with a check for the same inode, but overlapping extents. */ - - /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) + if (src == dst) return -EINVAL; - if (btrfs_root_readonly(root)) - return -EROFS; + btrfs_double_lock(src, loff, dst, dst_loff, len); + + ret = extent_same_check_offsets(src, loff, len); + if (ret) + goto out_unlock; + + ret = extent_same_check_offsets(dst, dst_loff, len); + if (ret) + goto out_unlock; + + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) { + ret = -EINVAL; + goto out_unlock; + } + + ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); + if (ret == 0) + ret = btrfs_clone(src, dst, loff, len, len, dst_loff); + +out_unlock: + btrfs_double_unlock(src, loff, dst, dst_loff, len); + + return ret; +} + +#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) + +static long btrfs_ioctl_file_extent_same(struct file *file, + void __user *argp) +{ + struct btrfs_ioctl_same_args *args = argp; + struct btrfs_ioctl_same_args same; + struct btrfs_ioctl_same_extent_info info; + struct inode *src = file->f_dentry->d_inode; + struct file *dst_file = NULL; + struct inode *dst; + u64 off; + u64 len; + int i; + int ret; + u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + bool is_admin = capable(CAP_SYS_ADMIN); + + if (!(file->f_mode & FMODE_READ)) + return -EINVAL; ret = mnt_want_write_file(file); if (ret) return ret; - src_file = fdget(srcfd); - if (!src_file.file) { - ret = -EBADF; - goto out_drop_write; + if (copy_from_user(&same, + (struct btrfs_ioctl_same_args __user *)argp, + sizeof(same))) { + ret = -EFAULT; + goto out; } - ret = -EXDEV; - if (src_file.file->f_path.mnt != file->f_path.mnt) - goto out_fput; + off = same.logical_offset; + len = same.length; - src = file_inode(src_file.file); + /* + * Limit the total length we will dedupe for each operation. + * This is intended to bound the total time spent in this + * ioctl to something sane. + */ + if (len > BTRFS_MAX_DEDUPE_LEN) + len = BTRFS_MAX_DEDUPE_LEN; - ret = -EINVAL; - if (src == inode) - same_inode = 1; + if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { + /* + * Btrfs does not support blocksize < page_size. As a + * result, btrfs_cmp_data() won't correctly handle + * this situation without an update. + */ + ret = -EINVAL; + goto out; + } - /* the src must be open for reading */ - if (!(src_file.file->f_mode & FMODE_READ)) - goto out_fput; + ret = -EISDIR; + if (S_ISDIR(src->i_mode)) + goto out; - /* don't make the dst file partly checksummed */ - if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - goto out_fput; + ret = -EACCES; + if (!S_ISREG(src->i_mode)) + goto out; - ret = -EISDIR; - if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) - goto out_fput; + ret = 0; + for (i = 0; i < same.dest_count; i++) { + if (copy_from_user(&info, &args->info[i], sizeof(info))) { + ret = -EFAULT; + goto out; + } - ret = -EXDEV; - if (src->i_sb != inode->i_sb) - goto out_fput; + info.bytes_deduped = 0; - ret = -ENOMEM; - buf = vmalloc(btrfs_level_size(root, 0)); - if (!buf) - goto out_fput; + dst_file = fget(info.fd); + if (!dst_file) { + info.status = -EBADF; + goto next; + } - path = btrfs_alloc_path(); - if (!path) { - vfree(buf); - goto out_fput; - } - path->reada = 2; + if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { + info.status = -EINVAL; + goto next; + } - if (!same_inode) { - if (inode < src) { - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); - } else { - mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + info.status = -EXDEV; + if (file->f_path.mnt != dst_file->f_path.mnt) + goto next; + + dst = dst_file->f_dentry->d_inode; + if (src->i_sb != dst->i_sb) + goto next; + + if (S_ISDIR(dst->i_mode)) { + info.status = -EISDIR; + goto next; } - } else { - mutex_lock(&src->i_mutex); - } - /* determine range to clone */ - ret = -EINVAL; - if (off + len > src->i_size || off + len < off) - goto out_unlock; - if (len == 0) - olen = len = src->i_size - off; - /* if we extend to eof, continue to block boundary */ - if (off + len == src->i_size) - len = ALIGN(src->i_size, bs) - off; + if (!S_ISREG(dst->i_mode)) { + info.status = -EACCES; + goto next; + } - /* verify the end result is block aligned */ - if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || - !IS_ALIGNED(destoff, bs)) - goto out_unlock; + info.status = btrfs_extent_same(src, off, len, dst, + info.logical_offset); + if (info.status == 0) + info.bytes_deduped += len; - /* verify if ranges are overlapped within the same file */ - if (same_inode) { - if (destoff + len > off && destoff < off + len) - goto out_unlock; - } +next: + if (dst_file) + fput(dst_file); - if (destoff > inode->i_size) { - ret = btrfs_cont_expand(inode, inode->i_size, destoff); - if (ret) - goto out_unlock; + if (__put_user_unaligned(info.status, &args->info[i].status) || + __put_user_unaligned(info.bytes_deduped, + &args->info[i].bytes_deduped)) { + ret = -EFAULT; + goto out; + } } - /* truncate page cache pages from target inode range */ - truncate_inode_pages_range(&inode->i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len) - 1); +out: + mnt_drop_write_file(file); + return ret; +} - /* do any pending delalloc/csum calc on src, one way or - another, and lock file content */ - while (1) { - struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); - ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1); - if (!ordered && - !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1, - EXTENT_DELALLOC, 0, NULL)) - break; - unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); - if (ordered) - btrfs_put_ordered_extent(ordered); - btrfs_wait_ordered_range(src, off, len); +/** + * btrfs_clone() - clone a range from inode file to another + * + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen, extent_same uses + * identical values here + * @destoff: Offset within @inode to start clone + */ +static int btrfs_clone(struct inode *src, struct inode *inode, + u64 off, u64 olen, u64 olen_aligned, u64 destoff) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path = NULL; + struct extent_buffer *leaf; + struct btrfs_trans_handle *trans; + char *buf = NULL; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + u64 len = olen_aligned; + + ret = -ENOMEM; + buf = vmalloc(btrfs_level_size(root, 0)); + if (!buf) + return ret; + + path = btrfs_alloc_path(); + if (!path) { + vfree(buf); + return ret; } + path->reada = 2; /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; @@ -2858,15 +3094,132 @@ next: key.offset++; } ret = 0; + out: btrfs_release_path(path); + btrfs_free_path(path); + vfree(buf); + return ret; +} + +static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct fd src_file; + struct inode *src; + int ret; + u64 len = olen; + u64 bs = root->fs_info->sb->s_blocksize; + int same_inode = 0; + + /* + * TODO: + * - split compressed inline extents. annoying: we need to + * decompress into destination's address_space (the file offset + * may change, so source mapping won't do), then recompress (or + * otherwise reinsert) a subrange. + * - allow ranges within the same file to be cloned (provided + * they don't overlap)? + */ + + /* the destination must be opened for writing */ + if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) + return -EINVAL; + + if (btrfs_root_readonly(root)) + return -EROFS; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + src_file = fdget(srcfd); + if (!src_file.file) { + ret = -EBADF; + goto out_drop_write; + } + + ret = -EXDEV; + if (src_file.file->f_path.mnt != file->f_path.mnt) + goto out_fput; + + src = file_inode(src_file.file); + + ret = -EINVAL; + if (src == inode) + same_inode = 1; + + /* the src must be open for reading */ + if (!(src_file.file->f_mode & FMODE_READ)) + goto out_fput; + + /* don't make the dst file partly checksummed */ + if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != + (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) + goto out_fput; + + ret = -EISDIR; + if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) + goto out_fput; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb) + goto out_fput; + + if (!same_inode) { + if (inode < src) { + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); + } else { + mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + } + } else { + mutex_lock(&src->i_mutex); + } + + /* determine range to clone */ + ret = -EINVAL; + if (off + len > src->i_size || off + len < off) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - off; + /* if we extend to eof, continue to block boundary */ + if (off + len == src->i_size) + len = ALIGN(src->i_size, bs) - off; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || + !IS_ALIGNED(destoff, bs)) + goto out_unlock; + + /* verify if ranges are overlapped within the same file */ + if (same_inode) { + if (destoff + len > off && destoff < off + len) + goto out_unlock; + } + + if (destoff > inode->i_size) { + ret = btrfs_cont_expand(inode, inode->i_size, destoff); + if (ret) + goto out_unlock; + } + + /* truncate page cache pages from target inode range */ + truncate_inode_pages_range(&inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len) - 1); + + lock_extent_range(src, off, len); + + ret = btrfs_clone(src, inode, off, olen, len, destoff); + unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1); out_unlock: mutex_unlock(&src->i_mutex); if (!same_inode) mutex_unlock(&inode->i_mutex); - vfree(buf); - btrfs_free_path(path); out_fput: fdput(src_file); out_drop_write: @@ -3312,11 +3665,13 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) switch (p->cmd) { case BTRFS_IOCTL_DEV_REPLACE_CMD_START: + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + if (atomic_xchg( &root->fs_info->mutually_exclusive_operation_running, 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - ret = -EINPROGRESS; + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_start(root, p); atomic_set( @@ -3560,8 +3915,7 @@ again: } else { /* this is (1) */ mutex_unlock(&fs_info->balance_mutex); - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - ret = -EINVAL; + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -3967,6 +4321,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, struct btrfs_trans_handle *trans; struct timespec ct = CURRENT_TIME; int ret = 0; + int received_uuid_changed; ret = mnt_want_write_file(file); if (ret < 0) @@ -3996,7 +4351,11 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, goto out; } - trans = btrfs_start_transaction(root, 1); + /* + * 1 - root item + * 2 - uuid items (received uuid + subvol uuid) + */ + trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; @@ -4007,24 +4366,42 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, sa->rtime.sec = ct.tv_sec; sa->rtime.nsec = ct.tv_nsec; + received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid, + BTRFS_UUID_SIZE); + if (received_uuid_changed && + !btrfs_is_empty_uuid(root_item->received_uuid)) + btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + root_item->received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + root->root_key.objectid); memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); btrfs_set_root_stransid(root_item, sa->stransid); btrfs_set_root_rtransid(root_item, sa->rtransid); - root_item->stime.sec = cpu_to_le64(sa->stime.sec); - root_item->stime.nsec = cpu_to_le32(sa->stime.nsec); - root_item->rtime.sec = cpu_to_le64(sa->rtime.sec); - root_item->rtime.nsec = cpu_to_le32(sa->rtime.nsec); + btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec); + btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec); + btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec); + btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec); ret = btrfs_update_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); if (ret < 0) { btrfs_end_transaction(trans, root); - trans = NULL; goto out; - } else { - ret = btrfs_commit_transaction(trans, root); - if (ret < 0) + } + if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { + ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, + sa->uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + root->root_key.objectid); + if (ret < 0 && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); goto out; + } + } + ret = btrfs_commit_transaction(trans, root); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + goto out; } ret = copy_to_user(arg, sa, sizeof(*sa)); @@ -4041,18 +4418,22 @@ out: static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; - const char *label = root->fs_info->super_copy->label; - size_t len = strnlen(label, BTRFS_LABEL_SIZE); + size_t len; int ret; + char label[BTRFS_LABEL_SIZE]; + + spin_lock(&root->fs_info->super_lock); + memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE); + spin_unlock(&root->fs_info->super_lock); + + len = strnlen(label, BTRFS_LABEL_SIZE); if (len == BTRFS_LABEL_SIZE) { pr_warn("btrfs: label is too long, return the first %zu bytes\n", --len); } - mutex_lock(&root->fs_info->volume_mutex); ret = copy_to_user(arg, label, len); - mutex_unlock(&root->fs_info->volume_mutex); return ret ? -EFAULT : 0; } @@ -4081,18 +4462,18 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&root->fs_info->volume_mutex); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_unlock; } + spin_lock(&root->fs_info->super_lock); strcpy(super_block->label, label); + spin_unlock(&root->fs_info->super_lock); ret = btrfs_end_transaction(trans, root); out_unlock: - mutex_unlock(&root->fs_info->volume_mutex); mnt_drop_write_file(file); return ret; } @@ -4207,6 +4588,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_fslabel(file, argp); case BTRFS_IOC_SET_FSLABEL: return btrfs_ioctl_set_fslabel(file, argp); + case BTRFS_IOC_FILE_EXTENT_SAME: + return btrfs_ioctl_file_extent_same(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index f93151a98886..b6a6f07c5ce2 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -207,8 +207,10 @@ static int lzo_compress_pages(struct list_head *ws, } /* we're making it bigger, give up */ - if (tot_in > 8192 && tot_in < tot_out) + if (tot_in > 8192 && tot_in < tot_out) { + ret = -1; goto out; + } /* we're all done */ if (tot_in >= len) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 81369827e514..966b413a33b8 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -67,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset " - "%llu\n", (unsigned long long)offset); + "%llu\n", offset); } /* @@ -205,6 +205,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->bytes_left = len; entry->inode = igrab(inode); entry->compress_type = compress_type; + entry->truncated_len = (u64)-1; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -336,14 +337,12 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, *file_offset = dec_end; if (dec_start > dec_end) { printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", - (unsigned long long)dec_start, - (unsigned long long)dec_end); + dec_start, dec_end); } to_dec = dec_end - dec_start; if (to_dec > entry->bytes_left) { printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", - (unsigned long long)entry->bytes_left, - (unsigned long long)to_dec); + entry->bytes_left, to_dec); } entry->bytes_left -= to_dec; if (!uptodate) @@ -403,8 +402,7 @@ have_entry: if (io_size > entry->bytes_left) { printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", - (unsigned long long)entry->bytes_left, - (unsigned long long)io_size); + entry->bytes_left, io_size); } entry->bytes_left -= io_size; if (!uptodate) @@ -671,7 +669,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); - mutex_lock(&root->fs_info->ordered_operations_mutex); + mutex_lock(&root->fs_info->ordered_extent_flush_mutex); spin_lock(&root->fs_info->ordered_root_lock); list_splice_init(&cur_trans->ordered_operations, &splice); while (!list_empty(&splice)) { @@ -718,7 +716,7 @@ out: list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } - mutex_unlock(&root->fs_info->ordered_operations_mutex); + mutex_unlock(&root->fs_info->ordered_extent_flush_mutex); return ret; } @@ -923,12 +921,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *test; int ret = 1; - if (ordered) + spin_lock_irq(&tree->lock); + if (ordered) { offset = entry_end(ordered); - else + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) + offset = min(offset, + ordered->file_offset + + ordered->truncated_len); + } else { offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); - - spin_lock_irq(&tree->lock); + } disk_i_size = BTRFS_I(inode)->disk_i_size; /* truncate file */ diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 68844d59ee6f..d9a5aa097b4f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -69,6 +69,7 @@ struct btrfs_ordered_sum { * the isize. */ #define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered ordered extent */ +#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ struct btrfs_ordered_extent { /* logical offset in the file */ @@ -96,6 +97,12 @@ struct btrfs_ordered_extent { */ u64 outstanding_isize; + /* + * If we get truncated we need to adjust the file extent we enter for + * this ordered extent so that we do not expose stale data. + */ + u64 truncated_len; + /* flags (described above) */ unsigned long flags; diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index dc0024f17c1f..0088bedc8631 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -26,14 +26,12 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) int i; printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu " "num_stripes %d\n", - (unsigned long long)btrfs_chunk_length(eb, chunk), - (unsigned long long)btrfs_chunk_owner(eb, chunk), - (unsigned long long)btrfs_chunk_type(eb, chunk), - num_stripes); + btrfs_chunk_length(eb, chunk), btrfs_chunk_owner(eb, chunk), + btrfs_chunk_type(eb, chunk), num_stripes); for (i = 0 ; i < num_stripes ; i++) { printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i, - (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i), - (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i)); + btrfs_stripe_devid_nr(eb, chunk, i), + btrfs_stripe_offset_nr(eb, chunk, i)); } } static void print_dev_item(struct extent_buffer *eb, @@ -41,18 +39,18 @@ static void print_dev_item(struct extent_buffer *eb, { printk(KERN_INFO "\t\tdev item devid %llu " "total_bytes %llu bytes used %llu\n", - (unsigned long long)btrfs_device_id(eb, dev_item), - (unsigned long long)btrfs_device_total_bytes(eb, dev_item), - (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); + btrfs_device_id(eb, dev_item), + btrfs_device_total_bytes(eb, dev_item), + btrfs_device_bytes_used(eb, dev_item)); } static void print_extent_data_ref(struct extent_buffer *eb, struct btrfs_extent_data_ref *ref) { printk(KERN_INFO "\t\textent data backref root %llu " "objectid %llu offset %llu count %u\n", - (unsigned long long)btrfs_extent_data_ref_root(eb, ref), - (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref), - (unsigned long long)btrfs_extent_data_ref_offset(eb, ref), + btrfs_extent_data_ref_root(eb, ref), + btrfs_extent_data_ref_objectid(eb, ref), + btrfs_extent_data_ref_offset(eb, ref), btrfs_extent_data_ref_count(eb, ref)); } @@ -87,19 +85,17 @@ static void print_extent_item(struct extent_buffer *eb, int slot) flags = btrfs_extent_flags(eb, ei); printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n", - (unsigned long long)btrfs_extent_refs(eb, ei), - (unsigned long long)btrfs_extent_generation(eb, ei), - (unsigned long long)flags); + btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei), + flags); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); - printk(KERN_INFO "\t\ttree block key (%llu %x %llu) " + printk(KERN_INFO "\t\ttree block key (%llu %u %llu) " "level %d\n", - (unsigned long long)btrfs_disk_key_objectid(&key), - key.type, - (unsigned long long)btrfs_disk_key_offset(&key), + btrfs_disk_key_objectid(&key), key.type, + btrfs_disk_key_offset(&key), btrfs_tree_block_level(eb, info)); iref = (struct btrfs_extent_inline_ref *)(info + 1); } else { @@ -115,11 +111,11 @@ static void print_extent_item(struct extent_buffer *eb, int slot) switch (type) { case BTRFS_TREE_BLOCK_REF_KEY: printk(KERN_INFO "\t\ttree block backref " - "root %llu\n", (unsigned long long)offset); + "root %llu\n", offset); break; case BTRFS_SHARED_BLOCK_REF_KEY: printk(KERN_INFO "\t\tshared block backref " - "parent %llu\n", (unsigned long long)offset); + "parent %llu\n", offset); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -129,8 +125,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot) sref = (struct btrfs_shared_data_ref *)(iref + 1); printk(KERN_INFO "\t\tshared data backref " "parent %llu count %u\n", - (unsigned long long)offset, - btrfs_shared_data_ref_count(eb, sref)); + offset, btrfs_shared_data_ref_count(eb, sref)); break; default: BUG(); @@ -148,13 +143,32 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot) ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0); printk("\t\textent back ref root %llu gen %llu " "owner %llu num_refs %lu\n", - (unsigned long long)btrfs_ref_root_v0(eb, ref0), - (unsigned long long)btrfs_ref_generation_v0(eb, ref0), - (unsigned long long)btrfs_ref_objectid_v0(eb, ref0), + btrfs_ref_root_v0(eb, ref0), + btrfs_ref_generation_v0(eb, ref0), + btrfs_ref_objectid_v0(eb, ref0), (unsigned long)btrfs_ref_count_v0(eb, ref0)); } #endif +static void print_uuid_item(struct extent_buffer *l, unsigned long offset, + u32 item_size) +{ + if (!IS_ALIGNED(item_size, sizeof(u64))) { + pr_warn("btrfs: uuid item with illegal size %lu!\n", + (unsigned long)item_size); + return; + } + while (item_size) { + __le64 subvol_id; + + read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id)); + printk(KERN_INFO "\t\tsubvol_id %llu\n", + (unsigned long long)le64_to_cpu(subvol_id)); + item_size -= sizeof(u64); + offset += sizeof(u64); + } +} + void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; @@ -177,39 +191,34 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) nr = btrfs_header_nritems(l); btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d", - (unsigned long long)btrfs_header_bytenr(l), nr, - btrfs_leaf_free_space(root, l)); + btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(l, i); btrfs_item_key_to_cpu(l, &key, i); type = btrfs_key_type(&key); - printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d " + printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d " "itemsize %d\n", - i, - (unsigned long long)key.objectid, type, - (unsigned long long)key.offset, + i, key.objectid, type, key.offset, btrfs_item_offset(l, item), btrfs_item_size(l, item)); switch (type) { case BTRFS_INODE_ITEM_KEY: ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); printk(KERN_INFO "\t\tinode generation %llu size %llu " "mode %o\n", - (unsigned long long) btrfs_inode_generation(l, ii), - (unsigned long long)btrfs_inode_size(l, ii), + btrfs_inode_size(l, ii), btrfs_inode_mode(l, ii)); break; case BTRFS_DIR_ITEM_KEY: di = btrfs_item_ptr(l, i, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(l, di, &found_key); printk(KERN_INFO "\t\tdir oid %llu type %u\n", - (unsigned long long)found_key.objectid, + found_key.objectid, btrfs_dir_type(l, di)); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n", - (unsigned long long) btrfs_disk_root_bytenr(l, ri), btrfs_disk_root_refs(l, ri)); break; @@ -245,17 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) } printk(KERN_INFO "\t\textent data disk bytenr %llu " "nr %llu\n", - (unsigned long long) btrfs_file_extent_disk_bytenr(l, fi), - (unsigned long long) btrfs_file_extent_disk_num_bytes(l, fi)); printk(KERN_INFO "\t\textent data offset %llu " "nr %llu ram %llu\n", - (unsigned long long) btrfs_file_extent_offset(l, fi), - (unsigned long long) btrfs_file_extent_num_bytes(l, fi), - (unsigned long long) btrfs_file_extent_ram_bytes(l, fi)); break; case BTRFS_EXTENT_REF_V0_KEY: @@ -269,7 +273,6 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); printk(KERN_INFO "\t\tblock group used %llu\n", - (unsigned long long) btrfs_disk_block_group_used(l, bi)); break; case BTRFS_CHUNK_ITEM_KEY: @@ -286,13 +289,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n" "\t\tchunk objectid %llu chunk offset %llu " "length %llu\n", - (unsigned long long) btrfs_dev_extent_chunk_tree(l, dev_extent), - (unsigned long long) btrfs_dev_extent_chunk_objectid(l, dev_extent), - (unsigned long long) btrfs_dev_extent_chunk_offset(l, dev_extent), - (unsigned long long) btrfs_dev_extent_length(l, dev_extent)); break; case BTRFS_DEV_STATS_KEY: @@ -301,6 +300,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) case BTRFS_DEV_REPLACE_KEY: printk(KERN_INFO "\t\tdev replace\n"); break; + case BTRFS_UUID_KEY_SUBVOL: + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + print_uuid_item(l, btrfs_item_ptr_offset(l, i), + btrfs_item_size_nr(l, i)); + break; }; } } @@ -320,16 +324,13 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) return; } btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u", - (unsigned long long)btrfs_header_bytenr(c), - level, nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); + btrfs_header_bytenr(c), level, nr, + (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", - i, - (unsigned long long)key.objectid, - key.type, - (unsigned long long)key.offset, - (unsigned long long)btrfs_node_blockptr(c, i)); + i, key.objectid, key.type, key.offset, + btrfs_node_blockptr(c, i)); } for (i = 0; i < nr; i++) { struct extent_buffer *next = read_tree_block(root, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1280eff8af56..4e6ef490619e 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -157,18 +157,11 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, return qgroup; } -/* must be called with qgroup_lock held */ -static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) +static void __del_qgroup_rb(struct btrfs_qgroup *qgroup) { - struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); struct btrfs_qgroup_list *list; - if (!qgroup) - return -ENOENT; - - rb_erase(&qgroup->node, &fs_info->qgroup_tree); list_del(&qgroup->dirty); - while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, struct btrfs_qgroup_list, next_group); @@ -185,7 +178,18 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) kfree(list); } kfree(qgroup); +} +/* must be called with qgroup_lock held */ +static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) +{ + struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); + + if (!qgroup) + return -ENOENT; + + rb_erase(&qgroup->node, &fs_info->qgroup_tree); + __del_qgroup_rb(qgroup); return 0; } @@ -394,8 +398,7 @@ next1: if (ret == -ENOENT) { printk(KERN_WARNING "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", - (unsigned long long)found_key.objectid, - (unsigned long long)found_key.offset); + found_key.objectid, found_key.offset); ret = 0; /* ignore the error */ } if (ret) @@ -428,39 +431,28 @@ out: } /* - * This is only called from close_ctree() or open_ctree(), both in single- - * treaded paths. Clean up the in-memory structures. No locking needed. + * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), + * first two are in single-threaded paths.And for the third one, we have set + * quota_root to be null with qgroup_lock held before, so it is safe to clean + * up the in-memory structures without qgroup_lock held. */ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) { struct rb_node *n; struct btrfs_qgroup *qgroup; - struct btrfs_qgroup_list *list; while ((n = rb_first(&fs_info->qgroup_tree))) { qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); - - while (!list_empty(&qgroup->groups)) { - list = list_first_entry(&qgroup->groups, - struct btrfs_qgroup_list, - next_group); - list_del(&list->next_group); - list_del(&list->next_member); - kfree(list); - } - - while (!list_empty(&qgroup->members)) { - list = list_first_entry(&qgroup->members, - struct btrfs_qgroup_list, - next_member); - list_del(&list->next_group); - list_del(&list->next_member); - kfree(list); - } - kfree(qgroup); + __del_qgroup_rb(qgroup); } + /* + * we call btrfs_free_qgroup_config() when umounting + * filesystem and disabling quota, so we set qgroup_ulit + * to be null here to avoid double free. + */ ulist_free(fs_info->qgroup_ulist); + fs_info->qgroup_ulist = NULL; } static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, @@ -946,13 +938,9 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, fs_info->pending_quota_state = 0; quota_root = fs_info->quota_root; fs_info->quota_root = NULL; - btrfs_free_qgroup_config(fs_info); spin_unlock(&fs_info->qgroup_lock); - if (!quota_root) { - ret = -EINVAL; - goto out; - } + btrfs_free_qgroup_config(fs_info); ret = btrfs_clean_quota_tree(trans, quota_root); if (ret) @@ -1174,7 +1162,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, if (ret) { fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; printk(KERN_INFO "unable to update quota limit for %llu\n", - (unsigned long long)qgroupid); + qgroupid); } spin_lock(&fs_info->qgroup_lock); @@ -1884,10 +1872,9 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, path, 1, 0); pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n", - (unsigned long long)fs_info->qgroup_rescan_progress.objectid, + fs_info->qgroup_rescan_progress.objectid, fs_info->qgroup_rescan_progress.type, - (unsigned long long)fs_info->qgroup_rescan_progress.offset, - ret); + fs_info->qgroup_rescan_progress.offset, ret); if (ret) { /* diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0525e1389f5b..d0ecfbd9cc9f 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1540,8 +1540,10 @@ static int full_stripe_write(struct btrfs_raid_bio *rbio) int ret; ret = alloc_rbio_parity_pages(rbio); - if (ret) + if (ret) { + __free_raid_bio(rbio); return ret; + } ret = lock_stripe_add(rbio); if (ret == 0) @@ -1687,11 +1689,8 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, struct blk_plug_cb *cb; rbio = alloc_rbio(root, bbio, raid_map, stripe_len); - if (IS_ERR(rbio)) { - kfree(raid_map); - kfree(bbio); + if (IS_ERR(rbio)) return PTR_ERR(rbio); - } bio_list_add(&rbio->bio_list, bio); rbio->bio_list_bytes = bio->bi_size; @@ -2041,9 +2040,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, int ret; rbio = alloc_rbio(root, bbio, raid_map, stripe_len); - if (IS_ERR(rbio)) { + if (IS_ERR(rbio)) return PTR_ERR(rbio); - } rbio->read_rebuild = 1; bio_list_add(&rbio->bio_list, bio); @@ -2052,6 +2050,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { BUG(); + kfree(raid_map); + kfree(bbio); kfree(rbio); return -EIO; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 12096496cc99..aacc2121e87c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -335,7 +335,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr) if (bnode->root) fs_info = bnode->root->fs_info; btrfs_panic(fs_info, errno, "Inconsistency in backref cache " - "found at offset %llu\n", (unsigned long long)bytenr); + "found at offset %llu\n", bytenr); } /* @@ -641,6 +641,11 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); return 1; } + if (key.type == BTRFS_METADATA_ITEM_KEY && + item_size <= sizeof(*ei)) { + WARN_ON(item_size < sizeof(*ei)); + return 1; + } if (key.type == BTRFS_EXTENT_ITEM_KEY) { bi = (struct btrfs_tree_block_info *)(ei + 1); @@ -691,6 +696,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, int cowonly; int ret; int err = 0; + bool need_check = true; path1 = btrfs_alloc_path(); path2 = btrfs_alloc_path(); @@ -914,6 +920,7 @@ again: cur->bytenr); lower = cur; + need_check = true; for (; level < BTRFS_MAX_LEVEL; level++) { if (!path2->nodes[level]) { BUG_ON(btrfs_root_bytenr(&root->root_item) != @@ -957,14 +964,12 @@ again: /* * add the block to pending list if we - * need check its backrefs. only block - * at 'cur->level + 1' is added to the - * tail of pending list. this guarantees - * we check backrefs from lower level - * blocks to upper level blocks. + * need check its backrefs, we only do this once + * while walking up a tree as we will catch + * anything else later on. */ - if (!upper->checked && - level == cur->level + 1) { + if (!upper->checked && need_check) { + need_check = false; list_add_tail(&edge->list[UPPER], &list); } else @@ -2314,8 +2319,13 @@ again: BUG_ON(root->reloc_root != reloc_root); ret = merge_reloc_root(rc, root); - if (ret) + if (ret) { + __update_reloc_root(reloc_root, 1); + free_extent_buffer(reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + kfree(reloc_root); goto out; + } } else { list_del_init(&reloc_root->root_list); } @@ -2344,9 +2354,6 @@ again: if (IS_ERR(root)) continue; - if (btrfs_root_refs(&root->root_item) == 0) - continue; - trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); @@ -3628,7 +3635,7 @@ int add_data_references(struct reloc_control *rc, unsigned long ptr; unsigned long end; u32 blocksize = btrfs_level_size(rc->extent_root, 0); - int ret; + int ret = 0; int err = 0; eb = path->nodes[0]; @@ -3655,6 +3662,10 @@ int add_data_references(struct reloc_control *rc, } else { BUG(); } + if (ret) { + err = ret; + goto out; + } ptr += btrfs_extent_inline_ref_size(key.type); } WARN_ON(ptr > end); @@ -3700,6 +3711,7 @@ int add_data_references(struct reloc_control *rc, } path->slots[0]++; } +out: btrfs_release_path(path); if (err) free_block_list(blocks); @@ -4219,8 +4231,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) } printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", - (unsigned long long)rc->block_group->key.objectid, - (unsigned long long)rc->block_group->flags); + rc->block_group->key.objectid, rc->block_group->flags); ret = btrfs_start_all_delalloc_inodes(fs_info, 0); if (ret < 0) { @@ -4242,7 +4253,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) break; printk(KERN_INFO "btrfs: found %llu extents\n", - (unsigned long long)rc->extents_found); + rc->extents_found); if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index ffb1036ef10d..0b1f4ef8db98 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -29,8 +29,8 @@ * generation numbers as then we know the root was once mounted with an older * kernel that was not aware of the root item structure change. */ -void btrfs_read_root_item(struct extent_buffer *eb, int slot, - struct btrfs_root_item *item) +static void btrfs_read_root_item(struct extent_buffer *eb, int slot, + struct btrfs_root_item *item) { uuid_le uuid; int len; @@ -155,8 +155,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (ret != 0) { btrfs_print_leaf(root, path->nodes[0]); printk(KERN_CRIT "unable to update root key %llu %u %llu\n", - (unsigned long long)key->objectid, key->type, - (unsigned long long)key->offset); + key->objectid, key->type, key->offset); BUG_ON(1); } @@ -490,13 +489,13 @@ again: */ void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item) { - u64 inode_flags = le64_to_cpu(root_item->inode.flags); + u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode); if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) { inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT; - root_item->inode.flags = cpu_to_le64(inode_flags); - root_item->flags = 0; - root_item->byte_limit = 0; + btrfs_set_stack_inode_flags(&root_item->inode, inode_flags); + btrfs_set_root_flags(root_item, 0); + btrfs_set_root_limit(root_item, 0); } } @@ -507,8 +506,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct timespec ct = CURRENT_TIME; spin_lock(&root->root_item_lock); - item->ctransid = cpu_to_le64(trans->transid); - item->ctime.sec = cpu_to_le64(ct.tv_sec); - item->ctime.nsec = cpu_to_le32(ct.tv_nsec); + btrfs_set_root_ctransid(item, trans->transid); + btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); + btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); spin_unlock(&root->root_item_lock); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 64a157becbe5..0afcd452fcb3 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -754,8 +754,7 @@ out: num_uncorrectable_read_errors); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", - (unsigned long long)fixup->logical, - rcu_str_deref(fixup->dev->name)); + fixup->logical, rcu_str_deref(fixup->dev->name)); } btrfs_free_path(path); @@ -1154,8 +1153,7 @@ corrected_error: spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: fixed up error at logical %llu on dev %s\n", - (unsigned long long)logical, - rcu_str_deref(dev->name)); + logical, rcu_str_deref(dev->name)); } } else { did_not_correct_error: @@ -1164,8 +1162,7 @@ did_not_correct_error: spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", - (unsigned long long)logical, - rcu_str_deref(dev->name)); + logical, rcu_str_deref(dev->name)); } out: @@ -1345,12 +1342,12 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, mapped_buffer = kmap_atomic(sblock->pagev[0]->page); h = (struct btrfs_header *)mapped_buffer; - if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) || + if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) || memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { sblock->header_error = 1; - } else if (generation != le64_to_cpu(h->generation)) { + } else if (generation != btrfs_stack_header_generation(h)) { sblock->header_error = 1; sblock->generation_error = 1; } @@ -1720,10 +1717,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * b) the page is already kmapped */ - if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr)) + if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) ++fail; - if (sblock->pagev[0]->generation != le64_to_cpu(h->generation)) + if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) ++fail; if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -1786,10 +1783,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) s = (struct btrfs_super_block *)mapped_buffer; memcpy(on_disk_csum, s->csum, sctx->csum_size); - if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr)) + if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) ++fail_cor; - if (sblock->pagev[0]->generation != le64_to_cpu(s->generation)) + if (sblock->pagev[0]->generation != btrfs_super_generation(s)) ++fail_gen; if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) @@ -2455,8 +2452,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, printk(KERN_ERR "btrfs scrub: tree block %llu spanning " "stripes, ignored. logical=%llu\n", - (unsigned long long)key.objectid, - (unsigned long long)logical); + key.objectid, logical); goto next; } @@ -2863,9 +2859,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ printk(KERN_ERR - "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", - fs_info->chunk_root->sectorsize, - (unsigned long long)PAGE_SIZE); + "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n", + fs_info->chunk_root->sectorsize, PAGE_SIZE); return -EINVAL; } @@ -3175,11 +3170,9 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) copy_nocow_pages_for_inode, nocow_ctx); if (ret != 0 && ret != -ENOENT) { - pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n", - (unsigned long long)logical, - (unsigned long long)physical_for_dev_replace, - (unsigned long long)len, - (unsigned long long)mirror_num, ret); + pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n", + logical, physical_for_dev_replace, len, mirror_num, + ret); not_written = 1; goto out; } @@ -3224,11 +3217,6 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) return PTR_ERR(local_root); } - if (btrfs_root_refs(&local_root->root_item) == 0) { - srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); - return -ENOENT; - } - key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 2e14fd89a8b4..e46e0ed74925 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -26,6 +26,7 @@ #include <linux/radix-tree.h> #include <linux/crc32c.h> #include <linux/vmalloc.h> +#include <linux/string.h> #include "send.h" #include "backref.h" @@ -54,8 +55,8 @@ struct fs_path { char *buf; int buf_len; - int reversed:1; - int virtual_mem:1; + unsigned int reversed:1; + unsigned int virtual_mem:1; char inline_buf[]; }; char pad[PAGE_SIZE]; @@ -1668,6 +1669,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, u64 *who_ino, u64 *who_gen) { int ret = 0; + u64 gen; u64 other_inode = 0; u8 other_type = 0; @@ -1678,6 +1680,24 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (ret <= 0) goto out; + /* + * If we have a parent root we need to verify that the parent dir was + * not delted and then re-created, if it was then we have no overwrite + * and we can just unlink this entry. + */ + if (sctx->parent_root) { + ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, + NULL, NULL, NULL); + if (ret < 0 && ret != -ENOENT) + goto out; + if (ret) { + ret = 0; + goto out; + } + if (gen != dir_gen) + goto out; + } + ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len, &other_inode, &other_type); if (ret < 0 && ret != -ENOENT) @@ -2519,7 +2539,8 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir) di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - if (di_key.objectid < sctx->send_progress) { + if (di_key.type != BTRFS_ROOT_ITEM_KEY && + di_key.objectid < sctx->send_progress) { ret = 1; goto out; } @@ -2581,7 +2602,6 @@ static int record_ref(struct list_head *head, u64 dir, u64 dir_gen, struct fs_path *path) { struct recorded_ref *ref; - char *tmp; ref = kmalloc(sizeof(*ref), GFP_NOFS); if (!ref) @@ -2591,25 +2611,35 @@ static int record_ref(struct list_head *head, u64 dir, ref->dir_gen = dir_gen; ref->full_path = path; - tmp = strrchr(ref->full_path->start, '/'); - if (!tmp) { - ref->name_len = ref->full_path->end - ref->full_path->start; - ref->name = ref->full_path->start; + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; + ref->dir_path = ref->full_path->start; + if (ref->name == ref->full_path->start) ref->dir_path_len = 0; - ref->dir_path = ref->full_path->start; - } else { - tmp++; - ref->name_len = ref->full_path->end - tmp; - ref->name = tmp; - ref->dir_path = ref->full_path->start; + else ref->dir_path_len = ref->full_path->end - ref->full_path->start - 1 - ref->name_len; - } list_add_tail(&ref->list, head); return 0; } +static int dup_ref(struct recorded_ref *ref, struct list_head *list) +{ + struct recorded_ref *new; + + new = kmalloc(sizeof(*ref), GFP_NOFS); + if (!new) + return -ENOMEM; + + new->dir = ref->dir; + new->dir_gen = ref->dir_gen; + new->full_path = NULL; + INIT_LIST_HEAD(&new->list); + list_add_tail(&new->list, list); + return 0; +} + static void __free_recorded_refs(struct list_head *head) { struct recorded_ref *cur; @@ -2724,9 +2754,7 @@ static int process_recorded_refs(struct send_ctx *sctx) int ret = 0; struct recorded_ref *cur; struct recorded_ref *cur2; - struct ulist *check_dirs = NULL; - struct ulist_iterator uit; - struct ulist_node *un; + struct list_head check_dirs; struct fs_path *valid_path = NULL; u64 ow_inode = 0; u64 ow_gen; @@ -2740,6 +2768,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * which is always '..' */ BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); + INIT_LIST_HEAD(&check_dirs); valid_path = fs_path_alloc(); if (!valid_path) { @@ -2747,12 +2776,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); goto out; } - check_dirs = ulist_alloc(GFP_NOFS); - if (!check_dirs) { - ret = -ENOMEM; - goto out; - } - /* * First, check if the first ref of the current inode was overwritten * before. If yes, we know that the current inode was already orphanized @@ -2889,8 +2912,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); goto out; } } - ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, - GFP_NOFS); + ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; } @@ -2918,8 +2940,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } list_for_each_entry(cur, &sctx->deleted_refs, list) { - ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, - GFP_NOFS); + ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; } @@ -2930,8 +2951,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); */ cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, list); - ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, - GFP_NOFS); + ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; } else if (!S_ISDIR(sctx->cur_inode_mode)) { @@ -2951,12 +2971,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; } - ret = ulist_add(check_dirs, cur->dir, cur->dir_gen, - GFP_NOFS); + ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; } - /* * If the inode is still orphan, unlink the orphan. This may * happen when a previous inode did overwrite the first ref @@ -2978,33 +2996,32 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * deletion and if it's finally possible to perform the rmdir now. * We also update the inode stats of the parent dirs here. */ - ULIST_ITER_INIT(&uit); - while ((un = ulist_next(check_dirs, &uit))) { + list_for_each_entry(cur, &check_dirs, list) { /* * In case we had refs into dirs that were not processed yet, * we don't need to do the utime and rmdir logic for these dirs. * The dir will be processed later. */ - if (un->val > sctx->cur_ino) + if (cur->dir > sctx->cur_ino) continue; - ret = get_cur_inode_state(sctx, un->val, un->aux); + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; if (ret == inode_state_did_create || ret == inode_state_no_change) { /* TODO delayed utimes */ - ret = send_utimes(sctx, un->val, un->aux); + ret = send_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; } else if (ret == inode_state_did_delete) { - ret = can_rmdir(sctx, un->val, sctx->cur_ino); + ret = can_rmdir(sctx, cur->dir, sctx->cur_ino); if (ret < 0) goto out; if (ret) { - ret = get_cur_path(sctx, un->val, un->aux, - valid_path); + ret = get_cur_path(sctx, cur->dir, + cur->dir_gen, valid_path); if (ret < 0) goto out; ret = send_rmdir(sctx, valid_path); @@ -3017,8 +3034,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); ret = 0; out: + __free_recorded_refs(&check_dirs); free_recorded_refs(sctx); - ulist_free(check_dirs); fs_path_free(valid_path); return ret; } @@ -3119,6 +3136,8 @@ out: struct find_ref_ctx { u64 dir; + u64 dir_gen; + struct btrfs_root *root; struct fs_path *name; int found_idx; }; @@ -3128,9 +3147,21 @@ static int __find_iref(int num, u64 dir, int index, void *ctx_) { struct find_ref_ctx *ctx = ctx_; + u64 dir_gen; + int ret; if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) && strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) { + /* + * To avoid doing extra lookups we'll only do this if everything + * else matches. + */ + ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + if (dir_gen != ctx->dir_gen) + return 0; ctx->found_idx = num; return 1; } @@ -3140,14 +3171,16 @@ static int __find_iref(int num, u64 dir, int index, static int find_iref(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, - u64 dir, struct fs_path *name) + u64 dir, u64 dir_gen, struct fs_path *name) { int ret; struct find_ref_ctx ctx; ctx.dir = dir; ctx.name = name; + ctx.dir_gen = dir_gen; ctx.found_idx = -1; + ctx.root = root; ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); if (ret < 0) @@ -3163,11 +3196,17 @@ static int __record_changed_new_ref(int num, u64 dir, int index, struct fs_path *name, void *ctx) { + u64 dir_gen; int ret; struct send_ctx *sctx = ctx; + ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + ret = find_iref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, dir, name); + sctx->cmp_key, dir, dir_gen, name); if (ret == -ENOENT) ret = __record_new_ref(num, dir, index, name, sctx); else if (ret > 0) @@ -3180,11 +3219,17 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index, struct fs_path *name, void *ctx) { + u64 dir_gen; int ret; struct send_ctx *sctx = ctx; + ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, - dir, name); + dir, dir_gen, name); if (ret == -ENOENT) ret = __record_deleted_ref(num, dir, index, name, sctx); else if (ret > 0) @@ -3869,7 +3914,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || found_key.type != key.type) { - ret = 0; + /* If we're a hole then just pretend nothing changed */ + ret = (left_disknr) ? 0 : 1; goto out; } @@ -3895,7 +3941,8 @@ static int is_extent_unchanged(struct send_ctx *sctx, * This may only happen on the first iteration. */ if (found_key.offset + right_len <= ekey->offset) { - ret = 0; + /* If we're a hole just pretend nothing changed */ + ret = (left_disknr) ? 0 : 1; goto out; } @@ -3960,8 +4007,8 @@ static int process_extent(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key) { - int ret = 0; struct clone_root *found_clone = NULL; + int ret = 0; if (S_ISLNK(sctx->cur_inode_mode)) return 0; @@ -3974,6 +4021,32 @@ static int process_extent(struct send_ctx *sctx, ret = 0; goto out; } + } else { + struct btrfs_file_extent_item *ei; + u8 type; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + type = btrfs_file_extent_type(path->nodes[0], ei); + if (type == BTRFS_FILE_EXTENT_PREALLOC || + type == BTRFS_FILE_EXTENT_REG) { + /* + * The send spec does not have a prealloc command yet, + * so just leave a hole for prealloc'ed extents until + * we have enough commands queued up to justify rev'ing + * the send spec. + */ + if (type == BTRFS_FILE_EXTENT_PREALLOC) { + ret = 0; + goto out; + } + + /* Have a hole, just skip it. */ + if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) { + ret = 0; + goto out; + } + } } ret = find_extent_clone(sctx, path, key->objectid, key->offset, @@ -4361,6 +4434,64 @@ static int changed_extent(struct send_ctx *sctx, return ret; } +static int dir_changed(struct send_ctx *sctx, u64 dir) +{ + u64 orig_gen, new_gen; + int ret; + + ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, + NULL, NULL); + if (ret) + return ret; + + ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, + NULL, NULL, NULL); + if (ret) + return ret; + + return (orig_gen != new_gen) ? 1 : 0; +} + +static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path, + struct btrfs_key *key) +{ + struct btrfs_inode_extref *extref; + struct extent_buffer *leaf; + u64 dirid = 0, last_dirid = 0; + unsigned long ptr; + u32 item_size; + u32 cur_offset = 0; + int ref_name_len; + int ret = 0; + + /* Easy case, just check this one dirid */ + if (key->type == BTRFS_INODE_REF_KEY) { + dirid = key->offset; + + ret = dir_changed(sctx, dirid); + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + dirid = btrfs_inode_extref_parent(leaf, extref); + ref_name_len = btrfs_inode_extref_name_len(leaf, extref); + cur_offset += ref_name_len + sizeof(*extref); + if (dirid == last_dirid) + continue; + ret = dir_changed(sctx, dirid); + if (ret) + break; + last_dirid = dirid; + } +out: + return ret; +} + /* * Updates compare related fields in sctx and simply forwards to the actual * changed_xxx functions. @@ -4376,6 +4507,19 @@ static int changed_cb(struct btrfs_root *left_root, int ret = 0; struct send_ctx *sctx = ctx; + if (result == BTRFS_COMPARE_TREE_SAME) { + if (key->type != BTRFS_INODE_REF_KEY && + key->type != BTRFS_INODE_EXTREF_KEY) + return 0; + ret = compare_refs(sctx, left_path, key); + if (!ret) + return 0; + if (ret < 0) + return ret; + result = BTRFS_COMPARE_TREE_CHANGED; + ret = 0; + } + sctx->left_path = left_path; sctx->right_path = right_path; sctx->cmp_key = key; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8eb6191d86da..3aab10ce63e8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -56,6 +56,8 @@ #include "rcu-string.h" #include "dev-replace.h" #include "free-space-cache.h" +#include "backref.h" +#include "tests/btrfs-tests.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -320,14 +322,15 @@ enum { Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_skip_balance, Opt_check_integrity, Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, Opt_fatal_errors, + Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, + Opt_commit_interval, Opt_err, }; static match_table_t tokens = { {Opt_degraded, "degraded"}, {Opt_subvol, "subvol=%s"}, - {Opt_subvolid, "subvolid=%d"}, + {Opt_subvolid, "subvolid=%s"}, {Opt_device, "device=%s"}, {Opt_nodatasum, "nodatasum"}, {Opt_nodatacow, "nodatacow"}, @@ -360,7 +363,9 @@ static match_table_t tokens = { {Opt_check_integrity, "check_int"}, {Opt_check_integrity_including_extent_data, "check_int_data"}, {Opt_check_integrity_print_mask, "check_int_print_mask=%d"}, + {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, {Opt_fatal_errors, "fatal_errors=%s"}, + {Opt_commit_interval, "commit=%d"}, {Opt_err, NULL}, }; @@ -496,10 +501,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, NOBARRIER); break; case Opt_thread_pool: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) + ret = match_int(&args[0], &intarg); + if (ret) { + goto out; + } else if (intarg > 0) { info->thread_pool_size = intarg; + } else { + ret = -EINVAL; + goto out; + } break; case Opt_max_inline: num = match_strdup(&args[0]); @@ -513,7 +523,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) root->sectorsize); } printk(KERN_INFO "btrfs: max_inline at %llu\n", - (unsigned long long)info->max_inline); + info->max_inline); + } else { + ret = -ENOMEM; + goto out; } break; case Opt_alloc_start: @@ -525,7 +538,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", - (unsigned long long)info->alloc_start); + info->alloc_start); + } else { + ret = -ENOMEM; + goto out; } break; case Opt_noacl: @@ -540,12 +556,16 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); break; case Opt_ratio: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) { + ret = match_int(&args[0], &intarg); + if (ret) { + goto out; + } else if (intarg >= 0) { info->metadata_ratio = intarg; printk(KERN_INFO "btrfs: metadata ratio %d\n", info->metadata_ratio); + } else { + ret = -EINVAL; + goto out; } break; case Opt_discard: @@ -554,6 +574,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_space_cache: btrfs_set_opt(info->mount_opt, SPACE_CACHE); break; + case Opt_rescan_uuid_tree: + btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); + break; case Opt_no_space_cache: printk(KERN_INFO "btrfs: disabling disk space caching\n"); btrfs_clear_opt(info->mount_opt, SPACE_CACHE); @@ -596,13 +619,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity_print_mask: - intarg = 0; - match_int(&args[0], &intarg); - if (intarg) { + ret = match_int(&args[0], &intarg); + if (ret) { + goto out; + } else if (intarg >= 0) { info->check_integrity_print_mask = intarg; printk(KERN_INFO "btrfs:" " check_integrity_print_mask 0x%x\n", info->check_integrity_print_mask); + } else { + ret = -EINVAL; + goto out; } break; #else @@ -626,6 +653,29 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) goto out; } break; + case Opt_commit_interval: + intarg = 0; + ret = match_int(&args[0], &intarg); + if (ret < 0) { + printk(KERN_ERR + "btrfs: invalid commit interval\n"); + ret = -EINVAL; + goto out; + } + if (intarg > 0) { + if (intarg > 300) { + printk(KERN_WARNING + "btrfs: excessive commit interval %d\n", + intarg); + } + info->commit_interval = intarg; + } else { + printk(KERN_INFO + "btrfs: using default commit interval %ds\n", + BTRFS_DEFAULT_COMMIT_INTERVAL); + info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + } + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -654,8 +704,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, { substring_t args[MAX_OPT_ARGS]; char *device_name, *opts, *orig, *p; + char *num = NULL; int error = 0; - int intarg; if (!options) return 0; @@ -679,17 +729,23 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, case Opt_subvol: kfree(*subvol_name); *subvol_name = match_strdup(&args[0]); + if (!*subvol_name) { + error = -ENOMEM; + goto out; + } break; case Opt_subvolid: - intarg = 0; - error = match_int(&args[0], &intarg); - if (!error) { + num = match_strdup(&args[0]); + if (num) { + *subvol_objectid = memparse(num, NULL); + kfree(num); /* we want the original fs_tree */ - if (!intarg) + if (!*subvol_objectid) *subvol_objectid = BTRFS_FS_TREE_OBJECTID; - else - *subvol_objectid = intarg; + } else { + error = -EINVAL; + goto out; } break; case Opt_subvolrootid: @@ -892,11 +948,9 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(root, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_inline != 8192 * 1024) - seq_printf(seq, ",max_inline=%llu", - (unsigned long long)info->max_inline); + seq_printf(seq, ",max_inline=%llu", info->max_inline); if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", - (unsigned long long)info->alloc_start); + seq_printf(seq, ",alloc_start=%llu", info->alloc_start); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); @@ -928,6 +982,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",space_cache"); else seq_puts(seq, ",nospace_cache"); + if (btrfs_test_opt(root, RESCAN_UUID_TREE)) + seq_puts(seq, ",rescan_uuid_tree"); if (btrfs_test_opt(root, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) @@ -940,8 +996,24 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",inode_cache"); if (btrfs_test_opt(root, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); + if (btrfs_test_opt(root, RECOVERY)) + seq_puts(seq, ",recovery"); +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) + seq_puts(seq, ",check_int_data"); + else if (btrfs_test_opt(root, CHECK_INTEGRITY)) + seq_puts(seq, ",check_int"); + if (info->check_integrity_print_mask) + seq_printf(seq, ",check_int_print_mask=%d", + info->check_integrity_print_mask); +#endif + if (info->metadata_ratio) + seq_printf(seq, ",metadata_ratio=%d", + info->metadata_ratio); if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) seq_puts(seq, ",fatal_errors=panic"); + if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) + seq_printf(seq, ",commit=%d", info->commit_interval); return 0; } @@ -1696,6 +1768,11 @@ static void btrfs_print_info(void) "\n"); } +static int btrfs_run_sanity_tests(void) +{ + return btrfs_test_free_space_cache(); +} + static int __init init_btrfs_fs(void) { int err; @@ -1734,23 +1811,32 @@ static int __init init_btrfs_fs(void) if (err) goto free_auto_defrag; - err = btrfs_interface_init(); + err = btrfs_prelim_ref_init(); if (err) - goto free_delayed_ref; + goto free_prelim_ref; - err = register_filesystem(&btrfs_fs_type); + err = btrfs_interface_init(); if (err) - goto unregister_ioctl; + goto free_delayed_ref; btrfs_init_lockdep(); btrfs_print_info(); - btrfs_test_free_space_cache(); + + err = btrfs_run_sanity_tests(); + if (err) + goto unregister_ioctl; + + err = register_filesystem(&btrfs_fs_type); + if (err) + goto unregister_ioctl; return 0; unregister_ioctl: btrfs_interface_exit(); +free_prelim_ref: + btrfs_prelim_ref_exit(); free_delayed_ref: btrfs_delayed_ref_exit(); free_auto_defrag: @@ -1777,6 +1863,7 @@ static void __exit exit_btrfs_fs(void) btrfs_delayed_ref_exit(); btrfs_auto_defrag_exit(); btrfs_delayed_inode_exit(); + btrfs_prelim_ref_exit(); ordered_data_exit(); extent_map_exit(); extent_io_exit(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h new file mode 100644 index 000000000000..580877625776 --- /dev/null +++ b/fs/btrfs/tests/btrfs-tests.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_TESTS +#define __BTRFS_TESTS + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + +#define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__) + +int btrfs_test_free_space_cache(void); +#else +static inline int btrfs_test_free_space_cache(void) +{ + return 0; +} +#endif + +#endif diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c new file mode 100644 index 000000000000..6fc82010dc15 --- /dev/null +++ b/fs/btrfs/tests/free-space-tests.c @@ -0,0 +1,395 @@ +/* + * Copyright (C) 2013 Fusion IO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/slab.h> +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../free-space-cache.h" + +#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +static struct btrfs_block_group_cache *init_test_block_group(void) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = 0; + cache->key.offset = 1024 * 1024 * 1024; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = 4096; + + spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); + + btrfs_init_free_space_ctl(cache); + + return cache; +} + +/* + * This test just does basic sanity checking, making sure we can add an exten + * entry and remove space from either end and the middle, and make sure we can + * remove space that covers adjacent extent entries. + */ +static int test_extents(struct btrfs_block_group_cache *cache) +{ + int ret = 0; + + test_msg("Running extent only tests\n"); + + /* First just make sure we can remove an entire entry */ + ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error adding initial extents %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error removing extent %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + test_msg("Full remove left some lingering space\n"); + return -1; + } + + /* Ok edge and middle cases now */ + ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error adding half extent %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); + if (ret) { + test_msg("Error removing tail end %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + if (ret) { + test_msg("Error removing front end %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); + if (ret) { + test_msg("Error removing middle peice %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + test_msg("Still have space at the front\n"); + return -1; + } + + if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) { + test_msg("Still have space in the middle\n"); + return -1; + } + + if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { + test_msg("Still have space at the end\n"); + return -1; + } + + /* Cleanup */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +static int test_bitmaps(struct btrfs_block_group_cache *cache) +{ + u64 next_bitmap_offset; + int ret; + + test_msg("Running bitmap only tests\n"); + + ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't create a bitmap entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + if (ret) { + test_msg("Error removing bitmap full range %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + test_msg("Left some space in bitmap\n"); + return -1; + } + + ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add to our bitmap entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove middle chunk %d\n", ret); + return ret; + } + + /* + * The first bitmap we have starts at offset 0 so the next one is just + * at the end of the first bitmap. + */ + next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + + /* Test a bit straddling two bitmaps */ + ret = test_add_free_space_entry(cache, next_bitmap_offset - + (2 * 1024 * 1024), 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add space that straddles two bitmaps %d\n", + ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, next_bitmap_offset - + (1 * 1024 * 1024), 2 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove overlapping space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), + 2 * 1024 * 1024)) { + test_msg("Left some space when removing overlapping\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + +/* This is the high grade jackassery */ +static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) +{ + u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + int ret; + + test_msg("Running bitmap and extent tests\n"); + + /* + * First let's do something simple, an extent at the same offset as the + * bitmap, but the free space completely in the extent and then + * completely in the bitmap. + */ + ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't create bitmap entry %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove extent entry %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + test_msg("Left remnants after our remove\n"); + return -1; + } + + /* Now to add back the extent entry and remove from the bitmap */ + ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't re-add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove from bitmap %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { + test_msg("Left remnants in the bitmap\n"); + return -1; + } + + /* + * Ok so a little more evil, extent entry and bitmap at the same offset, + * removing an overlapping chunk. + */ + ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add to a bitmap %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); + if (ret) { + test_msg("Couldn't remove overlapping space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { + test_msg("Left over peices after removing overlapping\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* Now with the extent entry offset into the bitmap */ + ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add space to the bitmap %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent to the cache %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); + if (ret) { + test_msg("Problem removing overlapping space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { + test_msg("Left something behind when removing space"); + return -1; + } + + /* + * This has blown up in the past, the extent entry starts before the + * bitmap entry, but we're trying to remove an offset that falls + * completely within the bitmap range and is in both the extent entry + * and the bitmap entry, looks like this + * + * [ extent ] + * [ bitmap ] + * [ del ] + */ + __btrfs_remove_free_space_cache(cache->free_space_ctl); + ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, + 4 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, + 5 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, + 5 * 1024 * 1024); + if (ret) { + test_msg("Failed to free our space %d\n", ret); + return ret; + } + + if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024, + 5 * 1024 * 1024)) { + test_msg("Left stuff over\n"); + return -1; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* + * This blew up before, we have part of the free space in a bitmap and + * then the entirety of the rest of the space in an extent. This used + * to return -EAGAIN back from btrfs_remove_extent, make sure this + * doesn't happen. + */ + ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap entry %d\n", ret); + return ret; + } + + ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); + if (ret) { + test_msg("Error removing bitmap and extent overlapping %d\n", ret); + return ret; + } + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + return 0; +} + +int btrfs_test_free_space_cache(void) +{ + struct btrfs_block_group_cache *cache; + int ret; + + test_msg("Running btrfs free space cache tests\n"); + + cache = init_test_block_group(); + if (!cache) { + test_msg("Couldn't run the tests\n"); + return 0; + } + + ret = test_extents(cache); + if (ret) + goto out; + ret = test_bitmaps(cache); + if (ret) + goto out; + ret = test_bitmaps_and_extents(cache); + if (ret) + goto out; +out: + __btrfs_remove_free_space_cache(cache->free_space_ctl); + kfree(cache->free_space_ctl); + kfree(cache); + test_msg("Free space cache tests finished\n"); + return ret; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index af1931a5960d..cac4a3f76323 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -837,7 +837,7 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, * them in one of two extent_io trees. This is used to make sure all of * those extents are on disk for transaction or log commit */ -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, +static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { int ret; @@ -1225,8 +1225,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_stransid(new_root_item, 0); btrfs_set_root_rtransid(new_root_item, 0); } - new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec); - new_root_item->otime.nsec = cpu_to_le32(cur_time.tv_nsec); + btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec); btrfs_set_root_otransid(new_root_item, trans->transid); old = btrfs_lock_root_node(root); @@ -1311,8 +1311,26 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry->d_name.len * 2); parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b, + BTRFS_UUID_KEY_SUBVOL, objectid); + if (ret) { btrfs_abort_transaction(trans, root, ret); + goto fail; + } + if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + new_root_item->received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + objectid); + if (ret && ret != -EEXIST) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + } fail: pending->error = ret; dir_item_existed: @@ -1362,6 +1380,8 @@ static void update_super_roots(struct btrfs_root *root) super->root_level = root_item->level; if (btrfs_test_opt(root, SPACE_CACHE)) super->cache_generation = root_item->generation; + if (root->fs_info->update_uuid_tree_gen) + super->uuid_tree_generation = root_item->generation; } int btrfs_transaction_in_commit(struct btrfs_fs_info *info) @@ -1928,8 +1948,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); - pr_debug("btrfs: cleaner removing %llu\n", - (unsigned long long)root->objectid); + pr_debug("btrfs: cleaner removing %llu\n", root->objectid); btrfs_kill_all_delayed_nodes(root); @@ -1942,6 +1961,5 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) * If we encounter a transaction abort during snapshot cleaning, we * don't want to crash here */ - BUG_ON(ret < 0 && ret != -EAGAIN && ret != -EROFS); - return 1; + return (ret < 0) ? 0 : 1; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index defbc4269897..5c2af8491621 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -160,8 +160,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, void btrfs_throttle(struct btrfs_root *root); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, int mark); int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_marked_extents(struct btrfs_root *root, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ff60d8978ae2..0d9613c3f5e5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -747,7 +747,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (ret) goto out; - btrfs_run_delayed_items(trans, root); + else + ret = btrfs_run_delayed_items(trans, root); out: kfree(name); iput(inode); @@ -923,7 +924,9 @@ again: kfree(victim_name); if (ret) return ret; - btrfs_run_delayed_items(trans, root); + ret = btrfs_run_delayed_items(trans, root); + if (ret) + return ret; *search_done = 1; goto again; } @@ -990,7 +993,9 @@ again: inode, victim_name, victim_name_len); - btrfs_run_delayed_items(trans, root); + if (!ret) + ret = btrfs_run_delayed_items( + trans, root); } iput(victim_parent); kfree(victim_name); @@ -1536,8 +1541,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); - if (!name) - return -ENOMEM; + if (!name) { + ret = -ENOMEM; + goto out; + } log_type = btrfs_dir_type(eb, di); read_extent_buffer(eb, name, (unsigned long)(di + 1), @@ -1810,7 +1817,7 @@ again: ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (!ret) - btrfs_run_delayed_items(trans, root); + ret = btrfs_run_delayed_items(trans, root); kfree(name); iput(inode); if (ret) diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c new file mode 100644 index 000000000000..dd0dea3766f7 --- /dev/null +++ b/fs/btrfs/uuid-tree.c @@ -0,0 +1,358 @@ +/* + * Copyright (C) STRATO AG 2013. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/uuid.h> +#include <asm/unaligned.h> +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "print-tree.h" + + +static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) +{ + key->type = type; + key->objectid = get_unaligned_le64(uuid); + key->offset = get_unaligned_le64(uuid + sizeof(u64)); +} + +/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */ +static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid, + u8 type, u64 subid) +{ + int ret; + struct btrfs_path *path = NULL; + struct extent_buffer *eb; + int slot; + u32 item_size; + unsigned long offset; + struct btrfs_key key; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -ENOENT; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = -ENOENT; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + offset = btrfs_item_ptr_offset(eb, slot); + ret = -ENOENT; + + if (!IS_ALIGNED(item_size, sizeof(u64))) { + pr_warn("btrfs: uuid item with illegal size %lu!\n", + (unsigned long)item_size); + goto out; + } + while (item_size) { + __le64 data; + + read_extent_buffer(eb, &data, offset, sizeof(data)); + if (le64_to_cpu(data) == subid) { + ret = 0; + break; + } + offset += sizeof(data); + item_size -= sizeof(data); + } + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid_cpu) +{ + int ret; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct extent_buffer *eb; + int slot; + unsigned long offset; + __le64 subid_le; + + ret = btrfs_uuid_tree_lookup(uuid_root, uuid, type, subid_cpu); + if (ret != -ENOENT) + return ret; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -EINVAL; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, + sizeof(subid_le)); + if (ret >= 0) { + /* Add an item for the type for the first time */ + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + } else if (ret == -EEXIST) { + /* + * An item with that type already exists. + * Extend the item and store the new subid at the end. + */ + btrfs_extend_item(uuid_root, path, sizeof(subid_le)); + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); + } else if (ret < 0) { + pr_warn("btrfs: insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!\n", + ret, (unsigned long long)key.objectid, + (unsigned long long)key.offset, type); + goto out; + } + + ret = 0; + subid_le = cpu_to_le64(subid_cpu); + write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); + btrfs_mark_buffer_dirty(eb); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, + struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid) +{ + int ret; + struct btrfs_path *path = NULL; + struct btrfs_key key; + struct extent_buffer *eb; + int slot; + unsigned long offset; + u32 item_size; + unsigned long move_dst; + unsigned long move_src; + unsigned long move_len; + + if (WARN_ON_ONCE(!uuid_root)) { + ret = -EINVAL; + goto out; + } + + btrfs_uuid_to_key(uuid, type, &key); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); + if (ret < 0) { + pr_warn("btrfs: error %d while searching for uuid item!\n", + ret); + goto out; + } + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + eb = path->nodes[0]; + slot = path->slots[0]; + offset = btrfs_item_ptr_offset(eb, slot); + item_size = btrfs_item_size_nr(eb, slot); + if (!IS_ALIGNED(item_size, sizeof(u64))) { + pr_warn("btrfs: uuid item with illegal size %lu!\n", + (unsigned long)item_size); + ret = -ENOENT; + goto out; + } + while (item_size) { + __le64 read_subid; + + read_extent_buffer(eb, &read_subid, offset, sizeof(read_subid)); + if (le64_to_cpu(read_subid) == subid) + break; + offset += sizeof(read_subid); + item_size -= sizeof(read_subid); + } + + if (!item_size) { + ret = -ENOENT; + goto out; + } + + item_size = btrfs_item_size_nr(eb, slot); + if (item_size == sizeof(subid)) { + ret = btrfs_del_item(trans, uuid_root, path); + goto out; + } + + move_dst = offset; + move_src = offset + sizeof(subid); + move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot)); + memmove_extent_buffer(eb, move_dst, move_src, move_len); + btrfs_truncate_item(uuid_root, path, item_size - sizeof(subid), 1); + +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, + u64 subid) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* 1 - for the uuid item */ + trans = btrfs_start_transaction(uuid_root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_uuid_tree_rem(trans, uuid_root, uuid, type, subid); + btrfs_end_transaction(trans, uuid_root); + +out: + return ret; +} + +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, + int (*check_func)(struct btrfs_fs_info *, u8 *, u8, + u64)) +{ + struct btrfs_root *root = fs_info->uuid_root; + struct btrfs_key key; + struct btrfs_key max_key; + struct btrfs_path *path; + int ret = 0; + struct extent_buffer *leaf; + int slot; + u32 item_size; + unsigned long offset; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = 0; + key.offset = 0; + max_key.objectid = (u64)-1; + max_key.type = (u8)-1; + max_key.offset = (u64)-1; + +again_search_slot: + path->keep_locks = 1; + ret = btrfs_search_forward(root, &key, &max_key, path, 0); + if (ret) { + if (ret > 0) + ret = 0; + goto out; + } + + while (1) { + cond_resched(); + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.type != BTRFS_UUID_KEY_SUBVOL && + key.type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto skip; + + offset = btrfs_item_ptr_offset(leaf, slot); + item_size = btrfs_item_size_nr(leaf, slot); + if (!IS_ALIGNED(item_size, sizeof(u64))) { + pr_warn("btrfs: uuid item with illegal size %lu!\n", + (unsigned long)item_size); + goto skip; + } + while (item_size) { + u8 uuid[BTRFS_UUID_SIZE]; + __le64 subid_le; + u64 subid_cpu; + + put_unaligned_le64(key.objectid, uuid); + put_unaligned_le64(key.offset, uuid + sizeof(u64)); + read_extent_buffer(leaf, &subid_le, offset, + sizeof(subid_le)); + subid_cpu = le64_to_cpu(subid_le); + ret = check_func(fs_info, uuid, key.type, subid_cpu); + if (ret < 0) + goto out; + if (ret > 0) { + btrfs_release_path(path); + ret = btrfs_uuid_iter_rem(root, uuid, key.type, + subid_cpu); + if (ret == 0) { + /* + * this might look inefficient, but the + * justification is that it is an + * exception that check_func returns 1, + * and that in the regular case only one + * entry per UUID exists. + */ + goto again_search_slot; + } + if (ret < 0 && ret != -ENOENT) + goto out; + } + item_size -= sizeof(subid_le); + offset += sizeof(subid_le); + } + +skip: + ret = btrfs_next_item(root, path); + if (ret == 0) + continue; + else if (ret > 0) + ret = 0; + break; + } + +out: + btrfs_free_path(path); + if (ret) + pr_warn("btrfs: btrfs_uuid_tree_iterate failed %d\n", ret); + return 0; +} diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 67a085381845..0052ca8264d9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -26,6 +26,7 @@ #include <linux/ratelimit.h> #include <linux/kthread.h> #include <linux/raid/pq.h> +#include <linux/semaphore.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -62,6 +63,48 @@ static void unlock_chunks(struct btrfs_root *root) mutex_unlock(&root->fs_info->chunk_mutex); } +static struct btrfs_fs_devices *__alloc_fs_devices(void) +{ + struct btrfs_fs_devices *fs_devs; + + fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS); + if (!fs_devs) + return ERR_PTR(-ENOMEM); + + mutex_init(&fs_devs->device_list_mutex); + + INIT_LIST_HEAD(&fs_devs->devices); + INIT_LIST_HEAD(&fs_devs->alloc_list); + INIT_LIST_HEAD(&fs_devs->list); + + return fs_devs; +} + +/** + * alloc_fs_devices - allocate struct btrfs_fs_devices + * @fsid: a pointer to UUID for this FS. If NULL a new UUID is + * generated. + * + * Return: a pointer to a new &struct btrfs_fs_devices on success; + * ERR_PTR() on error. Returned struct is not linked onto any lists and + * can be destroyed with kfree() right away. + */ +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) +{ + struct btrfs_fs_devices *fs_devs; + + fs_devs = __alloc_fs_devices(); + if (IS_ERR(fs_devs)) + return fs_devs; + + if (fsid) + memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + else + generate_random_uuid(fs_devs->fsid); + + return fs_devs; +} + static void free_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; @@ -101,6 +144,27 @@ void btrfs_cleanup_fs_uuids(void) } } +static struct btrfs_device *__alloc_device(void) +{ + struct btrfs_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_NOFS); + if (!dev) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&dev->dev_list); + INIT_LIST_HEAD(&dev->dev_alloc_list); + + spin_lock_init(&dev->io_lock); + + spin_lock_init(&dev->reada_lock); + atomic_set(&dev->reada_in_flight, 0); + INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); + + return dev; +} + static noinline struct btrfs_device *__find_device(struct list_head *head, u64 devid, u8 *uuid) { @@ -395,16 +459,14 @@ static noinline int device_list_add(const char *path, fs_devices = find_fsid(disk_super->fsid); if (!fs_devices) { - fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); - if (!fs_devices) - return -ENOMEM; - INIT_LIST_HEAD(&fs_devices->devices); - INIT_LIST_HEAD(&fs_devices->alloc_list); + fs_devices = alloc_fs_devices(disk_super->fsid); + if (IS_ERR(fs_devices)) + return PTR_ERR(fs_devices); + list_add(&fs_devices->list, &fs_uuids); - memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); fs_devices->latest_devid = devid; fs_devices->latest_trans = found_transid; - mutex_init(&fs_devices->device_list_mutex); + device = NULL; } else { device = __find_device(&fs_devices->devices, devid, @@ -414,17 +476,12 @@ static noinline int device_list_add(const char *path, if (fs_devices->opened) return -EBUSY; - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) { + device = btrfs_alloc_device(NULL, &devid, + disk_super->dev_item.uuid); + if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ - return -ENOMEM; + return PTR_ERR(device); } - device->devid = devid; - device->dev_stats_valid = 0; - device->work.func = pending_bios_fn; - memcpy(device->uuid, disk_super->dev_item.uuid, - BTRFS_UUID_SIZE); - spin_lock_init(&device->io_lock); name = rcu_string_strdup(path, GFP_NOFS); if (!name) { @@ -432,22 +489,13 @@ static noinline int device_list_add(const char *path, return -ENOMEM; } rcu_assign_pointer(device->name, name); - INIT_LIST_HEAD(&device->dev_alloc_list); - - /* init readahead state */ - spin_lock_init(&device->reada_lock); - device->reada_curr_zone = NULL; - atomic_set(&device->reada_in_flight, 0); - device->reada_next = 0; - INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); - INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); mutex_lock(&fs_devices->device_list_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); + fs_devices->num_devices++; mutex_unlock(&fs_devices->device_list_mutex); device->fs_devices = fs_devices; - fs_devices->num_devices++; } else if (!device->name || strcmp(device->name->str, path)) { name = rcu_string_strdup(path, GFP_NOFS); if (!name) @@ -474,25 +522,21 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_device *device; struct btrfs_device *orig_dev; - fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); - if (!fs_devices) - return ERR_PTR(-ENOMEM); + fs_devices = alloc_fs_devices(orig->fsid); + if (IS_ERR(fs_devices)) + return fs_devices; - INIT_LIST_HEAD(&fs_devices->devices); - INIT_LIST_HEAD(&fs_devices->alloc_list); - INIT_LIST_HEAD(&fs_devices->list); - mutex_init(&fs_devices->device_list_mutex); fs_devices->latest_devid = orig->latest_devid; fs_devices->latest_trans = orig->latest_trans; fs_devices->total_devices = orig->total_devices; - memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); /* We have held the volume lock, it is safe to get the devices. */ list_for_each_entry(orig_dev, &orig->devices, dev_list) { struct rcu_string *name; - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) + device = btrfs_alloc_device(NULL, &orig_dev->devid, + orig_dev->uuid); + if (IS_ERR(device)) goto error; /* @@ -506,13 +550,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) } rcu_assign_pointer(device->name, name); - device->devid = orig_dev->devid; - device->work.func = pending_bios_fn; - memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); - spin_lock_init(&device->io_lock); - INIT_LIST_HEAD(&device->dev_list); - INIT_LIST_HEAD(&device->dev_alloc_list); - list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; fs_devices->num_devices++; @@ -636,23 +673,22 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) if (device->can_discard) fs_devices->num_can_discard--; + if (device->missing) + fs_devices->missing_devices--; - new_device = kmalloc(sizeof(*new_device), GFP_NOFS); - BUG_ON(!new_device); /* -ENOMEM */ - memcpy(new_device, device, sizeof(*new_device)); + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ /* Safe because we are under uuid_mutex */ if (device->name) { name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(device->name && !name); /* -ENOMEM */ + BUG_ON(!name); /* -ENOMEM */ rcu_assign_pointer(new_device->name, name); } - new_device->bdev = NULL; - new_device->writeable = 0; - new_device->in_fs_metadata = 0; - new_device->can_discard = 0; - spin_lock_init(&new_device->io_lock); + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; call_rcu(&device->rcu, free_device); } @@ -865,7 +901,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, disk_super = p + (bytenr & ~PAGE_CACHE_MASK); if (btrfs_super_bytenr(disk_super) != bytenr || - disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) + btrfs_super_magic(disk_super) != BTRFS_MAGIC) goto error_unmap; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -880,8 +916,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, printk(KERN_INFO "device fsid %pU ", disk_super->fsid); } - printk(KERN_CONT "devid %llu transid %llu %s\n", - (unsigned long long)devid, (unsigned long long)transid, path); + printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path); ret = device_list_add(path, disk_super, devid, fs_devices_ret); if (!ret && fs_devices_ret) @@ -1278,8 +1313,7 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), - BTRFS_UUID_SIZE); + btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE); btrfs_set_dev_extent_length(leaf, extent, num_bytes); btrfs_mark_buffer_dirty(leaf); @@ -1307,15 +1341,14 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info) return ret; } -static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) +static noinline int find_next_devid(struct btrfs_fs_info *fs_info, + u64 *devid_ret) { int ret; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; - root = root->fs_info->chunk_root; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1324,20 +1357,21 @@ static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) key.type = BTRFS_DEV_ITEM_KEY; key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); if (ret < 0) goto error; BUG_ON(ret == 0); /* Corruption */ - ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, + ret = btrfs_previous_item(fs_info->chunk_root, path, + BTRFS_DEV_ITEMS_OBJECTID, BTRFS_DEV_ITEM_KEY); if (ret) { - *objectid = 1; + *devid_ret = 1; } else { btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - *objectid = found_key.offset + 1; + *devid_ret = found_key.offset + 1; } ret = 0; error: @@ -1391,9 +1425,9 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans, btrfs_set_device_bandwidth(leaf, dev_item, 0); btrfs_set_device_start_offset(leaf, dev_item, 0); - ptr = (unsigned long)btrfs_device_uuid(dev_item); + ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); - ptr = (unsigned long)btrfs_device_fsid(dev_item); + ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); @@ -1562,7 +1596,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) clear_super = true; } + mutex_unlock(&uuid_mutex); ret = btrfs_shrink_device(device, 0); + mutex_lock(&uuid_mutex); if (ret) goto error_undo; @@ -1586,7 +1622,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) /* * the device list mutex makes sure that we don't change * the device list while someone else is writing out all - * the device supers. + * the device supers. Whoever is writing all supers, should + * lock the device list mutex before getting the number of + * devices in the super block (super_copy). Conversely, + * whoever updates the number of devices in the super block + * (super_copy) should hold the device list mutex. */ cur_devices = device->fs_devices; @@ -1610,10 +1650,10 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device->fs_devices->open_devices--; call_rcu(&device->rcu, free_device); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; @@ -1793,9 +1833,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) if (!fs_devices->seeding) return -EINVAL; - seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); - if (!seed_devices) - return -ENOMEM; + seed_devices = __alloc_fs_devices(); + if (IS_ERR(seed_devices)) + return PTR_ERR(seed_devices); old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { @@ -1814,7 +1854,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, synchronize_rcu); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); list_for_each_entry(device, &seed_devices->devices, dev_list) { @@ -1830,6 +1869,8 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) generate_random_uuid(fs_devices->fsid); memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + super_flags = btrfs_super_flags(disk_super) & ~BTRFS_SUPER_FLAG_SEEDING; btrfs_set_super_flags(disk_super, super_flags); @@ -1889,11 +1930,9 @@ next_slot: dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); devid = btrfs_device_id(leaf, dev_item); - read_extent_buffer(leaf, dev_uuid, - (unsigned long)btrfs_device_uuid(dev_item), + read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); - read_extent_buffer(leaf, fs_uuid, - (unsigned long)btrfs_device_fsid(dev_item), + read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); @@ -1956,10 +1995,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) { + device = btrfs_alloc_device(root->fs_info, NULL, NULL); + if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ - ret = -ENOMEM; + ret = PTR_ERR(device); goto error; } @@ -1971,13 +2010,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } rcu_assign_pointer(device->name, name); - ret = find_next_devid(root, &device->devid); - if (ret) { - rcu_string_free(device->name); - kfree(device); - goto error; - } - trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { rcu_string_free(device->name); @@ -1992,9 +2024,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (blk_queue_discard(q)) device->can_discard = 1; device->writeable = 1; - device->work.func = pending_bios_fn; - generate_random_uuid(device->uuid); - spin_lock_init(&device->io_lock); device->generation = trans->transid; device->io_width = root->sectorsize; device->io_align = root->sectorsize; @@ -2121,6 +2150,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *devices; struct rcu_string *name; + u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; *device_out = NULL; @@ -2142,9 +2172,9 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, } } - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) { - ret = -ENOMEM; + device = btrfs_alloc_device(NULL, &devid, NULL); + if (IS_ERR(device)) { + ret = PTR_ERR(device); goto error; } @@ -2161,10 +2191,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, device->can_discard = 1; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); device->writeable = 1; - device->work.func = pending_bios_fn; - generate_random_uuid(device->uuid); - device->devid = BTRFS_DEV_REPLACE_DEVID; - spin_lock_init(&device->io_lock); device->generation = 0; device->io_width = root->sectorsize; device->io_align = root->sectorsize; @@ -2971,10 +2997,6 @@ again: if (found_key.objectid != key.objectid) break; - /* chunk zero is special */ - if (found_key.offset == 0) - break; - chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); if (!counting) { @@ -3010,6 +3032,8 @@ again: spin_unlock(&fs_info->balance_lock); } loop: + if (found_key.offset == 0) + break; key.offset = found_key.offset - 1; } @@ -3074,9 +3098,6 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } -void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, - struct btrfs_ioctl_balance_args *bargs); - /* * Should be called with both balance and volume mutexes held */ @@ -3139,7 +3160,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, (bctl->data.target & ~allowed))) { printk(KERN_ERR "btrfs: unable to start balance with target " "data profile %llu\n", - (unsigned long long)bctl->data.target); + bctl->data.target); ret = -EINVAL; goto out; } @@ -3148,7 +3169,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, (bctl->meta.target & ~allowed))) { printk(KERN_ERR "btrfs: unable to start balance with target " "metadata profile %llu\n", - (unsigned long long)bctl->meta.target); + bctl->meta.target); ret = -EINVAL; goto out; } @@ -3157,7 +3178,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, (bctl->sys.target & ~allowed))) { printk(KERN_ERR "btrfs: unable to start balance with target " "system profile %llu\n", - (unsigned long long)bctl->sys.target); + bctl->sys.target); ret = -EINVAL; goto out; } @@ -3430,6 +3451,264 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) return 0; } +static int btrfs_uuid_scan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = data; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_key key; + struct btrfs_key max_key; + struct btrfs_path *path = NULL; + int ret = 0; + struct extent_buffer *eb; + int slot; + struct btrfs_root_item root_item; + u32 item_size; + struct btrfs_trans_handle *trans = NULL; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; + + max_key.objectid = (u64)-1; + max_key.type = BTRFS_ROOT_ITEM_KEY; + max_key.offset = (u64)-1; + + path->keep_locks = 1; + + while (1) { + ret = btrfs_search_forward(root, &key, &max_key, path, 0); + if (ret) { + if (ret > 0) + ret = 0; + break; + } + + if (key.type != BTRFS_ROOT_ITEM_KEY || + (key.objectid < BTRFS_FIRST_FREE_OBJECTID && + key.objectid != BTRFS_FS_TREE_OBJECTID) || + key.objectid > BTRFS_LAST_FREE_OBJECTID) + goto skip; + + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + if (item_size < sizeof(root_item)) + goto skip; + + read_extent_buffer(eb, &root_item, + btrfs_item_ptr_offset(eb, slot), + (int)sizeof(root_item)); + if (btrfs_root_refs(&root_item) == 0) + goto skip; + + if (!btrfs_is_empty_uuid(root_item.uuid) || + !btrfs_is_empty_uuid(root_item.received_uuid)) { + if (trans) + goto update_tree; + + btrfs_release_path(path); + /* + * 1 - subvol uuid item + * 1 - received_subvol uuid item + */ + trans = btrfs_start_transaction(fs_info->uuid_root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + continue; + } else { + goto skip; + } +update_tree: + if (!btrfs_is_empty_uuid(root_item.uuid)) { + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, + key.objectid); + if (ret < 0) { + pr_warn("btrfs: uuid_tree_add failed %d\n", + ret); + break; + } + } + + if (!btrfs_is_empty_uuid(root_item.received_uuid)) { + ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + root_item.received_uuid, + BTRFS_UUID_KEY_RECEIVED_SUBVOL, + key.objectid); + if (ret < 0) { + pr_warn("btrfs: uuid_tree_add failed %d\n", + ret); + break; + } + } + +skip: + if (trans) { + ret = btrfs_end_transaction(trans, fs_info->uuid_root); + trans = NULL; + if (ret) + break; + } + + btrfs_release_path(path); + if (key.offset < (u64)-1) { + key.offset++; + } else if (key.type < BTRFS_ROOT_ITEM_KEY) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + } else if (key.objectid < (u64)-1) { + key.offset = 0; + key.type = BTRFS_ROOT_ITEM_KEY; + key.objectid++; + } else { + break; + } + cond_resched(); + } + +out: + btrfs_free_path(path); + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, fs_info->uuid_root); + if (ret) + pr_warn("btrfs: btrfs_uuid_scan_kthread failed %d\n", ret); + else + fs_info->update_uuid_tree_gen = 1; + up(&fs_info->uuid_tree_rescan_sem); + return 0; +} + +/* + * Callback for btrfs_uuid_tree_iterate(). + * returns: + * 0 check succeeded, the entry is not outdated. + * < 0 if an error occured. + * > 0 if the check failed, which means the caller shall remove the entry. + */ +static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info, + u8 *uuid, u8 type, u64 subid) +{ + struct btrfs_key key; + int ret = 0; + struct btrfs_root *subvol_root; + + if (type != BTRFS_UUID_KEY_SUBVOL && + type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) + goto out; + + key.objectid = subid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + subvol_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(subvol_root)) { + ret = PTR_ERR(subvol_root); + if (ret == -ENOENT) + ret = 1; + goto out; + } + + switch (type) { + case BTRFS_UUID_KEY_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE)) + ret = 1; + break; + case BTRFS_UUID_KEY_RECEIVED_SUBVOL: + if (memcmp(uuid, subvol_root->root_item.received_uuid, + BTRFS_UUID_SIZE)) + ret = 1; + break; + } + +out: + return ret; +} + +static int btrfs_uuid_rescan_kthread(void *data) +{ + struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + int ret; + + /* + * 1st step is to iterate through the existing UUID tree and + * to delete all entries that contain outdated data. + * 2nd step is to add all missing entries to the UUID tree. + */ + ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry); + if (ret < 0) { + pr_warn("btrfs: iterating uuid_tree failed %d\n", ret); + up(&fs_info->uuid_tree_rescan_sem); + return ret; + } + return btrfs_uuid_scan_kthread(data); +} + +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *uuid_root; + struct task_struct *task; + int ret; + + /* + * 1 - root node + * 1 - root item + */ + trans = btrfs_start_transaction(tree_root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + uuid_root = btrfs_create_tree(trans, fs_info, + BTRFS_UUID_TREE_OBJECTID); + if (IS_ERR(uuid_root)) { + btrfs_abort_transaction(trans, tree_root, + PTR_ERR(uuid_root)); + return PTR_ERR(uuid_root); + } + + fs_info->uuid_root = uuid_root; + + ret = btrfs_commit_transaction(trans, tree_root); + if (ret) + return ret; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + pr_warn("btrfs: failed to start uuid_scan task\n"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + +int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) +{ + struct task_struct *task; + + down(&fs_info->uuid_tree_rescan_sem); + task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid"); + if (IS_ERR(task)) { + /* fs_info->update_uuid_tree_gen remains 0 in all error case */ + pr_warn("btrfs: failed to start uuid_rescan task\n"); + up(&fs_info->uuid_tree_rescan_sem); + return PTR_ERR(task); + } + + return 0; +} + /* * shrinking a device means finding all of the device extents past * the new size, and then following the back refs to the chunks. @@ -4194,13 +4473,13 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * and exit, so return 1 so the callers don't try to use other copies. */ if (!em) { - btrfs_emerg(fs_info, "No mapping for %Lu-%Lu\n", logical, + btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical, logical+len); return 1; } if (em->start > logical || em->start + em->len < logical) { - btrfs_emerg(fs_info, "Invalid mapping for %Lu-%Lu, got " + btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got " "%Lu-%Lu\n", logical, logical+len, em->start, em->start + em->len); return 1; @@ -4375,8 +4654,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (!em) { btrfs_crit(fs_info, "unable to find logical %llu len %llu", - (unsigned long long)logical, - (unsigned long long)*length); + logical, *length); return -EINVAL; } @@ -4671,6 +4949,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); if (!bbio) { + kfree(raid_map); ret = -ENOMEM; goto out; } @@ -5246,9 +5525,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, if (map_length < length) { btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu", - (unsigned long long)logical, - (unsigned long long)length, - (unsigned long long)map_length); + logical, length, map_length); BUG(); } @@ -5314,23 +5591,72 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root, struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; - device = kzalloc(sizeof(*device), GFP_NOFS); - if (!device) + device = btrfs_alloc_device(NULL, &devid, dev_uuid); + if (IS_ERR(device)) return NULL; - list_add(&device->dev_list, - &fs_devices->devices); - device->devid = devid; - device->work.func = pending_bios_fn; + + list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; - device->missing = 1; fs_devices->num_devices++; + + device->missing = 1; fs_devices->missing_devices++; - spin_lock_init(&device->io_lock); - INIT_LIST_HEAD(&device->dev_alloc_list); - memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); + return device; } +/** + * btrfs_alloc_device - allocate struct btrfs_device + * @fs_info: used only for generating a new devid, can be NULL if + * devid is provided (i.e. @devid != NULL). + * @devid: a pointer to devid for this device. If NULL a new devid + * is generated. + * @uuid: a pointer to UUID for this device. If NULL a new UUID + * is generated. + * + * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() + * on error. Returned struct is not linked onto any lists and can be + * destroyed with kfree() right away. + */ +struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, + const u64 *devid, + const u8 *uuid) +{ + struct btrfs_device *dev; + u64 tmp; + + if (!devid && !fs_info) { + WARN_ON(1); + return ERR_PTR(-EINVAL); + } + + dev = __alloc_device(); + if (IS_ERR(dev)) + return dev; + + if (devid) + tmp = *devid; + else { + int ret; + + ret = find_next_devid(fs_info, &tmp); + if (ret) { + kfree(dev); + return ERR_PTR(ret); + } + } + dev->devid = tmp; + + if (uuid) + memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); + else + generate_random_uuid(dev->uuid); + + dev->work.func = pending_bios_fn; + + return dev; +} + static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) @@ -5437,7 +5763,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); device->is_tgtdev_for_dev_replace = 0; - ptr = (unsigned long)btrfs_device_uuid(dev_item); + ptr = btrfs_device_uuid(dev_item); read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); } @@ -5500,11 +5826,9 @@ static int read_one_dev(struct btrfs_root *root, u8 dev_uuid[BTRFS_UUID_SIZE]; devid = btrfs_device_id(leaf, dev_item); - read_extent_buffer(leaf, dev_uuid, - (unsigned long)btrfs_device_uuid(dev_item), + read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); - read_extent_buffer(leaf, fs_uuid, - (unsigned long)btrfs_device_fsid(dev_item), + read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { @@ -5519,8 +5843,7 @@ static int read_one_dev(struct btrfs_root *root, return -EIO; if (!device) { - btrfs_warn(root->fs_info, "devid %llu missing", - (unsigned long long)devid); + btrfs_warn(root->fs_info, "devid %llu missing", devid); device = add_missing_dev(root, devid, dev_uuid); if (!device) return -ENOMEM; @@ -5644,14 +5967,15 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) mutex_lock(&uuid_mutex); lock_chunks(root); - /* first we search for all of the device items, and then we - * read in all of the chunk items. This way we can create chunk - * mappings that reference all of the devices that are afound + /* + * Read all device items, and then all the chunk items. All + * device items are found before any chunk item (their object id + * is smaller than the lowest possible object id for a chunk + * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). */ key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.offset = 0; key.type = 0; -again: ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto error; @@ -5667,17 +5991,13 @@ again: break; } btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { - if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) - break; - if (found_key.type == BTRFS_DEV_ITEM_KEY) { - struct btrfs_dev_item *dev_item; - dev_item = btrfs_item_ptr(leaf, slot, + if (found_key.type == BTRFS_DEV_ITEM_KEY) { + struct btrfs_dev_item *dev_item; + dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); - ret = read_one_dev(root, leaf, dev_item); - if (ret) - goto error; - } + ret = read_one_dev(root, leaf, dev_item); + if (ret) + goto error; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -5687,11 +6007,6 @@ again: } path->slots[0]++; } - if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { - key.objectid = 0; - btrfs_release_path(path); - goto again; - } ret = 0; error: unlock_chunks(root); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 86705583480d..b72f540c8b29 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -152,6 +152,8 @@ struct btrfs_fs_devices { int rotating; }; +#define BTRFS_BIO_INLINE_CSUM_SIZE 64 + /* * we need the mirror number and stripe index to be passed around * the call chain while we are processing end_io (especially errors). @@ -161,9 +163,14 @@ struct btrfs_fs_devices { * we allocate are actually btrfs_io_bios. We'll cram as much of * struct btrfs_bio as we can into this over time. */ +typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); struct btrfs_io_bio { unsigned long mirror_num; unsigned long stripe_index; + u8 *csum; + u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + u8 *csum_allocated; + btrfs_io_bio_end_io_t *end_io; struct bio bio; }; @@ -298,6 +305,9 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); +struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, + const u64 *devid, + const u8 *uuid); int btrfs_rm_device(struct btrfs_root *root, char *device_path); void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); @@ -315,6 +325,8 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); +int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); +int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, diff --git a/fs/cifs/AUTHORS b/fs/cifs/AUTHORS deleted file mode 100644 index ea940b1db77b..000000000000 --- a/fs/cifs/AUTHORS +++ /dev/null @@ -1,55 +0,0 @@ -Original Author -=============== -Steve French (sfrench@samba.org) - -The author wishes to express his appreciation and thanks to: -Andrew Tridgell (Samba team) for his early suggestions about smb/cifs VFS -improvements. Thanks to IBM for allowing me time and test resources to pursue -this project, to Jim McDonough from IBM (and the Samba Team) for his help, to -the IBM Linux JFS team for explaining many esoteric Linux filesystem features. -Jeremy Allison of the Samba team has done invaluable work in adding the server -side of the original CIFS Unix extensions and reviewing and implementing -portions of the newer CIFS POSIX extensions into the Samba 3 file server. Thank -Dave Boutcher of IBM Rochester (author of the OS/400 smb/cifs filesystem client) -for proving years ago that very good smb/cifs clients could be done on Unix-like -operating systems. Volker Lendecke, Andrew Tridgell, Urban Widmark, John -Newbigin and others for their work on the Linux smbfs module. Thanks to -the other members of the Storage Network Industry Association CIFS Technical -Workgroup for their work specifying this highly complex protocol and finally -thanks to the Samba team for their technical advice and encouragement. - -Patch Contributors ------------------- -Zwane Mwaikambo -Andi Kleen -Amrut Joshi -Shobhit Dayal -Sergey Vlasov -Richard Hughes -Yury Umanets -Mark Hamzy (for some of the early cifs IPv6 work) -Domen Puncer -Jesper Juhl (in particular for lots of whitespace/formatting cleanup) -Vince Negri and Dave Stahl (for finding an important caching bug) -Adrian Bunk (kcalloc cleanups) -Miklos Szeredi -Kazeon team for various fixes especially for 2.4 version. -Asser Ferno (Change Notify support) -Shaggy (Dave Kleikamp) for innumerable small fs suggestions and some good cleanup -Gunter Kukkukk (testing and suggestions for support of old servers) -Igor Mammedov (DFS support) -Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code) - -Test case and Bug Report contributors -------------------------------------- -Thanks to those in the community who have submitted detailed bug reports -and debug of problems they have found: Jochen Dolze, David Blaine, -Rene Scharfe, Martin Josefsson, Alexander Wild, Anthony Liguori, -Lars Muller, Urban Widmark, Massimiliano Ferrero, Howard Owen, -Olaf Kirch, Kieron Briggs, Nick Millington and others. Also special -mention to the Stanford Checker (SWAT) which pointed out many minor -bugs in error paths. Valuable suggestions also have come from Al Viro -and Dave Miller. - -And thanks to the IBM LTC and Power test teams and SuSE testers for -finding multiple bugs during excellent stress test runs. diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES deleted file mode 100644 index bc0025cdd1c9..000000000000 --- a/fs/cifs/CHANGES +++ /dev/null @@ -1,1065 +0,0 @@ -Version 1.62 ------------- -Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened -to more strictly handle corrupt frames. - -Version 1.61 ------------- -Fix append problem to Samba servers (files opened with O_APPEND could -have duplicated data). Fix oops in cifs_lookup. Workaround problem -mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session. -Disable use of server inode numbers when server only -partially supports them (e.g. for one server querying inode numbers on -FindFirst fails but QPathInfo queries works). Fix oops with dfs in -cifs_put_smb_ses. Fix mmap to work on directio mounts (needed -for OpenOffice when on forcedirectio mount e.g.) - -Version 1.60 -------------- -Fix memory leak in reconnect. Fix oops in DFS mount error path. -Set s_maxbytes to smaller (the max that vfs can handle) so that -sendfile will now work over cifs mounts again. Add noforcegid -and noforceuid mount parameters. Fix small mem leak when using -ntlmv2. Fix 2nd mount to same server but with different port to -be allowed (rather than reusing the 1st port) - only when the -user explicitly overrides the port on the 2nd mount. - -Version 1.59 ------------- -Client uses server inode numbers (which are persistent) rather than -client generated ones by default (mount option "serverino" turned -on by default if server supports it). Add forceuid and forcegid -mount options (so that when negotiating unix extensions specifying -which uid mounted does not immediately force the server's reported -uids to be overridden). Add support for scope mount parm. Improve -hard link detection to use same inode for both. Do not set -read-only dos attribute on directories (for chmod) since Windows -explorer special cases this attribute bit for directories for -a different purpose. - -Version 1.58 ------------- -Guard against buffer overruns in various UCS-2 to UTF-8 string conversions -when the UTF-8 string is composed of unusually long (more than 4 byte) converted -characters. Add support for mounting root of a share which redirects immediately -to DFS target. Convert string conversion functions from Unicode to more -accurately mark string length before allocating memory (which may help the -rare cases where a UTF-8 string is much larger than the UCS2 string that -we converted from). Fix endianness of the vcnum field used during -session setup to distinguish multiple mounts to same server from different -userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental -flag to be set to 2, and mount must enable krb5 to turn on extended security). -Performance of file create to Samba improved (posix create on lookup -removes 1 of 2 network requests sent on file create) - -Version 1.57 ------------- -Improve support for multiple security contexts to the same server. We -used to use the same "vcnumber" for all connections which could cause -the server to treat subsequent connections, especially those that -are authenticated as guest, as reconnections, invalidating the earlier -user's smb session. This fix allows cifs to mount multiple times to the -same server with different userids without risking invalidating earlier -established security contexts. fsync now sends SMB Flush operation -to better ensure that we wait for server to write all of the data to -server disk (not just write it over the network). Add new mount -parameter to allow user to disable sending the (slow) SMB flush on -fsync if desired (fsync still flushes all cached write data to the server). -Posix file open support added (turned off after one attempt if server -fails to support it properly, as with Samba server versions prior to 3.3.2) -Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too -little memory for the "nativeFileSystem" field returned by the server -during mount). Endian convert inode numbers if necessary (makes it easier -to compare inode numbers on network files from big endian systems). - -Version 1.56 ------------- -Add "forcemandatorylock" mount option to allow user to use mandatory -rather than posix (advisory) byte range locks, even though server would -support posix byte range locks. Fix query of root inode when prefixpath -specified and user does not have access to query information about the -top of the share. Fix problem in 2.6.28 resolving DFS paths to -Samba servers (worked to Windows). Fix rmdir so that pending search -(readdir) requests do not get invalid results which include the now -removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable -when using DFS. Add better file create support to servers which support -the CIFS POSIX protocol extensions (this adds support for new flags -on create, and improves semantics for write of locked ranges). - -Version 1.55 ------------- -Various fixes to make delete of open files behavior more predictable -(when delete of an open file fails we mark the file as "delete-on-close" -in a way that more servers accept, but only if we can first rename the -file to a temporary name). Add experimental support for more safely -handling fcntl(F_SETLEASE). Convert cifs to using blocking tcp -sends, and also let tcp autotune the socket send and receive buffers. -This reduces the number of EAGAIN errors returned by TCP/IP in -high stress workloads (and the number of retries on socket writes -when sending large SMBWriteX requests). Fix case in which a portion of -data can in some cases not get written to the file on the server before the -file is closed. Fix DFS parsing to properly handle path consumed field, -and to handle certain codepage conversions better. Fix mount and -umount race that can cause oops in mount or umount or reconnect. - -Version 1.54 ------------- -Fix premature write failure on congested networks (we would give up -on EAGAIN from the socket too quickly on large writes). -Cifs_mkdir and cifs_create now respect the setgid bit on parent dir. -Fix endian problems in acl (mode from/to cifs acl) on bigendian -architectures. Fix problems with preserving timestamps on copying open -files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit -on parent directory when server supports Unix Extensions but not POSIX -create. Update cifs.upcall version to handle new Kerberos sec flags -(this requires update of cifs.upcall program from Samba). Fix memory leak -on dns_upcall (resolving DFS referralls). Fix plain text password -authentication (requires setting SecurityFlags to 0x30030 to enable -lanman and plain text though). Fix writes to be at correct offset when -file is open with O_APPEND and file is on a directio (forcediretio) mount. -Fix bug in rewinding readdir directory searches. Add nodfs mount option. - -Version 1.53 ------------- -DFS support added (Microsoft Distributed File System client support needed -for referrals which enable a hierarchical name space among servers). -Disable temporary caching of mode bits to servers which do not support -storing of mode (e.g. Windows servers, when client mounts without cifsacl -mount option) and add new "dynperm" mount option to enable temporary caching -of mode (enable old behavior). Fix hang on mount caused when server crashes -tcp session during negotiate protocol. - -Version 1.52 ------------- -Fix oops on second mount to server when null auth is used. -Enable experimental Kerberos support. Return writebehind errors on flush -and sync so that events like out of disk space get reported properly on -cached files. Fix setxattr failure to certain Samba versions. Fix mount -of second share to disconnected server session (autoreconnect on this). -Add ability to modify cifs acls for handling chmod (when mounted with -cifsacl flag). Fix prefixpath path separator so we can handle mounts -with prefixpaths longer than one directory (one path component) when -mounted to Windows servers. Fix slow file open when cifsacl -enabled. Fix memory leak in FindNext when the SMB call returns -EBADF. - - -Version 1.51 ------------- -Fix memory leak in statfs when mounted to very old servers (e.g. -Windows 9x). Add new feature "POSIX open" which allows servers -which support the current POSIX Extensions to provide better semantics -(e.g. delete for open files opened with posix open). Take into -account umask on posix mkdir not just older style mkdir. Add -ability to mount to IPC$ share (which allows CIFS named pipes to be -opened, read and written as if they were files). When 1st tree -connect fails (e.g. due to signing negotiation failure) fix -leak that causes cifsd not to stop and rmmod to fail to cleanup -cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on -bigendian architectures. Fix possible memory corruption when -EAGAIN returned on kern_recvmsg. Return better error if server -requires packet signing but client has disabled it. When mounted -with cifsacl mount option - mode bits are approximated based -on the contents of the ACL of the file or directory. When cifs -mount helper is missing convert make sure that UNC name -has backslash (not forward slash) between ip address of server -and the share name. - -Version 1.50 ------------- -Fix NTLMv2 signing. NFS server mounted over cifs works (if cifs mount is -done with "serverino" mount option). Add support for POSIX Unlink -(helps with certain sharing violation cases when server such as -Samba supports newer POSIX CIFS Protocol Extensions). Add "nounix" -mount option to allow disabling the CIFS Unix Extensions for just -that mount. Fix hang on spinlock in find_writable_file (race when -reopening file after session crash). Byte range unlock request to -windows server could unlock more bytes (on server copy of file) -than intended if start of unlock request is well before start of -a previous byte range lock that we issued. - -Version 1.49 ------------- -IPv6 support. Enable ipv6 addresses to be passed on mount (put the ipv6 -address after the "ip=" mount option, at least until mount.cifs is fixed to -handle DNS host to ipv6 name translation). Accept override of uid or gid -on mount even when Unix Extensions are negotiated (it used to be ignored -when Unix Extensions were ignored). This allows users to override the -default uid and gid for files when they are certain that the uids or -gids on the server do not match those of the client. Make "sec=none" -mount override username (so that null user connection is attempted) -to match what documentation said. Support for very large reads, over 127K, -available to some newer servers (such as Samba 3.0.26 and later but -note that it also requires setting CIFSMaxBufSize at module install -time to a larger value which may hurt performance in some cases). -Make sign option force signing (or fail if server does not support it). - -Version 1.48 ------------- -Fix mtime bouncing around from local idea of last write times to remote time. -Fix hang (in i_size_read) when simultaneous size update of same remote file -on smp system corrupts sequence number. Do not reread unnecessarily partial page -(which we are about to overwrite anyway) when writing out file opened rw. -When DOS attribute of file on non-Unix server's file changes on the server side -from read-only back to read-write, reflect this change in default file mode -(we had been leaving a file's mode read-only until the inode were reloaded). -Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute -when archive dos attribute not set and we are changing mode back to writeable -on server which does not support the Unix Extensions). Remove read only dos -attribute on chmod when adding any write permission (ie on any of -user/group/other (not all of user/group/other ie 0222) when -mounted to windows. Add support for POSIX MkDir (slight performance -enhancement and eliminates the network race between the mkdir and set -path info of the mode). - - -Version 1.47 ------------- -Fix oops in list_del during mount caused by unaligned string. -Fix file corruption which could occur on some large file -copies caused by writepages page i/o completion bug. -Seek to SEEK_END forces check for update of file size for non-cached -files. Allow file size to be updated on remote extend of locally open, -non-cached file. Fix reconnect to newer Samba servers (or other servers -which support the CIFS Unix/POSIX extensions) so that we again tell the -server the Unix/POSIX cifs capabilities which we support (SetFSInfo). -Add experimental support for new POSIX Open/Mkdir (which returns -stat information on the open, and allows setting the mode). - -Version 1.46 ------------- -Support deep tree mounts. Better support OS/2, Win9x (DOS) time stamps. -Allow null user to be specified on mount ("username="). Do not return -EINVAL on readdir when filldir fails due to overwritten blocksize -(fixes FC problem). Return error in rename 2nd attempt retry (ie report -if rename by handle also fails, after rename by path fails, we were -not reporting whether the retry worked or not). Fix NTLMv2 to -work to Windows servers (mount with option "sec=ntlmv2"). - -Version 1.45 ------------- -Do not time out lockw calls when using posix extensions. Do not -time out requests if server still responding reasonably fast -on requests on other threads. Improve POSIX locking emulation, -(lock cancel now works, and unlock of merged range works even -to Windows servers now). Fix oops on mount to lanman servers -(win9x, os/2 etc.) when null password. Do not send listxattr -(SMB to query all EAs) if nouser_xattr specified. Fix SE Linux -problem (instantiate inodes/dentries in right order for readdir). - -Version 1.44 ------------- -Rewritten sessionsetup support, including support for legacy SMB -session setup needed for OS/2 and older servers such as Windows 95 and 98. -Fix oops on ls to OS/2 servers. Add support for level 1 FindFirst -so we can do search (ls etc.) to OS/2. Do not send NTCreateX -or recent levels of FindFirst unless server says it supports NT SMBs -(instead use legacy equivalents from LANMAN dialect). Fix to allow -NTLMv2 authentication support (now can use stronger password hashing -on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004). -Allow override of global cifs security flags on mount via "sec=" option(s). - -Version 1.43 ------------- -POSIX locking to servers which support CIFS POSIX Extensions -(disabled by default controlled by proc/fs/cifs/Experimental). -Handle conversion of long share names (especially Asian languages) -to Unicode during mount. Fix memory leak in sess struct on reconnect. -Fix rare oops after acpi suspend. Fix O_TRUNC opens to overwrite on -cifs open which helps rare case when setpathinfo fails or server does -not support it. - -Version 1.42 ------------- -Fix slow oplock break when mounted to different servers at the same time and -the tids match and we try to find matching fid on wrong server. Fix read -looping when signing required by server (2.6.16 kernel only). Fix readdir -vs. rename race which could cause each to hang. Return . and .. even -if server does not. Allow searches to skip first three entries and -begin at any location. Fix oops in find_writeable_file. - -Version 1.41 ------------- -Fix NTLMv2 security (can be enabled in /proc/fs/cifs) so customers can -configure stronger authentication. Fix sfu symlinks so they can -be followed (not just recognized). Fix wraparound of bcc on -read responses when buffer size over 64K and also fix wrap of -max smb buffer size when CIFSMaxBufSize over 64K. Fix oops in -cifs_user_read and cifs_readpages (when EAGAIN on send of smb -on socket is returned over and over). Add POSIX (advisory) byte range -locking support (requires server with newest CIFS UNIX Extensions -to the protocol implemented). Slow down negprot slightly in port 139 -RFC1001 case to give session_init time on buggy servers. - -Version 1.40 ------------- -Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance -of readpages by eliminating one extra memcpy. Allow update of file size -from remote server even if file is open for write as long as mount is -directio. Recognize share mode security and send NTLM encrypted password -on tree connect if share mode negotiated. - -Version 1.39 ------------- -Defer close of a file handle slightly if pending writes depend on that handle -(this reduces the EBADF bad file handle errors that can be logged under heavy -stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2 -Fix SFU style symlinks and mknod needed for servers which do not support the -CIFS Unix Extensions. Fix setfacl/getfacl on bigendian. Timeout negative -dentries so files that the client sees as deleted but that later get created -on the server will be recognized. Add client side permission check on setattr. -Timeout stuck requests better (where server has never responded or sent corrupt -responses) - -Version 1.38 ------------- -Fix tcp socket retransmission timeouts (e.g. on ENOSPACE from the socket) -to be smaller at first (but increasing) so large write performance performance -over GigE is better. Do not hang thread on illegal byte range lock response -from Windows (Windows can send an RFC1001 size which does not match smb size) by -allowing an SMBs TCP length to be up to a few bytes longer than it should be. -wsize and rsize can now be larger than negotiated buffer size if server -supports large readx/writex, even when directio mount flag not specified. -Write size will in many cases now be 16K instead of 4K which greatly helps -file copy performance on lightly loaded networks. Fix oops in dnotify -when experimental config flag enabled. Make cifsFYI more granular. - -Version 1.37 ------------- -Fix readdir caching when unlink removes file in current search buffer, -and this is followed by a rewind search to just before the deleted entry. -Do not attempt to set ctime unless atime and/or mtime change requested -(most servers throw it away anyway). Fix length check of received smbs -to be more accurate. Fix big endian problem with mapchars mount option, -and with a field returned by statfs. - -Version 1.36 ------------- -Add support for mounting to older pre-CIFS servers such as Windows9x and ME. -For these older servers, add option for passing netbios name of server in -on mount (servernetbiosname). Add suspend support for power management, to -avoid cifsd thread preventing software suspend from working. -Add mount option for disabling the default behavior of sending byte range lock -requests to the server (necessary for certain applications which break with -mandatory lock behavior such as Evolution), and also mount option for -requesting case insensitive matching for path based requests (requesting -case sensitive is the default). - -Version 1.35 ------------- -Add writepage performance improvements. Fix path name conversions -for long filenames on mounts which were done with "mapchars" mount option -specified. Ensure multiplex ids do not collide. Fix case in which -rmmod can oops if done soon after last unmount. Fix truncated -search (readdir) output when resume filename was a long filename. -Fix filename conversion when mapchars mount option was specified and -filename was a long filename. - -Version 1.34 ------------- -Fix error mapping of the TOO_MANY_LINKS (hardlinks) case. -Do not oops if root user kills cifs oplock kernel thread or -kills the cifsd thread (NB: killing the cifs kernel threads is not -recommended, unmount and rmmod cifs will kill them when they are -no longer needed). Fix readdir to ASCII servers (ie older servers -which do not support Unicode) and also require asterisk. -Fix out of memory case in which data could be written one page -off in the page cache. - -Version 1.33 ------------- -Fix caching problem, in which readdir of directory containing a file -which was cached could cause the file's time stamp to be updated -without invalidating the readahead data (so we could get stale -file data on the client for that file even as the server copy changed). -Cleanup response processing so cifsd can not loop when abnormally -terminated. - - -Version 1.32 ------------- -Fix oops in ls when Transact2 FindFirst (or FindNext) returns more than one -transact response for an SMB request and search entry split across two frames. -Add support for lsattr (getting ext2/ext3/reiserfs attr flags from the server) -as new protocol extensions. Do not send Get/Set calls for POSIX ACLs -unless server explicitly claims to support them in CIFS Unix extensions -POSIX ACL capability bit. Fix packet signing when multiuser mounting with -different users from the same client to the same server. Fix oops in -cifs_close. Add mount option for remapping reserved characters in -filenames (also allow recognizing files with created by SFU which have any -of these seven reserved characters, except backslash, to be recognized). -Fix invalid transact2 message (we were sometimes trying to interpret -oplock breaks as SMB responses). Add ioctl for checking that the -current uid matches the uid of the mounter (needed by umount.cifs). -Reduce the number of large buffer allocations in cifs response processing -(significantly reduces memory pressure under heavy stress with multiple -processes accessing the same server at the same time). - -Version 1.31 ------------- -Fix updates of DOS attributes and time fields so that files on NT4 servers -do not get marked delete on close. Display sizes of cifs buffer pools in -cifs stats. Fix oops in unmount when cifsd thread being killed by -shutdown. Add generic readv/writev and aio support. Report inode numbers -consistently in readdir and lookup (when serverino mount option is -specified use the inode number that the server reports - for both lookup -and readdir, otherwise by default the locally generated inode number is used -for inodes created in either path since servers are not always able to -provide unique inode numbers when exporting multiple volumes from under one -sharename). - -Version 1.30 ------------- -Allow new nouser_xattr mount parm to disable xattr support for user namespace. -Do not flag user_xattr mount parm in dmesg. Retry failures setting file time -(mostly affects NT4 servers) by retry with handle based network operation. -Add new POSIX Query FS Info for returning statfs info more accurately. -Handle passwords with multiple commas in them. - -Version 1.29 ------------- -Fix default mode in sysfs of cifs module parms. Remove old readdir routine. -Fix capabilities flags for large readx so as to allow reads larger than 64K. - -Version 1.28 ------------- -Add module init parm for large SMB buffer size (to allow it to be changed -from its default of 16K) which is especially useful for large file copy -when mounting with the directio mount option. Fix oops after -returning from mount when experimental ExtendedSecurity enabled and -SpnegoNegotiated returning invalid error. Fix case to retry better when -peek returns from 1 to 3 bytes on socket which should have more data. -Fixed path based calls (such as cifs lookup) to handle path names -longer than 530 (now can handle PATH_MAX). Fix pass through authentication -from Samba server to DC (Samba required dummy LM password). - -Version 1.27 ------------- -Turn off DNOTIFY (directory change notification support) by default -(unless built with the experimental flag) to fix hang with KDE -file browser. Fix DNOTIFY flag mappings. Fix hang (in wait_event -waiting on an SMB response) in SendReceive when session dies but -reconnects quickly from another task. Add module init parms for -minimum number of large and small network buffers in the buffer pools, -and for the maximum number of simultaneous requests. - -Version 1.26 ------------- -Add setfacl support to allow setting of ACLs remotely to Samba 3.10 and later -and other POSIX CIFS compliant servers. Fix error mapping for getfacl -to EOPNOTSUPP when server does not support posix acls on the wire. Fix -improperly zeroed buffer in CIFS Unix extensions set times call. - -Version 1.25 ------------- -Fix internationalization problem in cifs readdir with filenames that map to -longer UTF-8 strings than the string on the wire was in Unicode. Add workaround -for readdir to netapp servers. Fix search rewind (seek into readdir to return -non-consecutive entries). Do not do readdir when server negotiates -buffer size to small to fit filename. Add support for reading POSIX ACLs from -the server (add also acl and noacl mount options). - -Version 1.24 ------------- -Optionally allow using server side inode numbers, rather than client generated -ones by specifying mount option "serverino" - this is required for some apps -to work which double check hardlinked files and have persistent inode numbers. - -Version 1.23 ------------- -Multiple bigendian fixes. On little endian systems (for reconnect after -network failure) fix tcp session reconnect code so we do not try first -to reconnect on reverse of port 445. Treat reparse points (NTFS junctions) -as directories rather than symlinks because we can do follow link on them. - -Version 1.22 ------------- -Add config option to enable XATTR (extended attribute) support, mapping -xattr names in the "user." namespace space to SMB/CIFS EAs. Lots of -minor fixes pointed out by the Stanford SWAT checker (mostly missing -or out of order NULL pointer checks in little used error paths). - -Version 1.21 ------------- -Add new mount parm to control whether mode check (generic_permission) is done -on the client. If Unix extensions are enabled and the uids on the client -and server do not match, client permission checks are meaningless on -server uids that do not exist on the client (this does not affect the -normal ACL check which occurs on the server). Fix default uid -on mknod to match create and mkdir. Add optional mount parm to allow -override of the default uid behavior (in which the server sets the uid -and gid of newly created files). Normally for network filesystem mounts -user want the server to set the uid/gid on newly created files (rather than -using uid of the client processes you would in a local filesystem). - -Version 1.20 ------------- -Make transaction counts more consistent. Merge /proc/fs/cifs/SimultaneousOps -info into /proc/fs/cifs/DebugData. Fix oops in rare oops in readdir -(in build_wildcard_path_from_dentry). Fix mknod to pass type field -(block/char/fifo) properly. Remove spurious mount warning log entry when -credentials passed as mount argument. Set major/minor device number in -inode for block and char devices when unix extensions enabled. - -Version 1.19 ------------- -Fix /proc/fs/cifs/Stats and DebugData display to handle larger -amounts of return data. Properly limit requests to MAX_REQ (50 -is the usual maximum active multiplex SMB/CIFS requests per server). -Do not kill cifsd (and thus hurt the other SMB session) when more than one -session to the same server (but with different userids) exists and one -of the two user's smb sessions is being removed while leaving the other. -Do not loop reconnecting in cifsd demultiplex thread when admin -kills the thread without going through unmount. - -Version 1.18 ------------- -Do not rename hardlinked files (since that should be a noop). Flush -cached write behind data when reopening a file after session abend, -except when already in write. Grab per socket sem during reconnect -to avoid oops in sendmsg if overlapping with reconnect. Do not -reset cached inode file size on readdir for files open for write on -client. - - -Version 1.17 ------------- -Update number of blocks in file so du command is happier (in Linux a fake -blocksize of 512 is required for calculating number of blocks in inode). -Fix prepare write of partial pages to read in data from server if possible. -Fix race on tcpStatus field between unmount and reconnection code, causing -cifsd process sometimes to hang around forever. Improve out of memory -checks in cifs_filldir - -Version 1.16 ------------- -Fix incorrect file size in file handle based setattr on big endian hardware. -Fix oops in build_path_from_dentry when out of memory. Add checks for invalid -and closing file structs in writepage/partialpagewrite. Add statistics -for each mounted share (new menuconfig option). Fix endianness problem in -volume information displayed in /proc/fs/cifs/DebugData (only affects -affects big endian architectures). Prevent renames while constructing -path names for open, mkdir and rmdir. - -Version 1.15 ------------- -Change to mempools for alloc smb request buffers and multiplex structs -to better handle low memory problems (and potential deadlocks). - -Version 1.14 ------------- -Fix incomplete listings of large directories on Samba servers when Unix -extensions enabled. Fix oops when smb_buffer can not be allocated. Fix -rename deadlock when writing out dirty pages at same time. - -Version 1.13 ------------- -Fix open of files in which O_CREATE can cause the mode to change in -some cases. Fix case in which retry of write overlaps file close. -Fix PPC64 build error. Reduce excessive stack usage in smb password -hashing. Fix overwrite of Linux user's view of file mode to Windows servers. - -Version 1.12 ------------- -Fixes for large file copy, signal handling, socket retry, buffer -allocation and low memory situations. - -Version 1.11 ------------- -Better port 139 support to Windows servers (RFC1001/RFC1002 Session_Initialize) -also now allowing support for specifying client netbiosname. NT4 support added. - -Version 1.10 ------------- -Fix reconnection (and certain failed mounts) to properly wake up the -blocked users thread so it does not seem hung (in some cases was blocked -until the cifs receive timeout expired). Fix spurious error logging -to kernel log when application with open network files killed. - -Version 1.09 ------------- -Fix /proc/fs module unload warning message (that could be logged -to the kernel log). Fix intermittent failure in connectathon -test7 (hardlink count not immediately refreshed in case in which -inode metadata can be incorrectly kept cached when time near zero) - -Version 1.08 ------------- -Allow file_mode and dir_mode (specified at mount time) to be enforced -locally (the server already enforced its own ACLs too) for servers -that do not report the correct mode (do not support the -CIFS Unix Extensions). - -Version 1.07 ------------- -Fix some small memory leaks in some unmount error paths. Fix major leak -of cache pages in readpages causing multiple read oriented stress -testcases (including fsx, and even large file copy) to fail over time. - -Version 1.06 ------------- -Send NTCreateX with ATTR_POSIX if Linux/Unix extensions negotiated with server. -This allows files that differ only in case and improves performance of file -creation and file open to such servers. Fix semaphore conflict which causes -slow delete of open file to Samba (which unfortunately can cause an oplock -break to self while vfs_unlink held i_sem) which can hang for 20 seconds. - -Version 1.05 ------------- -fixes to cifs_readpages for fsx test case - -Version 1.04 ------------- -Fix caching data integrity bug when extending file size especially when no -oplock on file. Fix spurious logging of valid already parsed mount options -that are parsed outside of the cifs vfs such as nosuid. - - -Version 1.03 ------------- -Connect to server when port number override not specified, and tcp port -unitialized. Reset search to restart at correct file when kernel routine -filldir returns error during large directory searches (readdir). - -Version 1.02 ------------- -Fix caching problem when files opened by multiple clients in which -page cache could contain stale data, and write through did -not occur often enough while file was still open when read ahead -(read oplock) not allowed. Treat "sep=" when first mount option -as an override of comma as the default separator between mount -options. - -Version 1.01 ------------- -Allow passwords longer than 16 bytes. Allow null password string. - -Version 1.00 ------------- -Gracefully clean up failed mounts when attempting to mount to servers such as -Windows 98 that terminate tcp sessions during protocol negotiation. Handle -embedded commas in mount parsing of passwords. - -Version 0.99 ------------- -Invalidate local inode cached pages on oplock break and when last file -instance is closed so that the client does not continue using stale local -copy rather than later modified server copy of file. Do not reconnect -when server drops the tcp session prematurely before negotiate -protocol response. Fix oops in reopen_file when dentry freed. Allow -the support for CIFS Unix Extensions to be disabled via proc interface. - -Version 0.98 ------------- -Fix hang in commit_write during reconnection of open files under heavy load. -Fix unload_nls oops in a mount failure path. Serialize writes to same socket -which also fixes any possible races when cifs signatures are enabled in SMBs -being sent out of signature sequence number order. - -Version 0.97 ------------- -Fix byte range locking bug (endian problem) causing bad offset and -length. - -Version 0.96 ------------- -Fix oops (in send_sig) caused by CIFS unmount code trying to -wake up the demultiplex thread after it had exited. Do not log -error on harmless oplock release of closed handle. - -Version 0.95 ------------- -Fix unsafe global variable usage and password hash failure on gcc 3.3.1 -Fix problem reconnecting secondary mounts to same server after session -failure. Fix invalid dentry - race in mkdir when directory gets created -by another client between the lookup and mkdir. - -Version 0.94 ------------- -Fix to list processing in reopen_files. Fix reconnection when server hung -but tcpip session still alive. Set proper timeout on socket read. - -Version 0.93 ------------- -Add missing mount options including iocharset. SMP fixes in write and open. -Fix errors in reconnecting after TCP session failure. Fix module unloading -of default nls codepage - -Version 0.92 ------------- -Active smb transactions should never go negative (fix double FreeXid). Fix -list processing in file routines. Check return code on kmalloc in open. -Fix spinlock usage for SMP. - -Version 0.91 ------------- -Fix oops in reopen_files when invalid dentry. drop dentry on server rename -and on revalidate errors. Fix cases where pid is now tgid. Fix return code -on create hard link when server does not support them. - -Version 0.90 ------------- -Fix scheduling while atomic error in getting inode info on newly created file. -Fix truncate of existing files opened with O_CREAT but not O_TRUNC set. - -Version 0.89 ------------- -Fix oops on write to dead tcp session. Remove error log write for case when file open -O_CREAT but not O_EXCL - -Version 0.88 ------------- -Fix non-POSIX behavior on rename of open file and delete of open file by taking -advantage of trans2 SetFileInfo rename facility if available on target server. -Retry on ENOSPC and EAGAIN socket errors. - -Version 0.87 ------------- -Fix oops on big endian readdir. Set blksize to be even power of two (2**blkbits) to fix -allocation size miscalculation. After oplock token lost do not read through -cache. - -Version 0.86 ------------- -Fix oops on empty file readahead. Fix for file size handling for locally cached files. - -Version 0.85 ------------- -Fix oops in mkdir when server fails to return inode info. Fix oops in reopen_files -during auto reconnection to server after server recovered from failure. - -Version 0.84 ------------- -Finish support for Linux 2.5 open/create changes, which removes the -redundant NTCreate/QPathInfo/close that was sent during file create. -Enable oplock by default. Enable packet signing by default (needed to -access many recent Windows servers) - -Version 0.83 ------------- -Fix oops when mounting to long server names caused by inverted parms to kmalloc. -Fix MultiuserMount (/proc/fs/cifs configuration setting) so that when enabled -we will choose a cifs user session (smb uid) that better matches the local -uid if a) the mount uid does not match the current uid and b) we have another -session to the same server (ip address) for a different mount which -matches the current local uid. - -Version 0.82 ------------- -Add support for mknod of block or character devices. Fix oplock -code (distributed caching) to properly send response to oplock -break from server. - -Version 0.81 ------------- -Finish up CIFS packet digital signing for the default -NTLM security case. This should help Windows 2003 -network interoperability since it is common for -packet signing to be required now. Fix statfs (stat -f) -which recently started returning errors due to -invalid value (-1 instead of 0) being set in the -struct kstatfs f_ffiles field. - -Version 0.80 ------------ -Fix oops on stopping oplock thread when removing cifs when -built as module. - -Version 0.79 ------------- -Fix mount options for ro (readonly), uid, gid and file and directory mode. - -Version 0.78 ------------- -Fix errors displayed on failed mounts to be more understandable. -Fixed various incorrect or misleading smb to posix error code mappings. - -Version 0.77 ------------- -Fix display of NTFS DFS junctions to display as symlinks. -They are the network equivalent. Fix oops in -cifs_partialpagewrite caused by missing spinlock protection -of openfile linked list. Allow writebehind caching errors to -be returned to the application at file close. - -Version 0.76 ------------- -Clean up options displayed in /proc/mounts by show_options to -be more consistent with other filesystems. - -Version 0.75 ------------- -Fix delete of readonly file to Windows servers. Reflect -presence or absence of read only dos attribute in mode -bits for servers that do not support CIFS Unix extensions. -Fix shortened results on readdir of large directories to -servers supporting CIFS Unix extensions (caused by -incorrect resume key). - -Version 0.74 ------------- -Fix truncate bug (set file size) that could cause hangs e.g. running fsx - -Version 0.73 ------------- -unload nls if mount fails. - -Version 0.72 ------------- -Add resume key support to search (readdir) code to workaround -Windows bug. Add /proc/fs/cifs/LookupCacheEnable which -allows disabling caching of attribute information for -lookups. - -Version 0.71 ------------- -Add more oplock handling (distributed caching code). Remove -dead code. Remove excessive stack space utilization from -symlink routines. - -Version 0.70 ------------- -Fix oops in get dfs referral (triggered when null path sent in to -mount). Add support for overriding rsize at mount time. - -Version 0.69 ------------- -Fix buffer overrun in readdir which caused intermittent kernel oopses. -Fix writepage code to release kmap on write data. Allow "-ip=" new -mount option to be passed in on parameter distinct from the first part -(server name portion of) the UNC name. Allow override of the -tcp port of the target server via new mount option "-port=" - -Version 0.68 ------------- -Fix search handle leak on rewind. Fix setuid and gid so that they are -reflected in the local inode immediately. Cleanup of whitespace -to make 2.4 and 2.5 versions more consistent. - - -Version 0.67 ------------- -Fix signal sending so that captive thread (cifsd) exits on umount -(which was causing the warning in kmem_cache_free of the request buffers -at rmmod time). This had broken as a sideeffect of the recent global -kernel change to daemonize. Fix memory leak in readdir code which -showed up in "ls -R" (and applications that did search rewinding). - -Version 0.66 ------------- -Reconnect tids and fids after session reconnection (still do not -reconnect byte range locks though). Fix problem caching -lookup information for directory inodes, improving performance, -especially in deep directory trees. Fix various build warnings. - -Version 0.65 ------------- -Finish fixes to commit write for caching/readahead consistency. fsx -now works to Samba servers. Fix oops caused when readahead -was interrupted by a signal. - -Version 0.64 ------------- -Fix data corruption (in partial page after truncate) that caused fsx to -fail to Windows servers. Cleaned up some extraneous error logging in -common error paths. Add generic sendfile support. - -Version 0.63 ------------- -Fix memory leak in AllocMidQEntry. -Finish reconnection logic, so connection with server can be dropped -(or server rebooted) and the cifs client will reconnect. - -Version 0.62 ------------- -Fix temporary socket leak when bad userid or password specified -(or other SMBSessSetup failure). Increase maximum buffer size to slightly -over 16K to allow negotiation of up to Samba and Windows server default read -sizes. Add support for readpages - -Version 0.61 ------------- -Fix oops when username not passed in on mount. Extensive fixes and improvements -to error logging (strip redundant newlines, change debug macros to ensure newline -passed in and to be more consistent). Fix writepage wrong file handle problem, -a readonly file handle could be incorrectly used to attempt to write out -file updates through the page cache to multiply open files. This could cause -the iozone benchmark to fail on the fwrite test. Fix bug mounting two different -shares to the same Windows server when using different usernames -(doing this to Samba servers worked but Windows was rejecting it) - now it is -possible to use different userids when connecting to the same server from a -Linux client. Fix oops when treeDisconnect called during unmount on -previously freed socket. - -Version 0.60 ------------- -Fix oops in readpages caused by not setting address space operations in inode in -rare code path. - -Version 0.59 ------------- -Includes support for deleting of open files and renaming over existing files (per POSIX -requirement). Add readlink support for Windows junction points (directory symlinks). - -Version 0.58 ------------- -Changed read and write to go through pagecache. Added additional address space operations. -Memory mapped operations now working. - -Version 0.57 ------------- -Added writepage code for additional memory mapping support. Fixed leak in xids causing -the simultaneous operations counter (/proc/fs/cifs/SimultaneousOps) to increase on -every stat call. Additional formatting cleanup. - -Version 0.56 ------------- -Fix bigendian bug in order of time conversion. Merge 2.5 to 2.4 version. Formatting cleanup. - -Version 0.55 ------------- -Fixes from Zwane Mwaikambo for adding missing return code checking in a few places. -Also included a modified version of his fix to protect global list manipulation of -the smb session and tree connection and mid related global variables. - -Version 0.54 ------------- -Fix problem with captive thread hanging around at unmount time. Adjust to 2.5.42-pre -changes to superblock layout. Remove wasteful allocation of smb buffers (now the send -buffer is reused for responses). Add more oplock handling. Additional minor cleanup. - -Version 0.53 ------------- -More stylistic updates to better match kernel style. Add additional statistics -for filesystem which can be viewed via /proc/fs/cifs. Add more pieces of NTLMv2 -and CIFS Packet Signing enablement. - -Version 0.52 ------------- -Replace call to sleep_on with safer wait_on_event. -Make stylistic changes to better match kernel style recommendations. -Remove most typedef usage (except for the PDUs themselves). - -Version 0.51 ------------- -Update mount so the -unc mount option is no longer required (the ip address can be specified -in a UNC style device name. Implementation of readpage/writepage started. - -Version 0.50 ------------- -Fix intermittent problem with incorrect smb header checking on badly -fragmented tcp responses - -Version 0.49 ------------- -Fixes to setting of allocation size and file size. - -Version 0.48 ------------- -Various 2.5.38 fixes. Now works on 2.5.38 - -Version 0.47 ------------- -Prepare for 2.5 kernel merge. Remove ifdefs. - -Version 0.46 ------------- -Socket buffer management fixes. Fix dual free. - -Version 0.45 ------------- -Various big endian fixes for hardlinks and symlinks and also for dfs. - -Version 0.44 ------------- -Various big endian fixes for servers with Unix extensions such as Samba - -Version 0.43 ------------- -Various FindNext fixes for incorrect filenames on large directory searches on big endian -clients. basic posix file i/o tests now work on big endian machines, not just le - -Version 0.42 ------------- -SessionSetup and NegotiateProtocol now work from Big Endian machines. -Various Big Endian fixes found during testing on the Linux on 390. Various fixes for compatibility with older -versions of 2.4 kernel (now builds and works again on kernels at least as early as 2.4.7). - -Version 0.41 ------------- -Various minor fixes for Connectathon Posix "basic" file i/o test suite. Directory caching fixed so hardlinked -files now return the correct number of links on fstat as they are repeatedly linked and unlinked. - -Version 0.40 ------------- -Implemented "Raw" (i.e. not encapsulated in SPNEGO) NTLMSSP (i.e. the Security Provider Interface used to negotiate -session advanced session authentication). Raw NTLMSSP is preferred by Windows 2000 Professional and Windows XP. -Began implementing support for SPNEGO encapsulation of NTLMSSP based session authentication blobs -(which is the mechanism preferred by Windows 2000 server in the absence of Kerberos). - -Version 0.38 ------------- -Introduced optional mount helper utility mount.cifs and made coreq changes to cifs vfs to enable -it. Fixed a few bugs in the DFS code (e.g. bcc two bytes too short and incorrect uid in PDU). - -Version 0.37 ------------- -Rewrote much of connection and mount/unmount logic to handle bugs with -multiple uses to same share, multiple users to same server etc. - -Version 0.36 ------------- -Fixed major problem with dentry corruption (missing call to dput) - -Version 0.35 ------------- -Rewrite of readdir code to fix bug. Various fixes for bigendian machines. -Begin adding oplock support. Multiusermount and oplockEnabled flags added to /proc/fs/cifs -although corresponding function not fully implemented in the vfs yet - -Version 0.34 ------------- -Fixed dentry caching bug, misc. cleanup - -Version 0.33 ------------- -Fixed 2.5 support to handle build and configure changes as well as misc. 2.5 changes. Now can build -on current 2.5 beta version (2.5.24) of the Linux kernel as well as on 2.4 Linux kernels. -Support for STATUS codes (newer 32 bit NT error codes) added. DFS support begun to be added. - -Version 0.32 ------------- -Unix extensions (symlink, readlink, hardlink, chmod and some chgrp and chown) implemented -and tested against Samba 2.2.5 - - -Version 0.31 ------------- -1) Fixed lockrange to be correct (it was one byte too short) - -2) Fixed GETLK (i.e. the fcntl call to test a range of bytes in a file to see if locked) to correctly -show range as locked when there is a conflict with an existing lock. - -3) default file perms are now 2767 (indicating support for mandatory locks) instead of 777 for directories -in most cases. Eventually will offer optional ability to query server for the correct perms. - -3) Fixed eventual trap when mounting twice to different shares on the same server when the first succeeded -but the second one was invalid and failed (the second one was incorrectly disconnecting the tcp and smb -session) - -4) Fixed error logging of valid mount options - -5) Removed logging of password field. - -6) Moved negotiate, treeDisconnect and uloggoffX (only tConx and SessSetup remain in connect.c) to cifssmb.c -and cleaned them up and made them more consistent with other cifs functions. - -7) Server support for Unix extensions is now fully detected and FindFirst is implemented both ways -(with or without Unix extensions) but FindNext and QueryPathInfo with the Unix extensions are not completed, -nor is the symlink support using the Unix extensions - -8) Started adding the readlink and follow_link code - -Version 0.3 ------------ -Initial drop - diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index aa0d68b086eb..1964d212ab08 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ - readdir.o ioctl.o sess.o export.o smb1ops.o + readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o cifs-$(CONFIG_CIFS_ACL) += cifsacl.o diff --git a/fs/cifs/README b/fs/cifs/README deleted file mode 100644 index 2d5622f60e11..000000000000 --- a/fs/cifs/README +++ /dev/null @@ -1,753 +0,0 @@ -The CIFS VFS support for Linux supports many advanced network filesystem -features such as hierarchical dfs like namespace, hardlinks, locking and more. -It was designed to comply with the SNIA CIFS Technical Reference (which -supersedes the 1992 X/Open SMB Standard) as well as to perform best practice -practical interoperability with Windows 2000, Windows XP, Samba and equivalent -servers. This code was developed in participation with the Protocol Freedom -Information Foundation. - -Please see - http://protocolfreedom.org/ and - http://samba.org/samba/PFIF/ -for more details. - - -For questions or bug reports please contact: - sfrench@samba.org (sfrench@us.ibm.com) - -Build instructions: -================== -For Linux 2.4: -1) Get the kernel source (e.g.from http://www.kernel.org) -and download the cifs vfs source (see the project page -at http://us1.samba.org/samba/Linux_CIFS_client.html) -and change directory into the top of the kernel directory -then patch the kernel (e.g. "patch -p1 < cifs_24.patch") -to add the cifs vfs to your kernel configure options if -it has not already been added (e.g. current SuSE and UL -users do not need to apply the cifs_24.patch since the cifs vfs is -already in the kernel configure menu) and then -mkdir linux/fs/cifs and then copy the current cifs vfs files from -the cifs download to your kernel build directory e.g. - - cp <cifs_download_dir>/fs/cifs/* to <kernel_download_dir>/fs/cifs - -2) make menuconfig (or make xconfig) -3) select cifs from within the network filesystem choices -4) save and exit -5) make dep -6) make modules (or "make" if CIFS VFS not to be built as a module) - -For Linux 2.6: -1) Download the kernel (e.g. from http://www.kernel.org) -and change directory into the top of the kernel directory tree -(e.g. /usr/src/linux-2.5.73) -2) make menuconfig (or make xconfig) -3) select cifs from within the network filesystem choices -4) save and exit -5) make - - -Installation instructions: -========================= -If you have built the CIFS vfs as module (successfully) simply -type "make modules_install" (or if you prefer, manually copy the file to -the modules directory e.g. /lib/modules/2.4.10-4GB/kernel/fs/cifs/cifs.o). - -If you have built the CIFS vfs into the kernel itself, follow the instructions -for your distribution on how to install a new kernel (usually you -would simply type "make install"). - -If you do not have the utility mount.cifs (in the Samba 3.0 source tree and on -the CIFS VFS web site) copy it to the same directory in which mount.smbfs and -similar files reside (usually /sbin). Although the helper software is not -required, mount.cifs is recommended. Eventually the Samba 3.0 utility program -"net" may also be helpful since it may someday provide easier mount syntax for -users who are used to Windows e.g. - net use <mount point> <UNC name or cifs URL> -Note that running the Winbind pam/nss module (logon service) on all of your -Linux clients is useful in mapping Uids and Gids consistently across the -domain to the proper network user. The mount.cifs mount helper can be -trivially built from Samba 3.0 or later source e.g. by executing: - - gcc samba/source/client/mount.cifs.c -o mount.cifs - -If cifs is built as a module, then the size and number of network buffers -and maximum number of simultaneous requests to one server can be configured. -Changing these from their defaults is not recommended. By executing modinfo - modinfo kernel/fs/cifs/cifs.ko -on kernel/fs/cifs/cifs.ko the list of configuration changes that can be made -at module initialization time (by running insmod cifs.ko) can be seen. - -Allowing User Mounts -==================== -To permit users to mount and unmount over directories they own is possible -with the cifs vfs. A way to enable such mounting is to mark the mount.cifs -utility as suid (e.g. "chmod +s /sbin/mount.cifs). To enable users to -umount shares they mount requires -1) mount.cifs version 1.4 or later -2) an entry for the share in /etc/fstab indicating that a user may -unmount it e.g. -//server/usersharename /mnt/username cifs user 0 0 - -Note that when the mount.cifs utility is run suid (allowing user mounts), -in order to reduce risks, the "nosuid" mount flag is passed in on mount to -disallow execution of an suid program mounted on the remote target. -When mount is executed as root, nosuid is not passed in by default, -and execution of suid programs on the remote target would be enabled -by default. This can be changed, as with nfs and other filesystems, -by simply specifying "nosuid" among the mount options. For user mounts -though to be able to pass the suid flag to mount requires rebuilding -mount.cifs with the following flag: - - gcc samba/source/client/mount.cifs.c -DCIFS_ALLOW_USR_SUID -o mount.cifs - -There is a corresponding manual page for cifs mounting in the Samba 3.0 and -later source tree in docs/manpages/mount.cifs.8 - -Allowing User Unmounts -====================== -To permit users to ummount directories that they have user mounted (see above), -the utility umount.cifs may be used. It may be invoked directly, or if -umount.cifs is placed in /sbin, umount can invoke the cifs umount helper -(at least for most versions of the umount utility) for umount of cifs -mounts, unless umount is invoked with -i (which will avoid invoking a umount -helper). As with mount.cifs, to enable user unmounts umount.cifs must be marked -as suid (e.g. "chmod +s /sbin/umount.cifs") or equivalent (some distributions -allow adding entries to a file to the /etc/permissions file to achieve the -equivalent suid effect). For this utility to succeed the target path -must be a cifs mount, and the uid of the current user must match the uid -of the user who mounted the resource. - -Also note that the customary way of allowing user mounts and unmounts is -(instead of using mount.cifs and unmount.cifs as suid) to add a line -to the file /etc/fstab for each //server/share you wish to mount, but -this can become unwieldy when potential mount targets include many -or unpredictable UNC names. - -Samba Considerations -==================== -To get the maximum benefit from the CIFS VFS, we recommend using a server that -supports the SNIA CIFS Unix Extensions standard (e.g. Samba 2.2.5 or later or -Samba 3.0) but the CIFS vfs works fine with a wide variety of CIFS servers. -Note that uid, gid and file permissions will display default values if you do -not have a server that supports the Unix extensions for CIFS (such as Samba -2.2.5 or later). To enable the Unix CIFS Extensions in the Samba server, add -the line: - - unix extensions = yes - -to your smb.conf file on the server. Note that the following smb.conf settings -are also useful (on the Samba server) when the majority of clients are Unix or -Linux: - - case sensitive = yes - delete readonly = yes - ea support = yes - -Note that server ea support is required for supporting xattrs from the Linux -cifs client, and that EA support is present in later versions of Samba (e.g. -3.0.6 and later (also EA support works in all versions of Windows, at least to -shares on NTFS filesystems). Extended Attribute (xattr) support is an optional -feature of most Linux filesystems which may require enabling via -make menuconfig. Client support for extended attributes (user xattr) can be -disabled on a per-mount basis by specifying "nouser_xattr" on mount. - -The CIFS client can get and set POSIX ACLs (getfacl, setfacl) to Samba servers -version 3.10 and later. Setting POSIX ACLs requires enabling both XATTR and -then POSIX support in the CIFS configuration options when building the cifs -module. POSIX ACL support can be disabled on a per mount basic by specifying -"noacl" on mount. - -Some administrators may want to change Samba's smb.conf "map archive" and -"create mask" parameters from the default. Unless the create mask is changed -newly created files can end up with an unnecessarily restrictive default mode, -which may not be what you want, although if the CIFS Unix extensions are -enabled on the server and client, subsequent setattr calls (e.g. chmod) can -fix the mode. Note that creating special devices (mknod) remotely -may require specifying a mkdev function to Samba if you are not using -Samba 3.0.6 or later. For more information on these see the manual pages -("man smb.conf") on the Samba server system. Note that the cifs vfs, -unlike the smbfs vfs, does not read the smb.conf on the client system -(the few optional settings are passed in on mount via -o parameters instead). -Note that Samba 2.2.7 or later includes a fix that allows the CIFS VFS to delete -open files (required for strict POSIX compliance). Windows Servers already -supported this feature. Samba server does not allow symlinks that refer to files -outside of the share, so in Samba versions prior to 3.0.6, most symlinks to -files with absolute paths (ie beginning with slash) such as: - ln -s /mnt/foo bar -would be forbidden. Samba 3.0.6 server or later includes the ability to create -such symlinks safely by converting unsafe symlinks (ie symlinks to server -files that are outside of the share) to a samba specific format on the server -that is ignored by local server applications and non-cifs clients and that will -not be traversed by the Samba server). This is opaque to the Linux client -application using the cifs vfs. Absolute symlinks will work to Samba 3.0.5 or -later, but only for remote clients using the CIFS Unix extensions, and will -be invisbile to Windows clients and typically will not affect local -applications running on the same server as Samba. - -Use instructions: -================ -Once the CIFS VFS support is built into the kernel or installed as a module -(cifs.o), you can use mount syntax like the following to access Samba or Windows -servers: - - mount -t cifs //9.53.216.11/e$ /mnt -o user=myname,pass=mypassword - -Before -o the option -v may be specified to make the mount.cifs -mount helper display the mount steps more verbosely. -After -o the following commonly used cifs vfs specific options -are supported: - - user=<username> - pass=<password> - domain=<domain name> - -Other cifs mount options are described below. Use of TCP names (in addition to -ip addresses) is available if the mount helper (mount.cifs) is installed. If -you do not trust the server to which are mounted, or if you do not have -cifs signing enabled (and the physical network is insecure), consider use -of the standard mount options "noexec" and "nosuid" to reduce the risk of -running an altered binary on your local system (downloaded from a hostile server -or altered by a hostile router). - -Although mounting using format corresponding to the CIFS URL specification is -not possible in mount.cifs yet, it is possible to use an alternate format -for the server and sharename (which is somewhat similar to NFS style mount -syntax) instead of the more widely used UNC format (i.e. \\server\share): - mount -t cifs tcp_name_of_server:share_name /mnt -o user=myname,pass=mypasswd - -When using the mount helper mount.cifs, passwords may be specified via alternate -mechanisms, instead of specifying it after -o using the normal "pass=" syntax -on the command line: -1) By including it in a credential file. Specify credentials=filename as one -of the mount options. Credential files contain two lines - username=someuser - password=your_password -2) By specifying the password in the PASSWD environment variable (similarly -the user name can be taken from the USER environment variable). -3) By specifying the password in a file by name via PASSWD_FILE -4) By specifying the password in a file by file descriptor via PASSWD_FD - -If no password is provided, mount.cifs will prompt for password entry - -Restrictions -============ -Servers must support either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC -1001/1002 support for "Netbios-Over-TCP/IP." This is not likely to be a -problem as most servers support this. - -Valid filenames differ between Windows and Linux. Windows typically restricts -filenames which contain certain reserved characters (e.g.the character : -which is used to delimit the beginning of a stream name by Windows), while -Linux allows a slightly wider set of valid characters in filenames. Windows -servers can remap such characters when an explicit mapping is specified in -the Server's registry. Samba starting with version 3.10 will allow such -filenames (ie those which contain valid Linux characters, which normally -would be forbidden for Windows/CIFS semantics) as long as the server is -configured for Unix Extensions (and the client has not disabled -/proc/fs/cifs/LinuxExtensionsEnabled). - - -CIFS VFS Mount Options -====================== -A partial list of the supported mount options follows: - user The user name to use when trying to establish - the CIFS session. - password The user password. If the mount helper is - installed, the user will be prompted for password - if not supplied. - ip The ip address of the target server - unc The target server Universal Network Name (export) to - mount. - domain Set the SMB/CIFS workgroup name prepended to the - username during CIFS session establishment - forceuid Set the default uid for inodes to the uid - passed in on mount. For mounts to servers - which do support the CIFS Unix extensions, such as a - properly configured Samba server, the server provides - the uid, gid and mode so this parameter should not be - specified unless the server and clients uid and gid - numbering differ. If the server and client are in the - same domain (e.g. running winbind or nss_ldap) and - the server supports the Unix Extensions then the uid - and gid can be retrieved from the server (and uid - and gid would not have to be specifed on the mount. - For servers which do not support the CIFS Unix - extensions, the default uid (and gid) returned on lookup - of existing files will be the uid (gid) of the person - who executed the mount (root, except when mount.cifs - is configured setuid for user mounts) unless the "uid=" - (gid) mount option is specified. Also note that permission - checks (authorization checks) on accesses to a file occur - at the server, but there are cases in which an administrator - may want to restrict at the client as well. For those - servers which do not report a uid/gid owner - (such as Windows), permissions can also be checked at the - client, and a crude form of client side permission checking - can be enabled by specifying file_mode and dir_mode on - the client. (default) - forcegid (similar to above but for the groupid instead of uid) (default) - noforceuid Fill in file owner information (uid) by requesting it from - the server if possible. With this option, the value given in - the uid= option (on mount) will only be used if the server - can not support returning uids on inodes. - noforcegid (similar to above but for the group owner, gid, instead of uid) - uid Set the default uid for inodes, and indicate to the - cifs kernel driver which local user mounted. If the server - supports the unix extensions the default uid is - not used to fill in the owner fields of inodes (files) - unless the "forceuid" parameter is specified. - gid Set the default gid for inodes (similar to above). - file_mode If CIFS Unix extensions are not supported by the server - this overrides the default mode for file inodes. - fsc Enable local disk caching using FS-Cache (off by default). This - option could be useful to improve performance on a slow link, - heavily loaded server and/or network where reading from the - disk is faster than reading from the server (over the network). - This could also impact scalability positively as the - number of calls to the server are reduced. However, local - caching is not suitable for all workloads for e.g. read-once - type workloads. So, you need to consider carefully your - workload/scenario before using this option. Currently, local - disk caching is functional for CIFS files opened as read-only. - dir_mode If CIFS Unix extensions are not supported by the server - this overrides the default mode for directory inodes. - port attempt to contact the server on this tcp port, before - trying the usual ports (port 445, then 139). - iocharset Codepage used to convert local path names to and from - Unicode. Unicode is used by default for network path - names if the server supports it. If iocharset is - not specified then the nls_default specified - during the local client kernel build will be used. - If server does not support Unicode, this parameter is - unused. - rsize default read size (usually 16K). The client currently - can not use rsize larger than CIFSMaxBufSize. CIFSMaxBufSize - defaults to 16K and may be changed (from 8K to the maximum - kmalloc size allowed by your kernel) at module install time - for cifs.ko. Setting CIFSMaxBufSize to a very large value - will cause cifs to use more memory and may reduce performance - in some cases. To use rsize greater than 127K (the original - cifs protocol maximum) also requires that the server support - a new Unix Capability flag (for very large read) which some - newer servers (e.g. Samba 3.0.26 or later) do. rsize can be - set from a minimum of 2048 to a maximum of 130048 (127K or - CIFSMaxBufSize, whichever is smaller) - wsize default write size (default 57344) - maximum wsize currently allowed by CIFS is 57344 (fourteen - 4096 byte pages) - actimeo=n attribute cache timeout in seconds (default 1 second). - After this timeout, the cifs client requests fresh attribute - information from the server. This option allows to tune the - attribute cache timeout to suit the workload needs. Shorter - timeouts mean better the cache coherency, but increased number - of calls to the server. Longer timeouts mean reduced number - of calls to the server at the expense of less stricter cache - coherency checks (i.e. incorrect attribute cache for a short - period of time). - rw mount the network share read-write (note that the - server may still consider the share read-only) - ro mount network share read-only - version used to distinguish different versions of the - mount helper utility (not typically needed) - sep if first mount option (after the -o), overrides - the comma as the separator between the mount - parms. e.g. - -o user=myname,password=mypassword,domain=mydom - could be passed instead with period as the separator by - -o sep=.user=myname.password=mypassword.domain=mydom - this might be useful when comma is contained within username - or password or domain. This option is less important - when the cifs mount helper cifs.mount (version 1.1 or later) - is used. - nosuid Do not allow remote executables with the suid bit - program to be executed. This is only meaningful for mounts - to servers such as Samba which support the CIFS Unix Extensions. - If you do not trust the servers in your network (your mount - targets) it is recommended that you specify this option for - greater security. - exec Permit execution of binaries on the mount. - noexec Do not permit execution of binaries on the mount. - dev Recognize block devices on the remote mount. - nodev Do not recognize devices on the remote mount. - suid Allow remote files on this mountpoint with suid enabled to - be executed (default for mounts when executed as root, - nosuid is default for user mounts). - credentials Although ignored by the cifs kernel component, it is used by - the mount helper, mount.cifs. When mount.cifs is installed it - opens and reads the credential file specified in order - to obtain the userid and password arguments which are passed to - the cifs vfs. - guest Although ignored by the kernel component, the mount.cifs - mount helper will not prompt the user for a password - if guest is specified on the mount options. If no - password is specified a null password will be used. - perm Client does permission checks (vfs_permission check of uid - and gid of the file against the mode and desired operation), - Note that this is in addition to the normal ACL check on the - target machine done by the server software. - Client permission checking is enabled by default. - noperm Client does not do permission checks. This can expose - files on this mount to access by other users on the local - client system. It is typically only needed when the server - supports the CIFS Unix Extensions but the UIDs/GIDs on the - client and server system do not match closely enough to allow - access by the user doing the mount, but it may be useful with - non CIFS Unix Extension mounts for cases in which the default - mode is specified on the mount but is not to be enforced on the - client (e.g. perhaps when MultiUserMount is enabled) - Note that this does not affect the normal ACL check on the - target machine done by the server software (of the server - ACL against the user name provided at mount time). - serverino Use server's inode numbers instead of generating automatically - incrementing inode numbers on the client. Although this will - make it easier to spot hardlinked files (as they will have - the same inode numbers) and inode numbers may be persistent, - note that the server does not guarantee that the inode numbers - are unique if multiple server side mounts are exported under a - single share (since inode numbers on the servers might not - be unique if multiple filesystems are mounted under the same - shared higher level directory). Note that some older - (e.g. pre-Windows 2000) do not support returning UniqueIDs - or the CIFS Unix Extensions equivalent and for those - this mount option will have no effect. Exporting cifs mounts - under nfsd requires this mount option on the cifs mount. - This is now the default if server supports the - required network operation. - noserverino Client generates inode numbers (rather than using the actual one - from the server). These inode numbers will vary after - unmount or reboot which can confuse some applications, - but not all server filesystems support unique inode - numbers. - setuids If the CIFS Unix extensions are negotiated with the server - the client will attempt to set the effective uid and gid of - the local process on newly created files, directories, and - devices (create, mkdir, mknod). If the CIFS Unix Extensions - are not negotiated, for newly created files and directories - instead of using the default uid and gid specified on - the mount, cache the new file's uid and gid locally which means - that the uid for the file can change when the inode is - reloaded (or the user remounts the share). - nosetuids The client will not attempt to set the uid and gid on - on newly created files, directories, and devices (create, - mkdir, mknod) which will result in the server setting the - uid and gid to the default (usually the server uid of the - user who mounted the share). Letting the server (rather than - the client) set the uid and gid is the default. If the CIFS - Unix Extensions are not negotiated then the uid and gid for - new files will appear to be the uid (gid) of the mounter or the - uid (gid) parameter specified on the mount. - netbiosname When mounting to servers via port 139, specifies the RFC1001 - source name to use to represent the client netbios machine - name when doing the RFC1001 netbios session initialize. - direct Do not do inode data caching on files opened on this mount. - This precludes mmapping files on this mount. In some cases - with fast networks and little or no caching benefits on the - client (e.g. when the application is doing large sequential - reads bigger than page size without rereading the same data) - this can provide better performance than the default - behavior which caches reads (readahead) and writes - (writebehind) through the local Linux client pagecache - if oplock (caching token) is granted and held. Note that - direct allows write operations larger than page size - to be sent to the server. - strictcache Use for switching on strict cache mode. In this mode the - client read from the cache all the time it has Oplock Level II, - otherwise - read from the server. All written data are stored - in the cache, but if the client doesn't have Exclusive Oplock, - it writes the data to the server. - rwpidforward Forward pid of a process who opened a file to any read or write - operation on that file. This prevent applications like WINE - from failing on read and write if we use mandatory brlock style. - acl Allow setfacl and getfacl to manage posix ACLs if server - supports them. (default) - noacl Do not allow setfacl and getfacl calls on this mount - user_xattr Allow getting and setting user xattrs (those attributes whose - name begins with "user." or "os2.") as OS/2 EAs (extended - attributes) to the server. This allows support of the - setfattr and getfattr utilities. (default) - nouser_xattr Do not allow getfattr/setfattr to get/set/list xattrs - mapchars Translate six of the seven reserved characters (not backslash) - *?<>|: - to the remap range (above 0xF000), which also - allows the CIFS client to recognize files created with - such characters by Windows's POSIX emulation. This can - also be useful when mounting to most versions of Samba - (which also forbids creating and opening files - whose names contain any of these seven characters). - This has no effect if the server does not support - Unicode on the wire. - nomapchars Do not translate any of these seven characters (default). - nocase Request case insensitive path name matching (case - sensitive is the default if the server supports it). - (mount option "ignorecase" is identical to "nocase") - posixpaths If CIFS Unix extensions are supported, attempt to - negotiate posix path name support which allows certain - characters forbidden in typical CIFS filenames, without - requiring remapping. (default) - noposixpaths If CIFS Unix extensions are supported, do not request - posix path name support (this may cause servers to - reject creatingfile with certain reserved characters). - nounix Disable the CIFS Unix Extensions for this mount (tree - connection). This is rarely needed, but it may be useful - in order to turn off multiple settings all at once (ie - posix acls, posix locks, posix paths, symlink support - and retrieving uids/gids/mode from the server) or to - work around a bug in server which implement the Unix - Extensions. - nobrl Do not send byte range lock requests to the server. - This is necessary for certain applications that break - with cifs style mandatory byte range locks (and most - cifs servers do not yet support requesting advisory - byte range locks). - forcemandatorylock Even if the server supports posix (advisory) byte range - locking, send only mandatory lock requests. For some - (presumably rare) applications, originally coded for - DOS/Windows, which require Windows style mandatory byte range - locking, they may be able to take advantage of this option, - forcing the cifs client to only send mandatory locks - even if the cifs server would support posix advisory locks. - "forcemand" is accepted as a shorter form of this mount - option. - nostrictsync If this mount option is set, when an application does an - fsync call then the cifs client does not send an SMB Flush - to the server (to force the server to write all dirty data - for this file immediately to disk), although cifs still sends - all dirty (cached) file data to the server and waits for the - server to respond to the write. Since SMB Flush can be - very slow, and some servers may be reliable enough (to risk - delaying slightly flushing the data to disk on the server), - turning on this option may be useful to improve performance for - applications that fsync too much, at a small risk of server - crash. If this mount option is not set, by default cifs will - send an SMB flush request (and wait for a response) on every - fsync call. - nodfs Disable DFS (global name space support) even if the - server claims to support it. This can help work around - a problem with parsing of DFS paths with Samba server - versions 3.0.24 and 3.0.25. - remount remount the share (often used to change from ro to rw mounts - or vice versa) - cifsacl Report mode bits (e.g. on stat) based on the Windows ACL for - the file. (EXPERIMENTAL) - servern Specify the server 's netbios name (RFC1001 name) to use - when attempting to setup a session to the server. - This is needed for mounting to some older servers (such - as OS/2 or Windows 98 and Windows ME) since they do not - support a default server name. A server name can be up - to 15 characters long and is usually uppercased. - sfu When the CIFS Unix Extensions are not negotiated, attempt to - create device files and fifos in a format compatible with - Services for Unix (SFU). In addition retrieve bits 10-12 - of the mode via the SETFILEBITS extended attribute (as - SFU does). In the future the bottom 9 bits of the - mode also will be emulated using queries of the security - descriptor (ACL). - mfsymlinks Enable support for Minshall+French symlinks - (see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks) - This option is ignored when specified together with the - 'sfu' option. Minshall+French symlinks are used even if - the server supports the CIFS Unix Extensions. - sign Must use packet signing (helps avoid unwanted data modification - by intermediate systems in the route). Note that signing - does not work with lanman or plaintext authentication. - seal Must seal (encrypt) all data on this mounted share before - sending on the network. Requires support for Unix Extensions. - Note that this differs from the sign mount option in that it - causes encryption of data sent over this mounted share but other - shares mounted to the same server are unaffected. - locallease This option is rarely needed. Fcntl F_SETLEASE is - used by some applications such as Samba and NFSv4 server to - check to see whether a file is cacheable. CIFS has no way - to explicitly request a lease, but can check whether a file - is cacheable (oplocked). Unfortunately, even if a file - is not oplocked, it could still be cacheable (ie cifs client - could grant fcntl leases if no other local processes are using - the file) for cases for example such as when the server does not - support oplocks and the user is sure that the only updates to - the file will be from this client. Specifying this mount option - will allow the cifs client to check for leases (only) locally - for files which are not oplocked instead of denying leases - in that case. (EXPERIMENTAL) - sec Security mode. Allowed values are: - none attempt to connection as a null user (no name) - krb5 Use Kerberos version 5 authentication - krb5i Use Kerberos authentication and packet signing - ntlm Use NTLM password hashing (default) - ntlmi Use NTLM password hashing with signing (if - /proc/fs/cifs/PacketSigningEnabled on or if - server requires signing also can be the default) - ntlmv2 Use NTLMv2 password hashing - ntlmv2i Use NTLMv2 password hashing with packet signing - lanman (if configured in kernel config) use older - lanman hash -hard Retry file operations if server is not responding -soft Limit retries to unresponsive servers (usually only - one retry) before returning an error. (default) - -The mount.cifs mount helper also accepts a few mount options before -o -including: - - -S take password from stdin (equivalent to setting the environment - variable "PASSWD_FD=0" - -V print mount.cifs version - -? display simple usage information - -With most 2.6 kernel versions of modutils, the version of the cifs kernel -module can be displayed via modinfo. - -Misc /proc/fs/cifs Flags and Debug Info -======================================= -Informational pseudo-files: -DebugData Displays information about active CIFS sessions and - shares, features enabled as well as the cifs.ko - version. -Stats Lists summary resource usage information as well as per - share statistics, if CONFIG_CIFS_STATS in enabled - in the kernel configuration. - -Configuration pseudo-files: -PacketSigningEnabled If set to one, cifs packet signing is enabled - and will be used if the server requires - it. If set to two, cifs packet signing is - required even if the server considers packet - signing optional. (default 1) -SecurityFlags Flags which control security negotiation and - also packet signing. Authentication (may/must) - flags (e.g. for NTLM and/or NTLMv2) may be combined with - the signing flags. Specifying two different password - hashing mechanisms (as "must use") on the other hand - does not make much sense. Default flags are - 0x07007 - (NTLM, NTLMv2 and packet signing allowed). The maximum - allowable flags if you want to allow mounts to servers - using weaker password hashes is 0x37037 (lanman, - plaintext, ntlm, ntlmv2, signing allowed). Some - SecurityFlags require the corresponding menuconfig - options to be enabled (lanman and plaintext require - CONFIG_CIFS_WEAK_PW_HASH for example). Enabling - plaintext authentication currently requires also - enabling lanman authentication in the security flags - because the cifs module only supports sending - laintext passwords using the older lanman dialect - form of the session setup SMB. (e.g. for authentication - using plain text passwords, set the SecurityFlags - to 0x30030): - - may use packet signing 0x00001 - must use packet signing 0x01001 - may use NTLM (most common password hash) 0x00002 - must use NTLM 0x02002 - may use NTLMv2 0x00004 - must use NTLMv2 0x04004 - may use Kerberos security 0x00008 - must use Kerberos 0x08008 - may use lanman (weak) password hash 0x00010 - must use lanman password hash 0x10010 - may use plaintext passwords 0x00020 - must use plaintext passwords 0x20020 - (reserved for future packet encryption) 0x00040 - -cifsFYI If set to non-zero value, additional debug information - will be logged to the system error log. This field - contains three flags controlling different classes of - debugging entries. The maximum value it can be set - to is 7 which enables all debugging points (default 0). - Some debugging statements are not compiled into the - cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the - kernel configuration. cifsFYI may be set to one or - nore of the following flags (7 sets them all): - - log cifs informational messages 0x01 - log return codes from cifs entry points 0x02 - log slow responses (ie which take longer than 1 second) - CONFIG_CIFS_STATS2 must be enabled in .config 0x04 - - -traceSMB If set to one, debug information is logged to the - system error log with the start of smb requests - and responses (default 0) -LookupCacheEnable If set to one, inode information is kept cached - for one second improving performance of lookups - (default 1) -OplockEnabled If set to one, safe distributed caching enabled. - (default 1) -LinuxExtensionsEnabled If set to one then the client will attempt to - use the CIFS "UNIX" extensions which are optional - protocol enhancements that allow CIFS servers - to return accurate UID/GID information as well - as support symbolic links. If you use servers - such as Samba that support the CIFS Unix - extensions but do not want to use symbolic link - support and want to map the uid and gid fields - to values supplied at mount (rather than the - actual values, then set this to zero. (default 1) - -These experimental features and tracing can be enabled by changing flags in -/proc/fs/cifs (after the cifs module has been installed or built into the -kernel, e.g. insmod cifs). To enable a feature set it to 1 e.g. to enable -tracing to the kernel message log type: - - echo 7 > /proc/fs/cifs/cifsFYI - -cifsFYI functions as a bit mask. Setting it to 1 enables additional kernel -logging of various informational messages. 2 enables logging of non-zero -SMB return codes while 4 enables logging of requests that take longer -than one second to complete (except for byte range lock requests). -Setting it to 4 requires defining CONFIG_CIFS_STATS2 manually in the -source code (typically by setting it in the beginning of cifsglob.h), -and setting it to seven enables all three. Finally, tracing -the start of smb requests and responses can be enabled via: - - echo 1 > /proc/fs/cifs/traceSMB - -Per share (per client mount) statistics are available in /proc/fs/cifs/Stats -if the kernel was configured with cifs statistics enabled. The statistics -represent the number of successful (ie non-zero return code from the server) -SMB responses to some of the more common commands (open, delete, mkdir etc.). -Also recorded is the total bytes read and bytes written to the server for -that share. Note that due to client caching effects this can be less than the -number of bytes read and written by the application running on the client. -The statistics for the number of total SMBs and oplock breaks are different in -that they represent all for that share, not just those for which the server -returned success. - -Also note that "cat /proc/fs/cifs/DebugData" will display information about -the active sessions and the shares that are mounted. - -Enabling Kerberos (extended security) works but requires version 1.2 or later -of the helper program cifs.upcall to be present and to be configured in the -/etc/request-key.conf file. The cifs.upcall helper program is from the Samba -project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not -require this helper. Note that NTLMv2 security (which does not require the -cifs.upcall helper program), instead of using Kerberos, is sufficient for -some use cases. - -DFS support allows transparent redirection to shares in an MS-DFS name space. -In addition, DFS support for target shares which are specified as UNC -names which begin with host names (rather than IP addresses) requires -a user space helper (such as cifs.upcall) to be present in order to -translate host names to ip address, and the user space helper must also -be configured in the file /etc/request-key.conf. Samba, Windows servers and -many NAS appliances support DFS as a way of constructing a global name -space to ease network configuration and improve reliability. - -To use cifs Kerberos and DFS support, the Linux keyutils package should be -installed and something like the following lines should be added to the -/etc/request-key.conf file: - -create cifs.spnego * * /usr/local/sbin/cifs.upcall %k -create dns_resolver * * /usr/local/sbin/cifs.upcall %k - -CIFS kernel module parameters -============================= -These module parameters can be specified or modified either during the time of -module loading or during the runtime by using the interface - /proc/module/cifs/parameters/<param> - -i.e. echo "value" > /sys/module/cifs/parameters/<param> - -1. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default. - [Y/y/1]. To disable use any of [N/n/0]. - diff --git a/fs/cifs/TODO b/fs/cifs/TODO deleted file mode 100644 index 355abcdcda98..000000000000 --- a/fs/cifs/TODO +++ /dev/null @@ -1,129 +0,0 @@ -Version 1.53 May 20, 2008 - -A Partial List of Missing Features -================================== - -Contributions are welcome. There are plenty of opportunities -for visible, important contributions to this module. Here -is a partial list of the known problems and missing features: - -a) Support for SecurityDescriptors(Windows/CIFS ACLs) for chmod/chgrp/chown -so that these operations can be supported to Windows servers - -b) Mapping POSIX ACLs (and eventually NFSv4 ACLs) to CIFS -SecurityDescriptors - -c) Better pam/winbind integration (e.g. to handle uid mapping -better) - -d) Cleanup now unneeded SessSetup code in -fs/cifs/connect.c and add back in NTLMSSP code if any servers -need it - -e) fix NTLMv2 signing when two mounts with different users to same -server. - -f) Directory entry caching relies on a 1 second timer, rather than -using FindNotify or equivalent. - (started) - -g) quota support (needs minor kernel change since quota calls -to make it to network filesystems or deviceless filesystems) - -h) investigate sync behavior (including syncpage) and check -for proper behavior of intr/nointr - -i) improve support for very old servers (OS/2 and Win9x for example) -Including support for changing the time remotely (utimes command). - -j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the -extra copy in/out of the socket buffers in some cases. - -k) Better optimize open (and pathbased setfilesize) to reduce the -oplock breaks coming from windows srv. Piggyback identical file -opens on top of each other by incrementing reference count rather -than resending (helps reduce server resource utilization and avoid -spurious oplock breaks). - -l) Improve performance of readpages by sending more than one read -at a time when 8 pages or more are requested. In conjuntion -add support for async_cifs_readpages. - -m) Add support for storing symlink info to Windows servers -in the Extended Attribute format their SFU clients would recognize. - -n) Finish fcntl D_NOTIFY support so kde and gnome file list windows -will autorefresh (partially complete by Asser). Needs minor kernel -vfs change to support removing D_NOTIFY on a file. - -o) Add GUI tool to configure /proc/fs/cifs settings and for display of -the CIFS statistics (started) - -p) implement support for security and trusted categories of xattrs -(requires minor protocol extension) to enable better support for SELINUX - -q) Implement O_DIRECT flag on open (already supported on mount) - -r) Create UID mapping facility so server UIDs can be mapped on a per -mount or a per server basis to client UIDs or nobody if no mapping -exists. This is helpful when Unix extensions are negotiated to -allow better permission checking when UIDs differ on the server -and client. Add new protocol request to the CIFS protocol -standard for asking the server for the corresponding name of a -particular uid. - -s) Add support for CIFS Unix and also the newer POSIX extensions to the -server side for Samba 4. - -t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) -need to add ability to set time to server (utimes command) - -u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too) - -v) mount check for unmatched uids - -w) Add support for new vfs entry point for fallocate - -x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of -processes can proceed better in parallel (on the server) - -y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount -restriction of wsize max being 127K) - -KNOWN BUGS (updated April 24, 2007) -==================================== -See http://bugzilla.samba.org - search on product "CifsVFS" for -current bug list. - -1) existing symbolic links (Windows reparse points) are recognized but -can not be created remotely. They are implemented for Samba and those that -support the CIFS Unix extensions, although earlier versions of Samba -overly restrict the pathnames. -2) follow_link and readdir code does not follow dfs junctions -but recognizes them -3) create of new files to FAT partitions on Windows servers can -succeed but still return access denied (appears to be Windows -server not cifs client problem) and has not been reproduced recently. -NTFS partitions do not have this problem. -4) Unix/POSIX capabilities are reset after reconnection, and affect -a few fields in the tree connection but we do do not know which -superblocks to apply these changes to. We should probably walk -the list of superblocks to set these. Also need to check the -flags on the second mount to the same share, and see if we -can do the same trick that NFS does to remount duplicate shares. - -Misc testing to do -================== -1) check out max path names and max path name components against various server -types. Try nested symlinks (8 deep). Return max path name in stat -f information - -2) Modify file portion of ltp so it can run against a mounted network -share and run it against cifs vfs in automated fashion. - -3) Additional performance testing and optimization using iozone and similar - -there are some easy changes that can be done to parallelize sequential writes, -and when signing is disabled to request larger read sizes (larger than -negotiated size) and send larger write sizes to modern servers. - -4) More exhaustively test against less common servers. More testing -against Windows 9x, Windows ME servers. - diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index fe8d6276410a..d8eac3b6cefb 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -91,6 +91,8 @@ extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen, #endif /* CONFIG_CIFS_SMB2 */ #endif +wchar_t cifs_toupper(wchar_t in); + /* * UniStrcat: Concatenate the second string to the first * diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 85ea98d139fc..a16b4e58bcc6 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -255,6 +255,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; cifs_inode->createtime = 0; + cifs_inode->epoch = 0; #ifdef CONFIG_CIFS_SMB2 get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE); #endif @@ -357,6 +358,18 @@ cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb) seq_printf(s, "loose"); } +static void +cifs_show_nls(struct seq_file *s, struct nls_table *cur) +{ + struct nls_table *def; + + /* Display iocharset= option if it's not default charset */ + def = load_nls_default(); + if (def != cur) + seq_printf(s, ",iocharset=%s", cur->charset); + unload_nls(def); +} + /* * cifs_show_options() is for displaying mount options in /proc/mounts. * Not all settable options are displayed but most of the important @@ -418,6 +431,9 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho", cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); + + cifs_show_nls(s, cifs_sb->local_nls); + if (tcon->seal) seq_printf(s, ",seal"); if (tcon->nocase) @@ -718,7 +734,7 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, written = generic_file_aio_write(iocb, iov, nr_segs, pos); - if (CIFS_I(inode)->clientCanCacheAll) + if (CIFS_CACHE_WRITE(CIFS_I(inode))) return written; rc = filemap_fdatawrite(inode->i_mapping); @@ -743,7 +759,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) * We need to be sure that all dirty pages are written and the * server has the newest file length. */ - if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping && + if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = filemap_fdatawait(inode->i_mapping); if (rc) { @@ -767,8 +783,10 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) { - /* note that this is called by vfs setlease with i_lock held - to protect *lease from going away */ + /* + * Note that this is called by vfs setlease with i_lock held to + * protect *lease from going away. + */ struct inode *inode = file_inode(file); struct cifsFileInfo *cfile = file->private_data; @@ -776,20 +794,19 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) return -EINVAL; /* check if file is oplocked */ - if (((arg == F_RDLCK) && - (CIFS_I(inode)->clientCanCacheRead)) || - ((arg == F_WRLCK) && - (CIFS_I(inode)->clientCanCacheAll))) + if (((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || + ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode)))) return generic_setlease(file, arg, lease); else if (tlink_tcon(cfile->tlink)->local_lease && - !CIFS_I(inode)->clientCanCacheRead) - /* If the server claims to support oplock on this - file, then we still need to check oplock even - if the local_lease mount option is set, but there - are servers which do not support oplock for which - this mount option may be useful if the user - knows that the file won't be changed on the server - by anyone else */ + !CIFS_CACHE_READ(CIFS_I(inode))) + /* + * If the server claims to support oplock on this file, then we + * still need to check oplock even if the local_lease mount + * option is set, but there are servers which do not support + * oplock for which this mount option may be useful if the user + * knows that the file won't be changed on the server by anyone + * else. + */ return generic_setlease(file, arg, lease); else return -EAGAIN; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 52ca861ed35e..cfa14c80ef3b 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -28,6 +28,7 @@ #include "cifsacl.h" #include <crypto/internal/hash.h> #include <linux/scatterlist.h> +#include <uapi/linux/cifs/cifs_mount.h> #ifdef CONFIG_CIFS_SMB2 #include "smb2pdu.h" #endif @@ -41,12 +42,7 @@ #define MAX_SES_INFO 2 #define MAX_TCON_INFO 4 -#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1) -#define MAX_SERVER_SIZE 15 -#define MAX_SHARE_SIZE 80 -#define CIFS_MAX_DOMAINNAME_LEN 256 /* max domain name length */ -#define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */ -#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */ +#define MAX_TREE_SIZE (2 + CIFS_NI_MAXHOST + 1 + CIFS_MAX_SHARE_LEN + 1) #define CIFS_MIN_RCV_POOL 4 @@ -135,6 +131,7 @@ struct cifs_secmech { /* per smb session structure/fields */ struct ntlmssp_auth { + bool sesskey_per_smbsess; /* whether session key is per smb session */ __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */ __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */ unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */ @@ -308,6 +305,9 @@ struct smb_version_operations { int (*create_hardlink)(const unsigned int, struct cifs_tcon *, const char *, const char *, struct cifs_sb_info *); + /* query symlink target */ + int (*query_symlink)(const unsigned int, struct cifs_tcon *, + const char *, char **, struct cifs_sb_info *); /* open a file for non-posix mounts */ int (*open)(const unsigned int, struct cifs_open_parms *, __u32 *, FILE_ALL_INFO *); @@ -361,18 +361,24 @@ struct smb_version_operations { /* push brlocks from the cache to the server */ int (*push_mand_locks)(struct cifsFileInfo *); /* get lease key of the inode */ - void (*get_lease_key)(struct inode *, struct cifs_fid *fid); + void (*get_lease_key)(struct inode *, struct cifs_fid *); /* set lease key of the inode */ - void (*set_lease_key)(struct inode *, struct cifs_fid *fid); + void (*set_lease_key)(struct inode *, struct cifs_fid *); /* generate new lease key */ - void (*new_lease_key)(struct cifs_fid *fid); - /* The next two functions will need to be changed to per smb session */ - void (*generate_signingkey)(struct TCP_Server_Info *server); - int (*calc_signature)(struct smb_rqst *rqst, - struct TCP_Server_Info *server); - int (*query_mf_symlink)(const unsigned char *path, char *pbuf, - unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb, - unsigned int xid); + void (*new_lease_key)(struct cifs_fid *); + int (*generate_signingkey)(struct cifs_ses *); + int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); + int (*query_mf_symlink)(const unsigned char *, char *, unsigned int *, + struct cifs_sb_info *, unsigned int); + /* if we can do cache read operations */ + bool (*is_read_op)(__u32); + /* set oplock level for the inode */ + void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int, + bool *); + /* create lease context buffer for CREATE request */ + char * (*create_lease_buf)(u8 *, u8); + /* parse lease context buffer and return oplock/epoch info */ + __u8 (*parse_lease_buf)(void *, unsigned int *); }; struct smb_version_values { @@ -390,9 +396,9 @@ struct smb_version_values { unsigned int cap_unix; unsigned int cap_nt_find; unsigned int cap_large_files; - unsigned int oplock_read; __u16 signing_enabled; __u16 signing_required; + size_t create_lease_size; }; #define HEADER_SIZE(server) (server->vals->header_size) @@ -548,7 +554,6 @@ struct TCP_Server_Info { int timeAdj; /* Adjust for difference in server time zone in sec */ __u64 CurrentMid; /* multiplex id - rotating counter */ char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ - char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */ /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* for signing, protected by srv_mutex */ @@ -731,6 +736,7 @@ struct cifs_ses { bool need_reconnect:1; /* connection reset, uid now invalid */ #ifdef CONFIG_CIFS_SMB2 __u16 session_flags; + char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */ #endif /* CONFIG_CIFS_SMB2 */ }; @@ -935,6 +941,8 @@ struct cifs_fid { __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */ #endif struct cifs_pending_open *pending_open; + unsigned int epoch; + bool purge_cache; }; struct cifs_fid_locks { @@ -1032,6 +1040,17 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); void cifsFileInfo_put(struct cifsFileInfo *cifs_file); +#define CIFS_CACHE_READ_FLG 1 +#define CIFS_CACHE_HANDLE_FLG 2 +#define CIFS_CACHE_RH_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_HANDLE_FLG) +#define CIFS_CACHE_WRITE_FLG 4 +#define CIFS_CACHE_RW_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG) +#define CIFS_CACHE_RHW_FLG (CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG) + +#define CIFS_CACHE_READ(cinode) (cinode->oplock & CIFS_CACHE_READ_FLG) +#define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG) +#define CIFS_CACHE_WRITE(cinode) (cinode->oplock & CIFS_CACHE_WRITE_FLG) + /* * One of these for each file inode */ @@ -1043,8 +1062,8 @@ struct cifsInodeInfo { /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ - bool clientCanCacheRead; /* read oplock */ - bool clientCanCacheAll; /* read and writebehind oplock */ + unsigned int oplock; /* oplock/lease level we have */ + unsigned int epoch; /* used to track lease state changes */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ unsigned long time; /* jiffies of last update of inode */ @@ -1502,7 +1521,7 @@ extern mempool_t *cifs_mid_poolp; extern struct smb_version_operations smb1_operations; extern struct smb_version_values smb1_values; #define SMB20_VERSION_STRING "2.0" -/*extern struct smb_version_operations smb20_operations; */ /* not needed yet */ +extern struct smb_version_operations smb20_operations; extern struct smb_version_values smb20_values; #define SMB21_VERSION_STRING "2.1" extern struct smb_version_operations smb21_operations; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 11ca24a8e054..948676db8e2e 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1495,11 +1495,12 @@ struct reparse_data { __u32 ReparseTag; __u16 ReparseDataLength; __u16 Reserved; - __u16 AltNameOffset; - __u16 AltNameLen; - __u16 TargetNameOffset; - __u16 TargetNameLen; - char LinkNamesBuf[1]; + __u16 SubstituteNameOffset; + __u16 SubstituteNameLength; + __u16 PrintNameOffset; + __u16 PrintNameLength; + __u32 Flags; + char PathBuffer[0]; } __attribute__((packed)); struct cifs_quota_data { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index b29a012bed33..b5ec2a268f56 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -357,13 +357,9 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char **syminfo, const struct nls_table *nls_codepage); -#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL -extern int CIFSSMBQueryReparseLinkInfo(const unsigned int xid, - struct cifs_tcon *tcon, - const unsigned char *searchName, - char *symlinkinfo, const int buflen, __u16 fid, - const struct nls_table *nls_codepage); -#endif /* temporarily unused until cifs_symlink fixed */ +extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, + __u16 fid, char **symlinkinfo, + const struct nls_table *nls_codepage); extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const int disposition, const int access_flags, const int omode, @@ -435,7 +431,7 @@ extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); extern int calc_seckey(struct cifs_ses *); -extern void generate_smb3signingkey(struct TCP_Server_Info *); +extern int generate_smb3signingkey(struct cifs_ses *); #ifdef CONFIG_CIFS_WEAK_PW_HASH extern int calc_lanman_hash(const char *password, const char *cryptkey, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index a89c4cb4e6cf..a3d74fea1623 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -3067,7 +3067,6 @@ querySymLinkRetry: return rc; } -#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL /* * Recent Windows versions now create symlinks more frequently * and they use the "reparse point" mechanism below. We can of course @@ -3079,18 +3078,22 @@ querySymLinkRetry: * it is not compiled in by default until callers fixed up and more tested. */ int -CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, - const unsigned char *searchName, - char *symlinkinfo, const int buflen, __u16 fid, - const struct nls_table *nls_codepage) +CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, + __u16 fid, char **symlinkinfo, + const struct nls_table *nls_codepage) { int rc = 0; int bytes_returned; struct smb_com_transaction_ioctl_req *pSMB; struct smb_com_transaction_ioctl_rsp *pSMBr; + bool is_unicode; + unsigned int sub_len; + char *sub_start; + struct reparse_data *reparse_buf; + __u32 data_offset, data_count; + char *end_of_smb; - cifs_dbg(FYI, "In Windows reparse style QueryLink for path %s\n", - searchName); + cifs_dbg(FYI, "In Windows reparse style QueryLink for fid %u\n", fid); rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) @@ -3119,66 +3122,55 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc); - } else { /* decode response */ - __u32 data_offset = le32_to_cpu(pSMBr->DataOffset); - __u32 data_count = le32_to_cpu(pSMBr->DataCount); - if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) { - /* BB also check enough total bytes returned */ - rc = -EIO; /* bad smb */ - goto qreparse_out; - } - if (data_count && (data_count < 2048)) { - char *end_of_smb = 2 /* sizeof byte count */ + - get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; - - struct reparse_data *reparse_buf = - (struct reparse_data *) - ((char *)&pSMBr->hdr.Protocol - + data_offset); - if ((char *)reparse_buf >= end_of_smb) { - rc = -EIO; - goto qreparse_out; - } - if ((reparse_buf->LinkNamesBuf + - reparse_buf->TargetNameOffset + - reparse_buf->TargetNameLen) > end_of_smb) { - cifs_dbg(FYI, "reparse buf beyond SMB\n"); - rc = -EIO; - goto qreparse_out; - } + goto qreparse_out; + } - if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) { - cifs_from_ucs2(symlinkinfo, (__le16 *) - (reparse_buf->LinkNamesBuf + - reparse_buf->TargetNameOffset), - buflen, - reparse_buf->TargetNameLen, - nls_codepage, 0); - } else { /* ASCII names */ - strncpy(symlinkinfo, - reparse_buf->LinkNamesBuf + - reparse_buf->TargetNameOffset, - min_t(const int, buflen, - reparse_buf->TargetNameLen)); - } - } else { - rc = -EIO; - cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n"); - } - symlinkinfo[buflen] = 0; /* just in case so the caller - does not go off the end of the buffer */ - cifs_dbg(FYI, "readlink result - %s\n", symlinkinfo); + data_offset = le32_to_cpu(pSMBr->DataOffset); + data_count = le32_to_cpu(pSMBr->DataCount); + if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) { + /* BB also check enough total bytes returned */ + rc = -EIO; /* bad smb */ + goto qreparse_out; + } + if (!data_count || (data_count > 2048)) { + rc = -EIO; + cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n"); + goto qreparse_out; + } + end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; + reparse_buf = (struct reparse_data *) + ((char *)&pSMBr->hdr.Protocol + data_offset); + if ((char *)reparse_buf >= end_of_smb) { + rc = -EIO; + goto qreparse_out; } + if ((reparse_buf->PathBuffer + reparse_buf->PrintNameOffset + + reparse_buf->PrintNameLength) > end_of_smb) { + cifs_dbg(FYI, "reparse buf beyond SMB\n"); + rc = -EIO; + goto qreparse_out; + } + sub_start = reparse_buf->SubstituteNameOffset + reparse_buf->PathBuffer; + sub_len = reparse_buf->SubstituteNameLength; + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) + is_unicode = true; + else + is_unicode = false; + /* BB FIXME investigate remapping reserved chars here */ + *symlinkinfo = cifs_strndup_from_utf16(sub_start, sub_len, is_unicode, + nls_codepage); + if (!*symlinkinfo) + rc = -ENOMEM; qreparse_out: cifs_buf_release(pSMB); - /* Note: On -EAGAIN error only caller can retry on handle based calls - since file handle passed in no longer valid */ - + /* + * Note: On -EAGAIN error only caller can retry on handle based calls + * since file handle passed in no longer valid. + */ return rc; } -#endif /* CIFS_SYMLINK_EXPERIMENTAL */ /* BB temporarily unused */ #ifdef CONFIG_CIFS_POSIX diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d67c550c4980..a279ffc0bc29 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -379,6 +379,7 @@ cifs_reconnect(struct TCP_Server_Info *server) try_to_freeze(); /* we should try only the port we connected to before */ + mutex_lock(&server->srv_mutex); rc = generic_ip_connect(server); if (rc) { cifs_dbg(FYI, "reconnect error %d\n", rc); @@ -390,6 +391,7 @@ cifs_reconnect(struct TCP_Server_Info *server) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&GlobalMid_Lock); } + mutex_unlock(&server->srv_mutex); } while (server->tcpStatus == CifsNeedReconnect); return rc; @@ -1114,7 +1116,7 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) break; #ifdef CONFIG_CIFS_SMB2 case Smb_20: - vol->ops = &smb21_operations; /* currently identical with 2.1 */ + vol->ops = &smb20_operations; vol->vals = &smb20_values; break; case Smb_21: @@ -1575,8 +1577,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (strnlen(string, MAX_USERNAME_SIZE) > - MAX_USERNAME_SIZE) { + if (strnlen(string, CIFS_MAX_USERNAME_LEN) > + CIFS_MAX_USERNAME_LEN) { printk(KERN_WARNING "CIFS: username too long\n"); goto cifs_parse_mount_err; } @@ -2221,13 +2223,13 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol) /* anything else takes username/password */ if (strncmp(ses->user_name, vol->username ? vol->username : "", - MAX_USERNAME_SIZE)) + CIFS_MAX_USERNAME_LEN)) return 0; if (strlen(vol->username) != 0 && ses->password != NULL && strncmp(ses->password, vol->password ? vol->password : "", - MAX_PASSWORD_SIZE)) + CIFS_MAX_PASSWORD_LEN)) return 0; } return 1; @@ -2352,7 +2354,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) } len = delim - payload; - if (len > MAX_USERNAME_SIZE || len <= 0) { + if (len > CIFS_MAX_USERNAME_LEN || len <= 0) { cifs_dbg(FYI, "Bad value from username search (len=%zd)\n", len); rc = -EINVAL; @@ -2369,7 +2371,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username); len = key->datalen - (len + 1); - if (len > MAX_PASSWORD_SIZE || len <= 0) { + if (len > CIFS_MAX_PASSWORD_LEN || len <= 0) { cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len); rc = -EINVAL; kfree(vol->username); @@ -3826,33 +3828,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, nls_info); - if (rc) { + if (rc) cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc); - } else { - mutex_lock(&server->srv_mutex); - if (!server->session_estab) { - server->session_key.response = ses->auth_key.response; - server->session_key.len = ses->auth_key.len; - server->sequence_number = 0x2; - server->session_estab = true; - ses->auth_key.response = NULL; - if (server->ops->generate_signingkey) - server->ops->generate_signingkey(server); - } - mutex_unlock(&server->srv_mutex); - - cifs_dbg(FYI, "CIFS Session Established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); - } - - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - ses->auth_key.len = 0; - kfree(ses->ntlmssp); - ses->ntlmssp = NULL; return rc; } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index d62ce0d48141..5384c2a640ca 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -32,6 +32,7 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "cifs_unicode.h" static void renew_parental_timestamps(struct dentry *direntry) @@ -499,6 +500,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, if (server->ops->close) server->ops->close(xid, tcon, &fid); cifs_del_pending_open(&open); + fput(file); rc = -ENOMEM; } @@ -834,12 +836,17 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q) { struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls; unsigned long hash; - int i; + wchar_t c; + int i, charlen; hash = init_name_hash(); - for (i = 0; i < q->len; i++) - hash = partial_name_hash(nls_tolower(codepage, q->name[i]), - hash); + for (i = 0; i < q->len; i += charlen) { + charlen = codepage->char2uni(&q->name[i], q->len - i, &c); + /* error out if we can't convert the character */ + if (unlikely(charlen < 0)) + return charlen; + hash = partial_name_hash(cifs_toupper(c), hash); + } q->hash = end_name_hash(hash); return 0; @@ -849,11 +856,47 @@ static int cifs_ci_compare(const struct dentry *parent, const struct dentry *den unsigned int len, const char *str, const struct qstr *name) { struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls; + wchar_t c1, c2; + int i, l1, l2; - if ((name->len == len) && - (nls_strnicmp(codepage, name->name, str, len) == 0)) - return 0; - return 1; + /* + * We make the assumption here that uppercase characters in the local + * codepage are always the same length as their lowercase counterparts. + * + * If that's ever not the case, then this will fail to match it. + */ + if (name->len != len) + return 1; + + for (i = 0; i < len; i += l1) { + /* Convert characters in both strings to UTF-16. */ + l1 = codepage->char2uni(&str[i], len - i, &c1); + l2 = codepage->char2uni(&name->name[i], name->len - i, &c2); + + /* + * If we can't convert either character, just declare it to + * be 1 byte long and compare the original byte. + */ + if (unlikely(l1 < 0 && l2 < 0)) { + if (str[i] != name->name[i]) + return 1; + l1 = 1; + continue; + } + + /* + * Here, we again ass|u|me that upper/lowercase versions of + * a character are the same length in the local NLS. + */ + if (l1 != l2) + return 1; + + /* Now compare uppercase versions of these characters */ + if (cifs_toupper(c1) != cifs_toupper(c2)) + return 1; + } + + return 0; } const struct dentry_operations cifs_ci_dentry_ops = { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 9d0dd952ad79..eb955b525e55 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -313,8 +313,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, * If the server returned a read oplock and we have mandatory brlocks, * set oplock level to None. */ - if (oplock == server->vals->oplock_read && - cifs_has_mand_locks(cinode)) { + if (server->ops->is_read_op(oplock) && cifs_has_mand_locks(cinode)) { cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n"); oplock = 0; } @@ -324,6 +323,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, oplock = fid->pending_open->oplock; list_del(&fid->pending_open->olist); + fid->purge_cache = false; server->ops->set_fid(cfile, fid, oplock); list_add(&cfile->tlist, &tcon->openFileList); @@ -334,6 +334,9 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, list_add_tail(&cfile->flist, &cinode->openFileList); spin_unlock(&cifs_file_list_lock); + if (fid->purge_cache) + cifs_invalidate_mapping(inode); + file->private_data = cfile; return cfile; } @@ -1524,12 +1527,12 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, * read won't conflict with non-overlapted locks due to * pagereading. */ - if (!CIFS_I(inode)->clientCanCacheAll && - CIFS_I(inode)->clientCanCacheRead) { + if (!CIFS_CACHE_WRITE(CIFS_I(inode)) && + CIFS_CACHE_READ(CIFS_I(inode))) { cifs_invalidate_mapping(inode); cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n", inode); - CIFS_I(inode)->clientCanCacheRead = false; + CIFS_I(inode)->oplock = 0; } rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, @@ -2213,7 +2216,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n", file->f_path.dentry->d_name.name, datasync); - if (!CIFS_I(inode)->clientCanCacheRead) { + if (!CIFS_CACHE_READ(CIFS_I(inode))) { rc = cifs_invalidate_mapping(inode); if (rc) { cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc); @@ -2577,7 +2580,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); ssize_t written; - if (cinode->clientCanCacheAll) { + if (CIFS_CACHE_WRITE(cinode)) { if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) @@ -2591,7 +2594,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, * these pages but not on the region from pos to ppos+len-1. */ written = cifs_user_writev(iocb, iov, nr_segs, pos); - if (written > 0 && cinode->clientCanCacheRead) { + if (written > 0 && CIFS_CACHE_READ(cinode)) { /* * Windows 7 server can delay breaking level2 oplock if a write * request comes - break it on the client to prevent reading @@ -2600,7 +2603,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, cifs_invalidate_mapping(inode); cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n", inode); - cinode->clientCanCacheRead = false; + cinode->oplock = 0; } return written; } @@ -2957,7 +2960,7 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, * on pages affected by this read but not on the region from pos to * pos+len-1. */ - if (!cinode->clientCanCacheRead) + if (!CIFS_CACHE_READ(cinode)) return cifs_user_readv(iocb, iov, nr_segs, pos); if (cap_unix(tcon->ses) && @@ -3093,7 +3096,7 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) xid = get_xid(); - if (!CIFS_I(inode)->clientCanCacheRead) { + if (!CIFS_CACHE_READ(CIFS_I(inode))) { rc = cifs_invalidate_mapping(inode); if (rc) return rc; @@ -3376,6 +3379,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, return rc; } +/* + * cifs_readpage_worker must be called with the page pinned + */ static int cifs_readpage_worker(struct file *file, struct page *page, loff_t *poffset) { @@ -3387,7 +3393,6 @@ static int cifs_readpage_worker(struct file *file, struct page *page, if (rc == 0) goto read_complete; - page_cache_get(page); read_data = kmap(page); /* for reads over a certain size could initiate async read ahead */ @@ -3414,7 +3419,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page, io_error: kunmap(page); - page_cache_release(page); + unlock_page(page); read_complete: return rc; @@ -3439,8 +3444,6 @@ static int cifs_readpage(struct file *file, struct page *page) rc = cifs_readpage_worker(file, page, &offset); - unlock_page(page); - free_xid(xid); return rc; } @@ -3494,6 +3497,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { + int oncethru = 0; pgoff_t index = pos >> PAGE_CACHE_SHIFT; loff_t offset = pos & (PAGE_CACHE_SIZE - 1); loff_t page_start = pos & PAGE_MASK; @@ -3503,6 +3507,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len); +start: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { rc = -ENOMEM; @@ -3526,7 +3531,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, * is, when the page lies beyond the EOF, or straddles the EOF * and the write will cover all of the existing data. */ - if (CIFS_I(mapping->host)->clientCanCacheRead) { + if (CIFS_CACHE_READ(CIFS_I(mapping->host))) { i_size = i_size_read(mapping->host); if (page_start >= i_size || (offset == 0 && (pos + len) >= i_size)) { @@ -3544,13 +3549,16 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, } } - if ((file->f_flags & O_ACCMODE) != O_WRONLY) { + if ((file->f_flags & O_ACCMODE) != O_WRONLY && !oncethru) { /* * might as well read a page, it is fast enough. If we get * an error, we don't need to return it. cifs_write_end will * do a sync write instead since PG_uptodate isn't set. */ cifs_readpage_worker(file, page, &page_start); + page_cache_release(page); + oncethru = 1; + goto start; } else { /* we could try using another file handle if there is one - but how would we lock it to prevent close of that handle @@ -3609,20 +3617,20 @@ void cifs_oplock_break(struct work_struct *work) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; - if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead && + if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) && cifs_has_mand_locks(cinode)) { cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n", inode); - cinode->clientCanCacheRead = false; + cinode->oplock = 0; } if (inode && S_ISREG(inode->i_mode)) { - if (cinode->clientCanCacheRead) + if (CIFS_CACHE_READ(cinode)) break_lease(inode, O_RDONLY); else break_lease(inode, O_WRONLY); rc = filemap_fdatawrite(inode->i_mapping); - if (cinode->clientCanCacheRead == 0) { + if (!CIFS_CACHE_READ(cinode)) { rc = filemap_fdatawait(inode->i_mapping); mapping_set_error(inode->i_mapping, rc); cifs_invalidate_mapping(inode); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 449b6cf09b09..f9ff9c173f78 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -101,7 +101,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) } /* don't bother with revalidation if we have an oplock */ - if (cifs_i->clientCanCacheRead) { + if (CIFS_CACHE_READ(cifs_i)) { cifs_dbg(FYI, "%s: inode %llu is oplocked\n", __func__, cifs_i->uniqueid); return; @@ -549,6 +549,10 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, * when Unix extensions are disabled - fake it. */ fattr->cf_nlink = 2; + } else if (fattr->cf_cifsattrs & ATTR_REPARSE) { + fattr->cf_mode = S_IFLNK; + fattr->cf_dtype = DT_LNK; + fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); } else { fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; @@ -646,7 +650,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, cifs_dbg(FYI, "Getting info on %s\n", full_path); if ((data == NULL) && (*inode != NULL)) { - if (CIFS_I(*inode)->clientCanCacheRead) { + if (CIFS_CACHE_READ(CIFS_I(*inode))) { cifs_dbg(FYI, "No need to revalidate cached inode sizes\n"); goto cgii_exit; } @@ -1657,7 +1661,7 @@ cifs_inode_needs_reval(struct inode *inode) struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - if (cifs_i->clientCanCacheRead) + if (CIFS_CACHE_READ(cifs_i)) return false; if (!lookupCacheEnabled) @@ -1800,7 +1804,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, * We need to be sure that all dirty pages are written and the server * has actual ctime, mtime and file length. */ - if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping && + if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = filemap_fdatawait(inode->i_mapping); if (rc) { @@ -1852,14 +1856,11 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) static void cifs_setsize(struct inode *inode, loff_t offset) { - loff_t oldsize; - spin_lock(&inode->i_lock); - oldsize = inode->i_size; i_size_write(inode, offset); spin_unlock(&inode->i_lock); - truncate_pagecache(inode, oldsize, offset); + truncate_pagecache(inode, offset); } static int diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 562044f700e5..7e36ceba0c7a 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -509,6 +509,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = NULL; struct cifs_tcon *tcon; + struct TCP_Server_Info *server; xid = get_xid(); @@ -519,25 +520,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) goto out; } tcon = tlink_tcon(tlink); - - /* - * For now, we just handle symlinks with unix extensions enabled. - * Eventually we should handle NTFS reparse points, and MacOS - * symlink support. For instance... - * - * rc = CIFSSMBQueryReparseLinkInfo(...) - * - * For now, just return -EACCES when the server doesn't support posix - * extensions. Note that we still allow querying symlinks when posix - * extensions are manually disabled. We could disable these as well - * but there doesn't seem to be any harm in allowing the client to - * read them. - */ - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) && - !cap_unix(tcon->ses)) { - rc = -EACCES; - goto out; - } + server = tcon->ses->server; full_path = build_path_from_dentry(direntry); if (!full_path) @@ -559,6 +542,9 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) if ((rc != 0) && cap_unix(tcon->ses)) rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, cifs_sb->local_nls); + else if (rc != 0 && server->ops->query_symlink) + rc = server->ops->query_symlink(xid, tcon, full_path, + &target_path, cifs_sb); kfree(full_path); out: diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index f7d4b2285efe..138a011633fe 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -105,6 +105,7 @@ sesInfoFree(struct cifs_ses *buf_to_free) } kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); + kfree(buf_to_free->auth_key.response); kfree(buf_to_free); } @@ -545,19 +546,15 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) oplock &= 0xF; if (oplock == OPLOCK_EXCLUSIVE) { - cinode->clientCanCacheAll = true; - cinode->clientCanCacheRead = true; + cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", &cinode->vfs_inode); } else if (oplock == OPLOCK_READ) { - cinode->clientCanCacheAll = false; - cinode->clientCanCacheRead = true; + cinode->oplock = CIFS_CACHE_READ_FLG; cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", &cinode->vfs_inode); - } else { - cinode->clientCanCacheAll = false; - cinode->clientCanCacheRead = false; - } + } else + cinode->oplock = 0; } bool diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 69d2c826a23b..42ef03be089f 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -172,6 +172,9 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) if (cifs_dfs_is_possible(cifs_sb) && (fattr->cf_cifsattrs & ATTR_REPARSE)) fattr->cf_flags |= CIFS_FATTR_NEED_REVAL; + } else if (fattr->cf_cifsattrs & ATTR_REPARSE) { + fattr->cf_mode = S_IFLNK; + fattr->cf_dtype = DT_LNK; } else { fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 08dd37bb23aa..5f99b7f19e78 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -226,7 +226,7 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, *(bcc_ptr+1) = 0; } else { bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name, - MAX_USERNAME_SIZE, nls_cp); + CIFS_MAX_USERNAME_LEN, nls_cp); } bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* account for null termination */ @@ -246,8 +246,8 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* BB what about null user mounts - check that we do this BB */ /* copy user */ if (ses->user_name != NULL) { - strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE); - bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); + strncpy(bcc_ptr, ses->user_name, CIFS_MAX_USERNAME_LEN); + bcc_ptr += strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN); } /* else null user mount */ *bcc_ptr = 0; @@ -428,7 +428,8 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; if (ses->server->sign) { flags |= NTLMSSP_NEGOTIATE_SIGN; - if (!ses->server->session_estab) + if (!ses->server->session_estab || + ses->ntlmssp->sesskey_per_smbsess) flags |= NTLMSSP_NEGOTIATE_KEY_XCH; } @@ -466,7 +467,8 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC; if (ses->server->sign) { flags |= NTLMSSP_NEGOTIATE_SIGN; - if (!ses->server->session_estab) + if (!ses->server->session_estab || + ses->ntlmssp->sesskey_per_smbsess) flags |= NTLMSSP_NEGOTIATE_KEY_XCH; } @@ -501,7 +503,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, } else { int len; len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName, - MAX_USERNAME_SIZE, nls_cp); + CIFS_MAX_USERNAME_LEN, nls_cp); len *= 2; /* unicode is 2 bytes each */ sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->DomainName.Length = cpu_to_le16(len); @@ -517,7 +519,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, } else { int len; len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name, - MAX_USERNAME_SIZE, nls_cp); + CIFS_MAX_USERNAME_LEN, nls_cp); len *= 2; /* unicode is 2 bytes each */ sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->UserName.Length = cpu_to_le16(len); @@ -629,7 +631,8 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, type = select_sectype(ses->server, ses->sectype); cifs_dbg(FYI, "sess setup type %d\n", type); if (type == Unspecified) { - cifs_dbg(VFS, "Unable to select appropriate authentication method!"); + cifs_dbg(VFS, + "Unable to select appropriate authentication method!"); return -EINVAL; } @@ -640,6 +643,8 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); if (!ses->ntlmssp) return -ENOMEM; + ses->ntlmssp->sesskey_per_smbsess = false; + } ssetup_ntlmssp_authenticate: @@ -815,8 +820,9 @@ ssetup_ntlmssp_authenticate: ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, GFP_KERNEL); if (!ses->auth_key.response) { - cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); + cifs_dbg(VFS, + "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); rc = -ENOMEM; goto ssetup_exit; } @@ -1005,5 +1011,37 @@ ssetup_exit: if ((phase == NtLmChallenge) && (rc == 0)) goto ssetup_ntlmssp_authenticate; + if (!rc) { + mutex_lock(&ses->server->srv_mutex); + if (!ses->server->session_estab) { + if (ses->server->sign) { + ses->server->session_key.response = + kmemdup(ses->auth_key.response, + ses->auth_key.len, GFP_KERNEL); + if (!ses->server->session_key.response) { + rc = -ENOMEM; + mutex_unlock(&ses->server->srv_mutex); + goto keycp_exit; + } + ses->server->session_key.len = + ses->auth_key.len; + } + ses->server->sequence_number = 0x2; + ses->server->session_estab = true; + } + mutex_unlock(&ses->server->srv_mutex); + + cifs_dbg(FYI, "CIFS session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); + } + +keycp_exit: + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + kfree(ses->ntlmssp); + return rc; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 60943978aec3..8233b174de3d 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -700,7 +700,7 @@ cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); cfile->fid.netfid = fid->netfid; cifs_set_oplock_level(cinode, oplock); - cinode->can_cache_brlcks = cinode->clientCanCacheAll; + cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); } static void @@ -837,7 +837,7 @@ cifs_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, { return CIFSSMBLock(0, tcon, fid->netfid, current->tgid, 0, 0, 0, 0, LOCKING_ANDX_OPLOCK_RELEASE, false, - cinode->clientCanCacheRead ? 1 : 0); + CIFS_CACHE_READ(cinode) ? 1 : 0); } static int @@ -881,6 +881,43 @@ cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset, (__u8)type, wait, 0); } +static int +cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, + const char *full_path, char **target_path, + struct cifs_sb_info *cifs_sb) +{ + int rc; + int oplock = 0; + __u16 netfid; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); + + rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, + FILE_READ_ATTRIBUTES, OPEN_REPARSE_POINT, &netfid, + &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) + return rc; + + rc = CIFSSMBQuerySymLink(xid, tcon, netfid, target_path, + cifs_sb->local_nls); + if (rc) { + CIFSSMBClose(xid, tcon, netfid); + return rc; + } + + convert_delimiter(*target_path, '/'); + CIFSSMBClose(xid, tcon, netfid); + cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + return rc; +} + +static bool +cifs_is_read_op(__u32 oplock) +{ + return oplock == OPLOCK_READ; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -927,6 +964,7 @@ struct smb_version_operations smb1_operations = { .rename_pending_delete = cifs_rename_pending_delete, .rename = CIFSSMBRename, .create_hardlink = CIFSCreateHardLink, + .query_symlink = cifs_query_symlink, .open = cifs_open_file, .set_fid = cifs_set_fid, .close = cifs_close_file, @@ -945,6 +983,7 @@ struct smb_version_operations smb1_operations = { .mand_unlock_range = cifs_unlock_range, .push_mand_locks = cifs_push_mandatory_locks, .query_mf_symlink = open_query_close_cifs_symlink, + .is_read_op = cifs_is_read_op, }; struct smb_version_values smb1_values = { @@ -960,7 +999,6 @@ struct smb_version_values smb1_values = { .cap_unix = CAP_UNIX, .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, .cap_large_files = CAP_LARGE_FILES, - .oplock_read = OPLOCK_READ, .signing_enabled = SECMODE_SIGN_ENABLED, .signing_required = SECMODE_SIGN_REQUIRED, }; diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 04a81a4142c3..3f17b4550831 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -34,29 +34,6 @@ #include "fscache.h" #include "smb2proto.h" -void -smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) -{ - oplock &= 0xFF; - if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) - return; - if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE || - oplock == SMB2_OPLOCK_LEVEL_BATCH) { - cinode->clientCanCacheAll = true; - cinode->clientCanCacheRead = true; - cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", - &cinode->vfs_inode); - } else if (oplock == SMB2_OPLOCK_LEVEL_II) { - cinode->clientCanCacheAll = false; - cinode->clientCanCacheRead = true; - cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", - &cinode->vfs_inode); - } else { - cinode->clientCanCacheAll = false; - cinode->clientCanCacheRead = false; - } -} - int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, FILE_ALL_INFO *buf) @@ -86,7 +63,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE); - rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data); + rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data, NULL); if (rc) goto out; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index c6ec1633309a..78ff88c467b9 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -60,7 +60,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); if (rc) { kfree(utf16_path); return rc; @@ -136,7 +136,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, return -ENOMEM; rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, 0, smb2_data, + FILE_READ_ATTRIBUTES, FILE_OPEN, + OPEN_REPARSE_POINT, smb2_data, SMB2_OP_QUERY_INFO); if (rc) goto out; @@ -191,8 +192,8 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_DELETE_ON_CLOSE, NULL, - SMB2_OP_DELETE); + CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, + NULL, SMB2_OP_DELETE); } static int diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index b0c43345cd98..fb3966265b6e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -171,6 +171,10 @@ smb2_check_message(char *buf, unsigned int length) if (4 + len != clc_len) { cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n", clc_len, 4 + len, mid); + /* create failed on symlink */ + if (command == SMB2_CREATE_HE && + hdr->Status == STATUS_STOPPED_ON_SYMLINK) + return 0; /* Windows 7 server returns 24 bytes more */ if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) return 0; @@ -376,23 +380,15 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb) __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode) { - if (cinode->clientCanCacheAll) - return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING; - else if (cinode->clientCanCacheRead) - return SMB2_LEASE_READ_CACHING; - return 0; -} - -__u8 smb2_map_lease_to_oplock(__le32 lease_state) -{ - if (lease_state & SMB2_LEASE_WRITE_CACHING) { - if (lease_state & SMB2_LEASE_HANDLE_CACHING) - return SMB2_OPLOCK_LEVEL_BATCH; - else - return SMB2_OPLOCK_LEVEL_EXCLUSIVE; - } else if (lease_state & SMB2_LEASE_READ_CACHING) - return SMB2_OPLOCK_LEVEL_II; - return 0; + __le32 lease = 0; + + if (CIFS_CACHE_WRITE(cinode)) + lease |= SMB2_LEASE_WRITE_CACHING; + if (CIFS_CACHE_HANDLE(cinode)) + lease |= SMB2_LEASE_HANDLE_CACHING; + if (CIFS_CACHE_READ(cinode)) + lease |= SMB2_LEASE_READ_CACHING; + return lease; } struct smb2_lease_break_work { @@ -417,96 +413,109 @@ cifs_ses_oplock_break(struct work_struct *work) } static bool -smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server) +smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, + struct smb2_lease_break_work *lw) { - struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer; - struct list_head *tmp, *tmp1, *tmp2; - struct cifs_ses *ses; - struct cifs_tcon *tcon; - struct cifsInodeInfo *cinode; + bool found; + __u8 lease_state; + struct list_head *tmp; struct cifsFileInfo *cfile; + struct TCP_Server_Info *server = tcon->ses->server; struct cifs_pending_open *open; - struct smb2_lease_break_work *lw; - bool found; + struct cifsInodeInfo *cinode; int ack_req = le32_to_cpu(rsp->Flags & SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED); - lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL); - if (!lw) - return false; + lease_state = le32_to_cpu(rsp->NewLeaseState); - INIT_WORK(&lw->lease_break, cifs_ses_oplock_break); - lw->lease_state = rsp->NewLeaseState; + list_for_each(tmp, &tcon->openFileList) { + cfile = list_entry(tmp, struct cifsFileInfo, tlist); + cinode = CIFS_I(cfile->dentry->d_inode); - cifs_dbg(FYI, "Checking for lease break\n"); + if (memcmp(cinode->lease_key, rsp->LeaseKey, + SMB2_LEASE_KEY_SIZE)) + continue; - /* look up tcon based on tid & uid */ - spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + cifs_dbg(FYI, "found in the open list\n"); + cifs_dbg(FYI, "lease key match, lease break 0x%d\n", + le32_to_cpu(rsp->NewLeaseState)); - spin_lock(&cifs_file_list_lock); - list_for_each(tmp1, &ses->tcon_list) { - tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + server->ops->set_oplock_level(cinode, lease_state, 0, NULL); - cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); - list_for_each(tmp2, &tcon->openFileList) { - cfile = list_entry(tmp2, struct cifsFileInfo, - tlist); - cinode = CIFS_I(cfile->dentry->d_inode); + if (ack_req) + cfile->oplock_break_cancelled = false; + else + cfile->oplock_break_cancelled = true; - if (memcmp(cinode->lease_key, rsp->LeaseKey, - SMB2_LEASE_KEY_SIZE)) - continue; + queue_work(cifsiod_wq, &cfile->oplock_break); + kfree(lw); + return true; + } - cifs_dbg(FYI, "found in the open list\n"); - cifs_dbg(FYI, "lease key match, lease break 0x%d\n", - le32_to_cpu(rsp->NewLeaseState)); + found = false; + list_for_each_entry(open, &tcon->pending_opens, olist) { + if (memcmp(open->lease_key, rsp->LeaseKey, + SMB2_LEASE_KEY_SIZE)) + continue; + + if (!found && ack_req) { + found = true; + memcpy(lw->lease_key, open->lease_key, + SMB2_LEASE_KEY_SIZE); + lw->tlink = cifs_get_tlink(open->tlink); + queue_work(cifsiod_wq, &lw->lease_break); + } - smb2_set_oplock_level(cinode, - smb2_map_lease_to_oplock(rsp->NewLeaseState)); + cifs_dbg(FYI, "found in the pending open list\n"); + cifs_dbg(FYI, "lease key match, lease break 0x%d\n", + le32_to_cpu(rsp->NewLeaseState)); - if (ack_req) - cfile->oplock_break_cancelled = false; - else - cfile->oplock_break_cancelled = true; + open->oplock = lease_state; + } + return found; +} - queue_work(cifsiod_wq, &cfile->oplock_break); +static bool +smb2_is_valid_lease_break(char *buffer) +{ + struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer; + struct list_head *tmp, *tmp1, *tmp2; + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + struct smb2_lease_break_work *lw; - spin_unlock(&cifs_file_list_lock); - spin_unlock(&cifs_tcp_ses_lock); - return true; - } + lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL); + if (!lw) + return false; - found = false; - list_for_each_entry(open, &tcon->pending_opens, olist) { - if (memcmp(open->lease_key, rsp->LeaseKey, - SMB2_LEASE_KEY_SIZE)) - continue; + INIT_WORK(&lw->lease_break, cifs_ses_oplock_break); + lw->lease_state = rsp->NewLeaseState; - if (!found && ack_req) { - found = true; - memcpy(lw->lease_key, open->lease_key, - SMB2_LEASE_KEY_SIZE); - lw->tlink = cifs_get_tlink(open->tlink); - queue_work(cifsiod_wq, - &lw->lease_break); - } + cifs_dbg(FYI, "Checking for lease break\n"); - cifs_dbg(FYI, "found in the pending open list\n"); - cifs_dbg(FYI, "lease key match, lease break 0x%d\n", - le32_to_cpu(rsp->NewLeaseState)); + /* look up tcon based on tid & uid */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &cifs_tcp_ses_list) { + server = list_entry(tmp, struct TCP_Server_Info, tcp_ses_list); - open->oplock = - smb2_map_lease_to_oplock(rsp->NewLeaseState); - } - if (found) { - spin_unlock(&cifs_file_list_lock); - spin_unlock(&cifs_tcp_ses_lock); - return true; + list_for_each(tmp1, &server->smb_ses_list) { + ses = list_entry(tmp1, struct cifs_ses, smb_ses_list); + + spin_lock(&cifs_file_list_lock); + list_for_each(tmp2, &ses->tcon_list) { + tcon = list_entry(tmp2, struct cifs_tcon, + tcon_list); + cifs_stats_inc( + &tcon->stats.cifs_stats.num_oplock_brks); + if (smb2_tcon_has_lease(tcon, rsp, lw)) { + spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } } + spin_unlock(&cifs_file_list_lock); } - spin_unlock(&cifs_file_list_lock); } spin_unlock(&cifs_tcp_ses_lock); kfree(lw); @@ -532,7 +541,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) if (rsp->StructureSize != smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { if (le16_to_cpu(rsp->StructureSize) == 44) - return smb2_is_valid_lease_break(buffer, server); + return smb2_is_valid_lease_break(buffer); else return false; } @@ -560,14 +569,15 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "file id match, oplock break\n"); cinode = CIFS_I(cfile->dentry->d_inode); - if (!cinode->clientCanCacheAll && + if (!CIFS_CACHE_WRITE(cinode) && rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) cfile->oplock_break_cancelled = true; else cfile->oplock_break_cancelled = false; - smb2_set_oplock_level(cinode, - rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0); + server->ops->set_oplock_level(cinode, + rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0, + 0, NULL); queue_work(cifsiod_wq, &cfile->oplock_break); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f259e6cc8357..861b33214144 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -24,6 +24,7 @@ #include "smb2proto.h" #include "cifsproto.h" #include "cifs_debug.h" +#include "cifs_unicode.h" #include "smb2status.h" #include "smb2glob.h" @@ -229,7 +230,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); if (rc) { kfree(utf16_path); return rc; @@ -376,10 +377,13 @@ static void smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) { struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; + cfile->fid.persistent_fid = fid->persistent_fid; cfile->fid.volatile_fid = fid->volatile_fid; - smb2_set_oplock_level(cinode, oplock); - cinode->can_cache_brlcks = cinode->clientCanCacheAll; + server->ops->set_oplock_level(cinode, oplock, fid->epoch, + &fid->purge_cache); + cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); } static void @@ -463,7 +467,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL); + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); kfree(utf16_path); if (rc) { cifs_dbg(VFS, "open dir failed\n"); @@ -530,7 +534,7 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, return SMB2_oplock_break(0, tcon, fid->persistent_fid, fid->volatile_fid, - cinode->clientCanCacheRead ? 1 : 0); + CIFS_CACHE_READ(cinode) ? 1 : 0); } static int @@ -550,7 +554,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL); + rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL); if (rc) return rc; buf->f_type = SMB2_MAGIC_NUMBER; @@ -596,7 +600,245 @@ smb2_new_lease_key(struct cifs_fid *fid) get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE); } -struct smb_version_operations smb21_operations = { +static int +smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, + const char *full_path, char **target_path, + struct cifs_sb_info *cifs_sb) +{ + int rc; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + struct smb2_err_rsp *err_buf = NULL; + struct smb2_symlink_err_rsp *symlink; + unsigned int sub_len, sub_offset; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); + + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_buf); + + if (!rc || !err_buf) { + kfree(utf16_path); + return -ENOENT; + } + /* open must fail on symlink - reset rc */ + rc = 0; + symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData; + sub_len = le16_to_cpu(symlink->SubstituteNameLength); + sub_offset = le16_to_cpu(symlink->SubstituteNameOffset); + *target_path = cifs_strndup_from_utf16( + (char *)symlink->PathBuffer + sub_offset, + sub_len, true, cifs_sb->local_nls); + if (!(*target_path)) { + kfree(utf16_path); + return -ENOMEM; + } + convert_delimiter(*target_path, '/'); + cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path); + kfree(utf16_path); + return rc; +} + +static void +smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) +{ + oplock &= 0xFF; + if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) + return; + if (oplock == SMB2_OPLOCK_LEVEL_BATCH) { + cinode->oplock = CIFS_CACHE_RHW_FLG; + cifs_dbg(FYI, "Batch Oplock granted on inode %p\n", + &cinode->vfs_inode); + } else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { + cinode->oplock = CIFS_CACHE_RW_FLG; + cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n", + &cinode->vfs_inode); + } else if (oplock == SMB2_OPLOCK_LEVEL_II) { + cinode->oplock = CIFS_CACHE_READ_FLG; + cifs_dbg(FYI, "Level II Oplock granted on inode %p\n", + &cinode->vfs_inode); + } else + cinode->oplock = 0; +} + +static void +smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) +{ + char message[5] = {0}; + + oplock &= 0xFF; + if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) + return; + + cinode->oplock = 0; + if (oplock & SMB2_LEASE_READ_CACHING_HE) { + cinode->oplock |= CIFS_CACHE_READ_FLG; + strcat(message, "R"); + } + if (oplock & SMB2_LEASE_HANDLE_CACHING_HE) { + cinode->oplock |= CIFS_CACHE_HANDLE_FLG; + strcat(message, "H"); + } + if (oplock & SMB2_LEASE_WRITE_CACHING_HE) { + cinode->oplock |= CIFS_CACHE_WRITE_FLG; + strcat(message, "W"); + } + if (!cinode->oplock) + strcat(message, "None"); + cifs_dbg(FYI, "%s Lease granted on inode %p\n", message, + &cinode->vfs_inode); +} + +static void +smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, + unsigned int epoch, bool *purge_cache) +{ + unsigned int old_oplock = cinode->oplock; + + smb21_set_oplock_level(cinode, oplock, epoch, purge_cache); + + if (purge_cache) { + *purge_cache = false; + if (old_oplock == CIFS_CACHE_READ_FLG) { + if (cinode->oplock == CIFS_CACHE_READ_FLG && + (epoch - cinode->epoch > 0)) + *purge_cache = true; + else if (cinode->oplock == CIFS_CACHE_RH_FLG && + (epoch - cinode->epoch > 1)) + *purge_cache = true; + else if (cinode->oplock == CIFS_CACHE_RHW_FLG && + (epoch - cinode->epoch > 1)) + *purge_cache = true; + else if (cinode->oplock == 0 && + (epoch - cinode->epoch > 0)) + *purge_cache = true; + } else if (old_oplock == CIFS_CACHE_RH_FLG) { + if (cinode->oplock == CIFS_CACHE_RH_FLG && + (epoch - cinode->epoch > 0)) + *purge_cache = true; + else if (cinode->oplock == CIFS_CACHE_RHW_FLG && + (epoch - cinode->epoch > 1)) + *purge_cache = true; + } + cinode->epoch = epoch; + } +} + +static bool +smb2_is_read_op(__u32 oplock) +{ + return oplock == SMB2_OPLOCK_LEVEL_II; +} + +static bool +smb21_is_read_op(__u32 oplock) +{ + return (oplock & SMB2_LEASE_READ_CACHING_HE) && + !(oplock & SMB2_LEASE_WRITE_CACHING_HE); +} + +static __le32 +map_oplock_to_lease(u8 oplock) +{ + if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) + return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING; + else if (oplock == SMB2_OPLOCK_LEVEL_II) + return SMB2_LEASE_READ_CACHING; + else if (oplock == SMB2_OPLOCK_LEVEL_BATCH) + return SMB2_LEASE_HANDLE_CACHING | SMB2_LEASE_READ_CACHING | + SMB2_LEASE_WRITE_CACHING; + return 0; +} + +static char * +smb2_create_lease_buf(u8 *lease_key, u8 oplock) +{ + struct create_lease *buf; + + buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL); + if (!buf) + return NULL; + + buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); + buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); + buf->lcontext.LeaseState = map_oplock_to_lease(oplock); + + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_lease, lcontext)); + buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context)); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_lease, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + buf->Name[0] = 'R'; + buf->Name[1] = 'q'; + buf->Name[2] = 'L'; + buf->Name[3] = 's'; + return (char *)buf; +} + +static char * +smb3_create_lease_buf(u8 *lease_key, u8 oplock) +{ + struct create_lease_v2 *buf; + + buf = kzalloc(sizeof(struct create_lease_v2), GFP_KERNEL); + if (!buf) + return NULL; + + buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); + buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); + buf->lcontext.LeaseState = map_oplock_to_lease(oplock); + + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_lease_v2, lcontext)); + buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context_v2)); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_lease_v2, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + buf->Name[0] = 'R'; + buf->Name[1] = 'q'; + buf->Name[2] = 'L'; + buf->Name[3] = 's'; + return (char *)buf; +} + +static __u8 +smb2_parse_lease_buf(void *buf, unsigned int *epoch) +{ + struct create_lease *lc = (struct create_lease *)buf; + + *epoch = 0; /* not used */ + if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) + return SMB2_OPLOCK_LEVEL_NOCHANGE; + return le32_to_cpu(lc->lcontext.LeaseState); +} + +static __u8 +smb3_parse_lease_buf(void *buf, unsigned int *epoch) +{ + struct create_lease_v2 *lc = (struct create_lease_v2 *)buf; + + *epoch = le16_to_cpu(lc->lcontext.Epoch); + if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) + return SMB2_OPLOCK_LEVEL_NOCHANGE; + return le32_to_cpu(lc->lcontext.LeaseState); +} + +struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, .setup_async_request = smb2_setup_async_request, @@ -638,6 +880,7 @@ struct smb_version_operations smb21_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -660,8 +903,82 @@ struct smb_version_operations smb21_operations = { .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, .calc_signature = smb2_calc_signature, + .is_read_op = smb2_is_read_op, + .set_oplock_level = smb2_set_oplock_level, + .create_lease_buf = smb2_create_lease_buf, + .parse_lease_buf = smb2_parse_lease_buf, }; +struct smb_version_operations smb21_operations = { + .compare_fids = smb2_compare_fids, + .setup_request = smb2_setup_request, + .setup_async_request = smb2_setup_async_request, + .check_receive = smb2_check_receive, + .add_credits = smb2_add_credits, + .set_credits = smb2_set_credits, + .get_credits_field = smb2_get_credits_field, + .get_credits = smb2_get_credits, + .get_next_mid = smb2_get_next_mid, + .read_data_offset = smb2_read_data_offset, + .read_data_length = smb2_read_data_length, + .map_error = map_smb2_to_linux_error, + .find_mid = smb2_find_mid, + .check_message = smb2_check_message, + .dump_detail = smb2_dump_detail, + .clear_stats = smb2_clear_stats, + .print_stats = smb2_print_stats, + .is_oplock_break = smb2_is_valid_oplock_break, + .need_neg = smb2_need_neg, + .negotiate = smb2_negotiate, + .negotiate_wsize = smb2_negotiate_wsize, + .negotiate_rsize = smb2_negotiate_rsize, + .sess_setup = SMB2_sess_setup, + .logoff = SMB2_logoff, + .tree_connect = SMB2_tcon, + .tree_disconnect = SMB2_tdis, + .is_path_accessible = smb2_is_path_accessible, + .can_echo = smb2_can_echo, + .echo = SMB2_echo, + .query_path_info = smb2_query_path_info, + .get_srv_inum = smb2_get_srv_inum, + .query_file_info = smb2_query_file_info, + .set_path_size = smb2_set_path_size, + .set_file_size = smb2_set_file_size, + .set_file_info = smb2_set_file_info, + .mkdir = smb2_mkdir, + .mkdir_setinfo = smb2_mkdir_setinfo, + .rmdir = smb2_rmdir, + .unlink = smb2_unlink, + .rename = smb2_rename_path, + .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, + .open = smb2_open_file, + .set_fid = smb2_set_fid, + .close = smb2_close_file, + .flush = smb2_flush_file, + .async_readv = smb2_async_readv, + .async_writev = smb2_async_writev, + .sync_read = smb2_sync_read, + .sync_write = smb2_sync_write, + .query_dir_first = smb2_query_dir_first, + .query_dir_next = smb2_query_dir_next, + .close_dir = smb2_close_dir, + .calc_smb_size = smb2_calc_size, + .is_status_pending = smb2_is_status_pending, + .oplock_response = smb2_oplock_response, + .queryfs = smb2_queryfs, + .mand_lock = smb2_mand_lock, + .mand_unlock_range = smb2_unlock_range, + .push_mand_locks = smb2_push_mandatory_locks, + .get_lease_key = smb2_get_lease_key, + .set_lease_key = smb2_set_lease_key, + .new_lease_key = smb2_new_lease_key, + .calc_signature = smb2_calc_signature, + .is_read_op = smb21_is_read_op, + .set_oplock_level = smb21_set_oplock_level, + .create_lease_buf = smb2_create_lease_buf, + .parse_lease_buf = smb2_parse_lease_buf, +}; struct smb_version_operations smb30_operations = { .compare_fids = smb2_compare_fids, @@ -706,6 +1023,7 @@ struct smb_version_operations smb30_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -729,6 +1047,10 @@ struct smb_version_operations smb30_operations = { .new_lease_key = smb2_new_lease_key, .generate_signingkey = generate_smb3signingkey, .calc_signature = smb3_calc_signature, + .is_read_op = smb21_is_read_op, + .set_oplock_level = smb3_set_oplock_level, + .create_lease_buf = smb3_create_lease_buf, + .parse_lease_buf = smb3_parse_lease_buf, }; struct smb_version_values smb20_values = { @@ -746,9 +1068,9 @@ struct smb_version_values smb20_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, - .oplock_read = SMB2_OPLOCK_LEVEL_II, .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease), }; struct smb_version_values smb21_values = { @@ -766,9 +1088,9 @@ struct smb_version_values smb21_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, - .oplock_read = SMB2_OPLOCK_LEVEL_II, .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease), }; struct smb_version_values smb30_values = { @@ -786,9 +1108,9 @@ struct smb_version_values smb30_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, - .oplock_read = SMB2_OPLOCK_LEVEL_II, .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), }; struct smb_version_values smb302_values = { @@ -806,7 +1128,7 @@ struct smb_version_values smb302_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, - .oplock_read = SMB2_OPLOCK_LEVEL_II, .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), }; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index abc9c2809b51..eba0efde66d7 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -478,12 +478,20 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, } /* + * If we are here due to reconnect, free per-smb session key + * in case signing was required. + */ + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + + /* * If memory allocation is successful, caller of this function * frees it. */ ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); if (!ses->ntlmssp) return -ENOMEM; + ses->ntlmssp->sesskey_per_smbsess = true; /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */ ses->sectype = RawNTLMSSP; @@ -628,6 +636,40 @@ ssetup_exit: /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ if ((phase == NtLmChallenge) && (rc == 0)) goto ssetup_ntlmssp_authenticate; + + if (!rc) { + mutex_lock(&server->srv_mutex); + if (server->sign && server->ops->generate_signingkey) { + rc = server->ops->generate_signingkey(ses); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + if (rc) { + cifs_dbg(FYI, + "SMB3 session key generation failed\n"); + mutex_unlock(&server->srv_mutex); + goto keygen_exit; + } + } + if (!server->session_estab) { + server->sequence_number = 0x2; + server->session_estab = true; + } + mutex_unlock(&server->srv_mutex); + + cifs_dbg(FYI, "SMB2/3 session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); + } + +keygen_exit: + if (!server->sign) { + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + } + kfree(ses->ntlmssp); + return rc; } @@ -813,39 +855,6 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) return rc; } -static struct create_lease * -create_lease_buf(u8 *lease_key, u8 oplock) -{ - struct create_lease *buf; - - buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL); - if (!buf) - return NULL; - - buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key)); - buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8))); - if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) - buf->lcontext.LeaseState = SMB2_LEASE_WRITE_CACHING | - SMB2_LEASE_READ_CACHING; - else if (oplock == SMB2_OPLOCK_LEVEL_II) - buf->lcontext.LeaseState = SMB2_LEASE_READ_CACHING; - else if (oplock == SMB2_OPLOCK_LEVEL_BATCH) - buf->lcontext.LeaseState = SMB2_LEASE_HANDLE_CACHING | - SMB2_LEASE_READ_CACHING | - SMB2_LEASE_WRITE_CACHING; - - buf->ccontext.DataOffset = cpu_to_le16(offsetof - (struct create_lease, lcontext)); - buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context)); - buf->ccontext.NameOffset = cpu_to_le16(offsetof - (struct create_lease, Name)); - buf->ccontext.NameLength = cpu_to_le16(4); - buf->Name[0] = 'R'; - buf->Name[1] = 'q'; - buf->Name[2] = 'L'; - buf->Name[3] = 's'; - return buf; -} static struct create_durable * create_durable_buf(void) @@ -894,55 +903,49 @@ create_reconnect_durable_buf(struct cifs_fid *fid) } static __u8 -parse_lease_state(struct smb2_create_rsp *rsp) +parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, + unsigned int *epoch) { char *data_offset; - struct create_lease *lc; - bool found = false; + struct create_context *cc; unsigned int next = 0; char *name; data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset); - lc = (struct create_lease *)data_offset; + cc = (struct create_context *)data_offset; do { - lc = (struct create_lease *)((char *)lc + next); - name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc; - if (le16_to_cpu(lc->ccontext.NameLength) != 4 || + cc = (struct create_context *)((char *)cc + next); + name = le16_to_cpu(cc->NameOffset) + (char *)cc; + if (le16_to_cpu(cc->NameLength) != 4 || strncmp(name, "RqLs", 4)) { - next = le32_to_cpu(lc->ccontext.Next); + next = le32_to_cpu(cc->Next); continue; } - if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS) - return SMB2_OPLOCK_LEVEL_NOCHANGE; - found = true; - break; + return server->ops->parse_lease_buf(cc, epoch); } while (next != 0); - if (!found) - return 0; - - return smb2_map_lease_to_oplock(lc->lcontext.LeaseState); + return 0; } static int -add_lease_context(struct kvec *iov, unsigned int *num_iovec, __u8 *oplock) +add_lease_context(struct TCP_Server_Info *server, struct kvec *iov, + unsigned int *num_iovec, __u8 *oplock) { struct smb2_create_req *req = iov[0].iov_base; unsigned int num = *num_iovec; - iov[num].iov_base = create_lease_buf(oplock+1, *oplock); + iov[num].iov_base = server->ops->create_lease_buf(oplock+1, *oplock); if (iov[num].iov_base == NULL) return -ENOMEM; - iov[num].iov_len = sizeof(struct create_lease); + iov[num].iov_len = server->vals->create_lease_size; req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE; if (!req->CreateContextsOffset) req->CreateContextsOffset = cpu_to_le32( sizeof(struct smb2_create_req) - 4 + iov[num - 1].iov_len); - req->CreateContextsLength = cpu_to_le32( - le32_to_cpu(req->CreateContextsLength) + - sizeof(struct create_lease)); - inc_rfc1001_len(&req->hdr, sizeof(struct create_lease)); + le32_add_cpu(&req->CreateContextsLength, + server->vals->create_lease_size); + inc_rfc1001_len(&req->hdr, server->vals->create_lease_size); *num_iovec = num + 1; return 0; } @@ -967,9 +970,7 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec, req->CreateContextsOffset = cpu_to_le32(sizeof(struct smb2_create_req) - 4 + iov[1].iov_len); - req->CreateContextsLength = - cpu_to_le32(le32_to_cpu(req->CreateContextsLength) + - sizeof(struct create_durable)); + le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable)); inc_rfc1001_len(&req->hdr, sizeof(struct create_durable)); *num_iovec = num + 1; return 0; @@ -977,7 +978,8 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec, int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, - __u8 *oplock, struct smb2_file_all_info *buf) + __u8 *oplock, struct smb2_file_all_info *buf, + struct smb2_err_rsp **err_buf) { struct smb2_create_req *req; struct smb2_create_rsp *rsp; @@ -1048,11 +1050,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (!server->oplocks) *oplock = SMB2_OPLOCK_LEVEL_NONE; - if (!(tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) || + if (!(server->capabilities & SMB2_GLOBAL_CAP_LEASING) || *oplock == SMB2_OPLOCK_LEVEL_NONE) req->RequestedOplockLevel = *oplock; else { - rc = add_lease_context(iov, &num_iovecs, oplock); + rc = add_lease_context(server, iov, &num_iovecs, oplock); if (rc) { cifs_small_buf_release(req); kfree(copy_path); @@ -1062,11 +1064,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) { /* need to set Next field of lease context if we request it */ - if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) { + if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) { struct create_context *ccontext = (struct create_context *)iov[num_iovecs-1].iov_base; ccontext->Next = - cpu_to_le32(sizeof(struct create_lease)); + cpu_to_le32(server->vals->create_lease_size); } rc = add_durable_context(iov, &num_iovecs, oparms); if (rc) { @@ -1082,6 +1084,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (rc != 0) { cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); + if (err_buf) + *err_buf = kmemdup(rsp, get_rfc1002_length(rsp) + 4, + GFP_KERNEL); goto creat_exit; } @@ -1098,7 +1103,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) - *oplock = parse_lease_state(rsp); + *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch); else *oplock = rsp->OplockLevel; creat_exit: diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 36b0d37ea69b..b83d0118a757 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -150,6 +150,20 @@ struct smb2_err_rsp { __u8 ErrorData[1]; /* variable length */ } __packed; +struct smb2_symlink_err_rsp { + __le32 SymLinkLength; + __le32 SymLinkErrorTag; + __le32 ReparseTag; + __le16 ReparseDataLength; + __le16 UnparsedPathLength; + __le16 SubstituteNameOffset; + __le16 SubstituteNameLength; + __le16 PrintNameOffset; + __le16 PrintNameLength; + __le32 Flags; + __u8 PathBuffer[0]; +} __packed; + #define SMB2_CLIENT_GUID_SIZE 16 extern __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; @@ -462,6 +476,10 @@ struct create_context { __u8 Buffer[0]; } __packed; +#define SMB2_LEASE_READ_CACHING_HE 0x01 +#define SMB2_LEASE_HANDLE_CACHING_HE 0x02 +#define SMB2_LEASE_WRITE_CACHING_HE 0x04 + #define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00) #define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01) #define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02) @@ -479,12 +497,31 @@ struct lease_context { __le64 LeaseDuration; } __packed; +struct lease_context_v2 { + __le64 LeaseKeyLow; + __le64 LeaseKeyHigh; + __le32 LeaseState; + __le32 LeaseFlags; + __le64 LeaseDuration; + __le64 ParentLeaseKeyLow; + __le64 ParentLeaseKeyHigh; + __le16 Epoch; + __le16 Reserved; +} __packed; + struct create_lease { struct create_context ccontext; __u8 Name[8]; struct lease_context lcontext; } __packed; +struct create_lease_v2 { + struct create_context ccontext; + __u8 Name[8]; + struct lease_context_v2 lcontext; + __u8 Pad[4]; +} __packed; + struct create_durable { struct create_context ccontext; __u8 Name[8]; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 1a5ecbed40ed..e3fb4801ee96 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -53,7 +53,6 @@ extern int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server); extern void smb2_echo_request(struct work_struct *work); extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); -extern __u8 smb2_map_lease_to_oplock(__le32 lease_state); extern bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv); @@ -87,7 +86,6 @@ extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, extern int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, FILE_ALL_INFO *buf); -extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); extern int smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); @@ -106,7 +104,8 @@ extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon); extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, - struct smb2_file_all_info *buf); + struct smb2_file_all_info *buf, + struct smb2_err_rsp **err_buf); extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, u32 indatalen, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 4f2300d020c7..340abca3aa52 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -114,6 +114,23 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server) return 0; } +static struct cifs_ses * +smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server) +{ + struct cifs_ses *ses; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (ses->Suid != smb2hdr->SessionId) + continue; + spin_unlock(&cifs_tcp_ses_lock); + return ses; + } + spin_unlock(&cifs_tcp_ses_lock); + + return NULL; +} + int smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) @@ -124,6 +141,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; + struct cifs_ses *ses; + + ses = smb2_find_smb_ses(smb2_pdu, server); + if (!ses) { + cifs_dbg(VFS, "%s: Could not find session\n", __func__); + return 0; + } memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); @@ -135,7 +159,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) } rc = crypto_shash_setkey(server->secmech.hmacsha256, - server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); + ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); return rc; @@ -198,8 +222,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } -void -generate_smb3signingkey(struct TCP_Server_Info *server) +int +generate_smb3signingkey(struct cifs_ses *ses) { unsigned char zero = 0x0; __u8 i[4] = {0, 0, 0, 1}; @@ -209,90 +233,99 @@ generate_smb3signingkey(struct TCP_Server_Info *server) unsigned char *hashptr = prfhash; memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); - memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); + memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); - rc = smb3_crypto_shash_allocate(server); + rc = smb3_crypto_shash_allocate(ses->server); if (rc) { cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_setkey(server->secmech.hmacsha256, - server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE); + rc = crypto_shash_setkey(ses->server->secmech.hmacsha256, + ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not set with session key\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash); + rc = crypto_shash_init(&ses->server->secmech.sdeschmacsha256->shash); if (rc) { cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, i, 4); if (rc) { cifs_dbg(VFS, "%s: Could not update with n\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, "SMB2AESCMAC", 12); if (rc) { cifs_dbg(VFS, "%s: Could not update with label\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, &zero, 1); if (rc) { cifs_dbg(VFS, "%s: Could not update with zero\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, "SmbSign", 8); if (rc) { cifs_dbg(VFS, "%s: Could not update with context\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, L, 4); if (rc) { cifs_dbg(VFS, "%s: Could not update with L\n", __func__); goto smb3signkey_ret; } - rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash, + rc = crypto_shash_final(&ses->server->secmech.sdeschmacsha256->shash, hashptr); if (rc) { cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); goto smb3signkey_ret; } - memcpy(server->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE); + memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE); smb3signkey_ret: - return; + return rc; } int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { - int i, rc; + int i; + int rc = 0; unsigned char smb3_signature[SMB2_CMACAES_SIZE]; unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base; + struct cifs_ses *ses; + + ses = smb2_find_smb_ses(smb2_pdu, server); + if (!ses) { + cifs_dbg(VFS, "%s: Could not find session\n", __func__); + return 0; + } memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE); memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE); rc = crypto_shash_setkey(server->secmech.cmacaes, - server->smb3signingkey, SMB2_CMACAES_SIZE); + ses->smb3signingkey, SMB2_CMACAES_SIZE); + if (rc) { cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__); return rc; @@ -389,6 +422,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; if ((smb2_pdu->Command == SMB2_NEGOTIATE) || + (smb2_pdu->Command == SMB2_SESSION_SETUP) || (smb2_pdu->Command == SMB2_OPLOCK_BREAK) || (!server->session_estab)) return 0; diff --git a/fs/cifs/winucase.c b/fs/cifs/winucase.c new file mode 100644 index 000000000000..1506d4fddb2c --- /dev/null +++ b/fs/cifs/winucase.c @@ -0,0 +1,663 @@ +/* + * fs/cifs/winucase.c + * + * Copyright (c) Jeffrey Layton <jlayton@redhat.com>, 2013 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * The const tables in this file were converted from the following info + * provided by Microsoft: + * + * 3.1.5.3 Mapping UTF-16 Strings to Upper Case: + * + * http://msdn.microsoft.com/en-us/library/hh877830.aspx + * http://www.microsoft.com/en-us/download/details.aspx?displaylang=en&id=10921 + * + * In particular, the table in "Windows 8 Upper Case Mapping Table.txt" was + * post-processed using the winucase_convert.pl script. + */ + +#include <linux/nls.h> + +wchar_t cifs_toupper(wchar_t in); /* quiet sparse */ + +static const wchar_t t2_00[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, + 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0000, + 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178, +}; + +static const wchar_t t2_01[256] = { + 0x0000, 0x0100, 0x0000, 0x0102, 0x0000, 0x0104, 0x0000, 0x0106, + 0x0000, 0x0108, 0x0000, 0x010a, 0x0000, 0x010c, 0x0000, 0x010e, + 0x0000, 0x0110, 0x0000, 0x0112, 0x0000, 0x0114, 0x0000, 0x0116, + 0x0000, 0x0118, 0x0000, 0x011a, 0x0000, 0x011c, 0x0000, 0x011e, + 0x0000, 0x0120, 0x0000, 0x0122, 0x0000, 0x0124, 0x0000, 0x0126, + 0x0000, 0x0128, 0x0000, 0x012a, 0x0000, 0x012c, 0x0000, 0x012e, + 0x0000, 0x0000, 0x0000, 0x0132, 0x0000, 0x0134, 0x0000, 0x0136, + 0x0000, 0x0000, 0x0139, 0x0000, 0x013b, 0x0000, 0x013d, 0x0000, + 0x013f, 0x0000, 0x0141, 0x0000, 0x0143, 0x0000, 0x0145, 0x0000, + 0x0147, 0x0000, 0x0000, 0x014a, 0x0000, 0x014c, 0x0000, 0x014e, + 0x0000, 0x0150, 0x0000, 0x0152, 0x0000, 0x0154, 0x0000, 0x0156, + 0x0000, 0x0158, 0x0000, 0x015a, 0x0000, 0x015c, 0x0000, 0x015e, + 0x0000, 0x0160, 0x0000, 0x0162, 0x0000, 0x0164, 0x0000, 0x0166, + 0x0000, 0x0168, 0x0000, 0x016a, 0x0000, 0x016c, 0x0000, 0x016e, + 0x0000, 0x0170, 0x0000, 0x0172, 0x0000, 0x0174, 0x0000, 0x0176, + 0x0000, 0x0000, 0x0179, 0x0000, 0x017b, 0x0000, 0x017d, 0x0000, + 0x0243, 0x0000, 0x0000, 0x0182, 0x0000, 0x0184, 0x0000, 0x0000, + 0x0187, 0x0000, 0x0000, 0x0000, 0x018b, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0191, 0x0000, 0x0000, 0x01f6, 0x0000, 0x0000, + 0x0000, 0x0198, 0x023d, 0x0000, 0x0000, 0x0000, 0x0220, 0x0000, + 0x0000, 0x01a0, 0x0000, 0x01a2, 0x0000, 0x01a4, 0x0000, 0x0000, + 0x01a7, 0x0000, 0x0000, 0x0000, 0x0000, 0x01ac, 0x0000, 0x0000, + 0x01af, 0x0000, 0x0000, 0x0000, 0x01b3, 0x0000, 0x01b5, 0x0000, + 0x0000, 0x01b8, 0x0000, 0x0000, 0x0000, 0x01bc, 0x0000, 0x01f7, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x01c4, 0x0000, + 0x0000, 0x01c7, 0x0000, 0x0000, 0x01ca, 0x0000, 0x01cd, 0x0000, + 0x01cf, 0x0000, 0x01d1, 0x0000, 0x01d3, 0x0000, 0x01d5, 0x0000, + 0x01d7, 0x0000, 0x01d9, 0x0000, 0x01db, 0x018e, 0x0000, 0x01de, + 0x0000, 0x01e0, 0x0000, 0x01e2, 0x0000, 0x01e4, 0x0000, 0x01e6, + 0x0000, 0x01e8, 0x0000, 0x01ea, 0x0000, 0x01ec, 0x0000, 0x01ee, + 0x0000, 0x0000, 0x0000, 0x01f1, 0x0000, 0x01f4, 0x0000, 0x0000, + 0x0000, 0x01f8, 0x0000, 0x01fa, 0x0000, 0x01fc, 0x0000, 0x01fe, +}; + +static const wchar_t t2_02[256] = { + 0x0000, 0x0200, 0x0000, 0x0202, 0x0000, 0x0204, 0x0000, 0x0206, + 0x0000, 0x0208, 0x0000, 0x020a, 0x0000, 0x020c, 0x0000, 0x020e, + 0x0000, 0x0210, 0x0000, 0x0212, 0x0000, 0x0214, 0x0000, 0x0216, + 0x0000, 0x0218, 0x0000, 0x021a, 0x0000, 0x021c, 0x0000, 0x021e, + 0x0000, 0x0000, 0x0000, 0x0222, 0x0000, 0x0224, 0x0000, 0x0226, + 0x0000, 0x0228, 0x0000, 0x022a, 0x0000, 0x022c, 0x0000, 0x022e, + 0x0000, 0x0230, 0x0000, 0x0232, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x023b, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0241, 0x0000, 0x0000, 0x0000, 0x0000, 0x0246, + 0x0000, 0x0248, 0x0000, 0x024a, 0x0000, 0x024c, 0x0000, 0x024e, + 0x2c6f, 0x2c6d, 0x0000, 0x0181, 0x0186, 0x0000, 0x0189, 0x018a, + 0x0000, 0x018f, 0x0000, 0x0190, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0193, 0x0000, 0x0000, 0x0194, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0197, 0x0196, 0x0000, 0x2c62, 0x0000, 0x0000, 0x0000, 0x019c, + 0x0000, 0x2c6e, 0x019d, 0x0000, 0x0000, 0x019f, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2c64, 0x0000, 0x0000, + 0x01a6, 0x0000, 0x0000, 0x01a9, 0x0000, 0x0000, 0x0000, 0x0000, + 0x01ae, 0x0244, 0x01b1, 0x01b2, 0x0245, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x01b7, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_03[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0370, 0x0000, 0x0372, 0x0000, 0x0000, 0x0000, 0x0376, + 0x0000, 0x0000, 0x0000, 0x03fd, 0x03fe, 0x03ff, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0386, 0x0388, 0x0389, 0x038a, + 0x0000, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, + 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, + 0x03a0, 0x03a1, 0x0000, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, + 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x038c, 0x038e, 0x038f, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x03cf, + 0x0000, 0x03d8, 0x0000, 0x03da, 0x0000, 0x03dc, 0x0000, 0x03de, + 0x0000, 0x03e0, 0x0000, 0x03e2, 0x0000, 0x03e4, 0x0000, 0x03e6, + 0x0000, 0x03e8, 0x0000, 0x03ea, 0x0000, 0x03ec, 0x0000, 0x03ee, + 0x0000, 0x0000, 0x03f9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x03f7, 0x0000, 0x0000, 0x03fa, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_04[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, + 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, + 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, + 0x0000, 0x0460, 0x0000, 0x0462, 0x0000, 0x0464, 0x0000, 0x0466, + 0x0000, 0x0468, 0x0000, 0x046a, 0x0000, 0x046c, 0x0000, 0x046e, + 0x0000, 0x0470, 0x0000, 0x0472, 0x0000, 0x0474, 0x0000, 0x0476, + 0x0000, 0x0478, 0x0000, 0x047a, 0x0000, 0x047c, 0x0000, 0x047e, + 0x0000, 0x0480, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x048a, 0x0000, 0x048c, 0x0000, 0x048e, + 0x0000, 0x0490, 0x0000, 0x0492, 0x0000, 0x0494, 0x0000, 0x0496, + 0x0000, 0x0498, 0x0000, 0x049a, 0x0000, 0x049c, 0x0000, 0x049e, + 0x0000, 0x04a0, 0x0000, 0x04a2, 0x0000, 0x04a4, 0x0000, 0x04a6, + 0x0000, 0x04a8, 0x0000, 0x04aa, 0x0000, 0x04ac, 0x0000, 0x04ae, + 0x0000, 0x04b0, 0x0000, 0x04b2, 0x0000, 0x04b4, 0x0000, 0x04b6, + 0x0000, 0x04b8, 0x0000, 0x04ba, 0x0000, 0x04bc, 0x0000, 0x04be, + 0x0000, 0x0000, 0x04c1, 0x0000, 0x04c3, 0x0000, 0x04c5, 0x0000, + 0x04c7, 0x0000, 0x04c9, 0x0000, 0x04cb, 0x0000, 0x04cd, 0x04c0, + 0x0000, 0x04d0, 0x0000, 0x04d2, 0x0000, 0x04d4, 0x0000, 0x04d6, + 0x0000, 0x04d8, 0x0000, 0x04da, 0x0000, 0x04dc, 0x0000, 0x04de, + 0x0000, 0x04e0, 0x0000, 0x04e2, 0x0000, 0x04e4, 0x0000, 0x04e6, + 0x0000, 0x04e8, 0x0000, 0x04ea, 0x0000, 0x04ec, 0x0000, 0x04ee, + 0x0000, 0x04f0, 0x0000, 0x04f2, 0x0000, 0x04f4, 0x0000, 0x04f6, + 0x0000, 0x04f8, 0x0000, 0x04fa, 0x0000, 0x04fc, 0x0000, 0x04fe, +}; + +static const wchar_t t2_05[256] = { + 0x0000, 0x0500, 0x0000, 0x0502, 0x0000, 0x0504, 0x0000, 0x0506, + 0x0000, 0x0508, 0x0000, 0x050a, 0x0000, 0x050c, 0x0000, 0x050e, + 0x0000, 0x0510, 0x0000, 0x0512, 0x0000, 0x0514, 0x0000, 0x0516, + 0x0000, 0x0518, 0x0000, 0x051a, 0x0000, 0x051c, 0x0000, 0x051e, + 0x0000, 0x0520, 0x0000, 0x0522, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, + 0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f, + 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547, + 0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f, + 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_1d[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0xa77d, 0x0000, 0x0000, 0x0000, 0x2c63, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_1e[256] = { + 0x0000, 0x1e00, 0x0000, 0x1e02, 0x0000, 0x1e04, 0x0000, 0x1e06, + 0x0000, 0x1e08, 0x0000, 0x1e0a, 0x0000, 0x1e0c, 0x0000, 0x1e0e, + 0x0000, 0x1e10, 0x0000, 0x1e12, 0x0000, 0x1e14, 0x0000, 0x1e16, + 0x0000, 0x1e18, 0x0000, 0x1e1a, 0x0000, 0x1e1c, 0x0000, 0x1e1e, + 0x0000, 0x1e20, 0x0000, 0x1e22, 0x0000, 0x1e24, 0x0000, 0x1e26, + 0x0000, 0x1e28, 0x0000, 0x1e2a, 0x0000, 0x1e2c, 0x0000, 0x1e2e, + 0x0000, 0x1e30, 0x0000, 0x1e32, 0x0000, 0x1e34, 0x0000, 0x1e36, + 0x0000, 0x1e38, 0x0000, 0x1e3a, 0x0000, 0x1e3c, 0x0000, 0x1e3e, + 0x0000, 0x1e40, 0x0000, 0x1e42, 0x0000, 0x1e44, 0x0000, 0x1e46, + 0x0000, 0x1e48, 0x0000, 0x1e4a, 0x0000, 0x1e4c, 0x0000, 0x1e4e, + 0x0000, 0x1e50, 0x0000, 0x1e52, 0x0000, 0x1e54, 0x0000, 0x1e56, + 0x0000, 0x1e58, 0x0000, 0x1e5a, 0x0000, 0x1e5c, 0x0000, 0x1e5e, + 0x0000, 0x1e60, 0x0000, 0x1e62, 0x0000, 0x1e64, 0x0000, 0x1e66, + 0x0000, 0x1e68, 0x0000, 0x1e6a, 0x0000, 0x1e6c, 0x0000, 0x1e6e, + 0x0000, 0x1e70, 0x0000, 0x1e72, 0x0000, 0x1e74, 0x0000, 0x1e76, + 0x0000, 0x1e78, 0x0000, 0x1e7a, 0x0000, 0x1e7c, 0x0000, 0x1e7e, + 0x0000, 0x1e80, 0x0000, 0x1e82, 0x0000, 0x1e84, 0x0000, 0x1e86, + 0x0000, 0x1e88, 0x0000, 0x1e8a, 0x0000, 0x1e8c, 0x0000, 0x1e8e, + 0x0000, 0x1e90, 0x0000, 0x1e92, 0x0000, 0x1e94, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x1ea0, 0x0000, 0x1ea2, 0x0000, 0x1ea4, 0x0000, 0x1ea6, + 0x0000, 0x1ea8, 0x0000, 0x1eaa, 0x0000, 0x1eac, 0x0000, 0x1eae, + 0x0000, 0x1eb0, 0x0000, 0x1eb2, 0x0000, 0x1eb4, 0x0000, 0x1eb6, + 0x0000, 0x1eb8, 0x0000, 0x1eba, 0x0000, 0x1ebc, 0x0000, 0x1ebe, + 0x0000, 0x1ec0, 0x0000, 0x1ec2, 0x0000, 0x1ec4, 0x0000, 0x1ec6, + 0x0000, 0x1ec8, 0x0000, 0x1eca, 0x0000, 0x1ecc, 0x0000, 0x1ece, + 0x0000, 0x1ed0, 0x0000, 0x1ed2, 0x0000, 0x1ed4, 0x0000, 0x1ed6, + 0x0000, 0x1ed8, 0x0000, 0x1eda, 0x0000, 0x1edc, 0x0000, 0x1ede, + 0x0000, 0x1ee0, 0x0000, 0x1ee2, 0x0000, 0x1ee4, 0x0000, 0x1ee6, + 0x0000, 0x1ee8, 0x0000, 0x1eea, 0x0000, 0x1eec, 0x0000, 0x1eee, + 0x0000, 0x1ef0, 0x0000, 0x1ef2, 0x0000, 0x1ef4, 0x0000, 0x1ef6, + 0x0000, 0x1ef8, 0x0000, 0x1efa, 0x0000, 0x1efc, 0x0000, 0x1efe, +}; + +static const wchar_t t2_1f[256] = { + 0x1f08, 0x1f09, 0x1f0a, 0x1f0b, 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f28, 0x1f29, 0x1f2a, 0x1f2b, 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f38, 0x1f39, 0x1f3a, 0x1f3b, 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f48, 0x1f49, 0x1f4a, 0x1f4b, 0x1f4c, 0x1f4d, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x1f59, 0x0000, 0x1f5b, 0x0000, 0x1f5d, 0x0000, 0x1f5f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f68, 0x1f69, 0x1f6a, 0x1f6b, 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fba, 0x1fbb, 0x1fc8, 0x1fc9, 0x1fca, 0x1fcb, 0x1fda, 0x1fdb, + 0x1ff8, 0x1ff9, 0x1fea, 0x1feb, 0x1ffa, 0x1ffb, 0x0000, 0x0000, + 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, 0x1fac, 0x1fad, 0x1fae, 0x1faf, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fb8, 0x1fb9, 0x0000, 0x1fbc, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x1fcc, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fd8, 0x1fd9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x1fe8, 0x1fe9, 0x0000, 0x0000, 0x0000, 0x1fec, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x1ffc, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_21[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2132, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167, + 0x2168, 0x2169, 0x216a, 0x216b, 0x216c, 0x216d, 0x216e, 0x216f, + 0x0000, 0x0000, 0x0000, 0x0000, 0x2183, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_24[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x24b6, 0x24b7, 0x24b8, 0x24b9, 0x24ba, 0x24bb, 0x24bc, 0x24bd, + 0x24be, 0x24bf, 0x24c0, 0x24c1, 0x24c2, 0x24c3, 0x24c4, 0x24c5, + 0x24c6, 0x24c7, 0x24c8, 0x24c9, 0x24ca, 0x24cb, 0x24cc, 0x24cd, + 0x24ce, 0x24cf, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_2c[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x2c00, 0x2c01, 0x2c02, 0x2c03, 0x2c04, 0x2c05, 0x2c06, 0x2c07, + 0x2c08, 0x2c09, 0x2c0a, 0x2c0b, 0x2c0c, 0x2c0d, 0x2c0e, 0x2c0f, + 0x2c10, 0x2c11, 0x2c12, 0x2c13, 0x2c14, 0x2c15, 0x2c16, 0x2c17, + 0x2c18, 0x2c19, 0x2c1a, 0x2c1b, 0x2c1c, 0x2c1d, 0x2c1e, 0x2c1f, + 0x2c20, 0x2c21, 0x2c22, 0x2c23, 0x2c24, 0x2c25, 0x2c26, 0x2c27, + 0x2c28, 0x2c29, 0x2c2a, 0x2c2b, 0x2c2c, 0x2c2d, 0x2c2e, 0x0000, + 0x0000, 0x2c60, 0x0000, 0x0000, 0x0000, 0x023a, 0x023e, 0x0000, + 0x2c67, 0x0000, 0x2c69, 0x0000, 0x2c6b, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x2c72, 0x0000, 0x0000, 0x2c75, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x2c80, 0x0000, 0x2c82, 0x0000, 0x2c84, 0x0000, 0x2c86, + 0x0000, 0x2c88, 0x0000, 0x2c8a, 0x0000, 0x2c8c, 0x0000, 0x2c8e, + 0x0000, 0x2c90, 0x0000, 0x2c92, 0x0000, 0x2c94, 0x0000, 0x2c96, + 0x0000, 0x2c98, 0x0000, 0x2c9a, 0x0000, 0x2c9c, 0x0000, 0x2c9e, + 0x0000, 0x2ca0, 0x0000, 0x2ca2, 0x0000, 0x2ca4, 0x0000, 0x2ca6, + 0x0000, 0x2ca8, 0x0000, 0x2caa, 0x0000, 0x2cac, 0x0000, 0x2cae, + 0x0000, 0x2cb0, 0x0000, 0x2cb2, 0x0000, 0x2cb4, 0x0000, 0x2cb6, + 0x0000, 0x2cb8, 0x0000, 0x2cba, 0x0000, 0x2cbc, 0x0000, 0x2cbe, + 0x0000, 0x2cc0, 0x0000, 0x2cc2, 0x0000, 0x2cc4, 0x0000, 0x2cc6, + 0x0000, 0x2cc8, 0x0000, 0x2cca, 0x0000, 0x2ccc, 0x0000, 0x2cce, + 0x0000, 0x2cd0, 0x0000, 0x2cd2, 0x0000, 0x2cd4, 0x0000, 0x2cd6, + 0x0000, 0x2cd8, 0x0000, 0x2cda, 0x0000, 0x2cdc, 0x0000, 0x2cde, + 0x0000, 0x2ce0, 0x0000, 0x2ce2, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_2d[256] = { + 0x10a0, 0x10a1, 0x10a2, 0x10a3, 0x10a4, 0x10a5, 0x10a6, 0x10a7, + 0x10a8, 0x10a9, 0x10aa, 0x10ab, 0x10ac, 0x10ad, 0x10ae, 0x10af, + 0x10b0, 0x10b1, 0x10b2, 0x10b3, 0x10b4, 0x10b5, 0x10b6, 0x10b7, + 0x10b8, 0x10b9, 0x10ba, 0x10bb, 0x10bc, 0x10bd, 0x10be, 0x10bf, + 0x10c0, 0x10c1, 0x10c2, 0x10c3, 0x10c4, 0x10c5, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_a6[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0xa640, 0x0000, 0xa642, 0x0000, 0xa644, 0x0000, 0xa646, + 0x0000, 0xa648, 0x0000, 0xa64a, 0x0000, 0xa64c, 0x0000, 0xa64e, + 0x0000, 0xa650, 0x0000, 0xa652, 0x0000, 0xa654, 0x0000, 0xa656, + 0x0000, 0xa658, 0x0000, 0xa65a, 0x0000, 0xa65c, 0x0000, 0xa65e, + 0x0000, 0x0000, 0x0000, 0xa662, 0x0000, 0xa664, 0x0000, 0xa666, + 0x0000, 0xa668, 0x0000, 0xa66a, 0x0000, 0xa66c, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0xa680, 0x0000, 0xa682, 0x0000, 0xa684, 0x0000, 0xa686, + 0x0000, 0xa688, 0x0000, 0xa68a, 0x0000, 0xa68c, 0x0000, 0xa68e, + 0x0000, 0xa690, 0x0000, 0xa692, 0x0000, 0xa694, 0x0000, 0xa696, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_a7[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0xa722, 0x0000, 0xa724, 0x0000, 0xa726, + 0x0000, 0xa728, 0x0000, 0xa72a, 0x0000, 0xa72c, 0x0000, 0xa72e, + 0x0000, 0x0000, 0x0000, 0xa732, 0x0000, 0xa734, 0x0000, 0xa736, + 0x0000, 0xa738, 0x0000, 0xa73a, 0x0000, 0xa73c, 0x0000, 0xa73e, + 0x0000, 0xa740, 0x0000, 0xa742, 0x0000, 0xa744, 0x0000, 0xa746, + 0x0000, 0xa748, 0x0000, 0xa74a, 0x0000, 0xa74c, 0x0000, 0xa74e, + 0x0000, 0xa750, 0x0000, 0xa752, 0x0000, 0xa754, 0x0000, 0xa756, + 0x0000, 0xa758, 0x0000, 0xa75a, 0x0000, 0xa75c, 0x0000, 0xa75e, + 0x0000, 0xa760, 0x0000, 0xa762, 0x0000, 0xa764, 0x0000, 0xa766, + 0x0000, 0xa768, 0x0000, 0xa76a, 0x0000, 0xa76c, 0x0000, 0xa76e, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0xa779, 0x0000, 0xa77b, 0x0000, 0x0000, 0xa77e, + 0x0000, 0xa780, 0x0000, 0xa782, 0x0000, 0xa784, 0x0000, 0xa786, + 0x0000, 0x0000, 0x0000, 0x0000, 0xa78b, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t t2_ff[256] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0xff21, 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27, + 0xff28, 0xff29, 0xff2a, 0xff2b, 0xff2c, 0xff2d, 0xff2e, 0xff2f, + 0xff30, 0xff31, 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37, + 0xff38, 0xff39, 0xff3a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, +}; + +static const wchar_t *const toplevel[256] = { + t2_00, t2_01, t2_02, t2_03, t2_04, t2_05, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, t2_1d, t2_1e, t2_1f, + NULL, t2_21, NULL, NULL, t2_24, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, t2_2c, t2_2d, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, t2_a6, t2_a7, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, t2_ff, +}; + +/** + * cifs_toupper - convert a wchar_t from lower to uppercase + * @in: character to convert from lower to uppercase + * + * This function consults the static tables above to convert a wchar_t from + * lower to uppercase. In the event that there is no mapping, the original + * "in" character is returned. + */ +wchar_t +cifs_toupper(wchar_t in) +{ + unsigned char idx; + const wchar_t *tbl; + wchar_t out; + + /* grab upper byte */ + idx = (in & 0xff00) >> 8; + + /* find pointer to 2nd layer table */ + tbl = toplevel[idx]; + if (!tbl) + return in; + + /* grab lower byte */ + idx = in & 0xff; + + /* look up character in table */ + out = tbl[idx]; + if (out) + return out; + + return in; +} diff --git a/fs/dcache.c b/fs/dcache.c index 4d9df3c940e6..41000305d716 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -37,6 +37,7 @@ #include <linux/rculist_bl.h> #include <linux/prefetch.h> #include <linux/ratelimit.h> +#include <linux/list_lru.h> #include "internal.h" #include "mount.h" @@ -48,7 +49,7 @@ * - the dcache hash table * s_anon bl list spinlock protects: * - the s_anon list (see __d_drop) - * dcache_lru_lock protects: + * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags @@ -63,7 +64,7 @@ * Ordering: * dentry->d_inode->i_lock * dentry->d_lock - * dcache_lru_lock + * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_anon lock * @@ -81,7 +82,6 @@ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); @@ -90,8 +90,8 @@ static struct kmem_cache *dentry_cache __read_mostly; /** * read_seqbegin_or_lock - begin a sequence number check or locking block - * lock: sequence lock - * seq : sequence number to be checked + * @lock: sequence lock + * @seq : sequence number to be checked * * First try it once optimistically without taking the lock. If that fails, * take the lock. The sequence number is also used as a marker for deciding @@ -103,7 +103,7 @@ static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) if (!(*seq & 1)) /* Even */ *seq = read_seqbegin(lock); else /* Odd */ - write_seqlock(lock); + read_seqlock_excl(lock); } static inline int need_seqretry(seqlock_t *lock, int seq) @@ -114,7 +114,7 @@ static inline int need_seqretry(seqlock_t *lock, int seq) static inline void done_seqretry(seqlock_t *lock, int seq) { if (seq & 1) - write_sequnlock(lock); + read_sequnlock_excl(lock); } /* @@ -146,23 +146,47 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static DEFINE_PER_CPU(unsigned int, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry_unused); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -static int get_nr_dentry(void) + +/* + * Here we resort to our own counters instead of using generic per-cpu counters + * for consistency with what the vfs inode code does. We are expected to harvest + * better code and performance by having our own specialized counters. + * + * Please note that the loop is done over all possible CPUs, not over all online + * CPUs. The reason for this is that we don't want to play games with CPUs going + * on and off. If one of them goes off, we will just keep their counters. + * + * glommer: See cffbc8a for details, and if you ever intend to change this, + * please update all vfs counters to match. + */ +static long get_nr_dentry(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } +static long get_nr_dentry_unused(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_unused, i); + return sum < 0 ? 0 : sum; +} + int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); - return proc_dointvec(table, write, buffer, lenp, ppos); + dentry_stat.nr_unused = get_nr_dentry_unused(); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif @@ -333,52 +357,96 @@ static void dentry_unlink_inode(struct dentry * dentry) } /* - * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held. + * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry + * is in use - which includes both the "real" per-superblock + * LRU list _and_ the DCACHE_SHRINK_LIST use. + * + * The DCACHE_SHRINK_LIST bit is set whenever the dentry is + * on the shrink list (ie not on the superblock LRU list). + * + * The per-cpu "nr_dentry_unused" counters are updated with + * the DCACHE_LRU_LIST bit. + * + * These helper functions make sure we always follow the + * rules. d_lock must be held by the caller. */ -static void dentry_lru_add(struct dentry *dentry) +#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) +static void d_lru_add(struct dentry *dentry) { - if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) { - spin_lock(&dcache_lru_lock); - dentry->d_flags |= DCACHE_LRU_LIST; - list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); - dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; - spin_unlock(&dcache_lru_lock); - } + D_FLAG_VERIFY(dentry, 0); + dentry->d_flags |= DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); + WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } -static void __dentry_lru_del(struct dentry *dentry) +static void d_lru_del(struct dentry *dentry) { + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); +} + +static void d_shrink_del(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); - dentry->d_sb->s_nr_dentry_unused--; - dentry_stat.nr_unused--; + this_cpu_dec(nr_dentry_unused); +} + +static void d_shrink_add(struct dentry *dentry, struct list_head *list) +{ + D_FLAG_VERIFY(dentry, 0); + list_add(&dentry->d_lru, list); + dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); } /* - * Remove a dentry with references from the LRU. + * These can only be called under the global LRU lock, ie during the + * callback for freeing the LRU list. "isolate" removes it from the + * LRU lists entirely, while shrink_move moves it to the indicated + * private list. */ -static void dentry_lru_del(struct dentry *dentry) +static void d_lru_isolate(struct dentry *dentry) { - if (!list_empty(&dentry->d_lru)) { - spin_lock(&dcache_lru_lock); - __dentry_lru_del(dentry); - spin_unlock(&dcache_lru_lock); - } + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + list_del_init(&dentry->d_lru); } -static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list) +static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list) { - spin_lock(&dcache_lru_lock); - if (list_empty(&dentry->d_lru)) { - dentry->d_flags |= DCACHE_LRU_LIST; - list_add_tail(&dentry->d_lru, list); - dentry->d_sb->s_nr_dentry_unused++; - dentry_stat.nr_unused++; - } else { - list_move_tail(&dentry->d_lru, list); + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags |= DCACHE_SHRINK_LIST; + list_move_tail(&dentry->d_lru, list); +} + +/* + * dentry_lru_(add|del)_list) must be called with d_lock held. + */ +static void dentry_lru_add(struct dentry *dentry) +{ + if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) + d_lru_add(dentry); +} + +/* + * Remove a dentry with references from the LRU. + * + * If we are on the shrink list, then we can get to try_prune_one_dentry() and + * lose our last reference through the parent walk. In this case, we need to + * remove ourselves from the shrink list, not the LRU. + */ +static void dentry_lru_del(struct dentry *dentry) +{ + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (dentry->d_flags & DCACHE_SHRINK_LIST) + return d_shrink_del(dentry); + d_lru_del(dentry); } - spin_unlock(&dcache_lru_lock); } /** @@ -474,7 +542,8 @@ EXPORT_SYMBOL(d_drop); * If ref is non-zero, then decrement the refcount too. * Returns dentry requiring refcount drop, or NULL if we're done. */ -static inline struct dentry *dentry_kill(struct dentry *dentry) +static inline struct dentry * +dentry_kill(struct dentry *dentry, int unlock_on_failure) __releases(dentry->d_lock) { struct inode *inode; @@ -483,8 +552,10 @@ static inline struct dentry *dentry_kill(struct dentry *dentry) inode = dentry->d_inode; if (inode && !spin_trylock(&inode->i_lock)) { relock: - spin_unlock(&dentry->d_lock); - cpu_relax(); + if (unlock_on_failure) { + spin_unlock(&dentry->d_lock); + cpu_relax(); + } return dentry; /* try again with same dentry */ } if (IS_ROOT(dentry)) @@ -567,7 +638,7 @@ repeat: return; kill_it: - dentry = dentry_kill(dentry); + dentry = dentry_kill(dentry, 1); if (dentry) goto repeat; } @@ -787,12 +858,12 @@ EXPORT_SYMBOL(d_prune_aliases); * * This may fail if locks cannot be acquired no problem, just try again. */ -static void try_prune_one_dentry(struct dentry *dentry) +static struct dentry * try_prune_one_dentry(struct dentry *dentry) __releases(dentry->d_lock) { struct dentry *parent; - parent = dentry_kill(dentry); + parent = dentry_kill(dentry, 0); /* * If dentry_kill returns NULL, we have nothing more to do. * if it returns the same dentry, trylocks failed. In either @@ -804,17 +875,18 @@ static void try_prune_one_dentry(struct dentry *dentry) * fragmentation. */ if (!parent) - return; + return NULL; if (parent == dentry) - return; + return dentry; /* Prune ancestors. */ dentry = parent; while (dentry) { if (lockref_put_or_lock(&dentry->d_lockref)) - return; - dentry = dentry_kill(dentry); + return NULL; + dentry = dentry_kill(dentry, 1); } + return NULL; } static void shrink_dentry_list(struct list_head *list) @@ -826,6 +898,12 @@ static void shrink_dentry_list(struct list_head *list) dentry = list_entry_rcu(list->prev, struct dentry, d_lru); if (&dentry->d_lru == list) break; /* empty */ + + /* + * Get the dentry lock, and re-verify that the dentry is + * this on the shrinking list. If it is, we know that + * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set. + */ spin_lock(&dentry->d_lock); if (dentry != list_entry(list->prev, struct dentry, d_lru)) { spin_unlock(&dentry->d_lock); @@ -833,76 +911,146 @@ static void shrink_dentry_list(struct list_head *list) } /* + * The dispose list is isolated and dentries are not accounted + * to the LRU here, so we can simply remove it from the list + * here regardless of whether it is referenced or not. + */ + d_shrink_del(dentry); + + /* * We found an inuse dentry which was not removed from - * the LRU because of laziness during lookup. Do not free - * it - just keep it off the LRU list. + * the LRU because of laziness during lookup. Do not free it. */ if (dentry->d_lockref.count) { - dentry_lru_del(dentry); spin_unlock(&dentry->d_lock); continue; } - rcu_read_unlock(); - try_prune_one_dentry(dentry); + /* + * If 'try_to_prune()' returns a dentry, it will + * be the same one we passed in, and d_lock will + * have been held the whole time, so it will not + * have been added to any other lists. We failed + * to get the inode lock. + * + * We just add it back to the shrink list. + */ + dentry = try_prune_one_dentry(dentry); rcu_read_lock(); + if (dentry) { + d_shrink_add(dentry, list); + spin_unlock(&dentry->d_lock); + } } rcu_read_unlock(); } +static enum lru_status +dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + /* + * Referenced dentries are still in use. If they have active + * counts, just remove them from the LRU. Otherwise give them + * another pass through the LRU. + */ + if (dentry->d_lockref.count) { + d_lru_isolate(dentry); + spin_unlock(&dentry->d_lock); + return LRU_REMOVED; + } + + if (dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + spin_unlock(&dentry->d_lock); + + /* + * The list move itself will be made by the common LRU code. At + * this point, we've dropped the dentry->d_lock but keep the + * lru lock. This is safe to do, since every list movement is + * protected by the lru lock even if both locks are held. + * + * This is guaranteed by the fact that all LRU management + * functions are intermediated by the LRU API calls like + * list_lru_add and list_lru_del. List movement in this file + * only ever occur through this functions or through callbacks + * like this one, that are called from the LRU API. + * + * The only exceptions to this are functions like + * shrink_dentry_list, and code that first checks for the + * DCACHE_SHRINK_LIST flag. Those are guaranteed to be + * operating only with stack provided lists after they are + * properly isolated from the main list. It is thus, always a + * local access. + */ + return LRU_ROTATE; + } + + d_lru_shrink_move(dentry, freeable); + spin_unlock(&dentry->d_lock); + + return LRU_REMOVED; +} + /** * prune_dcache_sb - shrink the dcache * @sb: superblock - * @count: number of entries to try to free + * @nr_to_scan : number of entries to try to free + * @nid: which node to scan for freeable entities * - * Attempt to shrink the superblock dcache LRU by @count entries. This is + * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is * done when we need more memory an called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ -void prune_dcache_sb(struct super_block *sb, int count) +long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid) { - struct dentry *dentry; - LIST_HEAD(referenced); - LIST_HEAD(tmp); + LIST_HEAD(dispose); + long freed; -relock: - spin_lock(&dcache_lru_lock); - while (!list_empty(&sb->s_dentry_lru)) { - dentry = list_entry(sb->s_dentry_lru.prev, - struct dentry, d_lru); - BUG_ON(dentry->d_sb != sb); - - if (!spin_trylock(&dentry->d_lock)) { - spin_unlock(&dcache_lru_lock); - cpu_relax(); - goto relock; - } + freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, + &dispose, &nr_to_scan); + shrink_dentry_list(&dispose); + return freed; +} - if (dentry->d_flags & DCACHE_REFERENCED) { - dentry->d_flags &= ~DCACHE_REFERENCED; - list_move(&dentry->d_lru, &referenced); - spin_unlock(&dentry->d_lock); - } else { - list_move_tail(&dentry->d_lru, &tmp); - dentry->d_flags |= DCACHE_SHRINK_LIST; - spin_unlock(&dentry->d_lock); - if (!--count) - break; - } - cond_resched_lock(&dcache_lru_lock); - } - if (!list_empty(&referenced)) - list_splice(&referenced, &sb->s_dentry_lru); - spin_unlock(&dcache_lru_lock); +static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, + spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + d_lru_shrink_move(dentry, freeable); + spin_unlock(&dentry->d_lock); - shrink_dentry_list(&tmp); + return LRU_REMOVED; } + /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock @@ -912,16 +1060,17 @@ relock: */ void shrink_dcache_sb(struct super_block *sb) { - LIST_HEAD(tmp); + long freed; - spin_lock(&dcache_lru_lock); - while (!list_empty(&sb->s_dentry_lru)) { - list_splice_init(&sb->s_dentry_lru, &tmp); - spin_unlock(&dcache_lru_lock); - shrink_dentry_list(&tmp); - spin_lock(&dcache_lru_lock); - } - spin_unlock(&dcache_lru_lock); + do { + LIST_HEAD(dispose); + + freed = list_lru_walk(&sb->s_dentry_lru, + dentry_lru_isolate_shrink, &dispose, UINT_MAX); + + this_cpu_sub(nr_dentry_unused, freed); + shrink_dentry_list(&dispose); + } while (freed > 0); } EXPORT_SYMBOL(shrink_dcache_sb); @@ -1283,8 +1432,13 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (dentry->d_lockref.count) { dentry_lru_del(dentry); } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - dentry_lru_move_list(dentry, &data->dispose); - dentry->d_flags |= DCACHE_SHRINK_LIST; + /* + * We can't use d_lru_shrink_move() because we + * need to get the global LRU lock and do the + * LRU accounting. + */ + d_lru_del(dentry); + d_shrink_add(dentry, &data->dispose); data->found++; ret = D_WALK_NORETRY; } @@ -2673,9 +2827,9 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) /** * prepend_name - prepend a pathname in front of current buffer pointer - * buffer: buffer pointer - * buflen: allocated length of the buffer - * name: name string and length qstr structure + * @buffer: buffer pointer + * @buflen: allocated length of the buffer + * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to * make sure that either the old or the new name pointer and length are @@ -2713,14 +2867,15 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * - * The function tries to write out the pathname without taking any lock other - * than the RCU read lock to make sure that dentries won't go away. It only - * checks the sequence number of the global rename_lock as any change in the - * dentry's d_seq will be preceded by changes in the rename_lock sequence - * number. If the sequence number had been change, it will restart the whole - * pathname back-tracing sequence again. It performs a total of 3 trials of - * lockless back-tracing sequences before falling back to take the - * rename_lock. + * The function will first try to write out the pathname without taking any + * lock other than the RCU read lock to make sure that dentries won't go away. + * It only checks the sequence number of the global rename_lock as any change + * in the dentry's d_seq will be preceded by changes in the rename_lock + * sequence number. If the sequence number had been changed, it will restart + * the whole pathname back-tracing sequence again by taking the rename_lock. + * In this case, there is no need to take the RCU read lock as the recursive + * parent pointer references will keep the dentry chain alive as long as no + * rename operation is performed. */ static int prepend_path(const struct path *path, const struct path *root, @@ -2868,6 +3023,16 @@ static int prepend_unreachable(char **buffer, int *buflen) return prepend(buffer, buflen, "(unreachable)", 13); } +static void get_fs_root_rcu(struct fs_struct *fs, struct path *root) +{ + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + *root = fs->root; + } while (read_seqcount_retry(&fs->seq, seq)); +} + /** * d_path - return the path of a dentry * @path: path to report @@ -2900,13 +3065,15 @@ char *d_path(const struct path *path, char *buf, int buflen) if (path->dentry->d_op && path->dentry->d_op->d_dname) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); - get_fs_root(current->fs, &root); + rcu_read_lock(); + get_fs_root_rcu(current->fs, &root); br_read_lock(&vfsmount_lock); error = path_with_deleted(path, &root, &res, &buflen); br_read_unlock(&vfsmount_lock); + rcu_read_unlock(); + if (error < 0) res = ERR_PTR(error); - path_put(&root); return res; } EXPORT_SYMBOL(d_path); @@ -3014,6 +3181,18 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } +static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root, + struct path *pwd) +{ + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + *root = fs->root; + *pwd = fs->pwd; + } while (read_seqcount_retry(&fs->seq, seq)); +} + /* * NOTE! The user-level library version returns a * character pointer. The kernel system call just @@ -3036,23 +3215,25 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) { int error; struct path pwd, root; - char *page = (char *) __get_free_page(GFP_USER); + char *page = __getname(); if (!page) return -ENOMEM; - get_fs_root_and_pwd(current->fs, &root, &pwd); + rcu_read_lock(); + get_fs_root_and_pwd_rcu(current->fs, &root, &pwd); error = -ENOENT; br_read_lock(&vfsmount_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; - char *cwd = page + PAGE_SIZE; - int buflen = PAGE_SIZE; + char *cwd = page + PATH_MAX; + int buflen = PATH_MAX; prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); br_read_unlock(&vfsmount_lock); + rcu_read_unlock(); if (error < 0) goto out; @@ -3065,7 +3246,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } error = -ERANGE; - len = PAGE_SIZE + page - cwd; + len = PATH_MAX + page - cwd; if (len <= size) { error = len; if (copy_to_user(buf, cwd, len)) @@ -3073,12 +3254,11 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } } else { br_read_unlock(&vfsmount_lock); + rcu_read_unlock(); } out: - path_put(&pwd); - path_put(&root); - free_page((unsigned long) page); + __putname(page); return error; } diff --git a/fs/drop_caches.c b/fs/drop_caches.c index c00e055b6282..9fd702f5bfb2 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -44,6 +44,7 @@ static void drop_slab(void) .gfp_mask = GFP_KERNEL, }; + nodes_setall(shrink.nodes_to_scan); do { nr_objects = shrink_slab(&shrink, 1000, 1000); } while (nr_objects > 10); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 2ec8eb1ab269..a52a5d23c30b 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -861,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) static void _write_failed(struct inode *inode, loff_t to) { if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } int exofs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0a87bb10998d..c260de6d7b6d 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -58,7 +58,7 @@ static void ext2_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); ext2_truncate_blocks(inode, inode->i_size); } } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 2d1bdbe78c04..3981ff783950 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -931,13 +931,15 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *ei; struct list_head *cur, *tmp; LIST_HEAD(skipped); - int ret, nr_shrunk = 0; + int nr_shrunk = 0; int retried = 0, skip_precached = 1, nr_skipped = 0; spin_lock(&sbi->s_es_lru_lock); retry: list_for_each_safe(cur, tmp, &sbi->s_es_lru) { + int shrunk; + /* * If we have already reclaimed all extents from extent * status tree, just stop the loop immediately. @@ -964,13 +966,13 @@ retry: continue; write_lock(&ei->i_es_lock); - ret = __es_try_to_reclaim_extents(ei, nr_to_scan); + shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); if (ei->i_es_lru_nr == 0) list_del_init(&ei->i_es_lru); write_unlock(&ei->i_es_lock); - nr_shrunk += ret; - nr_to_scan -= ret; + nr_shrunk += shrunk; + nr_to_scan -= shrunk; if (nr_to_scan == 0) break; } @@ -1007,7 +1009,20 @@ retry: return nr_shrunk; } -static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long ext4_es_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr; + struct ext4_sb_info *sbi; + + sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); + nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr); + return nr; +} + +static unsigned long ext4_es_scan(struct shrinker *shrink, + struct shrink_control *sc) { struct ext4_sb_info *sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); @@ -1022,9 +1037,8 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); - return ret; + return nr_shrunk; } void ext4_es_register_shrinker(struct ext4_sb_info *sbi) @@ -1032,7 +1046,8 @@ void ext4_es_register_shrinker(struct ext4_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_es_lru); spin_lock_init(&sbi->s_es_lru_lock); sbi->s_es_last_sorted = 0; - sbi->s_es_shrinker.shrink = ext4_es_shrink; + sbi->s_es_shrinker.scan_objects = ext4_es_scan; + sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; register_shrinker(&sbi->s_es_shrinker); } @@ -1076,7 +1091,7 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, struct ext4_es_tree *tree = &ei->i_es_tree; struct rb_node *node; struct extent_status *es; - int nr_shrunk = 0; + unsigned long nr_shrunk = 0; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c79fd7dabe79..0d424d7ac02b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4587,7 +4587,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { handle_t *handle; - loff_t oldsize = inode->i_size; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4650,7 +4649,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ - truncate_pagecache(inode, oldsize, inode->i_size); + truncate_pagecache(inode, inode->i_size); } /* * We want to call ext4_truncate() even if attr->ia_size == diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 11b51bb55b42..0062da21dd8b 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -147,7 +147,7 @@ static void fat_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); fat_truncate_blocks(inode, inode->i_size); } } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 30f6f27d5a59..9f4935b8f208 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -69,7 +69,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) { struct super_block *sb = inode->i_sb; - if (strcmp(sb->s_type->name, "bdev") == 0) + if (sb_is_blkdev_sb(sb)) return inode->i_mapping->backing_dev_info; return sb->s_bdi; @@ -251,11 +251,13 @@ static int move_expired_inodes(struct list_head *delaying_queue, if (work->older_than_this && inode_dirtied_after(inode, *work->older_than_this)) break; + list_move(&inode->i_wb_list, &tmp); + moved++; + if (sb_is_blkdev_sb(inode->i_sb)) + continue; if (sb && sb != inode->i_sb) do_sb_sort = 1; sb = inode->i_sb; - list_move(&inode->i_wb_list, &tmp); - moved++; } /* just one sb in list, splice to dispatch_queue and we're done */ diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 318e8433527c..b2a86e324aac 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -586,7 +586,8 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) fscache_operation_init(op, NULL, NULL); op->flags = FSCACHE_OP_MYTHREAD | - (1 << FSCACHE_OP_WAITING); + (1 << FSCACHE_OP_WAITING) | + (1 << FSCACHE_OP_UNUSE_COOKIE); spin_lock(&cookie->lock); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3ac91086f41f..62b43b577bfc 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1678,7 +1678,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. */ if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { - truncate_pagecache(inode, oldsize, outarg.attr.size); + truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(inode->i_mapping); } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 84434594e80e..a8ce6dab60a0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -218,7 +218,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, bool inval = false; if (oldsize != attr->size) { - truncate_pagecache(inode, oldsize, attr->size); + truncate_pagecache(inode, attr->size); inval = true; } else if (fc->auto_inval_data) { struct timespec new_mtime = { diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5e2f56fccf6b..62a65fc448dc 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1016,7 +1016,7 @@ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize chunk = oldsize - newsize; if (chunk > max_chunk) chunk = max_chunk; - truncate_pagecache(inode, oldsize, oldsize - chunk); + truncate_pagecache(inode, oldsize - chunk); oldsize -= chunk; gfs2_trans_end(sdp); error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); @@ -1067,7 +1067,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) if (journaled) error = gfs2_journaled_truncate(inode, oldsize, newsize); else - truncate_pagecache(inode, oldsize, newsize); + truncate_pagecache(inode, newsize); if (error) { brelse(dibh); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 722329cac98f..c2f41b4d00b9 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1427,21 +1427,22 @@ __acquires(&lru_lock) * gfs2_dispose_glock_lru() above. */ -static void gfs2_scan_glock_lru(int nr) +static long gfs2_scan_glock_lru(int nr) { struct gfs2_glock *gl; LIST_HEAD(skipped); LIST_HEAD(dispose); + long freed = 0; spin_lock(&lru_lock); - while(nr && !list_empty(&lru_list)) { + while ((nr-- >= 0) && !list_empty(&lru_list)) { gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); /* Test for being demotable */ if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); - nr--; + freed++; continue; } @@ -1451,23 +1452,28 @@ static void gfs2_scan_glock_lru(int nr) if (!list_empty(&dispose)) gfs2_dispose_glock_lru(&dispose); spin_unlock(&lru_lock); + + return freed; } -static int gfs2_shrink_glock_memory(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) { - if (sc->nr_to_scan) { - if (!(sc->gfp_mask & __GFP_FS)) - return -1; - gfs2_scan_glock_lru(sc->nr_to_scan); - } + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + return gfs2_scan_glock_lru(sc->nr_to_scan); +} - return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure; +static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return vfs_pressure_ratio(atomic_read(&lru_count)); } static struct shrinker glock_shrinker = { - .shrink = gfs2_shrink_glock_memory, .seeks = DEFAULT_SEEKS, + .count_objects = gfs2_glock_shrink_count, + .scan_objects = gfs2_glock_shrink_scan, }; /** diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 64915eeae5a7..ced3257f06e8 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -694,8 +694,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, mark_inode_dirty(inode); d_instantiate(dentry, inode); - if (file) + if (file) { + *opened |= FILE_CREATED; error = finish_open(file, dentry, gfs2_open_common, opened); + } gfs2_glock_dq_uninit(ghs); gfs2_glock_dq_uninit(ghs + 1); return error; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 7b0f5043cf24..351586e24e30 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -32,7 +32,8 @@ struct workqueue_struct *gfs2_control_wq; static struct shrinker qd_shrinker = { - .shrink = gfs2_shrink_qd_memory, + .count_objects = gfs2_qd_shrink_count, + .scan_objects = gfs2_qd_shrink_scan, .seeks = DEFAULT_SEEKS, }; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 3768c2f40e43..db441359ee8c 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -75,17 +75,16 @@ static LIST_HEAD(qd_lru_list); static atomic_t qd_lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(qd_lru_lock); -int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc) +unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; int nr_to_scan = sc->nr_to_scan; - - if (nr_to_scan == 0) - goto out; + long freed = 0; if (!(sc->gfp_mask & __GFP_FS)) - return -1; + return SHRINK_STOP; spin_lock(&qd_lru_lock); while (nr_to_scan && !list_empty(&qd_lru_list)) { @@ -110,11 +109,16 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc) kmem_cache_free(gfs2_quotad_cachep, qd); spin_lock(&qd_lru_lock); nr_to_scan--; + freed++; } spin_unlock(&qd_lru_lock); + return freed; +} -out: - return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100; +unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return vfs_pressure_ratio(atomic_read(&qd_lru_count)); } static u64 qd2index(struct gfs2_quota_data *qd) diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 4f5e6e44ed83..0f64d9deb1b0 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -53,8 +53,10 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern int gfs2_shrink_qd_memory(struct shrinker *shrink, - struct shrink_control *sc); +extern unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +extern unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); extern const struct quotactl_ops gfs2_quotactl_ops; #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index f9299d8a64e3..380ab31b5e0f 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -41,7 +41,7 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); hfs_file_truncate(inode); } } diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 4d2edaea891c..37213d075f3c 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -36,7 +36,7 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); hfsplus_file_truncate(inode); } } diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 4e9dabcf1f4c..67c1a61e0955 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -138,7 +138,7 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) hpfs_lock(inode->i_sb); if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); hpfs_truncate(inode); } diff --git a/fs/inode.c b/fs/inode.c index 93a0625b46e4..b33ba8e021cc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -17,6 +17,7 @@ #include <linux/prefetch.h> #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> +#include <linux/list_lru.h> #include "internal.h" /* @@ -24,7 +25,7 @@ * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget() - * inode->i_sb->s_inode_lru_lock protects: + * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list @@ -37,7 +38,7 @@ * * inode_sb_list_lock * inode->i_lock - * inode->i_sb->s_inode_lru_lock + * Inode LRU list locks * * bdi->wb.list_lock * inode->i_lock @@ -70,33 +71,33 @@ EXPORT_SYMBOL(empty_aops); */ struct inodes_stat_t inodes_stat; -static DEFINE_PER_CPU(unsigned int, nr_inodes); -static DEFINE_PER_CPU(unsigned int, nr_unused); +static DEFINE_PER_CPU(unsigned long, nr_inodes); +static DEFINE_PER_CPU(unsigned long, nr_unused); static struct kmem_cache *inode_cachep __read_mostly; -static int get_nr_inodes(void) +static long get_nr_inodes(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum; } -static inline int get_nr_inodes_unused(void) +static inline long get_nr_inodes_unused(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum; } -int get_nr_dirty_inodes(void) +long get_nr_dirty_inodes(void) { /* not actually dirty inodes, but a wild approximation */ - int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); + long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; } @@ -109,7 +110,7 @@ int proc_nr_inodes(ctl_table *table, int write, { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif @@ -401,13 +402,8 @@ EXPORT_SYMBOL(ihold); static void inode_lru_list_add(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_lru_lock); - if (list_empty(&inode->i_lru)) { - list_add(&inode->i_lru, &inode->i_sb->s_inode_lru); - inode->i_sb->s_nr_inodes_unused++; + if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); - } - spin_unlock(&inode->i_sb->s_inode_lru_lock); } /* @@ -425,13 +421,9 @@ void inode_add_lru(struct inode *inode) static void inode_lru_list_del(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_lru_lock); - if (!list_empty(&inode->i_lru)) { - list_del_init(&inode->i_lru); - inode->i_sb->s_nr_inodes_unused--; + + if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); - } - spin_unlock(&inode->i_sb->s_inode_lru_lock); } /** @@ -675,24 +667,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) return busy; } -static int can_unuse(struct inode *inode) -{ - if (inode->i_state & ~I_REFERENCED) - return 0; - if (inode_has_buffers(inode)) - return 0; - if (atomic_read(&inode->i_count)) - return 0; - if (inode->i_data.nrpages) - return 0; - return 1; -} - /* - * Walk the superblock inode LRU for freeable inodes and attempt to free them. - * This is called from the superblock shrinker function with a number of inodes - * to trim from the LRU. Inodes to be freed are moved to a temporary list and - * then are freed outside inode_lock by dispose_list(). + * Isolate the inode from the LRU in preparation for freeing it. * * Any inodes which are pinned purely because of attached pagecache have their * pagecache removed. If the inode has metadata buffers attached to @@ -706,89 +682,82 @@ static int can_unuse(struct inode *inode) * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ -void prune_icache_sb(struct super_block *sb, int nr_to_scan) +static enum lru_status +inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) { - LIST_HEAD(freeable); - int nr_scanned; - unsigned long reap = 0; + struct list_head *freeable = arg; + struct inode *inode = container_of(item, struct inode, i_lru); - spin_lock(&sb->s_inode_lru_lock); - for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) { - struct inode *inode; + /* + * we are inverting the lru lock/inode->i_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&inode->i_lock)) + return LRU_SKIP; - if (list_empty(&sb->s_inode_lru)) - break; + /* + * Referenced or dirty inodes are still in use. Give them another pass + * through the LRU as we canot reclaim them now. + */ + if (atomic_read(&inode->i_count) || + (inode->i_state & ~I_REFERENCED)) { + list_del_init(&inode->i_lru); + spin_unlock(&inode->i_lock); + this_cpu_dec(nr_unused); + return LRU_REMOVED; + } - inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru); + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { + inode->i_state &= ~I_REFERENCED; + spin_unlock(&inode->i_lock); + return LRU_ROTATE; + } - /* - * we are inverting the sb->s_inode_lru_lock/inode->i_lock here, - * so use a trylock. If we fail to get the lock, just move the - * inode to the back of the list so we don't spin on it. - */ - if (!spin_trylock(&inode->i_lock)) { - list_move(&inode->i_lru, &sb->s_inode_lru); - continue; + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(lru_lock); + if (remove_inode_buffers(inode)) { + unsigned long reap; + reap = invalidate_mapping_pages(&inode->i_data, 0, -1); + if (current_is_kswapd()) + __count_vm_events(KSWAPD_INODESTEAL, reap); + else + __count_vm_events(PGINODESTEAL, reap); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += reap; } + iput(inode); + spin_lock(lru_lock); + return LRU_RETRY; + } - /* - * Referenced or dirty inodes are still in use. Give them - * another pass through the LRU as we canot reclaim them now. - */ - if (atomic_read(&inode->i_count) || - (inode->i_state & ~I_REFERENCED)) { - list_del_init(&inode->i_lru); - spin_unlock(&inode->i_lock); - sb->s_nr_inodes_unused--; - this_cpu_dec(nr_unused); - continue; - } + WARN_ON(inode->i_state & I_NEW); + inode->i_state |= I_FREEING; + list_move(&inode->i_lru, freeable); + spin_unlock(&inode->i_lock); - /* recently referenced inodes get one more pass */ - if (inode->i_state & I_REFERENCED) { - inode->i_state &= ~I_REFERENCED; - list_move(&inode->i_lru, &sb->s_inode_lru); - spin_unlock(&inode->i_lock); - continue; - } - if (inode_has_buffers(inode) || inode->i_data.nrpages) { - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_lru_lock); - if (remove_inode_buffers(inode)) - reap += invalidate_mapping_pages(&inode->i_data, - 0, -1); - iput(inode); - spin_lock(&sb->s_inode_lru_lock); - - if (inode != list_entry(sb->s_inode_lru.next, - struct inode, i_lru)) - continue; /* wrong inode or list_empty */ - /* avoid lock inversions with trylock */ - if (!spin_trylock(&inode->i_lock)) - continue; - if (!can_unuse(inode)) { - spin_unlock(&inode->i_lock); - continue; - } - } - WARN_ON(inode->i_state & I_NEW); - inode->i_state |= I_FREEING; - spin_unlock(&inode->i_lock); + this_cpu_dec(nr_unused); + return LRU_REMOVED; +} - list_move(&inode->i_lru, &freeable); - sb->s_nr_inodes_unused--; - this_cpu_dec(nr_unused); - } - if (current_is_kswapd()) - __count_vm_events(KSWAPD_INODESTEAL, reap); - else - __count_vm_events(PGINODESTEAL, reap); - spin_unlock(&sb->s_inode_lru_lock); - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += reap; +/* + * Walk the superblock inode LRU for freeable inodes and attempt to free them. + * This is called from the superblock shrinker function with a number of inodes + * to trim from the LRU. Inodes to be freed are moved to a temporary list and + * then are freed outside inode_lock by dispose_list(). + */ +long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid) +{ + LIST_HEAD(freeable); + long freed; + freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, + &freeable, &nr_to_scan); dispose_list(&freeable); + return freed; } static void __wait_on_freeing_inode(struct inode *inode); diff --git a/fs/internal.h b/fs/internal.h index 2be46ea5dd0b..513e0d859a6c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -114,6 +114,8 @@ extern int open_check_o_direct(struct file *f); * inode.c */ extern spinlock_t inode_sb_list_lock; +extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid); extern void inode_add_lru(struct inode *inode); /* @@ -121,7 +123,7 @@ extern void inode_add_lru(struct inode *inode); */ extern void inode_wb_list_del(struct inode *inode); -extern int get_nr_dirty_inodes(void); +extern long get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); @@ -130,6 +132,8 @@ extern int invalidate_inodes(struct super_block *, bool); */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); extern int d_set_mounted(struct dentry *dentry); +extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid); /* * read_write.c diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 730f24e282a6..f4aab719add5 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -306,7 +306,7 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); jfs_truncate(inode); } } diff --git a/fs/mbcache.c b/fs/mbcache.c index 8c32ef3ba88e..e519e45bf673 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -86,18 +86,6 @@ static LIST_HEAD(mb_cache_list); static LIST_HEAD(mb_cache_lru_list); static DEFINE_SPINLOCK(mb_cache_spinlock); -/* - * What the mbcache registers as to get shrunk dynamically. - */ - -static int mb_cache_shrink_fn(struct shrinker *shrink, - struct shrink_control *sc); - -static struct shrinker mb_cache_shrinker = { - .shrink = mb_cache_shrink_fn, - .seeks = DEFAULT_SEEKS, -}; - static inline int __mb_cache_entry_is_hashed(struct mb_cache_entry *ce) { @@ -151,7 +139,7 @@ forget: /* - * mb_cache_shrink_fn() memory pressure callback + * mb_cache_shrink_scan() memory pressure callback * * This function is called by the kernel memory management when memory * gets low. @@ -159,17 +147,16 @@ forget: * @shrink: (ignored) * @sc: shrink_control passed from reclaim * - * Returns the number of objects which are present in the cache. + * Returns the number of objects freed. */ -static int -mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long +mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(free_list); - struct mb_cache *cache; struct mb_cache_entry *entry, *tmp; - int count = 0; int nr_to_scan = sc->nr_to_scan; gfp_t gfp_mask = sc->gfp_mask; + unsigned long freed = 0; mb_debug("trying to free %d entries", nr_to_scan); spin_lock(&mb_cache_spinlock); @@ -179,19 +166,37 @@ mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc) struct mb_cache_entry, e_lru_list); list_move_tail(&ce->e_lru_list, &free_list); __mb_cache_entry_unhash(ce); + freed++; + } + spin_unlock(&mb_cache_spinlock); + list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { + __mb_cache_entry_forget(entry, gfp_mask); } + return freed; +} + +static unsigned long +mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + struct mb_cache *cache; + unsigned long count = 0; + + spin_lock(&mb_cache_spinlock); list_for_each_entry(cache, &mb_cache_list, c_cache_list) { mb_debug("cache %s (%d)", cache->c_name, atomic_read(&cache->c_entry_count)); count += atomic_read(&cache->c_entry_count); } spin_unlock(&mb_cache_spinlock); - list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { - __mb_cache_entry_forget(entry, gfp_mask); - } - return (count / 100) * sysctl_vfs_cache_pressure; + + return vfs_pressure_ratio(count); } +static struct shrinker mb_cache_shrinker = { + .count_objects = mb_cache_shrink_count, + .scan_objects = mb_cache_shrink_scan, + .seeks = DEFAULT_SEEKS, +}; /* * mb_cache_create() create a new cache diff --git a/fs/minix/inode.c b/fs/minix/inode.c index df122496f328..0332109162a5 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -400,7 +400,7 @@ static void minix_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); minix_truncate(inode); } } diff --git a/fs/namei.c b/fs/namei.c index 409a441ba2ae..645268f23eb6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -660,29 +660,6 @@ static __always_inline void set_root_rcu(struct nameidata *nd) } } -static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) -{ - int ret; - - if (IS_ERR(link)) - goto fail; - - if (*link == '/') { - set_root(nd); - path_put(&nd->path); - nd->path = nd->root; - path_get(&nd->root); - nd->flags |= LOOKUP_JUMPED; - } - nd->inode = nd->path.dentry->d_inode; - - ret = link_path_walk(link, nd); - return ret; -fail: - path_put(&nd->path); - return PTR_ERR(link); -} - static void path_put_conditional(struct path *path, struct nameidata *nd) { dput(path->dentry); @@ -874,7 +851,20 @@ follow_link(struct path *link, struct nameidata *nd, void **p) error = 0; s = nd_get_link(nd); if (s) { - error = __vfs_follow_link(nd, s); + if (unlikely(IS_ERR(s))) { + path_put(&nd->path); + put_link(nd, link, *p); + return PTR_ERR(s); + } + if (*s == '/') { + set_root(nd); + path_put(&nd->path); + nd->path = nd->root; + path_get(&nd->root); + nd->flags |= LOOKUP_JUMPED; + } + nd->inode = nd->path.dentry->d_inode; + error = link_path_walk(s, nd); if (unlikely(error)) put_link(nd, link, *p); } @@ -2271,12 +2261,15 @@ mountpoint_last(struct nameidata *nd, struct path *path) dentry = d_alloc(dir, &nd->last); if (!dentry) { error = -ENOMEM; + mutex_unlock(&dir->d_inode->i_mutex); goto out; } dentry = lookup_real(dir->d_inode, dentry, nd->flags); error = PTR_ERR(dentry); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + mutex_unlock(&dir->d_inode->i_mutex); goto out; + } } mutex_unlock(&dir->d_inode->i_mutex); @@ -2663,6 +2656,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, int acc_mode; int create_error = 0; struct dentry *const DENTRY_NOT_SET = (void *) -1UL; + bool excl; BUG_ON(dentry->d_inode); @@ -2676,10 +2670,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) mode &= ~current_umask(); - if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) { + excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT); + if (excl) open_flag &= ~O_TRUNC; - *opened |= FILE_CREATED; - } /* * Checking write permission is tricky, bacuse we don't know if we are @@ -2732,12 +2725,6 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, goto out; } - acc_mode = op->acc_mode; - if (*opened & FILE_CREATED) { - fsnotify_create(dir, dentry); - acc_mode = MAY_OPEN; - } - if (error) { /* returned 1, that is */ if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; @@ -2747,9 +2734,19 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, dput(dentry); dentry = file->f_path.dentry; } - if (create_error && dentry->d_inode == NULL) { - error = create_error; - goto out; + if (*opened & FILE_CREATED) + fsnotify_create(dir, dentry); + if (!dentry->d_inode) { + WARN_ON(*opened & FILE_CREATED); + if (create_error) { + error = create_error; + goto out; + } + } else { + if (excl && !(*opened & FILE_CREATED)) { + error = -EEXIST; + goto out; + } } goto looked_up; } @@ -2758,6 +2755,12 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, * We didn't have the inode before the open, so check open permission * here. */ + acc_mode = op->acc_mode; + if (*opened & FILE_CREATED) { + WARN_ON(!(open_flag & O_CREAT)); + fsnotify_create(dir, dentry); + acc_mode = MAY_OPEN; + } error = may_open(&file->f_path, acc_mode, open_flag); if (error) fput(file); @@ -4236,11 +4239,6 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) return res; } -int vfs_follow_link(struct nameidata *nd, const char *link) -{ - return __vfs_follow_link(nd, link); -} - /* get the link contents into pagecache */ static char *page_getlink(struct dentry * dentry, struct page **ppage) { @@ -4352,7 +4350,6 @@ EXPORT_SYMBOL(vfs_path_lookup); EXPORT_SYMBOL(inode_permission); EXPORT_SYMBOL(unlock_rename); EXPORT_SYMBOL(vfs_create); -EXPORT_SYMBOL(vfs_follow_link); EXPORT_SYMBOL(vfs_link); EXPORT_SYMBOL(vfs_mkdir); EXPORT_SYMBOL(vfs_mknod); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e79bc6ce828e..854a8f05a610 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1392,6 +1392,9 @@ static int nfs_finish_open(struct nfs_open_context *ctx, { int err; + if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + *opened |= FILE_CREATED; + err = finish_open(file, dentry, do_open, opened); if (err) goto out; @@ -2006,17 +2009,18 @@ static void nfs_access_free_list(struct list_head *head) } } -int nfs_access_cache_shrinker(struct shrinker *shrink, - struct shrink_control *sc) +unsigned long +nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(head); struct nfs_inode *nfsi, *next; struct nfs_access_entry *cache; int nr_to_scan = sc->nr_to_scan; gfp_t gfp_mask = sc->gfp_mask; + long freed = 0; if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) - return (nr_to_scan == 0) ? 0 : -1; + return SHRINK_STOP; spin_lock(&nfs_access_lru_lock); list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { @@ -2032,6 +2036,7 @@ int nfs_access_cache_shrinker(struct shrinker *shrink, struct nfs_access_entry, lru); list_move(&cache->lru, &head); rb_erase(&cache->rb_node, &nfsi->access_cache); + freed++; if (!list_empty(&nfsi->access_cache_entry_lru)) list_move_tail(&nfsi->access_cache_inode_lru, &nfs_access_lru_list); @@ -2046,7 +2051,13 @@ remove_lru_entry: } spin_unlock(&nfs_access_lru_lock); nfs_access_free_list(&head); - return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; + return freed; +} + +unsigned long +nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc) +{ + return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries)); } static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0bd7a55a5f07..91ff089d3412 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -130,7 +130,6 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_ return -EINVAL; #else - VM_BUG_ON(iocb->ki_left != PAGE_SIZE); VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); if (rw == READ || rw == KERNEL_READ) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 87e797640828..eda8879171c4 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -541,7 +541,6 @@ EXPORT_SYMBOL_GPL(nfs_setattr); */ static int nfs_vmtruncate(struct inode * inode, loff_t offset) { - loff_t oldsize; int err; err = inode_newsize_ok(inode, offset); @@ -549,11 +548,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) goto out; spin_lock(&inode->i_lock); - oldsize = inode->i_size; i_size_write(inode, offset); spin_unlock(&inode->i_lock); - truncate_pagecache(inode, oldsize, offset); + truncate_pagecache(inode, offset); out: return err; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index d388302c005f..38da8c2b81ac 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -273,8 +273,10 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const char *ip_addr); /* dir.c */ -extern int nfs_access_cache_shrinker(struct shrinker *shrink, - struct shrink_control *sc); +extern unsigned long nfs_access_cache_count(struct shrinker *shrink, + struct shrink_control *sc); +extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); int nfs_create(struct inode *, struct dentry *, umode_t, bool); int nfs_mkdir(struct inode *, struct dentry *, umode_t); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index f520a1113b38..28842abafab4 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -279,15 +279,15 @@ _nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, if (test_bit(sp4_mode, &clp->cl_sp4_flags)) { spin_lock(&clp->cl_lock); if (clp->cl_machine_cred != NULL) - newcred = get_rpccred(clp->cl_machine_cred); + /* don't call get_rpccred on the machine cred - + * a reference will be held for life of clp */ + newcred = clp->cl_machine_cred; spin_unlock(&clp->cl_lock); - if (msg->rpc_cred) - put_rpccred(msg->rpc_cred); msg->rpc_cred = newcred; flavor = clp->cl_rpcclient->cl_auth->au_flavor; - WARN_ON(flavor != RPC_AUTH_GSS_KRB5I && - flavor != RPC_AUTH_GSS_KRB5P); + WARN_ON_ONCE(flavor != RPC_AUTH_GSS_KRB5I && + flavor != RPC_AUTH_GSS_KRB5P); *clntp = clp->cl_rpcclient; return true; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 39b6cf2d1683..989bb9d3074d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6001,10 +6001,12 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct .rpc_resp = &res, }; struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct rpc_cred *cred = NULL; if (use_integrity) { clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient; - msg.rpc_cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); + cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); + msg.rpc_cred = cred; } dprintk("NFS call secinfo %s\n", name->name); @@ -6016,8 +6018,8 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct &res.seq_res, 0); dprintk("NFS reply secinfo: %d\n", status); - if (msg.rpc_cred) - put_rpccred(msg.rpc_cred); + if (cred) + put_rpccred(cred); return status; } @@ -6151,11 +6153,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = { }, .allow.u.words = { [0] = 1 << (OP_CLOSE) | - 1 << (OP_LOCKU), + 1 << (OP_LOCKU) | + 1 << (OP_COMMIT), [1] = 1 << (OP_SECINFO - 32) | 1 << (OP_SECINFO_NO_NAME - 32) | 1 << (OP_TEST_STATEID - 32) | - 1 << (OP_FREE_STATEID - 32) + 1 << (OP_FREE_STATEID - 32) | + 1 << (OP_WRITE - 32) } }; @@ -7496,11 +7500,13 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; struct rpc_clnt *clnt = server->client; + struct rpc_cred *cred = NULL; int status; if (use_integrity) { clnt = server->nfs_client->cl_rpcclient; - msg.rpc_cred = nfs4_get_clid_cred(server->nfs_client); + cred = nfs4_get_clid_cred(server->nfs_client); + msg.rpc_cred = cred; } dprintk("--> %s\n", __func__); @@ -7508,8 +7514,8 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, &res.seq_res, 0); dprintk("<-- %s status=%d\n", __func__, status); - if (msg.rpc_cred) - put_rpccred(msg.rpc_cred); + if (cred) + put_rpccred(cred); return status; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index fbdad9e1719f..79210d23f607 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -414,7 +414,7 @@ static int nfs4_stat_to_errno(int); #define decode_test_stateid_maxsz (op_decode_hdr_maxsz + 2 + 1) #define encode_free_stateid_maxsz (op_encode_hdr_maxsz + 1 + \ XDR_QUADLEN(NFS4_STATEID_SIZE)) -#define decode_free_stateid_maxsz (op_decode_hdr_maxsz + 1) +#define decode_free_stateid_maxsz (op_decode_hdr_maxsz) #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -5966,21 +5966,8 @@ out: static int decode_free_stateid(struct xdr_stream *xdr, struct nfs41_free_stateid_res *res) { - __be32 *p; - int status; - - status = decode_op_hdr(xdr, OP_FREE_STATEID); - if (status) - return status; - - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - res->status = be32_to_cpup(p++); + res->status = decode_op_hdr(xdr, OP_FREE_STATEID); return res->status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 5793f24613c8..a03b9c6f9489 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -360,7 +360,8 @@ static void unregister_nfs4_fs(void) #endif static struct shrinker acl_shrinker = { - .shrink = nfs_access_cache_shrinker, + .count_objects = nfs_access_cache_count, + .scan_objects = nfs_access_cache_scan, .seeks = DEFAULT_SEEKS, }; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index e76244edd748..9186c7ce0b14 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -59,11 +59,14 @@ static unsigned int longest_chain_cachesize; static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); static void cache_cleaner_func(struct work_struct *unused); -static int nfsd_reply_cache_shrink(struct shrinker *shrink, - struct shrink_control *sc); +static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, + struct shrink_control *sc); +static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, + struct shrink_control *sc); static struct shrinker nfsd_reply_cache_shrinker = { - .shrink = nfsd_reply_cache_shrink, + .scan_objects = nfsd_reply_cache_scan, + .count_objects = nfsd_reply_cache_count, .seeks = 1, }; @@ -232,16 +235,18 @@ nfsd_cache_entry_expired(struct svc_cacherep *rp) * Walk the LRU list and prune off entries that are older than RC_EXPIRE. * Also prune the oldest ones when the total exceeds the max number of entries. */ -static void +static long prune_cache_entries(void) { struct svc_cacherep *rp, *tmp; + long freed = 0; list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { if (!nfsd_cache_entry_expired(rp) && num_drc_entries <= max_drc_entries) break; nfsd_reply_cache_free_locked(rp); + freed++; } /* @@ -254,6 +259,7 @@ prune_cache_entries(void) cancel_delayed_work(&cache_cleaner); else mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); + return freed; } static void @@ -264,20 +270,28 @@ cache_cleaner_func(struct work_struct *unused) spin_unlock(&cache_lock); } -static int -nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long +nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - unsigned int num; + unsigned long num; spin_lock(&cache_lock); - if (sc->nr_to_scan) - prune_cache_entries(); num = num_drc_entries; spin_unlock(&cache_lock); return num; } +static unsigned long +nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long freed; + + spin_lock(&cache_lock); + freed = prune_cache_entries(); + spin_unlock(&cache_lock); + return freed; +} /* * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes */ diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b1a5277cfd18..7e350c562e0e 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -254,7 +254,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); nilfs_truncate(inode); } } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index c5670b8d198c..ea4ba9daeb47 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1768,7 +1768,7 @@ static void ntfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); ntfs_truncate_vfs(inode); } } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4f8197caa487..d71903c6068b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2242,7 +2242,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, file->f_path.dentry->d_name.name, (unsigned int)nr_segs); - if (iocb->ki_left == 0) + if (iocb->ki_nbytes == 0) return 0; appending = file->f_flags & O_APPEND ? 1 : 0; @@ -2293,7 +2293,7 @@ relock: can_do_direct = direct_io; ret = ocfs2_prepare_inode_for_write(file, ppos, - iocb->ki_left, appending, + iocb->ki_nbytes, appending, &can_do_direct, &has_refcount); if (ret < 0) { mlog_errno(ret); @@ -2301,7 +2301,7 @@ relock: } if (direct_io && !is_sync_kiocb(iocb)) - unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, + unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes, *ppos); /* diff --git a/fs/omfs/file.c b/fs/omfs/file.c index e0d9b3e722bd..54d57d6ba68d 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -311,7 +311,7 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); omfs_truncate(inode); } } diff --git a/fs/open.c b/fs/open.c index 2a731b0d08bc..d420331ca32a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -744,14 +744,24 @@ cleanup_file: /** * finish_open - finish opening a file - * @od: opaque open data + * @file: file pointer * @dentry: pointer to dentry * @open: open callback + * @opened: state of open * * This can be used to finish opening a file passed to i_op->atomic_open(). * * If the open callback is set to NULL, then the standard f_op->open() * filesystem callback is substituted. + * + * NB: the dentry reference is _not_ consumed. If, for example, the dentry is + * the return value of d_splice_alias(), then the caller needs to perform dput() + * on it after finish_open(). + * + * On successful return @file is a fully instantiated open file. After this, if + * an error occurs in ->atomic_open(), it needs to clean up with fput(). + * + * Returns zero on success or -errno if the open failed. */ int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *), @@ -772,11 +782,16 @@ EXPORT_SYMBOL(finish_open); /** * finish_no_open - finish ->atomic_open() without opening the file * - * @od: opaque open data + * @file: file pointer * @dentry: dentry or NULL (as returned from ->lookup()) * * This can be used to set the result of a successful lookup in ->atomic_open(). - * The filesystem's atomic_open() method shall return NULL after calling this. + * + * NB: unlike finish_open() this function does consume the dentry reference and + * the caller need not dput() it. + * + * Returns "1" which must be the return value of ->atomic_open() after having + * called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 5aa847a603c0..59d85d608898 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -132,13 +132,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), K(global_page_state(NR_WRITEBACK)), -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - K(global_page_state(NR_ANON_PAGES) - + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * - HPAGE_PMD_NR), -#else K(global_page_state(NR_ANON_PAGES)), -#endif K(global_page_state(NR_FILE_MAPPED)), K(global_page_state(NR_SHMEM)), K(global_page_state(NR_SLAB_RECLAIMABLE) + diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 4ffb7ab5e397..b8e93a40a5d3 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -168,7 +168,7 @@ static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen) int err, ret; ret = -EIO; - err = zlib_inflateInit(&stream); + err = zlib_inflateInit2(&stream, WINDOW_BITS); if (err != Z_OK) goto error; @@ -195,8 +195,29 @@ error: static void allocate_buf_for_compression(void) { size_t size; + size_t cmpr; + + switch (psinfo->bufsize) { + /* buffer range for efivars */ + case 1000 ... 2000: + cmpr = 56; + break; + case 2001 ... 3000: + cmpr = 54; + break; + case 3001 ... 3999: + cmpr = 52; + break; + /* buffer range for nvram, erst */ + case 4000 ... 10000: + cmpr = 45; + break; + default: + cmpr = 60; + break; + } - big_oops_buf_sz = (psinfo->bufsize * 100) / 45; + big_oops_buf_sz = (psinfo->bufsize * 100) / cmpr; big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); if (big_oops_buf) { size = max(zlib_deflate_workspacesize(WINDOW_BITS, MEM_LEVEL), @@ -295,10 +316,6 @@ static void pstore_dump(struct kmsg_dumper *dumper, compressed = true; total_len = zipped_len; } else { - pr_err("pstore: compression failed for Part %d" - " returned %d\n", part, zipped_len); - pr_err("pstore: Capture uncompressed" - " oops/panic report of Part %d\n", part); compressed = false; total_len = copy_kmsg_to_buffer(hsize, len); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9a702e193538..831d49a4111f 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -687,45 +687,37 @@ int dquot_quota_sync(struct super_block *sb, int type) } EXPORT_SYMBOL(dquot_quota_sync); -/* Free unused dquots from cache */ -static void prune_dqcache(int count) +static unsigned long +dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct list_head *head; struct dquot *dquot; + unsigned long freed = 0; head = free_dquots.prev; - while (head != &free_dquots && count) { + while (head != &free_dquots && sc->nr_to_scan) { dquot = list_entry(head, struct dquot, dq_free); remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); do_destroy_dquot(dquot); - count--; + sc->nr_to_scan--; + freed++; head = free_dquots.prev; } + return freed; } -/* - * This is called from kswapd when we think we need some - * more memory - */ -static int shrink_dqcache_memory(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long +dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - int nr = sc->nr_to_scan; - - if (nr) { - spin_lock(&dq_list_lock); - prune_dqcache(nr); - spin_unlock(&dq_list_lock); - } - return ((unsigned) - percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]) - /100) * sysctl_vfs_cache_pressure; + return vfs_pressure_ratio( + percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])); } static struct shrinker dqcache_shrinker = { - .shrink = shrink_dqcache_memory, + .count_objects = dqcache_shrink_count, + .scan_objects = dqcache_shrink_scan, .seeks = DEFAULT_SEEKS, }; diff --git a/fs/read_write.c b/fs/read_write.c index 122a3846d9e1..e3cd280b158c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -367,7 +367,6 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_left = len; kiocb.ki_nbytes = len; ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); @@ -417,7 +416,6 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_left = len; kiocb.ki_nbytes = len; ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); @@ -599,7 +597,6 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_left = len; kiocb.ki_nbytes = len; ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); diff --git a/fs/super.c b/fs/super.c index f6961ea84c56..3a96c9783a8b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -53,11 +53,15 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ -static int prune_super(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long super_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) { struct super_block *sb; - int fs_objects = 0; - int total_objects; + long fs_objects = 0; + long total_objects; + long freed = 0; + long dentries; + long inodes; sb = container_of(shrink, struct super_block, s_shrink); @@ -65,46 +69,62 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) * Deadlock avoidance. We may hold various FS locks, and we don't want * to recurse into the FS that called us in clear_inode() and friends.. */ - if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS)) - return -1; + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; if (!grab_super_passive(sb)) - return -1; + return SHRINK_STOP; if (sb->s_op->nr_cached_objects) - fs_objects = sb->s_op->nr_cached_objects(sb); - - total_objects = sb->s_nr_dentry_unused + - sb->s_nr_inodes_unused + fs_objects + 1; - - if (sc->nr_to_scan) { - int dentries; - int inodes; - - /* proportion the scan between the caches */ - dentries = (sc->nr_to_scan * sb->s_nr_dentry_unused) / - total_objects; - inodes = (sc->nr_to_scan * sb->s_nr_inodes_unused) / - total_objects; - if (fs_objects) - fs_objects = (sc->nr_to_scan * fs_objects) / - total_objects; - /* - * prune the dcache first as the icache is pinned by it, then - * prune the icache, followed by the filesystem specific caches - */ - prune_dcache_sb(sb, dentries); - prune_icache_sb(sb, inodes); + fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); - if (fs_objects && sb->s_op->free_cached_objects) { - sb->s_op->free_cached_objects(sb, fs_objects); - fs_objects = sb->s_op->nr_cached_objects(sb); - } - total_objects = sb->s_nr_dentry_unused + - sb->s_nr_inodes_unused + fs_objects; + inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); + dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); + total_objects = dentries + inodes + fs_objects + 1; + + /* proportion the scan between the caches */ + dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); + inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); + + /* + * prune the dcache first as the icache is pinned by it, then + * prune the icache, followed by the filesystem specific caches + */ + freed = prune_dcache_sb(sb, dentries, sc->nid); + freed += prune_icache_sb(sb, inodes, sc->nid); + + if (fs_objects) { + fs_objects = mult_frac(sc->nr_to_scan, fs_objects, + total_objects); + freed += sb->s_op->free_cached_objects(sb, fs_objects, + sc->nid); } - total_objects = (total_objects / 100) * sysctl_vfs_cache_pressure; + drop_super(sb); + return freed; +} + +static unsigned long super_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct super_block *sb; + long total_objects = 0; + + sb = container_of(shrink, struct super_block, s_shrink); + + if (!grab_super_passive(sb)) + return 0; + + if (sb->s_op && sb->s_op->nr_cached_objects) + total_objects = sb->s_op->nr_cached_objects(sb, + sc->nid); + + total_objects += list_lru_count_node(&sb->s_dentry_lru, + sc->nid); + total_objects += list_lru_count_node(&sb->s_inode_lru, + sc->nid); + + total_objects = vfs_pressure_ratio(total_objects); drop_super(sb); return total_objects; } @@ -175,9 +195,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); - INIT_LIST_HEAD(&s->s_dentry_lru); - INIT_LIST_HEAD(&s->s_inode_lru); - spin_lock_init(&s->s_inode_lru_lock); + + if (list_lru_init(&s->s_dentry_lru)) + goto err_out; + if (list_lru_init(&s->s_inode_lru)) + goto err_out_dentry_lru; + INIT_LIST_HEAD(&s->s_mounts); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -210,11 +233,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) s->cleancache_poolid = -1; s->s_shrink.seeks = DEFAULT_SEEKS; - s->s_shrink.shrink = prune_super; + s->s_shrink.scan_objects = super_cache_scan; + s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; + s->s_shrink.flags = SHRINKER_NUMA_AWARE; } out: return s; + +err_out_dentry_lru: + list_lru_destroy(&s->s_dentry_lru); err_out: security_sb_free(s); #ifdef CONFIG_SMP @@ -295,6 +323,9 @@ void deactivate_locked_super(struct super_block *s) /* caches are now gone, we can safely kill the shrinker now */ unregister_shrinker(&s->s_shrink); + list_lru_destroy(&s->s_dentry_lru); + list_lru_destroy(&s->s_inode_lru); + put_filesystem(fs); put_super(s); } else { diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index c1a591a4725b..66bc316927e8 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -469,7 +469,7 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) { - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); sysv_truncate(inode); } } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 7f60e900edff..6e025e02ffde 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2587,10 +2587,11 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, return -EROFS; failing = power_cut_emulated(c, lnum, 1); - if (failing) + if (failing) { len = corrupt_data(c, buf, len); - ubifs_warn("actually write %d bytes to LEB %d:%d (the buffer was corrupted)", - len, lnum, offs); + ubifs_warn("actually write %d bytes to LEB %d:%d (the buffer was corrupted)", + len, lnum, offs); + } err = ubi_leb_write(c->ubi, lnum, buf, offs, len); if (err) return err; diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index 9e1d05666fed..f35135e28e96 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -277,18 +277,25 @@ static int kick_a_thread(void) return 0; } -int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc) +unsigned long ubifs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) { - int nr = sc->nr_to_scan; - int freed, contention = 0; long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); - if (nr == 0) - /* - * Due to the way UBIFS updates the clean znode counter it may - * temporarily be negative. - */ - return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; + /* + * Due to the way UBIFS updates the clean znode counter it may + * temporarily be negative. + */ + return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; +} + +unsigned long ubifs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long nr = sc->nr_to_scan; + int contention = 0; + unsigned long freed; + long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); if (!clean_zn_cnt) { /* @@ -316,10 +323,10 @@ int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc) if (!freed && contention) { dbg_tnc("freed nothing, but contention"); - return -1; + return SHRINK_STOP; } out: - dbg_tnc("%d znodes were freed, requested %d", freed, nr); + dbg_tnc("%lu znodes were freed, requested %lu", freed, nr); return freed; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 879b9976c12b..3e4aa7281e04 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -49,7 +49,8 @@ struct kmem_cache *ubifs_inode_slab; /* UBIFS TNC shrinker description */ static struct shrinker ubifs_shrinker_info = { - .shrink = ubifs_shrinker, + .scan_objects = ubifs_shrink_scan, + .count_objects = ubifs_shrink_count, .seeks = DEFAULT_SEEKS, }; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index b2babce4d70f..e8c8cfe1435c 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1624,7 +1624,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); int ubifs_tnc_end_commit(struct ubifs_info *c); /* shrinker.c */ -int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc); +unsigned long ubifs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long ubifs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); /* commit.c */ int ubifs_bg_thread(void *info); diff --git a/fs/udf/file.c b/fs/udf/file.c index 29569dd08168..c02a27a19c6d 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -141,7 +141,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); int err, pos; - size_t count = iocb->ki_left; + size_t count = iocb->ki_nbytes; struct udf_inode_info *iinfo = UDF_I(inode); down_write(&iinfo->i_data_sem); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index b6d15d349810..062b7925bca0 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -172,7 +172,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to) loff_t isize = inode->i_size; if (to > isize) { - truncate_pagecache(inode, to, isize); + truncate_pagecache(inode, isize); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index ff24e4449ece..c8ca96086784 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -531,7 +531,7 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } static int ufs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 4a7286c1dc80..a02cfb9e3bce 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -27,8 +27,6 @@ /* * Greedy allocation. May fail and may return vmalloced memory. - * - * Must be freed using kmem_free_large. */ void * kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) @@ -36,7 +34,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) void *ptr; size_t kmsize = maxsize; - while (!(ptr = kmem_zalloc_large(kmsize))) { + while (!(ptr = vzalloc(kmsize))) { if ((kmsize >>= 1) <= minsize) kmsize = minsize; } @@ -75,6 +73,17 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) return ptr; } +void * +kmem_zalloc_large(size_t size, xfs_km_flags_t flags) +{ + void *ptr; + + ptr = kmem_zalloc(size, flags | KM_MAYFAIL); + if (ptr) + return ptr; + return vzalloc(size); +} + void kmem_free(const void *ptr) { diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index b2f2620f9a87..3a7371cab508 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -57,17 +57,10 @@ kmem_flags_convert(xfs_km_flags_t flags) extern void *kmem_alloc(size_t, xfs_km_flags_t); extern void *kmem_zalloc(size_t, xfs_km_flags_t); +extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); extern void kmem_free(const void *); -static inline void *kmem_zalloc_large(size_t size) -{ - return vzalloc(size); -} -static inline void kmem_free_large(void *ptr) -{ - vfree(ptr); -} extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 69518960b2ba..0e2f37efedd0 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -152,7 +152,7 @@ xfs_get_acl(struct inode *inode, int type) * go out to the disk. */ len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kzalloc(len, GFP_KERNEL); + xfs_acl = kmem_zalloc_large(len, KM_SLEEP); if (!xfs_acl) return ERR_PTR(-ENOMEM); @@ -175,10 +175,10 @@ xfs_get_acl(struct inode *inode, int type) if (IS_ERR(acl)) goto out; - out_update_cache: +out_update_cache: set_cached_acl(inode, type, acl); - out: - kfree(xfs_acl); +out: + kmem_free(xfs_acl); return acl; } @@ -209,7 +209,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) struct xfs_acl *xfs_acl; int len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kzalloc(len, GFP_KERNEL); + xfs_acl = kmem_zalloc_large(len, KM_SLEEP); if (!xfs_acl) return -ENOMEM; @@ -222,7 +222,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, len, ATTR_ROOT); - kfree(xfs_acl); + kmem_free(xfs_acl); } else { /* * A NULL ACL argument means we want to remove the ACL. diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 977da0ec6604..e51e581454e9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1582,7 +1582,7 @@ xfs_vm_write_begin( unlock_page(page); if (pos + len > i_size_read(inode)) - truncate_pagecache(inode, pos + len, i_size_read(inode)); + truncate_pagecache(inode, i_size_read(inode)); page_cache_release(page); page = NULL; @@ -1618,7 +1618,7 @@ xfs_vm_write_end( loff_t to = pos + len; if (to > isize) { - truncate_pagecache(inode, to, isize); + truncate_pagecache(inode, isize); xfs_vm_kill_delalloc_range(inode, isize, to); } } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 92b830901d60..f47e65c30be6 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4450,7 +4450,7 @@ xfs_bmapi_write( { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; - struct xfs_bmalloca bma = { 0 }; /* args for xfs_bmap_alloc */ + struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ xfs_fileoff_t end; /* end of mapped file region */ int eof; /* after the end of extents */ int error; /* error return */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index cf3bc76710c3..bb8de8e399c4 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -925,3 +925,47 @@ xfs_bmdr_maxrecs( return blocklen / sizeof(xfs_bmdr_rec_t); return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t)); } + +/* + * Change the owner of a btree format fork fo the inode passed in. Change it to + * the owner of that is passed in so that we can change owners before or after + * we switch forks between inodes. The operation that the caller is doing will + * determine whether is needs to change owner before or after the switch. + * + * For demand paged transactional modification, the fork switch should be done + * after reading in all the blocks, modifying them and pinning them in the + * transaction. For modification when the buffers are already pinned in memory, + * the fork switch can be done before changing the owner as we won't need to + * validate the owner until the btree buffers are unpinned and writes can occur + * again. + * + * For recovery based ownership change, there is no transactional context and + * so a buffer list must be supplied so that we can record the buffers that we + * modified for the caller to issue IO on. + */ +int +xfs_bmbt_change_owner( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + xfs_ino_t new_owner, + struct list_head *buffer_list) +{ + struct xfs_btree_cur *cur; + int error; + + ASSERT(tp || buffer_list); + ASSERT(!(tp && buffer_list)); + if (whichfork == XFS_DATA_FORK) + ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE); + else + ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE); + + cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); + if (!cur) + return ENOMEM; + + error = xfs_btree_change_owner(cur, new_owner, buffer_list); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + return error; +} diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 1b726d626941..e367461a638e 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -236,6 +236,10 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf); extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); +extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, xfs_ino_t new_owner, + struct list_head *buffer_list); + extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 541d59f5e658..97f952caea74 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -612,13 +612,9 @@ xfs_getbmap( if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) return XFS_ERROR(ENOMEM); - out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL); - if (!out) { - out = kmem_zalloc_large(bmv->bmv_count * - sizeof(struct getbmapx)); - if (!out) - return XFS_ERROR(ENOMEM); - } + out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0); + if (!out) + return XFS_ERROR(ENOMEM); xfs_ilock(ip, XFS_IOLOCK_SHARED); if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { @@ -754,10 +750,7 @@ xfs_getbmap( break; } - if (is_vmalloc_addr(out)) - kmem_free_large(out); - else - kmem_free(out); + kmem_free(out); return error; } @@ -1789,14 +1782,6 @@ xfs_swap_extents( int taforkblks = 0; __uint64_t tmp; - /* - * We have no way of updating owner information in the BMBT blocks for - * each inode on CRC enabled filesystems, so to avoid corrupting the - * this metadata we simply don't allow extent swaps to occur. - */ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return XFS_ERROR(EINVAL); - tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); @@ -1920,6 +1905,42 @@ xfs_swap_extents( goto out_trans_cancel; } + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + /* + * Before we've swapped the forks, lets set the owners of the forks + * appropriately. We have to do this as we are demand paging the btree + * buffers, and so the validation done on read will expect the owner + * field to be correctly set. Once we change the owners, we can swap the + * inode forks. + * + * Note the trickiness in setting the log flags - we set the owner log + * flag on the opposite inode (i.e. the inode we are setting the new + * owner to be) because once we swap the forks and log that, log + * recovery is going to see the fork as owned by the swapped inode, + * not the pre-swapped inodes. + */ + src_log_flags = XFS_ILOG_CORE; + target_log_flags = XFS_ILOG_CORE; + if (ip->i_d.di_version == 3 && + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + target_log_flags |= XFS_ILOG_DOWNER; + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, + tip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + + if (tip->i_d.di_version == 3 && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + src_log_flags |= XFS_ILOG_DOWNER; + error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, + ip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + /* * Swap the data forks of the inodes */ @@ -1957,7 +1978,6 @@ xfs_swap_extents( tip->i_delayed_blks = ip->i_delayed_blks; ip->i_delayed_blks = 0; - src_log_flags = XFS_ILOG_CORE; switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: /* If the extents fit in the inode, fix the @@ -1971,11 +1991,12 @@ xfs_swap_extents( src_log_flags |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: + ASSERT(ip->i_d.di_version < 3 || + (src_log_flags & XFS_ILOG_DOWNER)); src_log_flags |= XFS_ILOG_DBROOT; break; } - target_log_flags = XFS_ILOG_CORE; switch (tip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: /* If the extents fit in the inode, fix the @@ -1990,13 +2011,11 @@ xfs_swap_extents( break; case XFS_DINODE_FMT_BTREE: target_log_flags |= XFS_ILOG_DBROOT; + ASSERT(tip->i_d.di_version < 3 || + (target_log_flags & XFS_ILOG_DOWNER)); break; } - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_log_inode(tp, ip, src_log_flags); xfs_trans_log_inode(tp, tip, target_log_flags); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 7a2b4da3c0db..5690e102243d 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -855,6 +855,41 @@ xfs_btree_readahead( return xfs_btree_readahead_sblock(cur, lr, block); } +STATIC xfs_daddr_t +xfs_btree_ptr_to_daddr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO)); + + return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); + } else { + ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); + ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK)); + + return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + be32_to_cpu(ptr->s)); + } +} + +/* + * Readahead @count btree blocks at the given @ptr location. + * + * We don't need to care about long or short form btrees here as we have a + * method of converting the ptr directly to a daddr available to us. + */ +STATIC void +xfs_btree_readahead_ptr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + xfs_extlen_t count) +{ + xfs_buf_readahead(cur->bc_mp->m_ddev_targp, + xfs_btree_ptr_to_daddr(cur, ptr), + cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); +} + /* * Set the buffer for level "lev" in the cursor to bp, releasing * any previous buffer. @@ -1073,24 +1108,6 @@ xfs_btree_buf_to_ptr( } } -STATIC xfs_daddr_t -xfs_btree_ptr_to_daddr( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) -{ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO)); - - return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); - } else { - ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); - ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK)); - - return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, - be32_to_cpu(ptr->s)); - } -} - STATIC void xfs_btree_set_refs( struct xfs_btree_cur *cur, @@ -3869,3 +3886,120 @@ xfs_btree_get_rec( *stat = 1; return 0; } + +/* + * Change the owner of a btree. + * + * The mechanism we use here is ordered buffer logging. Because we don't know + * how many buffers were are going to need to modify, we don't really want to + * have to make transaction reservations for the worst case of every buffer in a + * full size btree as that may be more space that we can fit in the log.... + * + * We do the btree walk in the most optimal manner possible - we have sibling + * pointers so we can just walk all the blocks on each level from left to right + * in a single pass, and then move to the next level and do the same. We can + * also do readahead on the sibling pointers to get IO moving more quickly, + * though for slow disks this is unlikely to make much difference to performance + * as the amount of CPU work we have to do before moving to the next block is + * relatively small. + * + * For each btree block that we load, modify the owner appropriately, set the + * buffer as an ordered buffer and log it appropriately. We need to ensure that + * we mark the region we change dirty so that if the buffer is relogged in + * a subsequent transaction the changes we make here as an ordered buffer are + * correctly relogged in that transaction. If we are in recovery context, then + * just queue the modified buffer as delayed write buffer so the transaction + * recovery completion writes the changes to disk. + */ +static int +xfs_btree_block_change_owner( + struct xfs_btree_cur *cur, + int level, + __uint64_t new_owner, + struct list_head *buffer_list) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_ptr rptr; + + /* do right sibling readahead */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + + /* modify the owner */ + block = xfs_btree_get_block(cur, level, &bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + block->bb_u.l.bb_owner = cpu_to_be64(new_owner); + else + block->bb_u.s.bb_owner = cpu_to_be32(new_owner); + + /* + * If the block is a root block hosted in an inode, we might not have a + * buffer pointer here and we shouldn't attempt to log the change as the + * information is already held in the inode and discarded when the root + * block is formatted into the on-disk inode fork. We still change it, + * though, so everything is consistent in memory. + */ + if (bp) { + if (cur->bc_tp) { + xfs_trans_ordered_buf(cur->bc_tp, bp); + xfs_btree_log_block(cur, bp, XFS_BB_OWNER); + } else { + xfs_buf_delwri_queue(bp, buffer_list); + } + } else { + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(level == cur->bc_nlevels - 1); + } + + /* now read rh sibling block for next iteration */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + return ENOENT; + + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); +} + +int +xfs_btree_change_owner( + struct xfs_btree_cur *cur, + __uint64_t new_owner, + struct list_head *buffer_list) +{ + union xfs_btree_ptr lptr; + int level; + struct xfs_btree_block *block = NULL; + int error = 0; + + cur->bc_ops->init_ptr_from_cur(cur, &lptr); + + /* for each level */ + for (level = cur->bc_nlevels - 1; level >= 0; level--) { + /* grab the left hand block */ + error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); + if (error) + return error; + + /* readahead the left most block for the next level down */ + if (level > 0) { + union xfs_btree_ptr *ptr; + + ptr = xfs_btree_ptr_addr(cur, 1, block); + xfs_btree_readahead_ptr(cur, ptr, 1); + + /* save for the next iteration of the loop */ + lptr = *ptr; + } + + /* for each buffer in the level */ + do { + error = xfs_btree_block_change_owner(cur, level, + new_owner, + buffer_list); + } while (!error); + + if (error != ENOENT) + return error; + } + + return 0; +} diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index c8473c7ef45e..06729b67ad58 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -121,15 +121,18 @@ union xfs_btree_rec { /* * For logging record fields. */ -#define XFS_BB_MAGIC 0x01 -#define XFS_BB_LEVEL 0x02 -#define XFS_BB_NUMRECS 0x04 -#define XFS_BB_LEFTSIB 0x08 -#define XFS_BB_RIGHTSIB 0x10 -#define XFS_BB_BLKNO 0x20 +#define XFS_BB_MAGIC (1 << 0) +#define XFS_BB_LEVEL (1 << 1) +#define XFS_BB_NUMRECS (1 << 2) +#define XFS_BB_LEFTSIB (1 << 3) +#define XFS_BB_RIGHTSIB (1 << 4) +#define XFS_BB_BLKNO (1 << 5) +#define XFS_BB_LSN (1 << 6) +#define XFS_BB_UUID (1 << 7) +#define XFS_BB_OWNER (1 << 8) #define XFS_BB_NUM_BITS 5 #define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) -#define XFS_BB_NUM_BITS_CRC 8 +#define XFS_BB_NUM_BITS_CRC 9 #define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) /* @@ -442,6 +445,8 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); int xfs_btree_insert(struct xfs_btree_cur *, int *); int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); +int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner, + struct list_head *buffer_list); /* * btree block CRC helpers diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c06823fe10d3..263470075ea2 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -81,54 +81,6 @@ xfs_buf_vmap_len( } /* - * xfs_buf_lru_add - add a buffer to the LRU. - * - * The LRU takes a new reference to the buffer so that it will only be freed - * once the shrinker takes the buffer off the LRU. - */ -STATIC void -xfs_buf_lru_add( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - - spin_lock(&btp->bt_lru_lock); - if (list_empty(&bp->b_lru)) { - atomic_inc(&bp->b_hold); - list_add_tail(&bp->b_lru, &btp->bt_lru); - btp->bt_lru_nr++; - bp->b_lru_flags &= ~_XBF_LRU_DISPOSE; - } - spin_unlock(&btp->bt_lru_lock); -} - -/* - * xfs_buf_lru_del - remove a buffer from the LRU - * - * The unlocked check is safe here because it only occurs when there are not - * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there - * to optimise the shrinker removing the buffer from the LRU and calling - * xfs_buf_free(). i.e. it removes an unnecessary round trip on the - * bt_lru_lock. - */ -STATIC void -xfs_buf_lru_del( - struct xfs_buf *bp) -{ - struct xfs_buftarg *btp = bp->b_target; - - if (list_empty(&bp->b_lru)) - return; - - spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru)) { - list_del_init(&bp->b_lru); - btp->bt_lru_nr--; - } - spin_unlock(&btp->bt_lru_lock); -} - -/* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer * reference count falls to zero. If the buffer is already on the LRU, we need @@ -151,20 +103,14 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; - atomic_set(&(bp)->b_lru_ref, 0); - if (!list_empty(&bp->b_lru)) { - struct xfs_buftarg *btp = bp->b_target; + spin_lock(&bp->b_lock); + atomic_set(&bp->b_lru_ref, 0); + if (!(bp->b_state & XFS_BSTATE_DISPOSE) && + (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + atomic_dec(&bp->b_hold); - spin_lock(&btp->bt_lru_lock); - if (!list_empty(&bp->b_lru) && - !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) { - list_del_init(&bp->b_lru); - btp->bt_lru_nr--; - atomic_dec(&bp->b_hold); - } - spin_unlock(&btp->bt_lru_lock); - } ASSERT(atomic_read(&bp->b_hold) >= 1); + spin_unlock(&bp->b_lock); } static int @@ -228,6 +174,7 @@ _xfs_buf_alloc( INIT_LIST_HEAD(&bp->b_list); RB_CLEAR_NODE(&bp->b_rbnode); sema_init(&bp->b_sema, 0); /* held, no waiters */ + spin_lock_init(&bp->b_lock); XB_SET_OWNER(bp); bp->b_target = target; bp->b_flags = flags; @@ -917,12 +864,33 @@ xfs_buf_rele( ASSERT(atomic_read(&bp->b_hold) > 0); if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - if (!(bp->b_flags & XBF_STALE) && - atomic_read(&bp->b_lru_ref)) { - xfs_buf_lru_add(bp); + spin_lock(&bp->b_lock); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new + * reference to the buffer for the LRU and clear the + * (now stale) dispose list state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); + } + spin_unlock(&bp->b_lock); spin_unlock(&pag->pag_buf_lock); } else { - xfs_buf_lru_del(bp); + /* + * most of the time buffers will already be removed from + * the LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the + * buffer was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + spin_unlock(&bp->b_lock); + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); spin_unlock(&pag->pag_buf_lock); @@ -1502,83 +1470,121 @@ xfs_buf_iomove( * returned. These buffers will have an elevated hold count, so wait on those * while freeing all the buffers only held by the LRU. */ +static enum lru_status +xfs_buftarg_wait_rele( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) + +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + if (atomic_read(&bp->b_hold) > 1) { + /* need to wait, so skip it this pass */ + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + return LRU_SKIP; + } + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + + /* + * clear the LRU reference count so the buffer doesn't get + * ignored in xfs_buf_rele(). + */ + atomic_set(&bp->b_lru_ref, 0); + bp->b_state |= XFS_BSTATE_DISPOSE; + list_move(item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + void xfs_wait_buftarg( struct xfs_buftarg *btp) { - struct xfs_buf *bp; + LIST_HEAD(dispose); + int loop = 0; -restart: - spin_lock(&btp->bt_lru_lock); - while (!list_empty(&btp->bt_lru)) { - bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); - if (atomic_read(&bp->b_hold) > 1) { - trace_xfs_buf_wait_buftarg(bp, _RET_IP_); - list_move_tail(&bp->b_lru, &btp->bt_lru); - spin_unlock(&btp->bt_lru_lock); - delay(100); - goto restart; + /* loop until there is nothing left on the lru list. */ + while (list_lru_count(&btp->bt_lru)) { + list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, + &dispose, LONG_MAX); + + while (!list_empty(&dispose)) { + struct xfs_buf *bp; + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + xfs_buf_rele(bp); } - /* - * clear the LRU reference count so the buffer doesn't get - * ignored in xfs_buf_rele(). - */ - atomic_set(&bp->b_lru_ref, 0); - spin_unlock(&btp->bt_lru_lock); - xfs_buf_rele(bp); - spin_lock(&btp->bt_lru_lock); + if (loop++ != 0) + delay(100); } - spin_unlock(&btp->bt_lru_lock); } -int -xfs_buftarg_shrink( +static enum lru_status +xfs_buftarg_isolate( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + /* + * we are inverting the lru lock/bp->b_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + /* + * Decrement the b_lru_ref count unless the value is already + * zero. If the value is already zero, we need to reclaim the + * buffer, otherwise it gets another trip through the LRU. + */ + if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + spin_unlock(&bp->b_lock); + return LRU_ROTATE; + } + + bp->b_state |= XFS_BSTATE_DISPOSE; + list_move(item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + +static unsigned long +xfs_buftarg_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { struct xfs_buftarg *btp = container_of(shrink, struct xfs_buftarg, bt_shrinker); - struct xfs_buf *bp; - int nr_to_scan = sc->nr_to_scan; LIST_HEAD(dispose); + unsigned long freed; + unsigned long nr_to_scan = sc->nr_to_scan; - if (!nr_to_scan) - return btp->bt_lru_nr; - - spin_lock(&btp->bt_lru_lock); - while (!list_empty(&btp->bt_lru)) { - if (nr_to_scan-- <= 0) - break; - - bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); - - /* - * Decrement the b_lru_ref count unless the value is already - * zero. If the value is already zero, we need to reclaim the - * buffer, otherwise it gets another trip through the LRU. - */ - if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { - list_move_tail(&bp->b_lru, &btp->bt_lru); - continue; - } - - /* - * remove the buffer from the LRU now to avoid needing another - * lock round trip inside xfs_buf_rele(). - */ - list_move(&bp->b_lru, &dispose); - btp->bt_lru_nr--; - bp->b_lru_flags |= _XBF_LRU_DISPOSE; - } - spin_unlock(&btp->bt_lru_lock); + freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, + &dispose, &nr_to_scan); while (!list_empty(&dispose)) { + struct xfs_buf *bp; bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); xfs_buf_rele(bp); } - return btp->bt_lru_nr; + return freed; +} + +static unsigned long +xfs_buftarg_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + return list_lru_count_node(&btp->bt_lru, sc->nid); } void @@ -1587,6 +1593,7 @@ xfs_free_buftarg( struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); + list_lru_destroy(&btp->bt_lru); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); @@ -1660,12 +1667,16 @@ xfs_alloc_buftarg( if (!btp->bt_bdi) goto error; - INIT_LIST_HEAD(&btp->bt_lru); - spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - btp->bt_shrinker.shrink = xfs_buftarg_shrink; + + if (list_lru_init(&btp->bt_lru)) + goto error; + + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; + btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&btp->bt_shrinker); return btp; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 433a12ed7b17..e65683361017 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -25,6 +25,7 @@ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/uio.h> +#include <linux/list_lru.h> /* * Base types @@ -59,7 +60,6 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ -#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */ typedef unsigned int xfs_buf_flags_t; @@ -78,8 +78,12 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" }, \ - { _XBF_LRU_DISPOSE, "LRU_DISPOSE" } + { _XBF_COMPOUND, "COMPOUND" } + +/* + * Internal state flags. + */ +#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ typedef struct xfs_buftarg { dev_t bt_dev; @@ -92,9 +96,7 @@ typedef struct xfs_buftarg { /* LRU control structures */ struct shrinker bt_shrinker; - struct list_head bt_lru; - spinlock_t bt_lru_lock; - unsigned int bt_lru_nr; + struct list_lru bt_lru; } xfs_buftarg_t; struct xfs_buf; @@ -137,7 +139,8 @@ typedef struct xfs_buf { * bt_lru_lock and not by b_sema */ struct list_head b_lru; /* lru list */ - xfs_buf_flags_t b_lru_flags; /* internal lru status flags */ + spinlock_t b_lock; /* internal state lock */ + unsigned int b_state; /* internal state flags */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3a944b198e35..88c5ea75ebf6 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -613,13 +613,27 @@ xfs_buf_item_unlock( } } } - if (clean || aborted) { - if (atomic_dec_and_test(&bip->bli_refcount)) { - ASSERT(!aborted || XFS_FORCED_SHUTDOWN(lip->li_mountp)); + + /* + * Clean buffers, by definition, cannot be in the AIL. However, aborted + * buffers may be dirty and hence in the AIL. Therefore if we are + * aborting a buffer and we've just taken the last refernce away, we + * have to check if it is in the AIL before freeing it. We need to free + * it in this case, because an aborted transaction has already shut the + * filesystem down and this is the last chance we will have to do so. + */ + if (atomic_dec_and_test(&bip->bli_refcount)) { + if (clean) + xfs_buf_item_relse(bp); + else if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + if (lip->li_flags & XFS_LI_IN_AIL) { + xfs_trans_ail_delete(lip->li_ailp, lip, + SHUTDOWN_LOG_IO_ERROR); + } xfs_buf_item_relse(bp); } - } else - atomic_dec(&bip->bli_refcount); + } if (!(flags & XFS_BLI_HOLD)) xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index d4e59a4ff59f..069537c845e5 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -635,6 +635,7 @@ xfs_da3_root_split( xfs_trans_log_buf(tp, bp, 0, size - 1); bp->b_ops = blk1->bp->b_ops; + xfs_trans_buf_copy_type(bp, blk1->bp); blk1->bp = bp; blk1->blkno = blkno; diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 08984eeee159..1021c8356d08 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -180,6 +180,11 @@ xfs_dir3_leaf_check_int( return true; } +/* + * We verify the magic numbers before decoding the leaf header so that on debug + * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due + * to incorrect magic numbers. + */ static bool xfs_dir3_leaf_verify( struct xfs_buf *bp, @@ -191,24 +196,25 @@ xfs_dir3_leaf_verify( ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); - xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + __uint16_t magic3; - if ((magic == XFS_DIR2_LEAF1_MAGIC && - leafhdr.magic != XFS_DIR3_LEAF1_MAGIC) || - (magic == XFS_DIR2_LEAFN_MAGIC && - leafhdr.magic != XFS_DIR3_LEAFN_MAGIC)) - return false; + magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC + : XFS_DIR3_LEAFN_MAGIC; + if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) + return false; if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) return false; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) return false; } else { - if (leafhdr.magic != magic) + if (leaf->hdr.info.magic != cpu_to_be16(magic)) return false; } + + xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf); } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 251c66632e5e..71520e6e5d65 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -940,13 +940,8 @@ xfs_qm_dqput_final( trace_xfs_dqput_free(dqp); - mutex_lock(&qi->qi_lru_lock); - if (list_empty(&dqp->q_lru)) { - list_add_tail(&dqp->q_lru, &qi->qi_lru_list); - qi->qi_lru_count++; + if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(xs_qm_dquot_unused); - } - mutex_unlock(&qi->qi_lru_lock); /* * If we just added a udquot to the freelist, then we want to release diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 60c6e1f12695..e838d84b4e85 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -142,7 +142,8 @@ xfs_qm_dqunpin_wait( STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, - struct list_head *buffer_list) + struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock) + __acquires(&lip->li_ailp->xa_lock) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; struct xfs_buf *bp = NULL; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 86f559f6e5d3..e43708e2f080 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -160,7 +160,8 @@ xfs_extent_busy_update_extent( struct xfs_extent_busy *busyp, xfs_agblock_t fbno, xfs_extlen_t flen, - bool userdata) + bool userdata) __releases(&pag->pagb_lock) + __acquires(&pag->pagb_lock) { xfs_agblock_t fend = fbno + flen; xfs_agblock_t bbno = busyp->bno; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 16219b9c6790..193206ba4358 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -48,7 +48,7 @@ STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, /* * Allocate and initialise an xfs_inode. */ -STATIC struct xfs_inode * +struct xfs_inode * xfs_inode_alloc( struct xfs_mount *mp, xfs_ino_t ino) @@ -98,7 +98,7 @@ xfs_inode_free_callback( kmem_zone_free(xfs_inode_zone, ip); } -STATIC void +void xfs_inode_free( struct xfs_inode *ip) { @@ -1167,7 +1167,7 @@ xfs_reclaim_inodes( * them to be cleaned, which we hope will not be very long due to the * background walker having already kicked the IO off on those dirty inodes. */ -void +long xfs_reclaim_inodes_nr( struct xfs_mount *mp, int nr_to_scan) @@ -1176,7 +1176,7 @@ xfs_reclaim_inodes_nr( xfs_reclaim_work_queue(mp); xfs_ail_push_all(mp->m_ail); - xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); + return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); } /* diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 8a89f7d791bd..9ed68bb750f5 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -42,11 +42,15 @@ struct xfs_eofblocks { int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); +/* recovery needs direct inode allocation capability */ +struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino); +void xfs_inode_free(struct xfs_inode *ip); + void xfs_reclaim_worker(struct work_struct *work); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); -void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); +long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c index e011d597f12f..63382d37f565 100644 --- a/fs/xfs/xfs_inode_buf.c +++ b/fs/xfs/xfs_inode_buf.c @@ -53,9 +53,8 @@ xfs_inobp_check( i * mp->m_sb.sb_inodesize); if (!dip->di_next_unlinked) { xfs_alert(mp, - "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.", - bp); - ASSERT(dip->di_next_unlinked); + "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", + i, (long long)bp->b_bn); } } } @@ -106,11 +105,10 @@ xfs_inode_buf_verify( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH, mp, dip); #ifdef DEBUG - xfs_emerg(mp, + xfs_alert(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", (unsigned long long)bp->b_bn, i, be16_to_cpu(dip->di_magic)); - ASSERT(0); #endif } } @@ -196,7 +194,7 @@ xfs_imap_to_bp( return 0; } -STATIC void +void xfs_dinode_from_disk( xfs_icdinode_t *to, xfs_dinode_t *from) diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h index 599e6c0ca2a9..abba0ae8cf2d 100644 --- a/fs/xfs/xfs_inode_buf.h +++ b/fs/xfs/xfs_inode_buf.h @@ -32,17 +32,17 @@ struct xfs_imap { ushort im_boffset; /* inode offset in block in bytes */ }; -int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, - struct xfs_imap *, struct xfs_dinode **, - struct xfs_buf **, uint, uint); -int xfs_iread(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, uint); -void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); -void xfs_dinode_to_disk(struct xfs_dinode *, - struct xfs_icdinode *); +int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, + struct xfs_imap *, struct xfs_dinode **, + struct xfs_buf **, uint, uint); +int xfs_iread(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, uint); +void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); +void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from); +void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from); #if defined(DEBUG) -void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); +void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #else #define xfs_inobp_check(mp, bp) #endif /* DEBUG */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bdebc21078d7..668e8f4ccf5e 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -71,7 +71,7 @@ xfs_find_handle( int hsize; xfs_handle_t handle; struct inode *inode; - struct fd f = {0}; + struct fd f = {NULL}; struct path path; int error; struct xfs_inode *ip; @@ -456,12 +456,9 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL); - if (!kbuf) { - kbuf = kmem_zalloc_large(al_hreq.buflen); - if (!kbuf) - goto out_dput; - } + kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + if (!kbuf) + goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, @@ -472,12 +469,9 @@ xfs_attrlist_by_handle( if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) error = -EFAULT; - out_kfree: - if (is_vmalloc_addr(kbuf)) - kmem_free_large(kbuf); - else - kmem_free(kbuf); - out_dput: +out_kfree: + kmem_free(kbuf); +out_dput: dput(dentry); return error; } @@ -495,12 +489,9 @@ xfs_attrmulti_attr_get( if (*len > XATTR_SIZE_MAX) return EINVAL; - kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL); - if (!kbuf) { - kbuf = kmem_zalloc_large(*len); - if (!kbuf) - return ENOMEM; - } + kbuf = kmem_zalloc_large(*len, KM_SLEEP); + if (!kbuf) + return ENOMEM; error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); if (error) @@ -509,11 +500,8 @@ xfs_attrmulti_attr_get( if (copy_to_user(ubuf, kbuf, *len)) error = EFAULT; - out_kfree: - if (is_vmalloc_addr(kbuf)) - kmem_free_large(kbuf); - else - kmem_free(kbuf); +out_kfree: + kmem_free(kbuf); return error; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index d3ab9534307f..f671f7e472ac 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -371,12 +371,9 @@ xfs_compat_attrlist_by_handle( return PTR_ERR(dentry); error = -ENOMEM; - kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL); - if (!kbuf) { - kbuf = kmem_zalloc_large(al_hreq.buflen); - if (!kbuf) - goto out_dput; - } + kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + if (!kbuf) + goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, @@ -387,12 +384,9 @@ xfs_compat_attrlist_by_handle( if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) error = -EFAULT; - out_kfree: - if (is_vmalloc_addr(kbuf)) - kmem_free_large(kbuf); - else - kmem_free(kbuf); - out_dput: +out_kfree: + kmem_free(kbuf); +out_dput: dput(dentry); return error; } diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index b93e14b86754..084b3e1741fd 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -495,7 +495,7 @@ xfs_bulkstat( /* * Done, we're either out of filesystem or space to put the data. */ - kmem_free_large(irbuf); + kmem_free(irbuf); *ubcountp = ubelem; /* * Found some inodes, return them now and return the error next time. @@ -541,8 +541,9 @@ xfs_bulkstat_single( * at the expense of the error case. */ - ino = (xfs_ino_t)*lastinop; - error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res); + ino = *lastinop; + error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), + NULL, &res); if (error) { /* * Special case way failed, do it the "long" way diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 5372d58ef93a..a2dea108071a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -257,7 +257,8 @@ xlog_grant_head_wait( struct xlog *log, struct xlog_grant_head *head, struct xlog_ticket *tic, - int need_bytes) + int need_bytes) __releases(&head->lock) + __acquires(&head->lock) { list_add_tail(&tic->t_queue, &head->waiters); diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h index 31e3a06c4644..ca7e28a8ed31 100644 --- a/fs/xfs/xfs_log_format.h +++ b/fs/xfs/xfs_log_format.h @@ -474,6 +474,8 @@ typedef struct xfs_inode_log_format_64 { #define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ #define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ #define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ +#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */ +#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */ /* @@ -487,7 +489,8 @@ typedef struct xfs_inode_log_format_64 { #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ XFS_ILOG_UUID | XFS_ILOG_ADATA | \ - XFS_ILOG_AEXT | XFS_ILOG_ABROOT) + XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \ + XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) #define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT) @@ -499,7 +502,8 @@ typedef struct xfs_inode_log_format_64 { XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ XFS_ILOG_DEV | XFS_ILOG_UUID | \ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP) + XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \ + XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) static inline int xfs_ilog_fbroot(int w) { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 7c0c1fdc728b..dabda9521b4b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2014,7 +2014,7 @@ xlog_recover_get_buf_lsn( case XFS_ATTR3_RMT_MAGIC: return be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); case XFS_SB_MAGIC: - return be64_to_cpu(((struct xfs_sb *)blk)->sb_lsn); + return be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); default: break; } @@ -2629,6 +2629,82 @@ out_release: return error; } +/* + * Inode fork owner changes + * + * If we have been told that we have to reparent the inode fork, it's because an + * extent swap operation on a CRC enabled filesystem has been done and we are + * replaying it. We need to walk the BMBT of the appropriate fork and change the + * owners of it. + * + * The complexity here is that we don't have an inode context to work with, so + * after we've replayed the inode we need to instantiate one. This is where the + * fun begins. + * + * We are in the middle of log recovery, so we can't run transactions. That + * means we cannot use cache coherent inode instantiation via xfs_iget(), as + * that will result in the corresponding iput() running the inode through + * xfs_inactive(). If we've just replayed an inode core that changes the link + * count to zero (i.e. it's been unlinked), then xfs_inactive() will run + * transactions (bad!). + * + * So, to avoid this, we instantiate an inode directly from the inode core we've + * just recovered. We have the buffer still locked, and all we really need to + * instantiate is the inode core and the forks being modified. We can do this + * manually, then run the inode btree owner change, and then tear down the + * xfs_inode without having to run any transactions at all. + * + * Also, because we don't have a transaction context available here but need to + * gather all the buffers we modify for writeback so we pass the buffer_list + * instead for the operation to use. + */ + +STATIC int +xfs_recover_inode_owner_change( + struct xfs_mount *mp, + struct xfs_dinode *dip, + struct xfs_inode_log_format *in_f, + struct list_head *buffer_list) +{ + struct xfs_inode *ip; + int error; + + ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); + + ip = xfs_inode_alloc(mp, in_f->ilf_ino); + if (!ip) + return ENOMEM; + + /* instantiate the inode */ + xfs_dinode_from_disk(&ip->i_d, dip); + ASSERT(ip->i_d.di_version >= 3); + + error = xfs_iformat_fork(ip, dip); + if (error) + goto out_free_ip; + + + if (in_f->ilf_fields & XFS_ILOG_DOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + + if (in_f->ilf_fields & XFS_ILOG_AOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + +out_free_ip: + xfs_inode_free(ip); + return error; +} + STATIC int xlog_recover_inode_pass2( struct xlog *log, @@ -2681,8 +2757,7 @@ xlog_recover_inode_pass2( error = bp->b_error; if (error) { xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); - xfs_buf_relse(bp); - goto error; + goto out_release; } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); @@ -2692,30 +2767,31 @@ xlog_recover_inode_pass2( * like an inode! */ if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", __func__, dip, bp, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; } dicp = item->ri_buf[1].i_addr; if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", __func__, item, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; } /* * If the inode has an LSN in it, recover the inode only if it's less - * than the lsn of the transaction we are replaying. + * than the lsn of the transaction we are replaying. Note: we still + * need to replay an owner change even though the inode is more recent + * than the transaction as there is no guarantee that all the btree + * blocks are more recent than this transaction, too. */ if (dip->di_version >= 3) { xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); @@ -2723,7 +2799,7 @@ xlog_recover_inode_pass2( if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { trace_xfs_log_recover_inode_skip(log, in_f); error = 0; - goto out_release; + goto out_owner_change; } } @@ -2745,10 +2821,9 @@ xlog_recover_inode_pass2( dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { /* do nothing */ } else { - xfs_buf_relse(bp); trace_xfs_log_recover_inode_skip(log, in_f); error = 0; - goto error; + goto out_release; } } @@ -2760,13 +2835,12 @@ xlog_recover_inode_pass2( (dicp->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad regular inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } } else if (unlikely(S_ISDIR(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && @@ -2774,19 +2848,17 @@ xlog_recover_inode_pass2( (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad dir inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } } if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", @@ -2794,29 +2866,27 @@ xlog_recover_inode_pass2( dicp->di_nextents + dicp->di_anextents, dicp->di_nblocks); error = EFSCORRUPTED; - goto error; + goto out_release; } if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); error = EFSCORRUPTED; - goto error; + goto out_release; } isize = xfs_icdinode_size(dicp->di_version); if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record length %d, rec ptr 0x%p", __func__, item->ri_buf[1].i_len, item); error = EFSCORRUPTED; - goto error; + goto out_release; } /* The core is in in-core format */ @@ -2842,7 +2912,7 @@ xlog_recover_inode_pass2( } if (in_f->ilf_size == 2) - goto write_inode_buffer; + goto out_owner_change; len = item->ri_buf[2].i_len; src = item->ri_buf[2].i_addr; ASSERT(in_f->ilf_size <= 4); @@ -2903,13 +2973,15 @@ xlog_recover_inode_pass2( default: xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); - xfs_buf_relse(bp); error = EIO; - goto error; + goto out_release; } } -write_inode_buffer: +out_owner_change: + if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) + error = xfs_recover_inode_owner_change(mp, dip, in_f, + buffer_list); /* re-generate the checksum. */ xfs_dinode_calc_crc(log->l_mp, dip); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6218a0aeeeea..3e6c2e6c9cd2 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -51,8 +51,9 @@ */ STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *); + +STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); /* * We use the batch lookup interface to iterate over the dquots as it * currently is the only interface into the radix tree code that allows @@ -203,12 +204,9 @@ xfs_qm_dqpurge( * We move dquots to the freelist as soon as their reference count * hits zero, so it really should be on the freelist here. */ - mutex_lock(&qi->qi_lru_lock); ASSERT(!list_empty(&dqp->q_lru)); - list_del_init(&dqp->q_lru); - qi->qi_lru_count--; + list_lru_del(&qi->qi_lru, &dqp->q_lru); XFS_STATS_DEC(xs_qm_dquot_unused); - mutex_unlock(&qi->qi_lru_lock); xfs_qm_dqdestroy(dqp); @@ -680,6 +678,143 @@ xfs_qm_calc_dquots_per_chunk( return ndquots; } +struct xfs_qm_isolate { + struct list_head buffers; + struct list_head dispose; +}; + +static enum lru_status +xfs_qm_dquot_isolate( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_dquot *dqp = container_of(item, + struct xfs_dquot, q_lru); + struct xfs_qm_isolate *isol = arg; + + if (!xfs_dqlock_nowait(dqp)) + goto out_miss_busy; + + /* + * This dquot has acquired a reference in the meantime remove it from + * the freelist and try again. + */ + if (dqp->q_nrefs) { + xfs_dqunlock(dqp); + XFS_STATS_INC(xs_qm_dqwants); + + trace_xfs_dqreclaim_want(dqp); + list_del_init(&dqp->q_lru); + XFS_STATS_DEC(xs_qm_dquot_unused); + return LRU_REMOVED; + } + + /* + * If the dquot is dirty, flush it. If it's already being flushed, just + * skip it so there is time for the IO to complete before we try to + * reclaim it again on the next LRU pass. + */ + if (!xfs_dqflock_nowait(dqp)) { + xfs_dqunlock(dqp); + goto out_miss_busy; + } + + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + int error; + + trace_xfs_dqreclaim_dirty(dqp); + + /* we have to drop the LRU lock to flush the dquot */ + spin_unlock(lru_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: dquot %p flush failed", + __func__, dqp); + goto out_unlock_dirty; + } + + xfs_buf_delwri_queue(bp, &isol->buffers); + xfs_buf_relse(bp); + goto out_unlock_dirty; + } + xfs_dqfunlock(dqp); + + /* + * Prevent lookups now that we are past the point of no return. + */ + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); + + ASSERT(dqp->q_nrefs == 0); + list_move_tail(&dqp->q_lru, &isol->dispose); + XFS_STATS_DEC(xs_qm_dquot_unused); + trace_xfs_dqreclaim_done(dqp); + XFS_STATS_INC(xs_qm_dqreclaims); + return LRU_REMOVED; + +out_miss_busy: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + return LRU_SKIP; + +out_unlock_dirty: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + xfs_dqunlock(dqp); + spin_lock(lru_lock); + return LRU_RETRY; +} + +static unsigned long +xfs_qm_shrink_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + struct xfs_qm_isolate isol; + unsigned long freed; + int error; + unsigned long nr_to_scan = sc->nr_to_scan; + + if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) + return 0; + + INIT_LIST_HEAD(&isol.buffers); + INIT_LIST_HEAD(&isol.dispose); + + freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, + &nr_to_scan); + + error = xfs_buf_delwri_submit(&isol.buffers); + if (error) + xfs_warn(NULL, "%s: dquot reclaim failed", __func__); + + while (!list_empty(&isol.dispose)) { + struct xfs_dquot *dqp; + + dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru); + list_del_init(&dqp->q_lru); + xfs_qm_dqfree_one(dqp); + } + + return freed; +} + +static unsigned long +xfs_qm_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + + return list_lru_count_node(&qi->qi_lru, sc->nid); +} + /* * This initializes all the quota information that's kept in the * mount structure @@ -696,11 +831,18 @@ xfs_qm_init_quotainfo( qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); + if ((error = list_lru_init(&qinf->qi_lru))) { + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; + } + /* * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ if ((error = xfs_qm_init_quotainos(mp))) { + list_lru_destroy(&qinf->qi_lru); kmem_free(qinf); mp->m_quotainfo = NULL; return error; @@ -711,10 +853,6 @@ xfs_qm_init_quotainfo( INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS); mutex_init(&qinf->qi_tree_lock); - INIT_LIST_HEAD(&qinf->qi_lru_list); - qinf->qi_lru_count = 0; - mutex_init(&qinf->qi_lru_lock); - /* mutex used to serialize quotaoffs */ mutex_init(&qinf->qi_quotaofflock); @@ -779,8 +917,10 @@ xfs_qm_init_quotainfo( qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; } - qinf->qi_shrinker.shrink = xfs_qm_shake; + qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; + qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; qinf->qi_shrinker.seeks = DEFAULT_SEEKS; + qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&qinf->qi_shrinker); return 0; } @@ -801,6 +941,7 @@ xfs_qm_destroy_quotainfo( ASSERT(qi != NULL); unregister_shrinker(&qi->qi_shrinker); + list_lru_destroy(&qi->qi_lru); if (qi->qi_uquotaip) { IRELE(qi->qi_uquotaip); @@ -1599,132 +1740,6 @@ xfs_qm_dqfree_one( xfs_qm_dqdestroy(dqp); } -STATIC void -xfs_qm_dqreclaim_one( - struct xfs_dquot *dqp, - struct list_head *buffer_list, - struct list_head *dispose_list) -{ - struct xfs_mount *mp = dqp->q_mount; - struct xfs_quotainfo *qi = mp->m_quotainfo; - int error; - - if (!xfs_dqlock_nowait(dqp)) - goto out_move_tail; - - /* - * This dquot has acquired a reference in the meantime remove it from - * the freelist and try again. - */ - if (dqp->q_nrefs) { - xfs_dqunlock(dqp); - - trace_xfs_dqreclaim_want(dqp); - XFS_STATS_INC(xs_qm_dqwants); - - list_del_init(&dqp->q_lru); - qi->qi_lru_count--; - XFS_STATS_DEC(xs_qm_dquot_unused); - return; - } - - /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. - */ - if (!xfs_dqflock_nowait(dqp)) - goto out_unlock_move_tail; - - if (XFS_DQ_IS_DIRTY(dqp)) { - struct xfs_buf *bp = NULL; - - trace_xfs_dqreclaim_dirty(dqp); - - error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(mp, "%s: dquot %p flush failed", - __func__, dqp); - goto out_unlock_move_tail; - } - - xfs_buf_delwri_queue(bp, buffer_list); - xfs_buf_relse(bp); - /* - * Give the dquot another try on the freelist, as the - * flushing will take some time. - */ - goto out_unlock_move_tail; - } - xfs_dqfunlock(dqp); - - /* - * Prevent lookups now that we are past the point of no return. - */ - dqp->dq_flags |= XFS_DQ_FREEING; - xfs_dqunlock(dqp); - - ASSERT(dqp->q_nrefs == 0); - list_move_tail(&dqp->q_lru, dispose_list); - qi->qi_lru_count--; - XFS_STATS_DEC(xs_qm_dquot_unused); - - trace_xfs_dqreclaim_done(dqp); - XFS_STATS_INC(xs_qm_dqreclaims); - return; - - /* - * Move the dquot to the tail of the list so that we don't spin on it. - */ -out_unlock_move_tail: - xfs_dqunlock(dqp); -out_move_tail: - list_move_tail(&dqp->q_lru, &qi->qi_lru_list); - trace_xfs_dqreclaim_busy(dqp); - XFS_STATS_INC(xs_qm_dqreclaim_misses); -} - -STATIC int -xfs_qm_shake( - struct shrinker *shrink, - struct shrink_control *sc) -{ - struct xfs_quotainfo *qi = - container_of(shrink, struct xfs_quotainfo, qi_shrinker); - int nr_to_scan = sc->nr_to_scan; - LIST_HEAD (buffer_list); - LIST_HEAD (dispose_list); - struct xfs_dquot *dqp; - int error; - - if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) - return 0; - if (!nr_to_scan) - goto out; - - mutex_lock(&qi->qi_lru_lock); - while (!list_empty(&qi->qi_lru_list)) { - if (nr_to_scan-- <= 0) - break; - dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot, - q_lru); - xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list); - } - mutex_unlock(&qi->qi_lru_lock); - - error = xfs_buf_delwri_submit(&buffer_list); - if (error) - xfs_warn(NULL, "%s: dquot reclaim failed", __func__); - - while (!list_empty(&dispose_list)) { - dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru); - list_del_init(&dqp->q_lru); - xfs_qm_dqfree_one(dqp); - } - -out: - return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure; -} - /* * Start a transaction and write the incore superblock changes to * disk. flags parameter indicates which fields have changed. diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 670cd4464070..2b602df9c242 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -49,9 +49,7 @@ typedef struct xfs_quotainfo { struct xfs_inode *qi_uquotaip; /* user quota inode */ struct xfs_inode *qi_gquotaip; /* group quota inode */ struct xfs_inode *qi_pquotaip; /* project quota inode */ - struct list_head qi_lru_list; - struct mutex qi_lru_lock; - int qi_lru_count; + struct list_lru qi_lru; int qi_dquots; time_t qi_btimelimit; /* limit for blks timer */ time_t qi_itimelimit; /* limit for inodes timer */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 979a77d4b87d..15188cc99449 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1535,19 +1535,21 @@ xfs_fs_mount( return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); } -static int +static long xfs_fs_nr_cached_objects( - struct super_block *sb) + struct super_block *sb, + int nid) { return xfs_reclaim_inodes_count(XFS_M(sb)); } -static void +static long xfs_fs_free_cached_objects( struct super_block *sb, - int nr_to_scan) + long nr_to_scan, + int nid) { - xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); + return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); } static const struct super_operations xfs_super_operations = { diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 2f2a7c005be2..f622a97a7e33 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -41,6 +41,7 @@ #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_symlink.h" +#include "xfs_buf_item.h" /* ----- Kernel only functions below ----- */ STATIC int @@ -363,6 +364,7 @@ xfs_symlink( pathlen -= byte_cnt; offset += byte_cnt; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - (char *)bp->b_addr); } |