diff options
Diffstat (limited to 'fs')
180 files changed, 9023 insertions, 4331 deletions
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 9c5e6b2cd11a..c2183f3917cd 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/mempool.h> +#include <linux/export.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/slab.h> @@ -255,7 +255,6 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; - bio->bi_comp_cpu = -1; atomic_set(&bio->bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -338,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio) * RETURNS: * Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) +struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); @@ -366,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio) * %__GFP_WAIT, the allocation is guaranteed to succeed. * **/ -struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) { struct bio *bio; @@ -697,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd) kfree(bmd); } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, +static struct bio_map_data *bio_alloc_map_data(int nr_segs, + unsigned int iov_count, gfp_t gfp_mask) { struct bio_map_data *bmd; diff --git a/fs/block_dev.c b/fs/block_dev.c index 95f786ec7f08..b07f1da1de4e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) if (!bdev->bd_disk) return; - if (disk_partitionable(bdev->bd_disk)) + if (disk_part_scan_enabled(bdev->bd_disk)) bdev->bd_invalidated = 1; } @@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk; + struct module *owner; int ret; int partno; int perm = 0; @@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk = get_gendisk(bdev->bd_dev, &partno); if (!disk) goto out; + owner = disk->fops->owner; disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); @@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); goto restart; } } @@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_unlock_bdev; } /* only one opener holds refs to the module and disk */ - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); } bdev->bd_openers++; if (for_part) @@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - module_put(disk->fops->owner); put_disk(disk); + module_put(owner); out: bdput(bdev); @@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (!bdev->bd_openers) { struct module *owner = disk->fops->owner; - put_disk(disk); - module_put(owner); disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; + + put_disk(disk); + module_put(owner); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 40e6ac08c21f..c0ddfd29c5e5 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ - compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o + compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ + reada.o backref.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index eb159aaa5a11..89b156d85d63 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) if (!value) return ERR_PTR(-ENOMEM); size = __btrfs_getxattr(inode, name, value, size); - if (size > 0) { - acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) { - kfree(value); - return acl; - } - set_cached_acl(inode, type, acl); - } - kfree(value); + } + if (size > 0) { + acl = posix_acl_from_xattr(value, size); } else if (size == -ENOENT || size == -ENODATA || size == 0) { /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; - set_cached_acl(inode, type, acl); } else { acl = ERR_PTR(-EIO); } + kfree(value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); return acl; } diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 7ec14097fef1..0b394580d860 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -64,6 +64,8 @@ struct btrfs_worker_thread { int idle; }; +static int __btrfs_start_workers(struct btrfs_workers *workers); + /* * btrfs_start_workers uses kthread_run, which can block waiting for memory * for a very long time. It will actually throttle on page writeback, @@ -88,27 +90,10 @@ static void start_new_worker_func(struct btrfs_work *work) { struct worker_start *start; start = container_of(work, struct worker_start, work); - btrfs_start_workers(start->queue, 1); + __btrfs_start_workers(start->queue); kfree(start); } -static int start_new_worker(struct btrfs_workers *queue) -{ - struct worker_start *start; - int ret; - - start = kzalloc(sizeof(*start), GFP_NOFS); - if (!start) - return -ENOMEM; - - start->work.func = start_new_worker_func; - start->queue = queue; - ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); - if (ret) - kfree(start); - return ret; -} - /* * helper function to move a thread onto the idle list after it * has finished some requests. @@ -153,12 +138,20 @@ static void check_busy_worker(struct btrfs_worker_thread *worker) static void check_pending_worker_creates(struct btrfs_worker_thread *worker) { struct btrfs_workers *workers = worker->workers; + struct worker_start *start; unsigned long flags; rmb(); if (!workers->atomic_start_pending) return; + start = kzalloc(sizeof(*start), GFP_NOFS); + if (!start) + return; + + start->work.func = start_new_worker_func; + start->queue = workers; + spin_lock_irqsave(&workers->lock, flags); if (!workers->atomic_start_pending) goto out; @@ -170,10 +163,11 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker) workers->num_workers_starting += 1; spin_unlock_irqrestore(&workers->lock, flags); - start_new_worker(workers); + btrfs_queue_worker(workers->atomic_worker_start, &start->work); return; out: + kfree(start); spin_unlock_irqrestore(&workers->lock, flags); } @@ -331,7 +325,7 @@ again: run_ordered_completions(worker->workers, work); check_pending_worker_creates(worker); - + cond_resched(); } spin_lock_irq(&worker->lock); @@ -462,56 +456,55 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, * starts new worker threads. This does not enforce the max worker * count in case you need to temporarily go past it. */ -static int __btrfs_start_workers(struct btrfs_workers *workers, - int num_workers) +static int __btrfs_start_workers(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; int ret = 0; - int i; - for (i = 0; i < num_workers; i++) { - worker = kzalloc(sizeof(*worker), GFP_NOFS); - if (!worker) { - ret = -ENOMEM; - goto fail; - } + worker = kzalloc(sizeof(*worker), GFP_NOFS); + if (!worker) { + ret = -ENOMEM; + goto fail; + } - INIT_LIST_HEAD(&worker->pending); - INIT_LIST_HEAD(&worker->prio_pending); - INIT_LIST_HEAD(&worker->worker_list); - spin_lock_init(&worker->lock); - - atomic_set(&worker->num_pending, 0); - atomic_set(&worker->refs, 1); - worker->workers = workers; - worker->task = kthread_run(worker_loop, worker, - "btrfs-%s-%d", workers->name, - workers->num_workers + i); - if (IS_ERR(worker->task)) { - ret = PTR_ERR(worker->task); - kfree(worker); - goto fail; - } - spin_lock_irq(&workers->lock); - list_add_tail(&worker->worker_list, &workers->idle_list); - worker->idle = 1; - workers->num_workers++; - workers->num_workers_starting--; - WARN_ON(workers->num_workers_starting < 0); - spin_unlock_irq(&workers->lock); + INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->prio_pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); + + atomic_set(&worker->num_pending, 0); + atomic_set(&worker->refs, 1); + worker->workers = workers; + worker->task = kthread_run(worker_loop, worker, + "btrfs-%s-%d", workers->name, + workers->num_workers + 1); + if (IS_ERR(worker->task)) { + ret = PTR_ERR(worker->task); + kfree(worker); + goto fail; } + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->idle_list); + worker->idle = 1; + workers->num_workers++; + workers->num_workers_starting--; + WARN_ON(workers->num_workers_starting < 0); + spin_unlock_irq(&workers->lock); + return 0; fail: - btrfs_stop_workers(workers); + spin_lock_irq(&workers->lock); + workers->num_workers_starting--; + spin_unlock_irq(&workers->lock); return ret; } -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +int btrfs_start_workers(struct btrfs_workers *workers) { spin_lock_irq(&workers->lock); - workers->num_workers_starting += num_workers; + workers->num_workers_starting++; spin_unlock_irq(&workers->lock); - return __btrfs_start_workers(workers, num_workers); + return __btrfs_start_workers(workers); } /* @@ -568,9 +561,10 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) struct btrfs_worker_thread *worker; unsigned long flags; struct list_head *fallback; + int ret; -again: spin_lock_irqsave(&workers->lock, flags); +again: worker = next_worker(workers); if (!worker) { @@ -584,7 +578,10 @@ again: workers->num_workers_starting++; spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ - __btrfs_start_workers(workers, 1); + ret = __btrfs_start_workers(workers); + spin_lock_irqsave(&workers->lock, flags); + if (ret) + goto fallback; goto again; } } @@ -665,7 +662,7 @@ void btrfs_set_work_high_prio(struct btrfs_work *work) /* * places a struct btrfs_work into the pending queue of one of the kthreads */ -int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) { struct btrfs_worker_thread *worker; unsigned long flags; @@ -673,7 +670,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) /* don't requeue something already on a list */ if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) - goto out; + return; worker = find_worker(workers); if (workers->ordered) { @@ -712,7 +709,4 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) if (wake) wake_up_process(worker->task); spin_unlock_irqrestore(&worker->lock, flags); - -out: - return 0; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 5077746cf85e..f34cc31fa3c9 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -109,8 +109,8 @@ struct btrfs_workers { char *name; }; -int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); +void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); +int btrfs_start_workers(struct btrfs_workers *workers); int btrfs_stop_workers(struct btrfs_workers *workers); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, struct btrfs_workers *async_starter); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c new file mode 100644 index 000000000000..22c64fff1bd5 --- /dev/null +++ b/fs/btrfs/backref.c @@ -0,0 +1,776 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "backref.h" + +struct __data_ref { + struct list_head list; + u64 inum; + u64 root; + u64 extent_data_item_offset; +}; + +struct __shared_ref { + struct list_head list; + u64 disk_byte; +}; + +static int __inode_info(u64 inum, u64 ioff, u8 key_type, + struct btrfs_root *fs_root, struct btrfs_path *path, + struct btrfs_key *found_key) +{ + int ret; + struct btrfs_key key; + struct extent_buffer *eb; + + key.type = key_type; + key.objectid = inum; + key.offset = ioff; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + eb = path->nodes[0]; + if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_root, path); + if (ret) + return ret; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); + if (found_key->type != key.type || found_key->objectid != key.objectid) + return 1; + + return 0; +} + +/* + * this makes the path point to (inum INODE_ITEM ioff) + */ +int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path) +{ + struct btrfs_key key; + return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path, + &key); +} + +static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path, + struct btrfs_key *found_key) +{ + return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path, + found_key); +} + +/* + * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements + * of the path are separated by '/' and the path is guaranteed to be + * 0-terminated. the path is only given within the current file system. + * Therefore, it never starts with a '/'. the caller is responsible to provide + * "size" bytes in "dest". the dest buffer will be filled backwards. finally, + * the start point of the resulting string is returned. this pointer is within + * dest, normally. + * in case the path buffer would overflow, the pointer is decremented further + * as if output was written to the buffer, though no more output is actually + * generated. that way, the caller can determine how much space would be + * required for the path to fit into the buffer. in that case, the returned + * value will be smaller than dest. callers must check this! + */ +static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + struct btrfs_inode_ref *iref, + struct extent_buffer *eb_in, u64 parent, + char *dest, u32 size) +{ + u32 len; + int slot; + u64 next_inum; + int ret; + s64 bytes_left = size - 1; + struct extent_buffer *eb = eb_in; + struct btrfs_key found_key; + + if (bytes_left >= 0) + dest[bytes_left] = '\0'; + + while (1) { + len = btrfs_inode_ref_name_len(eb, iref); + bytes_left -= len; + if (bytes_left >= 0) + read_extent_buffer(eb, dest + bytes_left, + (unsigned long)(iref + 1), len); + if (eb != eb_in) + free_extent_buffer(eb); + ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + if (ret) + break; + next_inum = found_key.offset; + + /* regular exit ahead */ + if (parent == next_inum) + break; + + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + if (eb != eb_in) + atomic_inc(&eb->refs); + btrfs_release_path(path); + + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + parent = next_inum; + --bytes_left; + if (bytes_left >= 0) + dest[bytes_left] = '/'; + } + + btrfs_release_path(path); + + if (ret) + return ERR_PTR(ret); + + return dest + bytes_left; +} + +/* + * this makes the path point to (logical EXTENT_ITEM *) + * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for + * tree blocks and <0 on error. + */ +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key) +{ + int ret; + u64 flags; + u32 item_size; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_key key; + + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logical; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + ret = btrfs_previous_item(fs_info->extent_root, path, + 0, BTRFS_EXTENT_ITEM_KEY); + if (ret < 0) + return ret; + + btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); + if (found_key->type != BTRFS_EXTENT_ITEM_KEY || + found_key->objectid > logical || + found_key->objectid + found_key->offset <= logical) + return -ENOENT; + + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, path->slots[0]); + BUG_ON(item_size < sizeof(*ei)); + + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) + return BTRFS_EXTENT_FLAG_TREE_BLOCK; + if (flags & BTRFS_EXTENT_FLAG_DATA) + return BTRFS_EXTENT_FLAG_DATA; + + return -EIO; +} + +/* + * helper function to iterate extent inline refs. ptr must point to a 0 value + * for the first call and may be modified. it is used to track state. + * if more refs exist, 0 is returned and the next call to + * __get_extent_inline_ref must pass the modified ptr parameter to get the + * next ref. after the last ref was processed, 1 is returned. + * returns <0 on error + */ +static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_extent_item *ei, u32 item_size, + struct btrfs_extent_inline_ref **out_eiref, + int *out_type) +{ + unsigned long end; + u64 flags; + struct btrfs_tree_block_info *info; + + if (!*ptr) { + /* first call */ + flags = btrfs_extent_flags(eb, ei); + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_eiref = + (struct btrfs_extent_inline_ref *)(info + 1); + } else { + *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + *ptr = (unsigned long)*out_eiref; + if ((void *)*ptr >= (void *)ei + item_size) + return -ENOENT; + } + + end = (unsigned long)ei + item_size; + *out_eiref = (struct btrfs_extent_inline_ref *)*ptr; + *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); + + *ptr += btrfs_extent_inline_ref_size(*out_type); + WARN_ON(*ptr > end); + if (*ptr == end) + return 1; /* last */ + + return 0; +} + +/* + * reads the tree block backref for an extent. tree level and root are returned + * through out_level and out_root. ptr must point to a 0 value for the first + * call and may be modified (see __get_extent_inline_ref comment). + * returns 0 if data was provided, 1 if there was no more data to provide or + * <0 on error. + */ +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_extent_item *ei, u32 item_size, + u64 *out_root, u8 *out_level) +{ + int ret; + int type; + struct btrfs_tree_block_info *info; + struct btrfs_extent_inline_ref *eiref; + + if (*ptr == (unsigned long)-1) + return 1; + + while (1) { + ret = __get_extent_inline_ref(ptr, eb, ei, item_size, + &eiref, &type); + if (ret < 0) + return ret; + + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + break; + + if (ret == 1) + return 1; + } + + /* we can treat both ref types equally here */ + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_root = btrfs_extent_inline_ref_offset(eb, eiref); + *out_level = btrfs_tree_block_level(eb, info); + + if (ret == 1) + *ptr = (unsigned long)-1; + + return 0; +} + +static int __data_list_add(struct list_head *head, u64 inum, + u64 extent_data_item_offset, u64 root) +{ + struct __data_ref *ref; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->inum = inum; + ref->extent_data_item_offset = extent_data_item_offset; + ref->root = root; + list_add_tail(&ref->list, head); + + return 0; +} + +static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb, + struct btrfs_extent_data_ref *dref) +{ + return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref), + btrfs_extent_data_ref_offset(eb, dref), + btrfs_extent_data_ref_root(eb, dref)); +} + +static int __shared_list_add(struct list_head *head, u64 disk_byte) +{ + struct __shared_ref *ref; + + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; + + ref->disk_byte = disk_byte; + list_add_tail(&ref->list, head); + + return 0; +} + +static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info, + u64 logical, u64 inum, + u64 extent_data_item_offset, + u64 extent_offset, + struct btrfs_path *path, + struct list_head *data_refs, + iterate_extent_inodes_t *iterate, + void *ctx) +{ + u64 ref_root; + u32 item_size; + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *eiref; + struct __data_ref *ref; + int ret; + int type; + int last; + unsigned long ptr = 0; + + WARN_ON(!list_empty(data_refs)); + ret = extent_from_logical(fs_info, logical, path, &key); + if (ret & BTRFS_EXTENT_FLAG_DATA) + ret = -EIO; + if (ret < 0) + goto out; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + ret = 0; + ref_root = 0; + /* + * as done in iterate_extent_inodes, we first build a list of refs to + * iterate, then free the path and then iterate them to avoid deadlocks. + */ + do { + last = __get_extent_inline_ref(&ptr, eb, ei, item_size, + &eiref, &type); + if (last < 0) { + ret = last; + goto out; + } + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) { + ref_root = btrfs_extent_inline_ref_offset(eb, eiref); + ret = __data_list_add(data_refs, inum, + extent_data_item_offset, + ref_root); + } + } while (!ret && !last); + + btrfs_release_path(path); + + if (ref_root == 0) { + printk(KERN_ERR "btrfs: failed to find tree block ref " + "for shared data backref %llu\n", logical); + WARN_ON(1); + ret = -EIO; + } + +out: + while (!list_empty(data_refs)) { + ref = list_first_entry(data_refs, struct __data_ref, list); + list_del(&ref->list); + if (!ret) + ret = iterate(ref->inum, extent_offset + + ref->extent_data_item_offset, + ref->root, ctx); + kfree(ref); + } + + return ret; +} + +static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, + u64 logical, u64 orig_extent_item_objectid, + u64 extent_offset, struct btrfs_path *path, + struct list_head *data_refs, + iterate_extent_inodes_t *iterate, + void *ctx) +{ + u64 disk_byte; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *eb; + int slot; + int nritems; + int ret; + int found = 0; + + eb = read_tree_block(fs_info->tree_root, logical, + fs_info->tree_root->leafsize, 0); + if (!eb) + return -EIO; + + /* + * from the shared data ref, we only have the leaf but we need + * the key. thus, we must look into all items and see that we + * find one (some) with a reference to our extent item. + */ + nritems = btrfs_header_nritems(eb); + for (slot = 0; slot < nritems; ++slot) { + btrfs_item_key_to_cpu(eb, &key, slot); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + if (!fi) { + free_extent_buffer(eb); + return -EIO; + } + disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); + if (disk_byte != orig_extent_item_objectid) { + if (found) + break; + else + continue; + } + ++found; + ret = __iter_shared_inline_ref_inodes(fs_info, logical, + key.objectid, + key.offset, + extent_offset, path, + data_refs, + iterate, ctx); + if (ret) + break; + } + + if (!found) { + printk(KERN_ERR "btrfs: failed to follow shared data backref " + "to parent %llu\n", logical); + WARN_ON(1); + ret = -EIO; + } + + free_extent_buffer(eb); + return ret; +} + +/* + * calls iterate() for every inode that references the extent identified by + * the given parameters. will use the path given as a parameter and return it + * released. + * when the iterator function returns a non-zero value, iteration stops. + */ +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 extent_item_objectid, + u64 extent_offset, + iterate_extent_inodes_t *iterate, void *ctx) +{ + unsigned long ptr = 0; + int last; + int ret; + int type; + u64 logical; + u32 item_size; + struct btrfs_extent_inline_ref *eiref; + struct btrfs_extent_data_ref *dref; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_key key; + struct list_head data_refs = LIST_HEAD_INIT(data_refs); + struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); + struct __data_ref *ref_d; + struct __shared_ref *ref_s; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + /* first we iterate the inline refs, ... */ + do { + last = __get_extent_inline_ref(&ptr, eb, ei, item_size, + &eiref, &type); + if (last == -ENOENT) { + ret = 0; + break; + } + if (last < 0) { + ret = last; + break; + } + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&eiref->offset); + ret = __data_list_add_eb(&data_refs, eb, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + logical = btrfs_extent_inline_ref_offset(eb, eiref); + ret = __shared_list_add(&shared_refs, logical); + } + } while (!ret && !last); + + /* ... then we proceed to in-tree references and ... */ + while (!ret) { + ++path->slots[0]; + if (path->slots[0] > btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(fs_info->extent_root, path); + if (ret) { + if (ret == 1) + ret = 0; /* we're done */ + break; + } + eb = path->nodes[0]; + } + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + if (key.objectid != extent_item_objectid) + break; + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_data_ref); + ret = __data_list_add_eb(&data_refs, eb, dref); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ret = __shared_list_add(&shared_refs, key.offset); + } + } + + btrfs_release_path(path); + + /* + * ... only at the very end we can process the refs we found. this is + * because the iterator function we call is allowed to make tree lookups + * and we have to avoid deadlocks. additionally, we need more tree + * lookups ourselves for shared data refs. + */ + while (!list_empty(&data_refs)) { + ref_d = list_first_entry(&data_refs, struct __data_ref, list); + list_del(&ref_d->list); + if (!ret) + ret = iterate(ref_d->inum, extent_offset + + ref_d->extent_data_item_offset, + ref_d->root, ctx); + kfree(ref_d); + } + + while (!list_empty(&shared_refs)) { + ref_s = list_first_entry(&shared_refs, struct __shared_ref, + list); + list_del(&ref_s->list); + if (!ret) + ret = __iter_shared_inline_ref(fs_info, + ref_s->disk_byte, + extent_item_objectid, + extent_offset, path, + &data_refs, + iterate, ctx); + kfree(ref_s); + } + + return ret; +} + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + iterate_extent_inodes_t *iterate, void *ctx) +{ + int ret; + u64 offset; + struct btrfs_key found_key; + + ret = extent_from_logical(fs_info, logical, path, + &found_key); + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = -EINVAL; + if (ret < 0) + return ret; + + offset = logical - found_key.objectid; + ret = iterate_extent_inodes(fs_info, path, found_key.objectid, + offset, iterate, ctx); + + return ret; +} + +static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, + struct btrfs_path *path, + iterate_irefs_t *iterate, void *ctx) +{ + int ret; + int slot; + u32 cur; + u32 len; + u32 name_len; + u64 parent = 0; + int found = 0; + struct extent_buffer *eb; + struct btrfs_item *item; + struct btrfs_inode_ref *iref; + struct btrfs_key found_key; + + while (1) { + ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, + &found_key); + if (ret < 0) + break; + if (ret) { + ret = found ? 0 : -ENOENT; + break; + } + ++found; + + parent = found_key.offset; + slot = path->slots[0]; + eb = path->nodes[0]; + /* make sure we can use eb after releasing the path */ + atomic_inc(&eb->refs); + btrfs_release_path(path); + + item = btrfs_item_nr(eb, slot); + iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); + + for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { + name_len = btrfs_inode_ref_name_len(eb, iref); + /* path must be released before calling iterate()! */ + ret = iterate(parent, iref, eb, ctx); + if (ret) { + free_extent_buffer(eb); + break; + } + len = sizeof(*iref) + name_len; + iref = (struct btrfs_inode_ref *)((char *)iref + len); + } + free_extent_buffer(eb); + } + + btrfs_release_path(path); + + return ret; +} + +/* + * returns 0 if the path could be dumped (probably truncated) + * returns <0 in case of an error + */ +static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, + struct extent_buffer *eb, void *ctx) +{ + struct inode_fs_paths *ipath = ctx; + char *fspath; + char *fspath_min; + int i = ipath->fspath->elem_cnt; + const int s_ptr = sizeof(char *); + u32 bytes_left; + + bytes_left = ipath->fspath->bytes_left > s_ptr ? + ipath->fspath->bytes_left - s_ptr : 0; + + fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; + fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, + inum, fspath_min, bytes_left); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); + + if (fspath > fspath_min) { + ipath->fspath->val[i] = (u64)(unsigned long)fspath; + ++ipath->fspath->elem_cnt; + ipath->fspath->bytes_left = fspath - fspath_min; + } else { + ++ipath->fspath->elem_missed; + ipath->fspath->bytes_missing += fspath_min - fspath; + ipath->fspath->bytes_left = 0; + } + + return 0; +} + +/* + * this dumps all file system paths to the inode into the ipath struct, provided + * is has been created large enough. each path is zero-terminated and accessed + * from ipath->fspath->val[i]. + * when it returns, there are ipath->fspath->elem_cnt number of paths available + * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the + * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, + * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would + * have been needed to return all paths. + */ +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) +{ + return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, + inode_to_path, ipath); +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct btrfs_data_container *init_data_container(u32 total_bytes) +{ + struct btrfs_data_container *data; + size_t alloc_bytes; + + alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); + data = kmalloc(alloc_bytes, GFP_NOFS); + if (!data) + return ERR_PTR(-ENOMEM); + + if (total_bytes >= sizeof(*data)) { + data->bytes_left = total_bytes - sizeof(*data); + data->bytes_missing = 0; + } else { + data->bytes_missing = sizeof(*data) - total_bytes; + data->bytes_left = 0; + } + + data->elem_cnt = 0; + data->elem_missed = 0; + + return data; +} + +/* + * allocates space to return multiple file system paths for an inode. + * total_bytes to allocate are passed, note that space usable for actual path + * information will be total_bytes - sizeof(struct inode_fs_paths). + * the returned pointer must be freed with free_ipath() in the end. + */ +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path) +{ + struct inode_fs_paths *ifp; + struct btrfs_data_container *fspath; + + fspath = init_data_container(total_bytes); + if (IS_ERR(fspath)) + return (void *)fspath; + + ifp = kmalloc(sizeof(*ifp), GFP_NOFS); + if (!ifp) { + kfree(fspath); + return ERR_PTR(-ENOMEM); + } + + ifp->btrfs_path = path; + ifp->fspath = fspath; + ifp->fs_root = fs_root; + + return ifp; +} + +void free_ipath(struct inode_fs_paths *ipath) +{ + kfree(ipath); +} diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h new file mode 100644 index 000000000000..92618837cb8f --- /dev/null +++ b/fs/btrfs/backref.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_BACKREF__ +#define __BTRFS_BACKREF__ + +#include "ioctl.h" + +struct inode_fs_paths { + struct btrfs_path *btrfs_path; + struct btrfs_root *fs_root; + struct btrfs_data_container *fspath; +}; + +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, + void *ctx); +typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref, + struct extent_buffer *eb, void *ctx); + +int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, + struct btrfs_path *path); + +int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, + struct btrfs_path *path, struct btrfs_key *found_key); + +int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, + struct btrfs_extent_item *ei, u32 item_size, + u64 *out_root, u8 *out_level); + +int iterate_extent_inodes(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 extent_item_objectid, + u64 extent_offset, + iterate_extent_inodes_t *iterate, void *ctx); + +int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + iterate_extent_inodes_t *iterate, void *ctx); + +int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); + +struct btrfs_data_container *init_data_container(u32 total_bytes); +struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, + struct btrfs_path *path); +void free_ipath(struct inode_fs_paths *ipath); + +#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d9f99a16edd6..634608d2a6d0 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -103,11 +103,6 @@ struct btrfs_inode { */ u64 delalloc_bytes; - /* total number of bytes that may be used for this inode for - * delalloc - */ - u64 reserved_bytes; - /* * the size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk @@ -115,9 +110,6 @@ struct btrfs_inode { */ u64 disk_i_size; - /* flags field from the on disk inode */ - u32 flags; - /* * if this is a directory then index_cnt is the counter for the index * number for new files that are created @@ -132,6 +124,15 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * Number of bytes outstanding that are going to need csums. This is + * used in ENOSPC accounting. + */ + u64 csum_bytes; + + /* flags field from the on disk inode */ + u32 flags; + + /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent * items we think we'll end up using, and reserved_extents is the number @@ -146,14 +147,12 @@ struct btrfs_inode { * the btrfs file release call will add this inode to the * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. - * - * yes, its silly to have a single bitflag, but we might grow more - * of these. */ unsigned ordered_data_close:1; unsigned orphan_meta_reserved:1; unsigned dummy_inode:1; unsigned in_defrag:1; + unsigned delalloc_meta_reserved:1; /* * always compress this one file diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8ec5d86f1734..14f1c5a0b2d2 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -85,7 +85,8 @@ struct compressed_bio { static inline int compressed_bio_size(struct btrfs_root *root, unsigned long disk_size) { - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + return sizeof(struct compressed_bio) + ((disk_size + root->sectorsize - 1) / root->sectorsize) * csum_size; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 011cab3aca8d..dede441bdeee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + /* ensure we can see the force_cow */ + smp_rmb(); + + /* + * We do not need to cow a block if + * 1) this block is not created or changed in this transaction; + * 2) this block does not belong to TREE_RELOC tree; + * 3) the root is not forced COW. + * + * What is forced COW: + * when we create snapshot during commiting the transaction, + * after we've finished coping src root, we must COW the shared + * block to ensure the metadata consistency. + */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && + !root->force_cow) return 0; return 1; } @@ -902,9 +917,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, orig_ptr = btrfs_node_blockptr(mid, orig_slot); - if (level < BTRFS_MAX_LEVEL - 1) + if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; + pslot = path->slots[level + 1]; + } /* * deal with the case where there is only one pointer in the root @@ -1107,9 +1123,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, mid = path->nodes[level]; WARN_ON(btrfs_header_generation(mid) != trans->transid); - if (level < BTRFS_MAX_LEVEL - 1) + if (level < BTRFS_MAX_LEVEL - 1) { parent = path->nodes[level + 1]; - pslot = path->slots[level + 1]; + pslot = path->slots[level + 1]; + } if (!parent) return 1; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 03912c5c6f49..67385033323d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -30,6 +30,7 @@ #include <linux/kobject.h> #include <trace/events/btrfs.h> #include <asm/kmap_types.h> +#include <linux/pagemap.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -360,6 +361,47 @@ struct btrfs_header { #define BTRFS_LABEL_SIZE 256 /* + * just in case we somehow lose the roots and are not able to mount, + * we store an array of the roots from previous transactions + * in the super. + */ +#define BTRFS_NUM_BACKUP_ROOTS 4 +struct btrfs_root_backup { + __le64 tree_root; + __le64 tree_root_gen; + + __le64 chunk_root; + __le64 chunk_root_gen; + + __le64 extent_root; + __le64 extent_root_gen; + + __le64 fs_root; + __le64 fs_root_gen; + + __le64 dev_root; + __le64 dev_root_gen; + + __le64 csum_root; + __le64 csum_root_gen; + + __le64 total_bytes; + __le64 bytes_used; + __le64 num_devices; + /* future */ + __le64 unsed_64[4]; + + u8 tree_root_level; + u8 chunk_root_level; + u8 extent_root_level; + u8 fs_root_level; + u8 dev_root_level; + u8 csum_root_level; + /* future and to align */ + u8 unused_8[10]; +} __attribute__ ((__packed__)); + +/* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc */ @@ -405,6 +447,7 @@ struct btrfs_super_block { /* future expansion */ __le64 reserved[31]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; + struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; } __attribute__ ((__packed__)); /* @@ -772,14 +815,8 @@ struct btrfs_space_info { struct btrfs_block_rsv { u64 size; u64 reserved; - u64 freed[2]; struct btrfs_space_info *space_info; - struct list_head list; spinlock_t lock; - atomic_t usage; - unsigned int priority:8; - unsigned int durable:1; - unsigned int refill_used:1; unsigned int full:1; }; @@ -811,7 +848,8 @@ struct btrfs_free_cluster { enum btrfs_caching_type { BTRFS_CACHE_NO = 0, BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FINISHED = 2, + BTRFS_CACHE_FAST = 2, + BTRFS_CACHE_FINISHED = 3, }; enum btrfs_disk_cache_state { @@ -840,10 +878,10 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; - u64 reserved_pinned; u64 bytes_super; u64 flags; u64 sectorsize; + u64 cache_generation; unsigned int ro:1; unsigned int dirty:1; unsigned int iref:1; @@ -899,6 +937,10 @@ struct btrfs_fs_info { spinlock_t block_group_cache_lock; struct rb_root block_group_cache_tree; + /* keep track of unallocated space */ + spinlock_t free_chunk_lock; + u64 free_chunk_space; + struct extent_io_tree freed_extents[2]; struct extent_io_tree *pinned_extents; @@ -916,14 +958,11 @@ struct btrfs_fs_info { struct btrfs_block_rsv trans_block_rsv; /* block reservation for chunk tree */ struct btrfs_block_rsv chunk_block_rsv; + /* block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; struct btrfs_block_rsv empty_block_rsv; - /* list of block reservations that cross multiple transactions */ - struct list_head durable_block_rsv_list; - - struct mutex durable_block_rsv_mutex; - u64 generation; u64 last_trans_committed; @@ -942,8 +981,8 @@ struct btrfs_fs_info { wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; - struct btrfs_super_block super_copy; - struct btrfs_super_block super_for_commit; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; struct block_device *__bdev; struct super_block *sb; struct inode *btree_inode; @@ -1036,6 +1075,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_freespace_worker; struct btrfs_workers submit_workers; struct btrfs_workers caching_workers; + struct btrfs_workers readahead_workers; /* * fixup workers take dirty pages that didn't properly go through @@ -1119,6 +1159,13 @@ struct btrfs_fs_info { u64 fs_state; struct btrfs_delayed_root *delayed_root; + + /* readahead tree */ + spinlock_t reada_lock; + struct radix_tree_root reada_tree; + + /* next backup root to be overwritten */ + int backup_root_index; }; /* @@ -1225,6 +1272,8 @@ struct btrfs_root { * for stat. It may be used for more later */ dev_t anon_dev; + + int force_cow; }; struct btrfs_ioctl_defrag_range_args { @@ -1363,6 +1412,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) +#define BTRFS_MOUNT_RECOVERY (1 << 18) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1978,6 +2028,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root) return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; } +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, + tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, + tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, + tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, + chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, + chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, + extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, + fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, + fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, + fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, + dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, + dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, + dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, + csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, + csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, + csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, + num_devices, 64); + /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); @@ -2129,6 +2228,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); } +static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +{ + return mapping_gfp_mask(mapping) & ~__GFP_FS; +} + /* extent-tree.c */ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, unsigned num_items) @@ -2137,6 +2241,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 3 * num_items; } +/* + * Doing a truncate won't result in new nodes or leaves, just what we need for + * COW. + */ +static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, + unsigned num_items) +{ + return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + num_items; +} + void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); @@ -2146,6 +2261,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u64 num_bytes, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr); @@ -2196,8 +2314,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo); +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len); int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, @@ -2240,25 +2358,26 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +int btrfs_block_rsv_add_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_block_rsv_check(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor); + u64 min_reserved); +int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *rsv); int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); int btrfs_set_block_group_rw(struct btrfs_root *root, @@ -2379,6 +2498,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) smp_mb(); return fs_info->closing; } +static inline void free_fs_info(struct btrfs_fs_info *fs_info) +{ + kfree(fs_info->delayed_root); + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + kfree(fs_info->super_copy); + kfree(fs_info->super_for_commit); + kfree(fs_info); +} /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, @@ -2561,7 +2692,8 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); -void btrfs_dirty_inode(struct inode *inode, int flags); +int btrfs_dirty_inode(struct inode *inode); +int btrfs_update_time(struct file *file); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); @@ -2579,11 +2711,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve); -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); @@ -2697,4 +2824,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, struct btrfs_scrub_progress *progress); +/* reada.c */ +struct reada_control { + struct btrfs_root *root; /* tree to prefetch */ + struct btrfs_key key_start; + struct btrfs_key key_end; /* exclusive */ + atomic_t elems; + struct kref refcnt; + wait_queue_head_t wait; +}; +struct reada_control *btrfs_reada_add(struct btrfs_root *root, + struct btrfs_key *start, struct btrfs_key *end); +int btrfs_reada_wait(void *handle); +void btrfs_reada_detach(void *handle); +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err); + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index ae4d9cd10961..9c1eccc2c503 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, return 0; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->global_block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); @@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, if (!item->bytes_reserved) return; - rsv = &root->fs_info->global_block_rsv; + rsv = &root->fs_info->delayed_block_rsv; btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct inode *inode, struct btrfs_delayed_node *node) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; - - if (!trans->bytes_reserved) - return 0; + int release = false; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->global_block_rsv; + dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); + + /* + * btrfs_dirty_inode will update the inode under btrfs_join_transaction + * which doesn't reserve space for speed. This is a problem since we + * still need to reserve space for this update, so try to reserve the + * space. + * + * Now if src_rsv == delalloc_block_rsv we'll let it just steal since + * we're accounted for. + */ + if (!src_rsv || (!trans->bytes_reserved && + src_rsv != &root->fs_info->delalloc_block_rsv)) { + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + /* + * Since we're under a transaction reserve_metadata_bytes could + * try to commit the transaction which will make it return + * EAGAIN to make us stop the transaction we have, so return + * ENOSPC instead so that btrfs_dirty_inode knows what to do. + */ + if (ret == -EAGAIN) + ret = -ENOSPC; + if (!ret) + node->bytes_reserved = num_bytes; + return ret; + } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { + spin_lock(&BTRFS_I(inode)->lock); + if (BTRFS_I(inode)->delalloc_meta_reserved) { + BTRFS_I(inode)->delalloc_meta_reserved = 0; + spin_unlock(&BTRFS_I(inode)->lock); + release = true; + goto migrate; + } + spin_unlock(&BTRFS_I(inode)->lock); + + /* Ok we didn't have space pre-reserved. This shouldn't happen + * too often but it can happen if we do delalloc to an existing + * inode which gets dirtied because of the time update, and then + * isn't touched again until after the transaction commits and + * then we try to write out the data. First try to be nice and + * reserve something strictly for us. If not be a pain and try + * to steal from the delalloc block rsv. + */ + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + if (!ret) + goto out; + + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) + goto out; + + /* + * Ok this is a problem, let's just steal from the global rsv + * since this really shouldn't happen that often. + */ + WARN_ON(1); + ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, + dst_rsv, num_bytes); + goto out; + } + +migrate: ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + +out: + /* + * Migrate only takes a reservation, it doesn't touch the size of the + * block_rsv. This is to simplify people who don't normally have things + * migrated from their block rsv. If they go to release their + * reservation, that will decrease the size as well, so if migrate + * reduced size we'd end up with a negative size. But for the + * delalloc_meta_reserved stuff we will only know to drop 1 reservation, + * but we could in fact do this reserve/migrate dance several times + * between the time we did the original reservation and we'd clean it + * up. So to take care of this, release the space for the meta + * reservation here. I think it may be time for a documentation page on + * how block rsvs. work. + */ if (!ret) node->bytes_reserved = num_bytes; + if (release) + btrfs_block_rsv_release(root, src_rsv, num_bytes); + return ret; } @@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, if (!node->bytes_reserved) return; - rsv = &root->fs_info->global_block_rsv; + rsv = &root->fs_info->delayed_block_rsv; btrfs_block_rsv_release(root, rsv, node->bytes_reserved); node->bytes_reserved = 0; @@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; delayed_root = btrfs_get_delayed_root(root); @@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &node->root->fs_info->global_block_rsv; + trans->block_rsv = &node->root->fs_info->delayed_block_rsv; ret = btrfs_insert_delayed_items(trans, path, node->root, node); if (!ret) @@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) goto free_path; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->global_block_rsv; + trans->block_rsv = &root->fs_info->delayed_block_rsv; ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); if (!ret) @@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); - /* - * we must reserve enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, + delayed_node); + if (ret) + goto release_node; fill_stack_inode_item(trans, &delayed_node->inode_item, inode); delayed_node->inode_dirty = 1; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07ea91879a91..f44b3928dc2d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result) static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, int verify) { - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); char *result = NULL; unsigned long len; unsigned long cur_len; @@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; while (1) { - ret = read_extent_buffer_pages(io_tree, eb, start, 1, + ret = read_extent_buffer_pages(io_tree, eb, start, + WAIT_COMPLETE, btree_get_extent, mirror_num); if (!ret && !verify_parent_transid(io_tree, eb, parent_transid)) @@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, end = min_t(u64, eb->len, PAGE_CACHE_SIZE); end = eb->start + end - 1; err: + if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + btree_readahead_hook(root, eb, eb->start, ret); + } + free_extent_buffer(eb); out: return ret; } +static int btree_io_failed_hook(struct bio *failed_bio, + struct page *page, u64 start, u64 end, + int mirror_num, struct extent_state *state) +{ + struct extent_io_tree *tree; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page); + if (eb == NULL) + goto out; + + if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { + clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); + btree_readahead_hook(root, eb, eb->start, -EIO); + } + free_extent_buffer(eb); + +out: + return -EIO; /* we fixed nothing */ +} + static void end_workqueue_bio(struct bio *bio, int err) { struct end_io_wq *end_io_wq = bio->bi_private; @@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btree_get_extent); + return extent_read_full_page(tree, page, btree_get_extent, 0); } static int btree_releasepage(struct page *page, gfp_t gfp_flags) @@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, if (!buf) return 0; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, 0, 0, btree_get_extent, 0); + buf, 0, WAIT_NONE, btree_get_extent, 0); free_extent_buffer(buf); return ret; } +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, + int mirror_num, struct extent_buffer **eb) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; + int ret; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return 0; + + set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); + + ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK, + btree_get_extent, mirror_num); + if (ret) { + free_extent_buffer(buf); + return ret; + } + + if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { + free_extent_buffer(buf); + return -EIO; + } else if (extent_buffer_uptodate(io_tree, buf, NULL)) { + *eb = buf; + } else { + free_extent_buffer(buf); + } + return 0; +} + struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { @@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->commit_root = NULL; root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { free_extent_buffer(root->node); + root->node = NULL; return -EIO; } root->commit_root = btrfs_root_node(root); @@ -1577,6 +1648,235 @@ sleep: return 0; } +/* + * this will find the highest generation in the array of + * root backups. The index of the highest array is returned, + * or -1 if we can't find anything. + * + * We check to make sure the array is valid by comparing the + * generation of the latest root in the array with the generation + * in the super block. If they don't match we pitch it. + */ +static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) +{ + u64 cur; + int newest_index = -1; + struct btrfs_root_backup *root_backup; + int i; + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { + root_backup = info->super_copy->super_roots + i; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = i; + } + + /* check to see if we actually wrapped around */ + if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { + root_backup = info->super_copy->super_roots; + cur = btrfs_backup_tree_root_gen(root_backup); + if (cur == newest_gen) + newest_index = 0; + } + return newest_index; +} + + +/* + * find the oldest backup so we know where to store new entries + * in the backup array. This will set the backup_root_index + * field in the fs_info struct + */ +static void find_oldest_super_backup(struct btrfs_fs_info *info, + u64 newest_gen) +{ + int newest_index = -1; + + newest_index = find_newest_super_backup(info, newest_gen); + /* if there was garbage in there, just move along */ + if (newest_index == -1) { + info->backup_root_index = 0; + } else { + info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; + } +} + +/* + * copy all the root pointers into the super backup array. + * this will bump the backup pointer by one when it is + * done + */ +static void backup_super_roots(struct btrfs_fs_info *info) +{ + int next_backup; + struct btrfs_root_backup *root_backup; + int last_backup; + + next_backup = info->backup_root_index; + last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + + /* + * just overwrite the last backup if we're at the same generation + * this happens only at umount + */ + root_backup = info->super_for_commit->super_roots + last_backup; + if (btrfs_backup_tree_root_gen(root_backup) == + btrfs_header_generation(info->tree_root->node)) + next_backup = last_backup; + + root_backup = info->super_for_commit->super_roots + next_backup; + + /* + * make sure all of our padding and empty slots get zero filled + * regardless of which ones we use today + */ + memset(root_backup, 0, sizeof(*root_backup)); + + info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; + + btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); + btrfs_set_backup_tree_root_gen(root_backup, + btrfs_header_generation(info->tree_root->node)); + + btrfs_set_backup_tree_root_level(root_backup, + btrfs_header_level(info->tree_root->node)); + + btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); + btrfs_set_backup_chunk_root_gen(root_backup, + btrfs_header_generation(info->chunk_root->node)); + btrfs_set_backup_chunk_root_level(root_backup, + btrfs_header_level(info->chunk_root->node)); + + btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(info->extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(info->extent_root->node)); + + /* + * we might commit during log recovery, which happens before we set + * the fs_root. Make sure it is valid before we fill it in. + */ + if (info->fs_root && info->fs_root->node) { + btrfs_set_backup_fs_root(root_backup, + info->fs_root->node->start); + btrfs_set_backup_fs_root_gen(root_backup, + btrfs_header_generation(info->fs_root->node)); + btrfs_set_backup_fs_root_level(root_backup, + btrfs_header_level(info->fs_root->node)); + } + + btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); + btrfs_set_backup_dev_root_gen(root_backup, + btrfs_header_generation(info->dev_root->node)); + btrfs_set_backup_dev_root_level(root_backup, + btrfs_header_level(info->dev_root->node)); + + btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(info->csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(info->csum_root->node)); + + btrfs_set_backup_total_bytes(root_backup, + btrfs_super_total_bytes(info->super_copy)); + btrfs_set_backup_bytes_used(root_backup, + btrfs_super_bytes_used(info->super_copy)); + btrfs_set_backup_num_devices(root_backup, + btrfs_super_num_devices(info->super_copy)); + + /* + * if we don't copy this out to the super_copy, it won't get remembered + * for the next commit + */ + memcpy(&info->super_copy->super_roots, + &info->super_for_commit->super_roots, + sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); +} + +/* + * this copies info out of the root backup array and back into + * the in-memory super block. It is meant to help iterate through + * the array, so you send it the number of backups you've already + * tried and the last backup index you used. + * + * this returns -1 when it has tried all the backups + */ +static noinline int next_root_backup(struct btrfs_fs_info *info, + struct btrfs_super_block *super, + int *num_backups_tried, int *backup_index) +{ + struct btrfs_root_backup *root_backup; + int newest = *backup_index; + + if (*num_backups_tried == 0) { + u64 gen = btrfs_super_generation(super); + + newest = find_newest_super_backup(info, gen); + if (newest == -1) + return -1; + + *backup_index = newest; + *num_backups_tried = 1; + } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { + /* we've tried all the backups, all done */ + return -1; + } else { + /* jump to the next oldest backup */ + newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % + BTRFS_NUM_BACKUP_ROOTS; + *backup_index = newest; + *num_backups_tried += 1; + } + root_backup = super->super_roots + newest; + + btrfs_set_super_generation(super, + btrfs_backup_tree_root_gen(root_backup)); + btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); + btrfs_set_super_root_level(super, + btrfs_backup_tree_root_level(root_backup)); + btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); + + /* + * fixme: the total bytes and num_devices need to match or we should + * need a fsck + */ + btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); + btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); + return 0; +} + +/* helper to cleanup tree roots */ +static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) +{ + free_extent_buffer(info->tree_root->node); + free_extent_buffer(info->tree_root->commit_root); + free_extent_buffer(info->dev_root->node); + free_extent_buffer(info->dev_root->commit_root); + free_extent_buffer(info->extent_root->node); + free_extent_buffer(info->extent_root->commit_root); + free_extent_buffer(info->csum_root->node); + free_extent_buffer(info->csum_root->commit_root); + + info->tree_root->node = NULL; + info->tree_root->commit_root = NULL; + info->dev_root->node = NULL; + info->dev_root->commit_root = NULL; + info->extent_root->node = NULL; + info->extent_root->commit_root = NULL; + info->csum_root->node = NULL; + info->csum_root->commit_root = NULL; + + if (chunk_root) { + free_extent_buffer(info->chunk_root->node); + free_extent_buffer(info->chunk_root->commit_root); + info->chunk_root->node = NULL; + info->chunk_root->commit_root = NULL; + } +} + + struct btrfs_root *open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -1590,29 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, u64 features; struct btrfs_key location; struct buffer_head *bh; - struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_super_block *disk_super; struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = NULL; - struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *extent_root; + struct btrfs_root *csum_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; struct btrfs_root *log_tree_root; - int ret; int err = -EINVAL; - - struct btrfs_super_block *disk_super; - - if (!extent_root || !tree_root || !tree_root->fs_info || - !chunk_root || !dev_root || !csum_root) { + int num_backups_tried = 0; + int backup_index = 0; + + extent_root = fs_info->extent_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + csum_root = fs_info->csum_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + chunk_root = fs_info->chunk_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + dev_root = fs_info->dev_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + + if (!extent_root || !csum_root || !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } - fs_info = tree_root->fs_info; ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { @@ -1648,15 +1951,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); + spin_lock_init(&fs_info->free_chunk_lock); mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); - fs_info->tree_root = tree_root; - fs_info->extent_root = extent_root; - fs_info->csum_root = csum_root; - fs_info->chunk_root = chunk_root; - fs_info->dev_root = dev_root; - fs_info->fs_devices = fs_devices; INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); @@ -1665,8 +1963,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_block_rsv(&fs_info->trans_block_rsv); btrfs_init_block_rsv(&fs_info->chunk_block_rsv); btrfs_init_block_rsv(&fs_info->empty_block_rsv); - INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); - mutex_init(&fs_info->durable_block_rsv_mutex); + btrfs_init_block_rsv(&fs_info->delayed_block_rsv); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); @@ -1677,6 +1974,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->trans_no_join = 0; + fs_info->free_chunk_space = 0; + + /* readahead state */ + INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); + spin_lock_init(&fs_info->reada_lock); fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1766,14 +2068,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_alloc; } - memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); - memcpy(&fs_info->super_for_commit, &fs_info->super_copy, - sizeof(fs_info->super_for_commit)); + memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_for_commit)); brelse(bh); - memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); + memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) goto fail_alloc; @@ -1783,6 +2085,13 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); /* + * run through our array of backup supers and setup + * our ring pointer to the oldest one + */ + generation = btrfs_super_generation(disk_super); + find_oldest_super_backup(fs_info, generation); + + /* * In the long term, we'll store the compression type in the super * block, and it'll be used for per file compression control. */ @@ -1870,6 +2179,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->readahead_workers, "readahead", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1880,19 +2192,29 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->endio_write_workers.idle_thresh = 2; fs_info->endio_meta_write_workers.idle_thresh = 2; + fs_info->readahead_workers.idle_thresh = 2; - btrfs_start_workers(&fs_info->workers, 1); - btrfs_start_workers(&fs_info->generic_worker, 1); - btrfs_start_workers(&fs_info->submit_workers, 1); - btrfs_start_workers(&fs_info->delalloc_workers, 1); - btrfs_start_workers(&fs_info->fixup_workers, 1); - btrfs_start_workers(&fs_info->endio_workers, 1); - btrfs_start_workers(&fs_info->endio_meta_workers, 1); - btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); - btrfs_start_workers(&fs_info->endio_write_workers, 1); - btrfs_start_workers(&fs_info->endio_freespace_worker, 1); - btrfs_start_workers(&fs_info->delayed_workers, 1); - btrfs_start_workers(&fs_info->caching_workers, 1); + /* + * btrfs_start_workers can really only fail because of ENOMEM so just + * return -ENOMEM if any of these fail. + */ + ret = btrfs_start_workers(&fs_info->workers); + ret |= btrfs_start_workers(&fs_info->generic_worker); + ret |= btrfs_start_workers(&fs_info->submit_workers); + ret |= btrfs_start_workers(&fs_info->delalloc_workers); + ret |= btrfs_start_workers(&fs_info->fixup_workers); + ret |= btrfs_start_workers(&fs_info->endio_workers); + ret |= btrfs_start_workers(&fs_info->endio_meta_workers); + ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); + ret |= btrfs_start_workers(&fs_info->endio_write_workers); + ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); + ret |= btrfs_start_workers(&fs_info->delayed_workers); + ret |= btrfs_start_workers(&fs_info->caching_workers); + ret |= btrfs_start_workers(&fs_info->readahead_workers); + if (ret) { + ret = -ENOMEM; + goto fail_sb_buffer; + } fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -1939,7 +2261,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", sb->s_id); - goto fail_chunk_root; + goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); chunk_root->commit_root = btrfs_root_node(chunk_root); @@ -1954,11 +2276,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (ret) { printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", sb->s_id); - goto fail_chunk_root; + goto fail_tree_roots; } btrfs_close_extra_devices(fs_devices); +retry_root_backup: blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); generation = btrfs_super_generation(disk_super); @@ -1966,32 +2289,33 @@ struct btrfs_root *open_ctree(struct super_block *sb, tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), blocksize, generation); - if (!tree_root->node) - goto fail_chunk_root; - if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + if (!tree_root->node || + !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", sb->s_id); - goto fail_tree_root; + + goto recovery_tree_root; } + btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); if (ret) - goto fail_tree_root; + goto recovery_tree_root; extent_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); if (ret) - goto fail_extent_root; + goto recovery_tree_root; dev_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_CSUM_TREE_OBJECTID, csum_root); if (ret) - goto fail_dev_root; + goto recovery_tree_root; csum_root->track_dirty = 1; @@ -2124,22 +2448,13 @@ fail_cleaner: fail_block_groups: btrfs_free_block_groups(fs_info); - free_extent_buffer(csum_root->node); - free_extent_buffer(csum_root->commit_root); -fail_dev_root: - free_extent_buffer(dev_root->node); - free_extent_buffer(dev_root->commit_root); -fail_extent_root: - free_extent_buffer(extent_root->node); - free_extent_buffer(extent_root->commit_root); -fail_tree_root: - free_extent_buffer(tree_root->node); - free_extent_buffer(tree_root->commit_root); -fail_chunk_root: - free_extent_buffer(chunk_root->node); - free_extent_buffer(chunk_root->commit_root); + +fail_tree_roots: + free_root_pointers(fs_info, 1); + fail_sb_buffer: btrfs_stop_workers(&fs_info->generic_worker); + btrfs_stop_workers(&fs_info->readahead_workers); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -2152,25 +2467,37 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: - kfree(fs_info->delayed_root); fail_iput: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); - - btrfs_close_devices(fs_info->fs_devices); - btrfs_mapping_tree_free(&fs_info->mapping_tree); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: - kfree(extent_root); - kfree(tree_root); - kfree(fs_info); - kfree(chunk_root); - kfree(dev_root); - kfree(csum_root); + btrfs_close_devices(fs_info->fs_devices); + free_fs_info(fs_info); return ERR_PTR(err); + +recovery_tree_root: + if (!btrfs_test_opt(tree_root, RECOVERY)) + goto fail_tree_roots; + + free_root_pointers(fs_info, 0); + + /* don't use the log in recovery mode, it won't be valid */ + btrfs_set_super_log_root(disk_super, 0); + + /* we can't trust the free space cache either */ + btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + + ret = next_root_backup(fs_info, fs_info->super_copy, + &num_backups_tried, &backup_index); + if (ret == -1) + goto fail_block_groups; + goto retry_root_backup; } static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) @@ -2254,22 +2581,10 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; - int last_barrier = 0; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; - /* make sure only the last submit_bh does a barrier */ - if (do_barriers) { - for (i = 0; i < max_mirrors; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - device->total_bytes) - break; - last_barrier = i; - } - } - for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) @@ -2315,17 +2630,136 @@ static int write_dev_supers(struct btrfs_device *device, bh->b_end_io = btrfs_end_buffer_write_sync; } - if (i == last_barrier && do_barriers) - ret = submit_bh(WRITE_FLUSH_FUA, bh); - else - ret = submit_bh(WRITE_SYNC, bh); - + /* + * we fua the first super. The others we allow + * to go down lazy. + */ + ret = submit_bh(WRITE_FUA, bh); if (ret) errors++; } return errors < i ? 0 : -1; } +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/* + * trigger flushes for one the devices. If you pass wait == 0, the flushes are + * sent down. With wait == 1, it waits for the previous flush. + * + * any device where the flush fails with eopnotsupp are flagged as not-barrier + * capable + */ +static int write_dev_flush(struct btrfs_device *device, int wait) +{ + struct bio *bio; + int ret = 0; + + if (device->nobarriers) + return 0; + + if (wait) { + bio = device->flush_bio; + if (!bio) + return 0; + + wait_for_completion(&device->flush_wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + device->nobarriers = 1; + } + if (!bio_flagged(bio, BIO_UPTODATE)) { + ret = -EIO; + } + + /* drop the reference from the wait == 0 run */ + bio_put(bio); + device->flush_bio = NULL; + + return ret; + } + + /* + * one reference for us, and we leave it for the + * caller + */ + device->flush_bio = NULL;; + bio = bio_alloc(GFP_NOFS, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; + device->flush_bio = bio; + + bio_get(bio); + submit_bio(WRITE_FLUSH, bio); + + return 0; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ + struct list_head *head; + struct btrfs_device *dev; + int errors = 0; + int ret; + + /* send down all the barriers */ + head = &info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 0); + if (ret) + errors++; + } + + /* wait for all the barriers */ + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 1); + if (ret) + errors++; + } + if (errors) + return -EIO; + return 0; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2338,14 +2772,19 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; do_barriers = !btrfs_test_opt(root, NOBARRIER); + backup_super_roots(root->fs_info); - sb = &root->fs_info->super_for_commit; + sb = root->fs_info->super_for_commit; dev_item = &sb->dev_item; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; + + if (do_barriers) + barrier_all_devices(root->fs_info); + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { total_errors++; @@ -2545,8 +2984,6 @@ int close_ctree(struct btrfs_root *root) /* clear out the rbtree of defraggable inodes */ btrfs_run_defrag_inodes(root->fs_info); - btrfs_put_block_group_cache(fs_info); - /* * Here come 2 situations when btrfs is broken to flip readonly: * @@ -2572,6 +3009,8 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } + btrfs_put_block_group_cache(fs_info); + kthread_stop(root->fs_info->transaction_kthread); kthread_stop(root->fs_info->cleaner_kthread); @@ -2603,7 +3042,6 @@ int close_ctree(struct btrfs_root *root) del_fs_roots(fs_info); iput(fs_info->btree_inode); - kfree(fs_info->delayed_root); btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); @@ -2617,6 +3055,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->submit_workers); btrfs_stop_workers(&fs_info->delayed_workers); btrfs_stop_workers(&fs_info->caching_workers); + btrfs_stop_workers(&fs_info->readahead_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -2624,12 +3063,7 @@ int close_ctree(struct btrfs_root *root) bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); - kfree(fs_info->extent_root); - kfree(fs_info->tree_root); - kfree(fs_info->chunk_root); - kfree(fs_info->dev_root); - kfree(fs_info->csum_root); - kfree(fs_info); + free_fs_info(fs_info); return 0; } @@ -2735,7 +3169,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) return ret; } -int btree_lock_page_hook(struct page *page) +static int btree_lock_page_hook(struct page *page, void *data, + void (*flush_fn)(void *)) { struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2752,7 +3187,10 @@ int btree_lock_page_hook(struct page *page) if (!eb) goto out; - btrfs_tree_lock(eb); + if (!btrfs_try_tree_write_lock(eb)) { + flush_fn(data); + btrfs_tree_lock(eb); + } btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { @@ -2767,7 +3205,10 @@ int btree_lock_page_hook(struct page *page) btrfs_tree_unlock(eb); free_extent_buffer(eb); out: - lock_page(page); + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); + } return 0; } @@ -3123,6 +3564,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, + .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, /* note we're sharing with inode.c for the merge bio hook */ .merge_bio_hook = btrfs_merge_bio_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bec3ea4bd67f..c99d0a8f13fa 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid); +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, + int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int clean_tree_block(struct btrfs_trans_handle *trans, @@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btree_lock_page_hook(struct page *page); - #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5be06a2462f..f5fbe576d2ba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -23,6 +23,7 @@ #include <linux/rcupdate.h> #include <linux/kthread.h> #include <linux/slab.h> +#include <linux/ratelimit.h> #include "compat.h" #include "hash.h" #include "ctree.h" @@ -52,6 +53,21 @@ enum { CHUNK_ALLOC_LIMITED = 2, }; +/* + * Control how reservations are dealt with. + * + * RESERVE_FREE - freeing a reservation. + * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for + * ENOSPC accounting + * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update + * bytes_may_use as the ENOSPC accounting is done elsewhere + */ +enum { + RESERVE_FREE = 0, + RESERVE_ALLOC = 1, + RESERVE_ALLOC_NO_ACCOUNT = 2, +}; + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); @@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); - WARN_ON(cache->reserved_pinned > 0); kfree(cache->free_space_ctl); kfree(cache); } @@ -450,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_root *root, int load_cache_only) { + DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; int ret = 0; - smp_mb(); - if (cache->cached != BTRFS_CACHE_NO) + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + atomic_set(&caching_ctl->count, 1); + caching_ctl->work.func = caching_thread; + + spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + put_caching_control(ctl); + spin_lock(&cache->lock); + } + + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); return 0; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); /* * We can't do the read from on-disk cache during a commit since we need @@ -465,57 +528,53 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, * we likely hold important locks. */ if (trans && (!trans->transaction->in_commit) && - (root && root != root->fs_info->tree_root)) { - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - return 0; - } - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); - + (root && root != root->fs_info->tree_root) && + btrfs_test_opt(root, SPACE_CACHE)) { ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); if (ret == 1) { + cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; } else { - cache->cached = BTRFS_CACHE_NO; + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } } spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); if (ret == 1) { + put_caching_control(caching_ctl); free_excluded_extents(fs_info->extent_root, cache); return 0; } + } else { + /* + * We are not going to do the fast caching, set cached to the + * appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); } - if (load_cache_only) - return 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - BUG_ON(!caching_ctl); - - INIT_LIST_HEAD(&caching_ctl->list); - mutex_init(&caching_ctl->mutex); - init_waitqueue_head(&caching_ctl->wait); - caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; - /* one for caching kthread, one for caching block group list */ - atomic_set(&caching_ctl->count, 2); - caching_ctl->work.func = caching_thread; - - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - kfree(caching_ctl); + if (load_cache_only) { + put_caching_control(caching_ctl); return 0; } - cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); down_write(&fs_info->extent_commit_sem); + atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->extent_commit_sem); @@ -1770,18 +1829,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, { int ret; u64 discarded_bytes = 0; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, - bytenr, &num_bytes, &multi, 0); + bytenr, &num_bytes, &bbio, 0); if (!ret) { - struct btrfs_bio_stripe *stripe = multi->stripes; + struct btrfs_bio_stripe *stripe = bbio->stripes; int i; - for (i = 0; i < multi->num_stripes; i++, stripe++) { + for (i = 0; i < bbio->num_stripes; i++, stripe++) { if (!stripe->dev->can_discard) continue; @@ -1800,7 +1859,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, */ ret = 0; } - kfree(multi); + kfree(bbio); } if (actual_bytes) @@ -2700,6 +2759,13 @@ again: goto again; } + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + /* * We want to set the generation to 0, that way if anything goes wrong * from here on out we know not to trust this cache when we load up next @@ -2749,12 +2815,15 @@ again: if (!ret) dcs = BTRFS_DC_SETUP; btrfs_free_reserved_data_space(inode, num_pages); + out_put: iput(inode); out_free: btrfs_release_path(path); out: spin_lock(&block_group->lock); + if (!ret && dcs == BTRFS_DC_SETUP) + block_group->cache_generation = trans->transid; block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); @@ -3122,16 +3191,13 @@ commit_trans: return -ENOSPC; } data_sinfo->bytes_may_use += bytes; - BTRFS_I(inode)->reserved_bytes += bytes; spin_unlock(&data_sinfo->lock); return 0; } /* - * called when we are clearing an delalloc extent from the - * inode's io_tree or there was an error for whatever reason - * after calling btrfs_check_data_free_space + * Called if we need to clear a data reservation for this inode. */ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) { @@ -3144,7 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) data_sinfo = BTRFS_I(inode)->space_info; spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; spin_unlock(&data_sinfo->lock); } @@ -3165,6 +3230,7 @@ static int should_alloc_chunk(struct btrfs_root *root, struct btrfs_space_info *sinfo, u64 alloc_bytes, int force) { + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; u64 thresh; @@ -3173,11 +3239,18 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; /* + * We need to take into account the global rsv because for all intents + * and purposes it's used space. Don't worry about locking the + * global_rsv, it doesn't change except when the transaction commits. + */ + num_allocated += global_rsv->size; + + /* * in limited mode, we want to have some free space up to * about 1% of the FS size. */ if (force == CHUNK_ALLOC_LIMITED) { - thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); thresh = max_t(u64, 64 * 1024 * 1024, div_factor_fine(thresh, 1)); @@ -3199,7 +3272,7 @@ static int should_alloc_chunk(struct btrfs_root *root, if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) return 0; - thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(root->fs_info->super_copy); /* 256MB or 5% of the FS */ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); @@ -3302,24 +3375,26 @@ out: /* * shrink metadata reservation for delalloc */ -static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 to_reclaim, int sync) +static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, + bool wait_ordered) { struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; u64 reserved; u64 max_reclaim; u64 reclaimed = 0; long time_left; - int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; unsigned long progress; + trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); - reserved = space_info->bytes_reserved; + reserved = space_info->bytes_may_use; progress = space_info->reservation_progress; if (reserved == 0) @@ -3334,18 +3409,20 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } max_reclaim = min(reserved, to_reclaim); - + nr_pages = max_t(unsigned long, nr_pages, + max_reclaim >> PAGE_CACHE_SHIFT); while (loops < 1024) { /* have the flusher threads jump in and do some IO */ smp_mb(); nr_pages = min_t(unsigned long, nr_pages, root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); - writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); + writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, + WB_REASON_FS_FREE_SPACE); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_reserved) - reclaimed += reserved - space_info->bytes_reserved; - reserved = space_info->bytes_reserved; + if (reserved > space_info->bytes_may_use) + reclaimed += reserved - space_info->bytes_may_use; + reserved = space_info->bytes_may_use; spin_unlock(&space_info->lock); loops++; @@ -3356,11 +3433,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (trans && trans->transaction->blocked) return -EAGAIN; - time_left = schedule_timeout_interruptible(1); + if (wait_ordered && !trans) { + btrfs_wait_ordered_extents(root, 0, 0); + } else { + time_left = schedule_timeout_interruptible(1); - /* We were interrupted, exit */ - if (time_left) - break; + /* We were interrupted, exit */ + if (time_left) + break; + } /* we've kicked the IO a few times, if anything has been freed, * exit. There is no sense in looping here for a long time @@ -3375,34 +3456,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, } } - if (reclaimed >= to_reclaim && !trans) - btrfs_wait_ordered_extents(root, 0, 0); + return reclaimed >= to_reclaim; } -/* - * Retries tells us how many times we've called reserve_metadata_bytes. The - * idea is if this is the first call (retries == 0) then we will add to our - * reserved count if we can't make the allocation in order to hold our place - * while we go and try and free up space. That way for retries > 1 we don't try - * and add space, we just check to see if the amount of unused space is >= the - * total space, meaning that our reservation is valid. +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit * - * However if we don't intend to retry this reservation, pass -1 as retries so - * that it short circuits this logic. + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does. Otherwise it + * will return -ENOSPC. */ -static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, +static int may_commit_transaction(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 bytes, int force) +{ + struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; + struct btrfs_trans_handle *trans; + + trans = (struct btrfs_trans_handle *)current->journal_info; + if (trans) + return -EAGAIN; + + if (force) + goto commit; + + /* See if there is enough pinned space to make this reservation */ + spin_lock(&space_info->lock); + if (space_info->bytes_pinned >= bytes) { + spin_unlock(&space_info->lock); + goto commit; + } + spin_unlock(&space_info->lock); + + /* + * See if there is some space in the delayed insertion reservation for + * this reservation. + */ + if (space_info != delayed_rsv->space_info) + return -ENOSPC; + + spin_lock(&delayed_rsv->lock); + if (delayed_rsv->size < bytes) { + spin_unlock(&delayed_rsv->lock); + return -ENOSPC; + } + spin_unlock(&delayed_rsv->lock); + +commit: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return -ENOSPC; + + return btrfs_commit_transaction(trans, root); +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - wether or not we can flush to make our reservation + * + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; - u64 unused; + u64 used; u64 num_bytes = orig_bytes; int retries = 0; int ret = 0; bool committed = false; bool flushing = false; + bool wait_ordered = false; again: ret = 0; @@ -3419,7 +3556,7 @@ again: * deadlock since we are waiting for the flusher to finish, but * hold the current transaction open. */ - if (trans) + if (current->journal_info) return -EAGAIN; ret = wait_event_interruptible(space_info->wait, !space_info->flush); @@ -3431,9 +3568,9 @@ again: } ret = -ENOSPC; - unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; /* * The idea here is that we've not already over-reserved the block group @@ -3442,10 +3579,9 @@ again: * lets start flushing stuff first and then come back and try to make * our reservation. */ - if (unused <= space_info->total_bytes) { - unused = space_info->total_bytes - unused; - if (unused >= num_bytes) { - space_info->bytes_reserved += orig_bytes; + if (used <= space_info->total_bytes) { + if (used + orig_bytes <= space_info->total_bytes) { + space_info->bytes_may_use += orig_bytes; ret = 0; } else { /* @@ -3461,10 +3597,64 @@ again: * amount plus the amount of bytes that we need for this * reservation. */ - num_bytes = unused - space_info->total_bytes + + wait_ordered = true; + num_bytes = used - space_info->total_bytes + (orig_bytes * (retries + 1)); } + if (ret) { + u64 profile = btrfs_get_alloc_profile(root, 0); + u64 avail; + + /* + * If we have a lot of space that's pinned, don't bother doing + * the overcommit dance yet and just commit the transaction. + */ + avail = (space_info->total_bytes - space_info->bytes_used) * 8; + do_div(avail, 10); + if (space_info->bytes_pinned >= avail && flush && !committed) { + space_info->flush = 1; + flushing = true; + spin_unlock(&space_info->lock); + ret = may_commit_transaction(root, space_info, + orig_bytes, 1); + if (ret) + goto out; + committed = true; + goto again; + } + + spin_lock(&root->fs_info->free_chunk_lock); + avail = root->fs_info->free_chunk_space; + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually useable. + */ + if (profile & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + avail >>= 1; + + /* + * If we aren't flushing don't let us overcommit too much, say + * 1/8th of the space. If we can flush, let it overcommit up to + * 1/2 of the space. + */ + if (flush) + avail >>= 3; + else + avail >>= 1; + spin_unlock(&root->fs_info->free_chunk_lock); + + if (used + num_bytes < space_info->total_bytes + avail) { + space_info->bytes_may_use += orig_bytes; + ret = 0; + } else { + wait_ordered = true; + } + } + /* * Couldn't make our reservation, save our place so while we're trying * to reclaim space we can actually use it instead of somebody else @@ -3484,7 +3674,7 @@ again: * We do synchronous shrinking since we don't actually unreserve * metadata until after the IO is completed. */ - ret = shrink_delalloc(trans, root, num_bytes, 1); + ret = shrink_delalloc(root, num_bytes, wait_ordered); if (ret < 0) goto out; @@ -3496,35 +3686,17 @@ again: * so go back around and try again. */ if (retries < 2) { + wait_ordered = true; retries++; goto again; } - /* - * Not enough space to be reclaimed, don't bother committing the - * transaction. - */ - spin_lock(&space_info->lock); - if (space_info->bytes_pinned < orig_bytes) - ret = -ENOSPC; - spin_unlock(&space_info->lock); - if (ret) - goto out; - - ret = -EAGAIN; - if (trans) - goto out; - ret = -ENOSPC; if (committed) goto out; - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - goto out; - ret = btrfs_commit_transaction(trans, root); + ret = may_commit_transaction(root, space_info, orig_bytes, 0); if (!ret) { - trans = NULL; committed = true; goto again; } @@ -3542,10 +3714,12 @@ out: static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_rsv *block_rsv; - if (root->ref_cows) + struct btrfs_block_rsv *block_rsv = NULL; + + if (root->ref_cows || root == root->fs_info->csum_root) block_rsv = trans->block_rsv; - else + + if (!block_rsv) block_rsv = root->block_rsv; if (!block_rsv) @@ -3616,7 +3790,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, } if (num_bytes) { spin_lock(&space_info->lock); - space_info->bytes_reserved -= num_bytes; + space_info->bytes_may_use -= num_bytes; space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -3640,9 +3814,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); - atomic_set(&rsv->usage, 1); - rsv->priority = 6; - INIT_LIST_HEAD(&rsv->list); } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) @@ -3663,38 +3834,20 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { - if (rsv && atomic_dec_and_test(&rsv->usage)) { - btrfs_block_rsv_release(root, rsv, (u64)-1); - if (!rsv->durable) - kfree(rsv); - } -} - -/* - * make the block_rsv struct be able to capture freed space. - * the captured space will re-add to the the block_rsv struct - * after transaction commit - */ -void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv) -{ - block_rsv->durable = 1; - mutex_lock(&fs_info->durable_block_rsv_mutex); - list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); - mutex_unlock(&fs_info->durable_block_rsv_mutex); + btrfs_block_rsv_release(root, rsv, (u64)-1); + kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +static inline int __block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int flush) { int ret; if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3703,55 +3856,80 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, return ret; } -int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved, int min_factor) +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + return __block_rsv_add(root, block_rsv, num_bytes, 1); +} + +int btrfs_block_rsv_add_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + return __block_rsv_add(root, block_rsv, num_bytes, 0); +} + +int btrfs_block_rsv_check(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int min_factor) { u64 num_bytes = 0; - int commit_trans = 0; int ret = -ENOSPC; if (!block_rsv) return 0; spin_lock(&block_rsv->lock); - if (min_factor > 0) - num_bytes = div_factor(block_rsv->size, min_factor); - if (min_reserved > num_bytes) - num_bytes = min_reserved; + num_bytes = div_factor(block_rsv->size, min_factor); + if (block_rsv->reserved >= num_bytes) + ret = 0; + spin_unlock(&block_rsv->lock); - if (block_rsv->reserved >= num_bytes) { + return ret; +} + +static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int flush) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = min_reserved; + if (block_rsv->reserved >= num_bytes) ret = 0; - } else { + else num_bytes -= block_rsv->reserved; - if (block_rsv->durable && - block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) - commit_trans = 1; - } spin_unlock(&block_rsv->lock); + if (!ret) return 0; - if (block_rsv->refill_used) { - ret = reserve_metadata_bytes(trans, root, block_rsv, - num_bytes, 0); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 0); - return 0; - } - } - - if (commit_trans) { - if (trans) - return -EAGAIN; - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - ret = btrfs_commit_transaction(trans, root); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; } - return -ENOSPC; + return ret; +} + +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved) +{ + return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); +} + +int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved) +{ + return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); } int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, @@ -3783,7 +3961,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) u64 num_bytes; u64 meta_used; u64 data_used; - int csum_size = btrfs_super_csum_size(&fs_info->super_copy); + int csum_size = btrfs_super_csum_size(fs_info->super_copy); sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); spin_lock(&sinfo->lock); @@ -3827,12 +4005,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) if (sinfo->total_bytes > num_bytes) { num_bytes = sinfo->total_bytes - num_bytes; block_rsv->reserved += num_bytes; - sinfo->bytes_reserved += num_bytes; + sinfo->bytes_may_use += num_bytes; } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; - sinfo->bytes_reserved -= num_bytes; + sinfo->bytes_may_use -= num_bytes; sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -3848,16 +4026,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); fs_info->chunk_block_rsv.space_info = space_info; - fs_info->chunk_block_rsv.priority = 10; space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; - fs_info->global_block_rsv.priority = 10; - fs_info->global_block_rsv.refill_used = 1; fs_info->delalloc_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; - fs_info->empty_block_rsv.priority = 10; + fs_info->delayed_block_rsv.space_info = space_info; fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; @@ -3865,10 +4040,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); - - btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); - update_global_block_rsv(fs_info); } @@ -3881,37 +4052,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); WARN_ON(fs_info->chunk_block_rsv.reserved > 0); -} - -int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *rsv) -{ - struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv; - u64 num_bytes; - int ret; - - /* - * Truncate should be freeing data, but give us 2 items just in case it - * needs to use some space. We may want to be smarter about this in the - * future. - */ - num_bytes = btrfs_calc_trans_metadata_size(root, 2); - - /* We already have enough bytes, just return */ - if (rsv->reserved >= num_bytes) - return 0; - - num_bytes -= rsv->reserved; - - /* - * You should have reserved enough space before hand to do this, so this - * should not fail. - */ - ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes); - BUG_ON(ret); - - return 0; + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); } void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, @@ -3920,9 +4062,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, if (!trans->bytes_reserved) return; - BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); - btrfs_block_rsv_release(root, trans->block_rsv, - trans->bytes_reserved); + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; } @@ -3964,33 +4104,99 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } +/** + * drop_outstanding_extent - drop an outstanding extent + * @inode: the inode we're dropping the extent for + * + * This is called when we are freeing up an outstanding extent, either called + * after an error or after an extent is written. This will return the number of + * reserved extents that need to be freed. This must be called with + * BTRFS_I(inode)->lock held. + */ static unsigned drop_outstanding_extent(struct inode *inode) { + unsigned drop_inode_space = 0; unsigned dropped_extents = 0; - spin_lock(&BTRFS_I(inode)->lock); BUG_ON(!BTRFS_I(inode)->outstanding_extents); BTRFS_I(inode)->outstanding_extents--; + if (BTRFS_I(inode)->outstanding_extents == 0 && + BTRFS_I(inode)->delalloc_meta_reserved) { + drop_inode_space = 1; + BTRFS_I(inode)->delalloc_meta_reserved = 0; + } + /* * If we have more or the same amount of outsanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= BTRFS_I(inode)->reserved_extents) - goto out; + return drop_inode_space; dropped_extents = BTRFS_I(inode)->reserved_extents - BTRFS_I(inode)->outstanding_extents; BTRFS_I(inode)->reserved_extents -= dropped_extents; -out: - spin_unlock(&BTRFS_I(inode)->lock); - return dropped_extents; + return dropped_extents + drop_inode_space; } -static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) +/** + * calc_csum_metadata_size - return the amount of metada space that must be + * reserved/free'd for the given bytes. + * @inode: the inode we're manipulating + * @num_bytes: the number of bytes in question + * @reserve: 1 if we are reserving space, 0 if we are freeing space + * + * This adjusts the number of csum_bytes in the inode and then returns the + * correct amount of metadata that must either be reserved or freed. We + * calculate how many checksums we can fit into one leaf and then divide the + * number of bytes that will need to be checksumed by this value to figure out + * how many checksums will be required. If we are adding bytes then the number + * may go up and we will return the number of additional bytes that must be + * reserved. If it is going down we will return the number of bytes that must + * be freed. + * + * This must be called with BTRFS_I(inode)->lock held. + */ +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, + int reserve) { - return num_bytes >>= 3; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 csum_size; + int num_csums_per_leaf; + int num_csums; + int old_csums; + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && + BTRFS_I(inode)->csum_bytes == 0) + return 0; + + old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + if (reserve) + BTRFS_I(inode)->csum_bytes += num_bytes; + else + BTRFS_I(inode)->csum_bytes -= num_bytes; + csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + num_csums_per_leaf = (int)div64_u64(csum_size, + sizeof(struct btrfs_csum_item) + + sizeof(struct btrfs_disk_key)); + num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + num_csums = num_csums + num_csums_per_leaf - 1; + num_csums = num_csums / num_csums_per_leaf; + + old_csums = old_csums + num_csums_per_leaf - 1; + old_csums = old_csums / num_csums_per_leaf; + + /* No change, no need to reserve more */ + if (old_csums == num_csums) + return 0; + + if (reserve) + return btrfs_calc_trans_metadata_size(root, + num_csums - old_csums); + + return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); } int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) @@ -3998,10 +4204,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; u64 to_reserve = 0; + u64 csum_bytes; unsigned nr_extents = 0; + int extra_reserve = 0; + int flush = 1; int ret; - if (btrfs_transaction_in_commit(root->fs_info)) + /* Need to be holding the i_mutex here if we aren't free space cache */ + if (btrfs_is_free_space_inode(root, inode)) + flush = 0; + else + WARN_ON(!mutex_is_locked(&inode->i_mutex)); + + if (flush && btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); num_bytes = ALIGN(num_bytes, root->sectorsize); @@ -4010,33 +4225,74 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) BTRFS_I(inode)->outstanding_extents++; if (BTRFS_I(inode)->outstanding_extents > - BTRFS_I(inode)->reserved_extents) { + BTRFS_I(inode)->reserved_extents) nr_extents = BTRFS_I(inode)->outstanding_extents - BTRFS_I(inode)->reserved_extents; - BTRFS_I(inode)->reserved_extents += nr_extents; - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + /* + * Add an item to reserve for updating the inode when we complete the + * delalloc io. + */ + if (!BTRFS_I(inode)->delalloc_meta_reserved) { + nr_extents++; + extra_reserve = 1; } + + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); + csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); - to_reserve += calc_csum_metadata_size(inode, num_bytes); - ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); + ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (ret) { + u64 to_free = 0; unsigned dropped; + + spin_lock(&BTRFS_I(inode)->lock); + dropped = drop_outstanding_extent(inode); /* - * We don't need the return value since our reservation failed, - * we just need to clean up our counter. + * If the inodes csum_bytes is the same as the original + * csum_bytes then we know we haven't raced with any free()ers + * so we can just reduce our inodes csum bytes and carry on. + * Otherwise we have to do the normal free thing to account for + * the case that the free side didn't free up its reserve + * because of this outstanding reservation. */ - dropped = drop_outstanding_extent(inode); - WARN_ON(dropped > 1); + if (BTRFS_I(inode)->csum_bytes == csum_bytes) + calc_csum_metadata_size(inode, num_bytes, 0); + else + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); + if (dropped) + to_free += btrfs_calc_trans_metadata_size(root, dropped); + + if (to_free) + btrfs_block_rsv_release(root, block_rsv, to_free); return ret; } + spin_lock(&BTRFS_I(inode)->lock); + if (extra_reserve) { + BTRFS_I(inode)->delalloc_meta_reserved = 1; + nr_extents--; + } + BTRFS_I(inode)->reserved_extents += nr_extents; + spin_unlock(&BTRFS_I(inode)->lock); + block_rsv_add_bytes(block_rsv, to_reserve, 1); return 0; } +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for + * @num_bytes: the number of bytes we're releasing + * + * This will release the metadata reservation for an inode. This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations. + */ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4044,9 +4300,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) unsigned dropped; num_bytes = ALIGN(num_bytes, root->sectorsize); + spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode); - to_free = calc_csum_metadata_size(inode, num_bytes); + to_free = calc_csum_metadata_size(inode, num_bytes, 0); + spin_unlock(&BTRFS_I(inode)->lock); if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); @@ -4054,6 +4312,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) to_free); } +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * @inode: inode we're writing to + * @num_bytes: the number of bytes we want to allocate + * + * This will do the following things + * + * o reserve space in the data space info for num_bytes + * o reserve space in the metadata space info based on number of outstanding + * extents and how much csums will be needed + * o add to the inodes ->delalloc_bytes + * o add it to the fs_info's delalloc inodes list. + * + * This will return 0 for success and -ENOSPC if there is no space left. + */ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) { int ret; @@ -4071,6 +4344,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) return 0; } +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @num_bytes: the number of bytes we want to free up + * + * This must be matched with a call to btrfs_delalloc_reserve_space. This is + * called in the case that we don't need the metadata AND data reservations + * anymore. So if there is an error or we insert an inline extent. + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + */ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) { btrfs_delalloc_release_metadata(inode, num_bytes); @@ -4090,12 +4376,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, /* block accounting for super block */ spin_lock(&info->delalloc_lock); - old_val = btrfs_super_bytes_used(&info->super_copy); + old_val = btrfs_super_bytes_used(info->super_copy); if (alloc) old_val += num_bytes; else old_val -= num_bytes; - btrfs_set_super_bytes_used(&info->super_copy, old_val); + btrfs_set_super_bytes_used(info->super_copy, old_val); spin_unlock(&info->delalloc_lock); while (total) { @@ -4123,7 +4409,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - if (btrfs_super_cache_generation(&info->super_copy) != 0 && + if (btrfs_test_opt(root, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -4135,7 +4421,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->reservation_progress++; cache->space_info->bytes_used += num_bytes; cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); @@ -4187,7 +4472,6 @@ static int pin_down_extent(struct btrfs_root *root, if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; - cache->space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4215,45 +4499,82 @@ int btrfs_pin_extent(struct btrfs_root *root, } /* - * update size of reserved extents. this function may return -EAGAIN - * if 'reserve' is true or 'sinfo' is false. + * this function must be called within transaction + */ +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + + /* + * pull in the free space cache (if any) so that our pin + * removes the free space from the cache. We have load_only set + * to one because the slow code to read in the free extents does check + * the pinned extents. + */ + cache_block_group(cache, trans, root, 1); + + pin_down_extent(root, cache, bytenr, num_bytes, 0); + + /* remove us from the free space cache (if we're there at all) */ + btrfs_remove_free_space(cache, bytenr, num_bytes); + btrfs_put_block_group(cache); + return 0; +} + +/** + * btrfs_update_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @reserve: One of the reservation enums + * + * This is called by the allocator when it reserves space, or by somebody who is + * freeing space that was never actually used on disk. For example if you + * reserve some space for a new leaf in transaction A and before transaction A + * commits you free that leaf, you call this with reserve set to 0 in order to + * clear the reservation. + * + * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * ENOSPC accounting. For data we handle the reservation through clearing the + * delalloc bits in the io_tree. We have to do this since we could end up + * allocating less disk space for the amount of data we have reserved in the + * case of compression. + * + * If this is a reservation and the block group has become read only we cannot + * make the reservation and return -EAGAIN, otherwise this function always + * succeeds. */ -int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo) +static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve) { + struct btrfs_space_info *space_info = cache->space_info; int ret = 0; - if (sinfo) { - struct btrfs_space_info *space_info = cache->space_info; - spin_lock(&space_info->lock); - spin_lock(&cache->lock); - if (reserve) { - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - } - } else { - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->reservation_progress++; - } - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - } else { - spin_lock(&cache->lock); + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (reserve != RESERVE_FREE) { if (cache->ro) { ret = -EAGAIN; } else { - if (reserve) - cache->reserved += num_bytes; - else - cache->reserved -= num_bytes; + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + if (reserve == RESERVE_ALLOC) { + BUG_ON(space_info->bytes_may_use < num_bytes); + space_info->bytes_may_use -= num_bytes; + } } - spin_unlock(&cache->lock); + } else { + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); return ret; } @@ -4319,13 +4640,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) spin_lock(&cache->lock); cache->pinned -= len; cache->space_info->bytes_pinned -= len; - if (cache->ro) { + if (cache->ro) cache->space_info->bytes_readonly += len; - } else if (cache->reserved_pinned > 0) { - len = min(len, cache->reserved_pinned); - cache->reserved_pinned -= len; - cache->space_info->bytes_reserved += len; - } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } @@ -4340,11 +4656,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *unpin; - struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *next_rsv; u64 start; u64 end; - int idx; int ret; if (fs_info->pinned_extents == &fs_info->freed_extents[0]) @@ -4367,30 +4680,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - mutex_lock(&fs_info->durable_block_rsv_mutex); - list_for_each_entry_safe(block_rsv, next_rsv, - &fs_info->durable_block_rsv_list, list) { - - idx = trans->transid & 0x1; - if (block_rsv->freed[idx] > 0) { - block_rsv_add_bytes(block_rsv, - block_rsv->freed[idx], 0); - block_rsv->freed[idx] = 0; - } - if (atomic_read(&block_rsv->usage) == 0) { - btrfs_block_rsv_release(root, block_rsv, (u64)-1); - - if (block_rsv->freed[0] == 0 && - block_rsv->freed[1] == 0) { - list_del_init(&block_rsv->list); - kfree(block_rsv); - } - } else { - btrfs_block_rsv_release(root, block_rsv, 0); - } - } - mutex_unlock(&fs_info->durable_block_rsv_mutex); - return 0; } @@ -4668,7 +4957,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_block_rsv *block_rsv; struct btrfs_block_group_cache *cache = NULL; int ret; @@ -4683,64 +4971,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (!last_ref) return; - block_rsv = get_block_rsv(trans, root); cache = btrfs_lookup_block_group(root->fs_info, buf->start); - if (block_rsv->space_info != cache->space_info) - goto out; if (btrfs_header_generation(buf) == trans->transid) { if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, root, buf->start); if (!ret) - goto pin; + goto out; } if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(root, cache, buf->start, buf->len, 1); - goto pin; + goto out; } WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); - if (ret == -EAGAIN) { - /* block group became read-only */ - btrfs_update_reserved_bytes(cache, buf->len, 0, 1); - goto out; - } - - ret = 1; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) { - block_rsv->reserved += buf->len; - ret = 0; - } - spin_unlock(&block_rsv->lock); - - if (ret) { - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_reserved -= buf->len; - cache->space_info->reservation_progress++; - spin_unlock(&cache->space_info->lock); - } - goto out; - } -pin: - if (block_rsv->durable && !cache->ro) { - ret = 0; - spin_lock(&cache->lock); - if (!cache->ro) { - cache->reserved_pinned += buf->len; - ret = 1; - } - spin_unlock(&cache->lock); - - if (ret) { - spin_lock(&block_rsv->lock); - block_rsv->freed[trans->transid & 0x1] += buf->len; - spin_unlock(&block_rsv->lock); - } + btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); } out: /* @@ -4876,17 +5124,20 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root = orig_root->fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; + struct btrfs_block_group_cache *used_block_group; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; int done_chunk_alloc = 0; struct btrfs_space_info *space_info; - int last_ptr_loop = 0; int loop = 0; int index = 0; + int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? + RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; + bool have_caching_bg = false; u64 ideal_cache_percent = 0; u64 ideal_cache_offset = 0; @@ -4939,6 +5190,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); + used_block_group = block_group; /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -4969,12 +5221,14 @@ ideal_cache: } } search: + have_caching_bg = false; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], list) { u64 offset; int cached; + used_block_group = block_group; btrfs_get_block_group(block_group); search_start = block_group->key.objectid; @@ -4998,13 +5252,15 @@ search: } have_block_group: - if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) { u64 free_percent; + found_uncached_bg = true; ret = cache_block_group(block_group, trans, orig_root, 1); if (block_group->cached == BTRFS_CACHE_FINISHED) - goto have_block_group; + goto alloc; free_percent = btrfs_block_group_used(&block_group->item); free_percent *= 100; @@ -5026,7 +5282,6 @@ have_block_group: orig_root, 0); BUG_ON(ret); } - found_uncached_bg = true; /* * If loop is set for cached only, try the next block @@ -5036,94 +5291,80 @@ have_block_group: goto loop; } - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) - found_uncached_bg = true; - +alloc: if (unlikely(block_group->ro)) goto loop; spin_lock(&block_group->free_space_ctl->tree_lock); if (cached && block_group->free_space_ctl->free_space < - num_bytes + empty_size) { + num_bytes + empty_cluster + empty_size) { spin_unlock(&block_group->free_space_ctl->tree_lock); goto loop; } spin_unlock(&block_group->free_space_ctl->tree_lock); /* - * Ok we want to try and use the cluster allocator, so lets look - * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will - * have tried the cluster allocator plenty of times at this - * point and not have found anything, so we are likely way too - * fragmented for the clustering stuff to find anything, so lets - * just skip it and let the allocator find whatever block it can - * find + * Ok we want to try and use the cluster allocator, so + * lets look there */ - if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { + if (last_ptr) { /* * the refill lock keeps out other * people trying to start a new cluster */ spin_lock(&last_ptr->refill_lock); - if (last_ptr->block_group && - (last_ptr->block_group->ro || - !block_group_bits(last_ptr->block_group, data))) { - offset = 0; + used_block_group = last_ptr->block_group; + if (used_block_group != block_group && + (!used_block_group || + used_block_group->ro || + !block_group_bits(used_block_group, data))) { + used_block_group = block_group; goto refill_cluster; } - offset = btrfs_alloc_from_cluster(block_group, last_ptr, - num_bytes, search_start); + if (used_block_group != block_group) + btrfs_get_block_group(used_block_group); + + offset = btrfs_alloc_from_cluster(used_block_group, + last_ptr, num_bytes, used_block_group->key.objectid); if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); goto checks; } - spin_lock(&last_ptr->lock); - /* - * whoops, this cluster doesn't actually point to - * this block group. Get a ref on the block - * group is does point to and try again - */ - if (!last_ptr_loop && last_ptr->block_group && - last_ptr->block_group != block_group && - index <= - get_block_group_index(last_ptr->block_group)) { - - btrfs_put_block_group(block_group); - block_group = last_ptr->block_group; - btrfs_get_block_group(block_group); - spin_unlock(&last_ptr->lock); - spin_unlock(&last_ptr->refill_lock); - - last_ptr_loop = 1; - search_start = block_group->key.objectid; - /* - * we know this block group is properly - * in the list because - * btrfs_remove_block_group, drops the - * cluster before it removes the block - * group from the list - */ - goto have_block_group; + WARN_ON(last_ptr->block_group != used_block_group); + if (used_block_group != block_group) { + btrfs_put_block_group(used_block_group); + used_block_group = block_group; } - spin_unlock(&last_ptr->lock); refill_cluster: + BUG_ON(used_block_group != block_group); + /* If we are on LOOP_NO_EMPTY_SIZE, we can't + * set up a new clusters, so lets just skip it + * and let the allocator find whatever block + * it can find. If we reach this point, we + * will have tried the cluster allocator + * plenty of times and not have found + * anything, so we are likely way too + * fragmented for the clustering stuff to find + * anything. */ + if (loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + goto unclustered_alloc; + } + /* * this cluster didn't work out, free it and * start over */ btrfs_return_cluster_to_free_space(NULL, last_ptr); - last_ptr_loop = 0; - /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, - offset, num_bytes, + search_start, num_bytes, empty_cluster + empty_size); if (ret == 0) { /* @@ -5159,6 +5400,7 @@ refill_cluster: goto loop; } +unclustered_alloc: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); /* @@ -5177,20 +5419,22 @@ refill_cluster: failed_alloc = true; goto have_block_group; } else if (!offset) { + if (!cached) + have_caching_bg = true; goto loop; } checks: search_start = stripe_align(root, offset); /* move on to the next group */ if (search_start + num_bytes >= search_end) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } /* move on to the next group */ if (search_start + num_bytes > - block_group->key.objectid + block_group->key.offset) { - btrfs_add_free_space(block_group, offset, num_bytes); + used_block_group->key.objectid + used_block_group->key.offset) { + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } @@ -5198,14 +5442,14 @@ checks: ins->offset = num_bytes; if (offset < search_start) - btrfs_add_free_space(block_group, offset, + btrfs_add_free_space(used_block_group, offset, search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, - (data & BTRFS_BLOCK_GROUP_DATA)); + ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, + alloc_type); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } @@ -5214,19 +5458,26 @@ checks: ins->offset = num_bytes; if (offset < search_start) - btrfs_add_free_space(block_group, offset, + btrfs_add_free_space(used_block_group, offset, search_start - offset); BUG_ON(offset > search_start); + if (used_block_group != block_group) + btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); + if (used_block_group != block_group) + btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) + goto search; + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) goto search; @@ -5325,7 +5576,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int index = 0; spin_lock(&info->lock); - printk(KERN_INFO "space_info has %llu free, is %sfull\n", + printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", + (unsigned long long)info->flags, (unsigned long long)(info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - info->bytes_readonly), @@ -5411,7 +5663,8 @@ again: return ret; } -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +static int __btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len, int pin) { struct btrfs_block_group_cache *cache; int ret = 0; @@ -5426,8 +5679,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) if (btrfs_test_opt(root, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); - btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, 0, 1); + if (pin) + pin_down_extent(root, cache, start, len, 1); + else { + btrfs_add_free_space(cache, start, len); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); + } btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, start, len); @@ -5435,6 +5692,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) return ret; } +int btrfs_free_reserved_extent(struct btrfs_root *root, + u64 start, u64 len) +{ + return __btrfs_free_reserved_extent(root, start, len, 0); +} + +int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, + u64 start, u64 len) +{ + return __btrfs_free_reserved_extent(root, start, len, 1); +} + static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -5630,7 +5899,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, put_caching_control(caching_ctl); } - ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); + ret = btrfs_update_reserved_bytes(block_group, ins->offset, + RESERVE_ALLOC_NO_ACCOUNT); BUG_ON(ret); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, @@ -5687,8 +5957,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(trans, root, block_rsv, - blocksize, 0); + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); /* * If we couldn't reserve metadata bytes try and use some from * the global reserve. @@ -5708,13 +5977,15 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (!ret) return block_rsv; if (ret) { - WARN_ON(1); - ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, - 0); + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL, + /*DEFAULT_RATELIMIT_BURST*/ 2); + if (__ratelimit(&_rs)) { + printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); + WARN_ON(1); + } + ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); if (!ret) { - spin_lock(&block_rsv->lock); - block_rsv->size += blocksize; - spin_unlock(&block_rsv->lock); return block_rsv; } else if (ret && block_rsv != global_rsv) { ret = block_rsv_use_bytes(global_rsv, blocksize); @@ -6592,12 +6863,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) cache->bytes_super - btrfs_block_group_used(&cache->item); if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + - sinfo->bytes_may_use + sinfo->bytes_readonly + - cache->reserved_pinned + num_bytes + min_allocable_bytes <= - sinfo->total_bytes) { + sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + + min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; - sinfo->bytes_reserved += cache->reserved_pinned; - cache->reserved_pinned = 0; cache->ro = 1; ret = 0; } @@ -6964,7 +7232,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_space_info, list); if (space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0) { + space_info->bytes_reserved > 0 || + space_info->bytes_may_use > 0) { WARN_ON(1); dump_space_info(space_info, 0, 0); } @@ -7006,14 +7275,12 @@ int btrfs_read_block_groups(struct btrfs_root *root) return -ENOMEM; path->reada = 1; - cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); - if (cache_gen != 0 && - btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); + if (btrfs_test_opt(root, SPACE_CACHE) && + btrfs_super_generation(root->fs_info->super_copy) != cache_gen) need_clear = 1; if (btrfs_test_opt(root, CLEAR_CACHE)) need_clear = 1; - if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) - printk(KERN_INFO "btrfs: disk space caching is enabled\n"); while (1) { ret = find_first_block_group(root, path, &key); @@ -7252,7 +7519,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; } - inode = lookup_free_space_inode(root, block_group, path); + inode = lookup_free_space_inode(tree_root, block_group, path); if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); BUG_ON(ret); @@ -7268,7 +7535,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->lock); } /* One for our lookup ref */ - iput(inode); + btrfs_add_delayed_iput(inode); } key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -7339,7 +7606,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) int mixed = 0; int ret; - disk_super = &fs_info->super_copy; + disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) return 1; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d418164a35f1..49f3c9dc09f4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -17,6 +17,7 @@ #include "compat.h" #include "ctree.h" #include "btrfs_inode.h" +#include "volumes.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -894,6 +895,202 @@ search_again: goto again; } +/** + * convert_extent - convert all bits in a given range from one bit to another + * @tree: the io tree to search + * @start: the start offset in bytes + * @end: the end offset in bytes (inclusive) + * @bits: the bits to set in this range + * @clear_bits: the bits to clear in this range + * @mask: the allocation mask + * + * This will go through and set bits for the given range. If any states exist + * already in this range they are set with the given bit and cleared of the + * clear_bits. This is only meant to be used by things that are mergeable, ie + * converting from say DELALLOC to DIRTY. This is not meant to be used with + * boundary bits like LOCK. + */ +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int clear_bits, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; + u64 last_start; + u64 last_end; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = insert_state(tree, prealloc, start, end, &bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); +hit_next: + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + struct rb_node *next_node; + + set_state_bits(tree, state, &bits); + clear_state_bit(tree, state, &clear_bits, 0); + + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + + start = last_end + 1; + next_node = rb_next(&state->rb_node); + if (next_node && start < end && prealloc && !need_resched()) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, &bits); + clear_state_bit(tree, state, &clear_bits, 0); + merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + /* + * Avoid to free 'prealloc' if it can be merged with + * the later extent. + */ + err = insert_state(tree, prealloc, start, this_end, + &bits); + BUG_ON(err == -EEXIST); + if (err) { + free_extent_state(prealloc); + prealloc = NULL; + goto out; + } + prealloc = NULL; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + prealloc = alloc_extent_state_atomic(prealloc); + if (!prealloc) { + err = -ENOMEM; + goto out; + } + + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + set_state_bits(tree, prealloc, &bits); + clear_state_bit(tree, prealloc, &clear_bits, 0); + + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + /* wrappers around set/clear extent bit */ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) @@ -919,7 +1116,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, + EXTENT_DELALLOC | EXTENT_UPTODATE, 0, NULL, cached_state, mask); } @@ -1599,6 +1796,368 @@ static int check_page_writeback(struct extent_io_tree *tree, return 0; } +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int this_mirror; + int failed_mirror; + int in_validation; +}; + +static int free_io_failure(struct inode *inode, struct io_failure_record *rec, + int did_repair) +{ + int ret; + int err = 0; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + + set_state_private(failure_tree, rec->start, 0); + ret = clear_extent_bits(failure_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret) + err = ret; + + if (did_repair) { + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, + rec->start + rec->len - 1, + EXTENT_DAMAGED, GFP_NOFS); + if (ret && !err) + err = ret; + } + + kfree(rec); + return err; +} + +static void repair_io_failure_callback(struct bio *bio, int err) +{ + complete(bio->bi_private); +} + +/* + * this bypasses the standard btrfs submit functions deliberately, as + * the standard behavior is to write all copies in a raid setup. here we only + * want to write the one bad copy. so we do the mapping for ourselves and issue + * submit_bio directly. + * to avoid any synchonization issues, wait for the data after writing, which + * actually prevents the read that triggered the error from finishing. + * currently, there can be no more than two copies of every data bit. thus, + * exactly one rewrite is required. + */ +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num) +{ + struct bio *bio; + struct btrfs_device *dev; + DECLARE_COMPLETION_ONSTACK(compl); + u64 map_length = 0; + u64 sector; + struct btrfs_bio *bbio = NULL; + int ret; + + BUG_ON(!mirror_num); + + bio = bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; + bio->bi_private = &compl; + bio->bi_end_io = repair_io_failure_callback; + bio->bi_size = 0; + map_length = length; + + ret = btrfs_map_block(map_tree, WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); + sector = bbio->stripes[mirror_num-1].physical >> 9; + bio->bi_sector = sector; + dev = bbio->stripes[mirror_num-1].dev; + kfree(bbio); + if (!dev || !dev->bdev || !dev->writeable) { + bio_put(bio); + return -EIO; + } + bio->bi_bdev = dev->bdev; + bio_add_page(bio, page, length, start-page_offset(page)); + submit_bio(WRITE_SYNC, bio); + wait_for_completion(&compl); + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { + /* try to remap that extent elsewhere? */ + bio_put(bio); + return -EIO; + } + + printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " + "sector %llu)\n", page->mapping->host->i_ino, start, + dev->name, sector); + + bio_put(bio); + return 0; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int clean_io_failure(u64 start, struct page *page) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failrec; + struct btrfs_mapping_tree *map_tree; + struct extent_state *state; + int num_copies; + int did_repair = 0; + int ret; + struct inode *inode = page->mapping->host; + + private = 0; + ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY, 0); + if (!ret) + return 0; + + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, + &private_failure); + if (ret) + return 0; + + failrec = (struct io_failure_record *)(unsigned long) private_failure; + BUG_ON(!failrec->this_mirror); + + if (failrec->in_validation) { + /* there was no real error, just free the record */ + pr_debug("clean_io_failure: freeing dummy error at %llu\n", + failrec->start); + did_repair = 1; + goto out; + } + + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + + if (state && state->start == failrec->start) { + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; + num_copies = btrfs_num_copies(map_tree, failrec->logical, + failrec->len); + if (num_copies > 1) { + ret = repair_io_failure(map_tree, start, failrec->len, + failrec->logical, page, + failrec->failed_mirror); + did_repair = !ret; + } + } + +out: + if (!ret) + ret = free_io_failure(inode, failrec, did_repair); + + return ret; +} + +/* + * this is a generic handler for readpage errors (default + * readpage_io_failed_hook). if other copies exist, read those and write back + * good data to the failed position. does not investigate in remapping the + * failed extent elsewhere, hoping the device will be smart enough to do this as + * needed + */ + +static int bio_readpage_error(struct bio *failed_bio, struct page *page, + u64 start, u64 end, int failed_mirror, + struct extent_state *state) +{ + struct io_failure_record *failrec = NULL; + u64 private; + struct extent_map *em; + struct inode *inode = page->mapping->host; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct bio *bio; + int num_copies; + int ret; + int read_mode; + u64 logical; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kzalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + failrec->start = start; + failrec->len = end - start + 1; + failrec->this_mirror = 0; + failrec->bio_flags = 0; + failrec->in_validation = 0; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (!em) { + read_unlock(&em_tree->lock); + kfree(failrec); + return -EIO; + } + + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } + read_unlock(&em_tree->lock); + + if (!em || IS_ERR(em)) { + kfree(failrec); + return -EIO; + } + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); + } + pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " + "len=%llu\n", logical, start, failrec->len); + failrec->logical = logical; + free_extent_map(em); + + /* set the bits in the private failure tree */ + ret = set_extent_bits(failure_tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + if (ret >= 0) + ret = set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + /* set the bits in the inode's tree */ + if (ret >= 0) + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, + GFP_NOFS); + if (ret < 0) { + kfree(failrec); + return ret; + } + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + pr_debug("bio_readpage_error: (found) logical=%llu, " + "start=%llu, len=%llu, validation=%d\n", + failrec->logical, failrec->start, failrec->len, + failrec->in_validation); + /* + * when data can be on disk more than twice, add to failrec here + * (e.g. with a list for failed_mirror) to make + * clean_io_failure() clean all those errors at once. + */ + } + num_copies = btrfs_num_copies( + &BTRFS_I(inode)->root->fs_info->mapping_tree, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " + "state=%p, num_copies=%d, next_mirror %d, " + "failed_mirror %d\n", state, num_copies, + failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + if (!state) { + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, failrec->start, + EXTENT_LOCKED); + if (state && state->start != failrec->start) + state = NULL; + spin_unlock(&tree->lock); + } + + /* + * there are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + */ + if (failed_bio->bi_vcnt > 1) { + /* + * to fulfill b), we need to know the exact failing sectors, as + * we don't want to rewrite any more than the failed ones. thus, + * we need separate read requests for the failed bio + * + * if the following BUG_ON triggers, our validation request got + * merged. we need separate requests for our algorithm to work. + */ + BUG_ON(failrec->in_validation); + failrec->in_validation = 1; + failrec->this_mirror = failed_mirror; + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + } else { + /* + * we're ready to fulfill a) and b) alongside. get a good copy + * of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + if (failrec->in_validation) { + BUG_ON(failrec->this_mirror != failed_mirror); + failrec->in_validation = 0; + failrec->this_mirror = 0; + } + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + read_mode = READ_SYNC; + } + + if (!state || failrec->this_mirror > num_copies) { + pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " + "next_mirror %d, failed_mirror %d\n", state, + num_copies, failrec->this_mirror, failed_mirror); + free_io_failure(inode, failrec, 0); + return -EIO; + } + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_private = state; + bio->bi_end_io = failed_bio->bi_end_io; + bio->bi_sector = failrec->logical >> 9; + bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + bio->bi_size = 0; + + bio_add_page(bio, page, failrec->len, start - page_offset(page)); + + pr_debug("bio_readpage_error: submitting new read[%#x] to " + "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, + failrec->this_mirror, num_copies, failrec->in_validation); + + tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, + failrec->bio_flags, 0); + return 0; +} + /* lots and lots of room for performance fixes in the end_bio funcs */ /* @@ -1697,6 +2256,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct extent_state *cached = NULL; struct extent_state *state; + pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " + "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, + (long int)bio->bi_bdev); tree = &BTRFS_I(page->mapping->host)->io_tree; start = ((u64)page->index << PAGE_CACHE_SHIFT) + @@ -1727,12 +2289,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err) state); if (ret) uptodate = 0; + else + clean_io_failure(start, page); } - if (!uptodate && tree->ops && - tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(bio, page, - start, end, NULL); + if (!uptodate) { + int failed_mirror; + failed_mirror = (int)(unsigned long)bio->bi_bdev; + /* + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, not + * in the !uptodate case). In that case it returns 0 and + * we just go on with the next page in our bio. If it + * can't handle the error it will return -EIO and we + * remain responsible for that page. + */ + ret = bio_readpage_error(bio, page, start, end, + failed_mirror, NULL); if (ret == 0) { +error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) @@ -1740,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) uncache_state(&cached); continue; } + if (tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook( + bio, page, start, end, + failed_mirror, state); + if (ret == 0) + goto error_handled; + } } if (uptodate) { @@ -1811,6 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, mirror_num, bio_flags, start); else submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; bio_put(bio); @@ -2076,16 +2660,16 @@ out: } int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent) + get_extent_t *get_extent, int mirror_num) { struct bio *bio = NULL; unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, + ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags); if (bio) - ret = submit_one_bio(READ, bio, 0, bio_flags); + ret = submit_one_bio(READ, bio, mirror_num, bio_flags); return ret; } @@ -2136,6 +2720,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, int compressed; int write_flags; unsigned long nr_written = 0; + bool fill_delalloc = true; if (wbc->sync_mode == WB_SYNC_ALL) write_flags = WRITE_SYNC; @@ -2145,6 +2730,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, trace___extent_writepage(page, inode, wbc); WARN_ON(!PageLocked(page)); + + ClearPageError(page); + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || (page->index == end_index && !pg_offset)) { @@ -2166,10 +2754,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, set_page_extent_mapped(page); + if (!tree->ops || !tree->ops->fill_delalloc) + fill_delalloc = false; + delalloc_start = start; delalloc_end = 0; page_started = 0; - if (!epd->extent_locked) { + if (!epd->extent_locked && fill_delalloc) { u64 delalloc_to_write = 0; /* * make sure the wbc mapping index is at least updated @@ -2421,10 +3012,16 @@ retry: * swizzled back from swapper_space to tmpfs file * mapping */ - if (tree->ops && tree->ops->write_cache_pages_lock_hook) - tree->ops->write_cache_pages_lock_hook(page); - else - lock_page(page); + if (tree->ops && + tree->ops->write_cache_pages_lock_hook) { + tree->ops->write_cache_pages_lock_hook(page, + data, flush_fn); + } else { + if (!trylock_page(page)) { + flush_fn(data); + lock_page(page); + } + } if (unlikely(page->mapping != mapping)) { unlock_page(page); @@ -2790,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); + len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); + /* * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size @@ -2837,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent_skip_holes(inode, off, last_for_get_extent, + em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); if (!em) goto out; @@ -2926,7 +3526,7 @@ out: return ret; } -static inline struct page *extent_buffer_page(struct extent_buffer *eb, +inline struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i) { struct page *p; @@ -2951,7 +3551,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, return p; } -static inline unsigned long num_extent_pages(u64 start, u64 len) +inline unsigned long num_extent_pages(u64 start, u64 len) { return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT); @@ -3204,6 +3804,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, PAGECACHE_TAG_DIRTY); } spin_unlock_irq(&page->mapping->tree_lock); + ClearPageError(page); unlock_page(page); } return 0; @@ -3349,8 +3950,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, } int read_extent_buffer_pages(struct extent_io_tree *tree, - struct extent_buffer *eb, - u64 start, int wait, + struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num) { unsigned long i; @@ -3386,7 +3986,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); - if (!wait) { + if (wait == WAIT_NONE) { if (!trylock_page(page)) goto unlock_exit; } else { @@ -3430,7 +4030,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, if (bio) submit_one_bio(READ, bio, mirror_num, bio_flags); - if (ret || !wait) + if (ret || wait != WAIT_COMPLETE) return ret; for (i = start_i; i < num_pages; i++) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7b2f0c3e7929..7604c3001322 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -17,6 +17,8 @@ #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) #define EXTENT_FIRST_DELALLOC (1 << 12) +#define EXTENT_NEED_WAIT (1 << 13) +#define EXTENT_DAMAGED (1 << 14) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -32,6 +34,7 @@ #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 +#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -67,7 +70,7 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, + u64 start, u64 end, int failed_mirror, struct extent_state *state); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, @@ -85,7 +88,8 @@ struct extent_io_ops { struct extent_state *other); void (*split_extent_hook)(struct inode *inode, struct extent_state *orig, u64 split); - int (*write_cache_pages_lock_hook)(struct page *page); + int (*write_cache_pages_lock_hook)(struct page *page, void *data, + void (*flush_fn)(void *)); }; struct extent_io_tree { @@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent); + get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); void extent_io_exit(void); @@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int clear_bits, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, @@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len); void free_extent_buffer(struct extent_buffer *eb); +#define WAIT_NONE 0 +#define WAIT_COMPLETE 1 +#define WAIT_PAGE_LOCK 2 int read_extent_buffer_pages(struct extent_io_tree *tree, struct extent_buffer *eb, u64 start, int wait, get_extent_t *get_extent, int mirror_num); +unsigned long num_extent_pages(u64 start, u64 len); +struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); static inline void extent_buffer_get(struct extent_buffer *eb) { @@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, gfp_t gfp_flags); + +struct btrfs_mapping_tree; + +int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, + u64 length, u64 logical, struct page *page, + int mirror_num); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a1cb7821becd..c7fb3a4247d3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_csum_item *item; struct extent_buffer *leaf; u64 csum_offset = 0; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 item_last_offset = 0; u64 disk_bytenr; u32 diff; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int ret; struct btrfs_path *path; struct btrfs_csum_item *item = NULL; @@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int ret; size_t size; u64 csum_end; - u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) @@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, u64 bytenr, u64 len) { struct extent_buffer *leaf; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); u64 csum_end; u64 end_byte = bytenr + len; u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; @@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 csum_end; struct extent_buffer *leaf; int ret; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); int blocksize_bits = root->fs_info->sb->s_blocksize_bits; root = root->fs_info->csum_root; @@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_sector_sum *sector_sum; u32 nritems; u32 ins_size; - u16 csum_size = - btrfs_super_csum_size(&root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1266f6e9cdb2..97fbe939c050 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, int i; unsigned long index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = fdentry(file)->d_inode; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int err = 0; int faili = 0; u64 start_pos; @@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, again: for (i = 0; i < num_pages; i++) { pages[i] = find_or_create_page(inode->i_mapping, index + i, - GFP_NOFS); + mask); if (!pages[i]) { faili = i - 1; err = -ENOMEM; @@ -1166,6 +1167,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / (sizeof(struct page *))); + nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); + nrptrs = max(nrptrs, 8); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM; @@ -1386,7 +1389,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, goto out; } - file_update_time(file); + err = btrfs_update_time(file); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } BTRFS_I(inode)->sequence++; start_pos = round_down(pos, root->sectorsize); @@ -1615,10 +1622,6 @@ static long btrfs_fallocate(struct file *file, int mode, goto out; } - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); - if (ret) - goto out; - locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -1664,11 +1667,27 @@ static long btrfs_fallocate(struct file *file, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + + /* + * Make sure we have enough space before we do the + * allocation. + */ + ret = btrfs_check_data_free_space(inode, last_byte - + cur_offset); + if (ret) { + free_extent_map(em); + break; + } + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, last_byte - cur_offset, 1 << inode->i_blkbits, offset + len, &alloc_hint); + + /* Let go of our reservation. */ + btrfs_free_reserved_data_space(inode, last_byte - + cur_offset); if (ret < 0) { free_extent_map(em); break; @@ -1694,8 +1713,6 @@ static long btrfs_fallocate(struct file *file, int mode, } unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); - - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 41ac927401d0..ec23d43d0c35 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -20,6 +20,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/math64.h> +#include <linux/ratelimit.h> #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, *block_group, struct btrfs_path *path) { struct inode *inode = NULL; + u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; spin_lock(&block_group->lock); if (block_group->inode) @@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; spin_lock(&block_group->lock); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { + if (!((BTRFS_I(inode)->flags & flags) == flags)) { printk(KERN_INFO "Old style space inode found, converting.\n"); - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | + BTRFS_INODE_NODATACOW; block_group->disk_cache_state = BTRFS_DC_CLEAR; } - if (!btrfs_fs_closing(root->fs_info)) { + if (!block_group->iref) { block_group->inode = igrab(inode); block_group->iref = 1; } @@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header *header; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; + u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret; ret = btrfs_insert_empty_inode(trans, root, path, ino); if (ret) return ret; + /* We inline crc's for the free disk space cache */ + if (ino != BTRFS_FREE_INO_OBJECTID) + flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; + leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); @@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_uid(leaf, inode_item, 0); btrfs_set_inode_gid(leaf, inode_item, 0); btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC); + btrfs_set_inode_flags(leaf, inode_item, flags); btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); @@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct inode *inode) { struct btrfs_block_rsv *rsv; + u64 needed_bytes; loff_t oldsize; int ret = 0; rsv = trans->block_rsv; - trans->block_rsv = root->orphan_block_rsv; - ret = btrfs_block_rsv_check(trans, root, - root->orphan_block_rsv, - 0, 5); - if (ret) - return ret; + trans->block_rsv = &root->fs_info->global_block_rsv; + + /* 1 for slack space, 1 for updating the inode */ + needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + + btrfs_calc_trans_metadata_size(root, 1); + + spin_lock(&trans->block_rsv->lock); + if (trans->block_rsv->reserved < needed_bytes) { + spin_unlock(&trans->block_rsv->lock); + trans->block_rsv = rsv; + return -ENOSPC; + } + spin_unlock(&trans->block_rsv->lock); oldsize = i_size_read(inode); btrfs_i_size_write(inode, 0); @@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - trans->block_rsv = rsv; if (ret) { + trans->block_rsv = rsv; WARN_ON(1); return ret; } ret = btrfs_update_inode(trans, root, inode); + trans->block_rsv = rsv; + return ret; } @@ -242,26 +259,348 @@ static int readahead_cache(struct inode *inode) return 0; } +struct io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_root *root; + unsigned long size; + int index; + int num_pages; + unsigned check_crcs:1; +}; + +static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, + struct btrfs_root *root) +{ + memset(io_ctl, 0, sizeof(struct io_ctl)); + io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages, + GFP_NOFS); + if (!io_ctl->pages) + return -ENOMEM; + io_ctl->root = root; + if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) + io_ctl->check_crcs = 1; + return 0; +} + +static void io_ctl_free(struct io_ctl *io_ctl) +{ + kfree(io_ctl->pages); +} + +static void io_ctl_unmap_page(struct io_ctl *io_ctl) +{ + if (io_ctl->cur) { + kunmap(io_ctl->page); + io_ctl->cur = NULL; + io_ctl->orig = NULL; + } +} + +static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) +{ + WARN_ON(io_ctl->cur); + BUG_ON(io_ctl->index >= io_ctl->num_pages); + io_ctl->page = io_ctl->pages[io_ctl->index++]; + io_ctl->cur = kmap(io_ctl->page); + io_ctl->orig = io_ctl->cur; + io_ctl->size = PAGE_CACHE_SIZE; + if (clear) + memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); +} + +static void io_ctl_drop_pages(struct io_ctl *io_ctl) +{ + int i; + + io_ctl_unmap_page(io_ctl); + + for (i = 0; i < io_ctl->num_pages; i++) { + ClearPageChecked(io_ctl->pages[i]); + unlock_page(io_ctl->pages[i]); + page_cache_release(io_ctl->pages[i]); + } +} + +static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, + int uptodate) +{ + struct page *page; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + int i; + + for (i = 0; i < io_ctl->num_pages; i++) { + page = find_or_create_page(inode->i_mapping, i, mask); + if (!page) { + io_ctl_drop_pages(io_ctl); + return -ENOMEM; + } + io_ctl->pages[i] = page; + if (uptodate && !PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + printk(KERN_ERR "btrfs: error reading free " + "space cache\n"); + io_ctl_drop_pages(io_ctl); + return -EIO; + } + } + } + + for (i = 0; i < io_ctl->num_pages; i++) { + clear_page_dirty_for_io(io_ctl->pages[i]); + set_page_extent_mapped(io_ctl->pages[i]); + } + + return 0; +} + +static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) +{ + u64 *val; + + io_ctl_map_page(io_ctl, 1); + + /* + * Skip the csum areas. If we don't check crcs then we just have a + * 64bit chunk at the front of the first page. + */ + if (io_ctl->check_crcs) { + io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); + io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } + + val = io_ctl->cur; + *val = cpu_to_le64(generation); + io_ctl->cur += sizeof(u64); +} + +static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) +{ + u64 *gen; + + /* + * Skip the crc area. If we don't check crcs then we just have a 64bit + * chunk at the front of the first page. + */ + if (io_ctl->check_crcs) { + io_ctl->cur += sizeof(u32) * io_ctl->num_pages; + io_ctl->size -= sizeof(u64) + + (sizeof(u32) * io_ctl->num_pages); + } else { + io_ctl->cur += sizeof(u64); + io_ctl->size -= sizeof(u64) * 2; + } + + gen = io_ctl->cur; + if (le64_to_cpu(*gen) != generation) { + printk_ratelimited(KERN_ERR "btrfs: space cache generation " + "(%Lu) does not match inode (%Lu)\n", *gen, + generation); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + io_ctl->cur += sizeof(u64); + return 0; +} + +static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) +{ + u32 *tmp; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_unmap_page(io_ctl); + return; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages;; + + crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + io_ctl_unmap_page(io_ctl); + tmp = kmap(io_ctl->pages[0]); + tmp += index; + *tmp = crc; + kunmap(io_ctl->pages[0]); +} + +static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) +{ + u32 *tmp, val; + u32 crc = ~(u32)0; + unsigned offset = 0; + + if (!io_ctl->check_crcs) { + io_ctl_map_page(io_ctl, 0); + return 0; + } + + if (index == 0) + offset = sizeof(u32) * io_ctl->num_pages; + + tmp = kmap(io_ctl->pages[0]); + tmp += index; + val = *tmp; + kunmap(io_ctl->pages[0]); + + io_ctl_map_page(io_ctl, 0); + crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, + PAGE_CACHE_SIZE - offset); + btrfs_csum_final(crc, (char *)&crc); + if (val != crc) { + printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " + "space cache\n"); + io_ctl_unmap_page(io_ctl); + return -EIO; + } + + return 0; +} + +static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, + void *bitmap) +{ + struct btrfs_free_space_entry *entry; + + if (!io_ctl->cur) + return -ENOSPC; + + entry = io_ctl->cur; + entry->offset = cpu_to_le64(offset); + entry->bytes = cpu_to_le64(bytes); + entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : + BTRFS_FREE_SPACE_EXTENT; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + + /* No more pages to map */ + if (io_ctl->index >= io_ctl->num_pages) + return 0; + + /* map the next page */ + io_ctl_map_page(io_ctl, 1); + return 0; +} + +static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) +{ + if (!io_ctl->cur) + return -ENOSPC; + + /* + * If we aren't at the start of the current page, unmap this one and + * map the next one if there is any left. + */ + if (io_ctl->cur != io_ctl->orig) { + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + if (io_ctl->index >= io_ctl->num_pages) + return -ENOSPC; + io_ctl_map_page(io_ctl, 0); + } + + memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + if (io_ctl->index < io_ctl->num_pages) + io_ctl_map_page(io_ctl, 0); + return 0; +} + +static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) +{ + /* + * If we're not on the boundary we know we've modified the page and we + * need to crc the page. + */ + if (io_ctl->cur != io_ctl->orig) + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + else + io_ctl_unmap_page(io_ctl); + + while (io_ctl->index < io_ctl->num_pages) { + io_ctl_map_page(io_ctl, 1); + io_ctl_set_crc(io_ctl, io_ctl->index - 1); + } +} + +static int io_ctl_read_entry(struct io_ctl *io_ctl, + struct btrfs_free_space *entry, u8 *type) +{ + struct btrfs_free_space_entry *e; + int ret; + + if (!io_ctl->cur) { + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + } + + e = io_ctl->cur; + entry->offset = le64_to_cpu(e->offset); + entry->bytes = le64_to_cpu(e->bytes); + *type = e->type; + io_ctl->cur += sizeof(struct btrfs_free_space_entry); + io_ctl->size -= sizeof(struct btrfs_free_space_entry); + + if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) + return 0; + + io_ctl_unmap_page(io_ctl); + + return 0; +} + +static int io_ctl_read_bitmap(struct io_ctl *io_ctl, + struct btrfs_free_space *entry) +{ + int ret; + + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + + memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); + io_ctl_unmap_page(io_ctl); + + return 0; +} + int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) { struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct page *page; + struct io_ctl io_ctl; struct btrfs_key key; + struct btrfs_free_space *e, *n; struct list_head bitmaps; u64 num_entries; u64 num_bitmaps; u64 generation; - pgoff_t index = 0; + u8 type; int ret = 0; INIT_LIST_HEAD(&bitmaps); /* Nothing in the space cache, goodbye */ if (!i_size_read(inode)) - goto out; + return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; @@ -269,11 +608,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return 0; else if (ret > 0) { btrfs_release_path(path); - ret = 0; - goto out; + return 0; } ret = -1; @@ -291,169 +629,102 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, " not match free space cache generation (%llu)\n", (unsigned long long)BTRFS_I(inode)->generation, (unsigned long long)generation); - goto out; + return 0; } if (!num_entries) - goto out; + return 0; + io_ctl_init(&io_ctl, inode, root); ret = readahead_cache(inode); if (ret) goto out; - while (1) { - struct btrfs_free_space_entry *entry; - struct btrfs_free_space *e; - void *addr; - unsigned long offset = 0; - int need_loop = 0; + ret = io_ctl_prepare_pages(&io_ctl, inode, 1); + if (ret) + goto out; - if (!num_entries && !num_bitmaps) - break; + ret = io_ctl_check_crc(&io_ctl, 0); + if (ret) + goto free_cache; + + ret = io_ctl_check_generation(&io_ctl, generation); + if (ret) + goto free_cache; - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); - if (!page) + while (num_entries) { + e = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); + if (!e) goto free_cache; - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - page_cache_release(page); - printk(KERN_ERR "btrfs: error reading free " - "space cache\n"); - goto free_cache; - } + ret = io_ctl_read_entry(&io_ctl, e, &type); + if (ret) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; } - addr = kmap(page); - if (index == 0) { - u64 *gen; + if (!e->bytes) { + kmem_cache_free(btrfs_free_space_cachep, e); + goto free_cache; + } - /* - * We put a bogus crc in the front of the first page in - * case old kernels try to mount a fs with the new - * format to make sure they discard the cache. - */ - addr += sizeof(u64); - offset += sizeof(u64); - - gen = addr; - if (*gen != BTRFS_I(inode)->generation) { - printk(KERN_ERR "btrfs: space cache generation" - " (%llu) does not match inode (%llu)\n", - (unsigned long long)*gen, - (unsigned long long) - BTRFS_I(inode)->generation); - kunmap(page); - unlock_page(page); - page_cache_release(page); + if (type == BTRFS_FREE_SPACE_EXTENT) { + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + spin_unlock(&ctl->tree_lock); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } - addr += sizeof(u64); - offset += sizeof(u64); - } - entry = addr; - - while (1) { - if (!num_entries) - break; - - need_loop = 1; - e = kmem_cache_zalloc(btrfs_free_space_cachep, - GFP_NOFS); - if (!e) { - kunmap(page); - unlock_page(page); - page_cache_release(page); + } else { + BUG_ON(!num_bitmaps); + num_bitmaps--; + e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!e->bitmap) { + kmem_cache_free( + btrfs_free_space_cachep, e); goto free_cache; } - - e->offset = le64_to_cpu(entry->offset); - e->bytes = le64_to_cpu(entry->bytes); - if (!e->bytes) { - kunmap(page); + spin_lock(&ctl->tree_lock); + ret = link_free_space(ctl, e); + ctl->total_bitmaps++; + ctl->op->recalc_thresholds(ctl); + spin_unlock(&ctl->tree_lock); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); kmem_cache_free(btrfs_free_space_cachep, e); - unlock_page(page); - page_cache_release(page); goto free_cache; } - - if (entry->type == BTRFS_FREE_SPACE_EXTENT) { - spin_lock(&ctl->tree_lock); - ret = link_free_space(ctl, e); - spin_unlock(&ctl->tree_lock); - if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); - kunmap(page); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - } else { - e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!e->bitmap) { - kunmap(page); - kmem_cache_free( - btrfs_free_space_cachep, e); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - spin_lock(&ctl->tree_lock); - ret = link_free_space(ctl, e); - ctl->total_bitmaps++; - ctl->op->recalc_thresholds(ctl); - spin_unlock(&ctl->tree_lock); - if (ret) { - printk(KERN_ERR "Duplicate entries in " - "free space cache, dumping\n"); - kunmap(page); - unlock_page(page); - page_cache_release(page); - goto free_cache; - } - list_add_tail(&e->list, &bitmaps); - } - - num_entries--; - offset += sizeof(struct btrfs_free_space_entry); - if (offset + sizeof(struct btrfs_free_space_entry) >= - PAGE_CACHE_SIZE) - break; - entry++; + list_add_tail(&e->list, &bitmaps); } - /* - * We read an entry out of this page, we need to move on to the - * next page. - */ - if (need_loop) { - kunmap(page); - goto next; - } + num_entries--; + } - /* - * We add the bitmaps at the end of the entries in order that - * the bitmap entries are added to the cache. - */ - e = list_entry(bitmaps.next, struct btrfs_free_space, list); + io_ctl_unmap_page(&io_ctl); + + /* + * We add the bitmaps at the end of the entries in order that + * the bitmap entries are added to the cache. + */ + list_for_each_entry_safe(e, n, &bitmaps, list) { list_del_init(&e->list); - memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); - kunmap(page); - num_bitmaps--; -next: - unlock_page(page); - page_cache_release(page); - index++; + ret = io_ctl_read_bitmap(&io_ctl, e); + if (ret) + goto free_cache; } + io_ctl_drop_pages(&io_ctl); ret = 1; out: + io_ctl_free(&io_ctl); return ret; free_cache: + io_ctl_drop_pages(&io_ctl); __btrfs_remove_free_space_cache(ctl); goto out; } @@ -465,7 +736,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root = fs_info->tree_root; struct inode *inode; struct btrfs_path *path; - int ret; + int ret = 0; bool matched; u64 used = btrfs_block_group_used(&block_group->item); @@ -497,6 +768,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, return 0; } + /* We may have converted the inode and made the cache invalid. */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + goto out; + } + spin_unlock(&block_group->lock); + ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, path, block_group->key.objectid); btrfs_free_path(path); @@ -530,6 +809,19 @@ out: return ret; } +/** + * __btrfs_write_out_cache - write out cached info to an inode + * @root - the root the inode belongs to + * @ctl - the free space cache we are going to write out + * @block_group - the block_group for this cache if it belongs to a block_group + * @trans - the trans handle + * @path - the path to use + * @offset - the offset for the key we'll insert + * + * This function writes out a free space cache struct to disk for quick recovery + * on mount. This will return 0 if it was successfull in writing the cache out, + * and -1 if it was not. + */ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, @@ -540,42 +832,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct extent_buffer *leaf; struct rb_node *node; struct list_head *pos, *n; - struct page **pages; - struct page *page; struct extent_state *cached_state = NULL; struct btrfs_free_cluster *cluster = NULL; struct extent_io_tree *unpin = NULL; + struct io_ctl io_ctl; struct list_head bitmap_list; struct btrfs_key key; u64 start, end, len; - u64 bytes = 0; - u32 crc = ~(u32)0; - int index = 0, num_pages = 0; int entries = 0; int bitmaps = 0; - int ret = -1; - bool next_page = false; - bool out_of_space = false; + int ret; + int err = -1; INIT_LIST_HEAD(&bitmap_list); - node = rb_first(&ctl->free_space_offset); - if (!node) - return 0; - if (!i_size_read(inode)) return -1; - num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - - filemap_write_and_wait(inode->i_mapping); - btrfs_wait_ordered_range(inode, inode->i_size & - ~(root->sectorsize - 1), (u64)-1); - - pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); - if (!pages) - return -1; + io_ctl_init(&io_ctl, inode, root); /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) @@ -589,30 +863,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, */ unpin = root->fs_info->pinned_extents; - /* - * Lock all pages first so we can lock the extent safely. - * - * NOTE: Because we hold the ref the entire time we're going to write to - * the page find_get_page should never fail, so we don't do a check - * after find_get_page at this point. Just putting this here so people - * know and don't freak out. - */ - while (index < num_pages) { - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); - if (!page) { - int i; + /* Lock all pages first so we can lock the extent safely. */ + io_ctl_prepare_pages(&io_ctl, inode, 0); - for (i = 0; i < num_pages; i++) { - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - goto out; - } - pages[index] = page; - index++; - } - - index = 0; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state, GFP_NOFS); @@ -623,189 +876,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, if (block_group) start = block_group->key.objectid; - /* Write out the extent entries */ - do { - struct btrfs_free_space_entry *entry; - void *addr, *orig; - unsigned long offset = 0; + node = rb_first(&ctl->free_space_offset); + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; + } - next_page = false; + /* Make sure we can fit our crcs into the first page */ + if (io_ctl.check_crcs && + (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { + WARN_ON(1); + goto out_nospc; + } - if (index >= num_pages) { - out_of_space = true; - break; - } + io_ctl_set_generation(&io_ctl, trans->transid); - page = pages[index]; + /* Write out the extent entries */ + while (node) { + struct btrfs_free_space *e; - orig = addr = kmap(page); - if (index == 0) { - u64 *gen; + e = rb_entry(node, struct btrfs_free_space, offset_index); + entries++; - /* - * We're going to put in a bogus crc for this page to - * make sure that old kernels who aren't aware of this - * format will be sure to discard the cache. - */ - addr += sizeof(u64); - offset += sizeof(u64); + ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, + e->bitmap); + if (ret) + goto out_nospc; - gen = addr; - *gen = trans->transid; - addr += sizeof(u64); - offset += sizeof(u64); + if (e->bitmap) { + list_add_tail(&e->list, &bitmap_list); + bitmaps++; } - entry = addr; - - memset(addr, 0, PAGE_CACHE_SIZE - offset); - while (node && !next_page) { - struct btrfs_free_space *e; - - e = rb_entry(node, struct btrfs_free_space, offset_index); - entries++; - - entry->offset = cpu_to_le64(e->offset); - entry->bytes = cpu_to_le64(e->bytes); - if (e->bitmap) { - entry->type = BTRFS_FREE_SPACE_BITMAP; - list_add_tail(&e->list, &bitmap_list); - bitmaps++; - } else { - entry->type = BTRFS_FREE_SPACE_EXTENT; - } - node = rb_next(node); - if (!node && cluster) { - node = rb_first(&cluster->root); - cluster = NULL; - } - offset += sizeof(struct btrfs_free_space_entry); - if (offset + sizeof(struct btrfs_free_space_entry) >= - PAGE_CACHE_SIZE) - next_page = true; - entry++; + node = rb_next(node); + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; } + } - /* - * We want to add any pinned extents to our free space cache - * so we don't leak the space - */ - while (block_group && !next_page && - (start < block_group->key.objectid + - block_group->key.offset)) { - ret = find_first_extent_bit(unpin, start, &start, &end, - EXTENT_DIRTY); - if (ret) { - ret = 0; - break; - } - - /* This pinned extent is out of our range */ - if (start >= block_group->key.objectid + - block_group->key.offset) - break; - - len = block_group->key.objectid + - block_group->key.offset - start; - len = min(len, end + 1 - start); - - entries++; - entry->offset = cpu_to_le64(start); - entry->bytes = cpu_to_le64(len); - entry->type = BTRFS_FREE_SPACE_EXTENT; - - start = end + 1; - offset += sizeof(struct btrfs_free_space_entry); - if (offset + sizeof(struct btrfs_free_space_entry) >= - PAGE_CACHE_SIZE) - next_page = true; - entry++; + /* + * We want to add any pinned extents to our free space cache + * so we don't leak the space + */ + while (block_group && (start < block_group->key.objectid + + block_group->key.offset)) { + ret = find_first_extent_bit(unpin, start, &start, &end, + EXTENT_DIRTY); + if (ret) { + ret = 0; + break; } - /* Generate bogus crc value */ - if (index == 0) { - u32 *tmp; - crc = btrfs_csum_data(root, orig + sizeof(u64), crc, - PAGE_CACHE_SIZE - sizeof(u64)); - btrfs_csum_final(crc, (char *)&crc); - crc++; - tmp = orig; - *tmp = crc; - } + /* This pinned extent is out of our range */ + if (start >= block_group->key.objectid + + block_group->key.offset) + break; - kunmap(page); + len = block_group->key.objectid + + block_group->key.offset - start; + len = min(len, end + 1 - start); - bytes += PAGE_CACHE_SIZE; + entries++; + ret = io_ctl_add_entry(&io_ctl, start, len, NULL); + if (ret) + goto out_nospc; - index++; - } while (node || next_page); + start = end + 1; + } /* Write out the bitmaps */ list_for_each_safe(pos, n, &bitmap_list) { - void *addr; struct btrfs_free_space *entry = list_entry(pos, struct btrfs_free_space, list); - if (index >= num_pages) { - out_of_space = true; - break; - } - page = pages[index]; - - addr = kmap(page); - memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); - kunmap(page); - bytes += PAGE_CACHE_SIZE; - + ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); + if (ret) + goto out_nospc; list_del_init(&entry->list); - index++; - } - - if (out_of_space) { - btrfs_drop_pages(pages, num_pages); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, - i_size_read(inode) - 1, &cached_state, - GFP_NOFS); - ret = 0; - goto out; } /* Zero out the rest of the pages just to make sure */ - while (index < num_pages) { - void *addr; + io_ctl_zero_remaining_pages(&io_ctl); - page = pages[index]; - addr = kmap(page); - memset(addr, 0, PAGE_CACHE_SIZE); - kunmap(page); - bytes += PAGE_CACHE_SIZE; - index++; - } - - ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, - bytes, &cached_state); - btrfs_drop_pages(pages, num_pages); + ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, + 0, i_size_read(inode), &cached_state); + io_ctl_drop_pages(&io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); - if (ret) { - ret = 0; + if (ret) goto out; - } - BTRFS_I(inode)->generation = trans->transid; - filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + goto out; key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; key.type = 0; - ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - ret = -1; - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, + GFP_NOFS); goto out; } leaf = path->nodes[0]; @@ -816,15 +991,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { - ret = -1; - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, - GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + NULL, GFP_NOFS); btrfs_release_path(path); goto out; } } + + BTRFS_I(inode)->generation = trans->transid; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); btrfs_set_free_space_entries(leaf, header, entries); @@ -833,16 +1009,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - ret = 1; - + err = 0; out: - kfree(pages); - if (ret != 1) { - invalidate_inode_pages2_range(inode->i_mapping, 0, index); + io_ctl_free(&io_ctl); + if (err) { + invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); - return ret; + return err; + +out_nospc: + list_for_each_safe(pos, n, &bitmap_list) { + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + list_del_init(&entry->list); + } + io_ctl_drop_pages(&io_ctl); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, GFP_NOFS); + goto out; } int btrfs_write_out_cache(struct btrfs_root *root, @@ -869,14 +1055,15 @@ int btrfs_write_out_cache(struct btrfs_root *root, ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, path, block_group->key.objectid); - if (ret < 0) { + if (ret) { spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; spin_unlock(&block_group->lock); ret = 0; - +#ifdef DEBUG printk(KERN_ERR "btrfs: failed to write free space cace " "for block group %llu\n", block_group->key.objectid); +#endif } iput(inode); @@ -1283,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, { info->offset = offset_to_bitmap(ctl, offset); info->bytes = 0; + INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; @@ -1662,7 +1850,13 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - WARN_ON(1); + /* the tree logging code might be calling us before we + * have fully loaded the free space rbtree for this + * block group. So it is possible the entry won't + * be in the rbtree yet at all. The caching code + * will make sure not to put it in the rbtree if + * the logging code has pinned it. + */ goto out_lock; } } @@ -1701,6 +1895,7 @@ again: ctl->total_bitmaps--; } kmem_cache_free(btrfs_free_space_cachep, info); + ret = 0; goto out_lock; } @@ -1708,7 +1903,8 @@ again: unlink_free_space(ctl, info); info->offset += bytes; info->bytes -= bytes; - link_free_space(ctl, info); + ret = link_free_space(ctl, info); + WARN_ON(ret); goto out_lock; } @@ -2124,6 +2320,7 @@ again: if (!found) { start = i; + cluster->max_size = 0; found = true; } @@ -2267,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; - struct rb_node *node; int ret = -ENOSPC; + u64 bitmap_offset = offset_to_bitmap(ctl, offset); if (ctl->total_bitmaps == 0) return -ENOSPC; /* - * First check our cached list of bitmaps and see if there is an entry - * here that will work. + * The bitmap that covers offset won't be in the list unless offset + * is just its start offset. */ + entry = list_first_entry(bitmaps, struct btrfs_free_space, list); + if (entry->offset != bitmap_offset) { + entry = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (entry && list_empty(&entry->list)) + list_add(&entry->list, bitmaps); + } + list_for_each_entry(entry, bitmaps, list) { if (entry->bytes < min_bytes) continue; @@ -2287,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, } /* - * If we do have entries on our list and we are here then we didn't find - * anything, so go ahead and get the next entry after the last entry in - * this list and start the search from there. + * The bitmaps list has all the bitmaps that record free space + * starting after offset, so no more search is required. */ - if (!list_empty(bitmaps)) { - entry = list_entry(bitmaps->prev, struct btrfs_free_space, - list); - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; - entry = rb_entry(node, struct btrfs_free_space, offset_index); - goto search; - } - - entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); - if (!entry) - return -ENOSPC; - -search: - node = &entry->offset_index; - do { - entry = rb_entry(node, struct btrfs_free_space, offset_index); - node = rb_next(&entry->offset_index); - if (!entry->bitmap) - continue; - if (entry->bytes < min_bytes) - continue; - ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, min_bytes); - } while (ret && node); - - return ret; + return -ENOSPC; } /* @@ -2336,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct list_head bitmaps; struct btrfs_free_space *entry, *tmp; + LIST_HEAD(bitmaps); u64 min_bytes; int ret; @@ -2376,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } - INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes, min_bytes); if (ret) @@ -2472,9 +2647,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, spin_unlock(&ctl->tree_lock); if (bytes >= minlen) { - int update_ret; - update_ret = btrfs_update_reserved_bytes(block_group, - bytes, 1, 1); + struct btrfs_space_info *space_info; + int update = 0; + + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (!block_group->ro) { + block_group->reserved += bytes; + space_info->bytes_reserved += bytes; + update = 1; + } + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); ret = btrfs_error_discard_extent(fs_info->extent_root, start, @@ -2482,9 +2667,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, &actually_trimmed); btrfs_add_free_space(block_group, start, bytes); - if (!update_ret) - btrfs_update_reserved_bytes(block_group, - bytes, 0, 1); + if (update) { + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->ro) + space_info->bytes_readonly += bytes; + block_group->reserved -= bytes; + space_info->bytes_reserved -= bytes; + spin_unlock(&space_info->lock); + spin_unlock(&block_group->lock); + } if (ret) break; @@ -2643,9 +2835,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, return 0; ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); - if (ret < 0) + if (ret) { + btrfs_delalloc_release_metadata(inode, inode->i_size); +#ifdef DEBUG printk(KERN_ERR "btrfs: failed to write free ino cache " "for root %llu\n", root->root_key.objectid); +#endif + } iput(inode); return ret; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index b4087e0fa871..f8962a957d65 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_path *path; struct inode *inode; + struct btrfs_block_rsv *rsv; + u64 num_bytes; u64 alloc_hint = 0; int ret; int prealloc; @@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (!path) return -ENOMEM; + rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->trans_block_rsv; + + num_bytes = trans->bytes_reserved; + /* + * 1 item for inode item insertion if need + * 3 items for inode item update (in the worst case) + * 1 item for free space object + * 3 items for pre-allocation + */ + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); + ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, + trans->bytes_reserved); + if (ret) + goto out; again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ret = PTR_ERR(inode); - goto out; + goto out_release; } if (IS_ERR(inode)) { @@ -434,7 +451,7 @@ again: ret = create_free_ino_inode(root, trans, path); if (ret) - goto out; + goto out_release; goto again; } @@ -465,21 +482,26 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, prealloc); + ret = btrfs_delalloc_reserve_space(inode, prealloc); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); - if (ret) + if (ret) { + btrfs_delalloc_release_space(inode, prealloc); goto out_put; + } btrfs_free_reserved_data_space(inode, prealloc); + ret = btrfs_write_out_ino_cache(root, trans, path); out_put: iput(inode); +out_release: + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: - if (ret == 0) - ret = btrfs_write_out_ino_cache(root, trans, path); + trans->block_rsv = rsv; + trans->bytes_reserved = num_bytes; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 75686a61bd45..fd1a06df5bc6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -38,6 +38,7 @@ #include <linux/falloc.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/mount.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -45,10 +46,10 @@ #include "btrfs_inode.h" #include "ioctl.h" #include "print-tree.h" -#include "volumes.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" +#include "volumes.h" #include "compression.h" #include "locking.h" #include "free-space-cache.h" @@ -93,6 +94,8 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -393,7 +396,10 @@ again: (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); - BUG_ON(!pages); + if (!pages) { + /* just bail out to the uncompressed code */ + goto cont; + } if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; @@ -424,6 +430,7 @@ again: will_compress = 1; } } +cont: if (start == 0) { trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); @@ -820,7 +827,7 @@ static noinline int cow_file_range(struct inode *inode, } BUG_ON(disk_num_bytes > - btrfs_super_total_bytes(&root->fs_info->super_copy)); + btrfs_super_total_bytes(root->fs_info->super_copy)); alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); @@ -1737,7 +1744,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } goto out; @@ -1787,17 +1794,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } ret = 0; out: - if (nolock) { - if (trans) - btrfs_end_transaction_nolock(trans, root); - } else { + if (root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, ordered_extent->len); - if (trans) + if (trans) { + if (nolock) + btrfs_end_transaction_nolock(trans, root); + else btrfs_end_transaction(trans, root); } @@ -1819,153 +1826,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, } /* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the page is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - struct page *page; - u64 start; - u64 len; - u64 logical; - unsigned long bio_flags; - int last_mirror; -}; - -static int btrfs_io_failed_hook(struct bio *failed_bio, - struct page *page, u64 start, u64 end, - struct extent_state *state) -{ - struct io_failure_record *failrec = NULL; - u64 private; - struct extent_map *em; - struct inode *inode = page->mapping->host; - struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct bio *bio; - int num_copies; - int ret; - int rw; - u64 logical; - - ret = get_state_private(failure_tree, start, &private); - if (ret) { - failrec = kmalloc(sizeof(*failrec), GFP_NOFS); - if (!failrec) - return -ENOMEM; - failrec->start = start; - failrec->len = end - start + 1; - failrec->last_mirror = 0; - failrec->bio_flags = 0; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (em->start > start || em->start + em->len < start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - - if (IS_ERR_OR_NULL(em)) { - kfree(failrec); - return -EIO; - } - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, - em->compress_type); - } - failrec->logical = logical; - free_extent_map(em); - set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | - EXTENT_DIRTY, GFP_NOFS); - set_state_private(failure_tree, start, - (u64)(unsigned long)failrec); - } else { - failrec = (struct io_failure_record *)(unsigned long)private; - } - num_copies = btrfs_num_copies( - &BTRFS_I(inode)->root->fs_info->mapping_tree, - failrec->logical, failrec->len); - failrec->last_mirror++; - if (!state) { - spin_lock(&BTRFS_I(inode)->io_tree.lock); - state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, - failrec->start, - EXTENT_LOCKED); - if (state && state->start != failrec->start) - state = NULL; - spin_unlock(&BTRFS_I(inode)->io_tree.lock); - } - if (!state || failrec->last_mirror > num_copies) { - set_state_private(failure_tree, failrec->start, 0); - clear_extent_bits(failure_tree, failrec->start, - failrec->start + failrec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); - kfree(failrec); - return -EIO; - } - bio = bio_alloc(GFP_NOFS, 1); - bio->bi_private = state; - bio->bi_end_io = failed_bio->bi_end_io; - bio->bi_sector = failrec->logical >> 9; - bio->bi_bdev = failed_bio->bi_bdev; - bio->bi_size = 0; - - bio_add_page(bio, page, failrec->len, start - page_offset(page)); - if (failed_bio->bi_rw & REQ_WRITE) - rw = WRITE; - else - rw = READ; - - ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, - failrec->last_mirror, - failrec->bio_flags, 0); - return ret; -} - -/* - * each time an IO finishes, we do a fast check in the IO failure tree - * to see if we need to process or clean up an io_failure_record - */ -static int btrfs_clean_io_failures(struct inode *inode, u64 start) -{ - u64 private; - u64 private_failure; - struct io_failure_record *failure; - int ret; - - private = 0; - if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY, 0)) { - ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, - start, &private_failure); - if (ret == 0) { - failure = (struct io_failure_record *)(unsigned long) - private_failure; - set_state_private(&BTRFS_I(inode)->io_failure_tree, - failure->start, 0); - clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, - failure->start, - failure->start + failure->len - 1, - EXTENT_DIRTY | EXTENT_LOCKED, - GFP_NOFS); - kfree(failure); - } - } - return 0; -} - -/* * when reads are done, we need to check csums to verify the data is correct - * if there's a match, we allow the bio to finish. If not, we go through - * the io_failure_record routines to find good copies + * if there's a match, we allow the bio to finish. If not, the code in + * extent_io.c will try to find good copies for us. */ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) @@ -2011,10 +1874,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, kunmap_atomic(kaddr, KM_USER0); good: - /* if the io failure tree for this inode is non-empty, - * check to see if we've recovered from a failed IO - */ - btrfs_clean_io_failures(inode, start); return 0; zeroit: @@ -2079,89 +1938,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) up_read(&root->fs_info->cleanup_work_sem); } -/* - * calculate extra metadata reservation when snapshotting a subvolume - * contains orphan files. - */ -void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve) -{ - struct btrfs_root *root; - struct btrfs_block_rsv *block_rsv; - u64 num_bytes; - int index; - - root = pending->root; - if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) - return; - - block_rsv = root->orphan_block_rsv; - - /* orphan block reservation for the snapshot */ - num_bytes = block_rsv->size; - - /* - * after the snapshot is created, COWing tree blocks may use more - * space than it frees. So we should make sure there is enough - * reserved space. - */ - index = trans->transid & 0x1; - if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { - num_bytes += block_rsv->size - - (block_rsv->reserved + block_rsv->freed[index]); - } - - *bytes_to_reserve += num_bytes; -} - -void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending) -{ - struct btrfs_root *root = pending->root; - struct btrfs_root *snap = pending->snap; - struct btrfs_block_rsv *block_rsv; - u64 num_bytes; - int index; - int ret; - - if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) - return; - - /* refill source subvolume's orphan block reservation */ - block_rsv = root->orphan_block_rsv; - index = trans->transid & 0x1; - if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { - num_bytes = block_rsv->size - - (block_rsv->reserved + block_rsv->freed[index]); - ret = btrfs_block_rsv_migrate(&pending->block_rsv, - root->orphan_block_rsv, - num_bytes); - BUG_ON(ret); - } - - /* setup orphan block reservation for the snapshot */ - block_rsv = btrfs_alloc_block_rsv(snap); - BUG_ON(!block_rsv); - - btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - snap->orphan_block_rsv = block_rsv; - - num_bytes = root->orphan_block_rsv->size; - ret = btrfs_block_rsv_migrate(&pending->block_rsv, - block_rsv, num_bytes); - BUG_ON(ret); - -#if 0 - /* insert orphan item for the snapshot */ - WARN_ON(!root->orphan_item_inserted); - ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, - snap->root_key.objectid); - BUG_ON(ret); - snap->orphan_item_inserted = 1; -#endif -} - enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -2247,9 +2023,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) } spin_unlock(&root->orphan_lock); - if (block_rsv) - btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); @@ -2259,7 +2032,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) /* insert an orphan item to track this unlinked/truncated file */ if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); - BUG_ON(ret); + BUG_ON(ret && ret != -EEXIST); } /* insert an orphan item to track subvolume contains orphan files */ @@ -2316,6 +2089,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; struct inode *inode; + u64 last_objectid = 0; int ret = 0, nr_unlink = 0, nr_truncate = 0; if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) @@ -2367,41 +2141,81 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * crossing root thing. we store the inode number in the * offset of the orphan item. */ + + if (found_key.offset == last_objectid) { + printk(KERN_ERR "btrfs: Error removing orphan entry, " + "stopping orphan cleanup\n"); + ret = -EINVAL; + goto out; + } + + last_objectid = found_key.offset; + found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); + ret = PTR_RET(inode); + if (ret && ret != -ESTALE) goto out; - } - /* - * add this inode to the orphan list so btrfs_orphan_del does - * the proper thing when we hit it - */ - spin_lock(&root->orphan_lock); - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->orphan_lock); + if (ret == -ESTALE && root == root->fs_info->tree_root) { + struct btrfs_root *dead_root; + struct btrfs_fs_info *fs_info = root->fs_info; + int is_dead_root = 0; + /* + * this is an orphan in the tree root. Currently these + * could come from 2 sources: + * a) a snapshot deletion in progress + * b) a free space cache inode + * We need to distinguish those two, as the snapshot + * orphan must not get deleted. + * find_dead_roots already ran before us, so if this + * is a snapshot deletion, we should find the root + * in the dead_roots list + */ + spin_lock(&fs_info->trans_lock); + list_for_each_entry(dead_root, &fs_info->dead_roots, + root_list) { + if (dead_root->root_key.objectid == + found_key.objectid) { + is_dead_root = 1; + break; + } + } + spin_unlock(&fs_info->trans_lock); + if (is_dead_root) { + /* prevent this orphan from being found again */ + key.offset = found_key.objectid - 1; + continue; + } + } /* - * if this is a bad inode, means we actually succeeded in - * removing the inode, but not the orphan record, which means - * we need to manually delete the orphan since iput will just - * do a destroy_inode + * Inode is already gone but the orphan item is still there, + * kill the orphan item. */ - if (is_bad_inode(inode)) { - trans = btrfs_start_transaction(root, 0); + if (ret == -ESTALE) { + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } - btrfs_orphan_del(trans, inode); + ret = btrfs_del_orphan_item(trans, root, + found_key.objectid); + BUG_ON(ret); btrfs_end_transaction(trans, root); - iput(inode); continue; } + /* + * add this inode to the orphan list so btrfs_orphan_del does + * the proper thing when we hit it + */ + spin_lock(&root->orphan_lock); + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + spin_unlock(&root->orphan_lock); + /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { if (!S_ISREG(inode->i_mode)) { @@ -2410,7 +2224,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; + /* + * Need to hold the imutex for reservation purposes, not + * a huge deal here but I have a WARN_ON in + * btrfs_delalloc_reserve_space to catch offenders. + */ + mutex_lock(&inode->i_mutex); ret = btrfs_truncate(inode); + mutex_unlock(&inode->i_mutex); } else { nr_unlink++; } @@ -2420,6 +2241,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret) goto out; } + /* release the path since we're done with it */ + btrfs_release_path(path); + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; if (root->orphan_block_rsv) @@ -2647,7 +2471,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, /* * copy everything in the in-memory inode into the btree. */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, +static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { struct btrfs_inode_item *inode_item; @@ -2655,21 +2479,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int ret; - /* - * If the inode is a free space inode, we can deadlock during commit - * if we put it into the delayed code. - * - * The data relocation inode should also be directly updated - * without delay - */ - if (!btrfs_is_free_space_inode(root, inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { - ret = btrfs_delayed_update_inode(trans, root, inode); - if (!ret) - btrfs_set_inode_last_trans(trans, inode); - return ret; - } - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2698,6 +2507,43 @@ failed: } /* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + /* + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay + */ + if (!btrfs_is_free_space_inode(root, inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_delayed_update_inode(trans, root, inode); + if (!ret) + btrfs_set_inode_last_trans(trans, inode); + return ret; + } + + return btrfs_update_inode_item(trans, root, inode); +} + +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + ret = btrfs_update_inode(trans, root, inode); + if (ret == -ENOSPC) + return btrfs_update_inode_item(trans, root, inode); + return ret; +} + +/* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory @@ -2835,7 +2681,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, u64 ino = btrfs_ino(inode); u64 dir_ino = btrfs_ino(dir); - trans = btrfs_start_transaction(root, 10); + /* + * 1 for the possible orphan item + * 1 for the dir item + * 1 for the dir index + * 1 for the inode ref + * 1 for the inode ref in the tree log + * 2 for the dir entries in the log + * 1 for the inode + */ + trans = btrfs_start_transaction(root, 8); if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; @@ -2858,7 +2713,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, return ERR_PTR(-ENOMEM); } - trans = btrfs_start_transaction(root, 0); + /* 1 for the orphan item */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_free_path(path); root->fs_info->enospc_unlink = 0; @@ -2963,6 +2819,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, err = 0; out: btrfs_free_path(path); + /* Migrate the orphan reservation over */ + if (!err) + err = btrfs_block_rsv_migrate(trans->block_rsv, + &root->fs_info->global_block_rsv, + trans->bytes_reserved); + if (err) { btrfs_end_transaction(trans, root); root->fs_info->enospc_unlink = 0; @@ -2977,6 +2839,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (trans->block_rsv == &root->fs_info->global_block_rsv) { + btrfs_block_rsv_release(root, trans->block_rsv, + trans->bytes_reserved); + trans->block_rsv = &root->fs_info->trans_block_rsv; BUG_ON(!root->fs_info->enospc_unlink); root->fs_info->enospc_unlink = 0; } @@ -3368,6 +3233,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) pgoff_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); struct page *page; + gfp_t mask = btrfs_alloc_write_mask(mapping); int ret = 0; u64 page_start; u64 page_end; @@ -3380,7 +3246,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) ret = -ENOMEM; again: - page = find_or_create_page(mapping, index, GFP_NOFS); + page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto out; @@ -3501,7 +3367,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) u64 hint_byte = 0; hole_size = last_byte - cur_offset; - trans = btrfs_start_transaction(root, 2); + trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { err = PTR_ERR(trans); break; @@ -3511,6 +3377,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) cur_offset + hole_size, &hint_byte, 1); if (err) { + btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); break; } @@ -3520,6 +3387,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) 0, hole_size, 0, hole_size, 0, 0, 0); if (err) { + btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); break; } @@ -3527,6 +3395,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); + btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); } free_extent_map(em); @@ -3544,6 +3413,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) static int btrfs_setsize(struct inode *inode, loff_t newsize) { + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); int ret; @@ -3551,16 +3422,19 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) return 0; if (newsize > oldsize) { - i_size_write(inode, newsize); - btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); truncate_pagecache(inode, oldsize, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); - if (ret) { - btrfs_setsize(inode, oldsize); + if (ret) return ret; - } - mark_inode_dirty(inode); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + i_size_write(inode, newsize); + btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + ret = btrfs_update_inode(trans, root, inode); + btrfs_end_transaction_throttle(trans, root); } else { /* @@ -3600,9 +3474,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid) { setattr_copy(inode, attr); - mark_inode_dirty(inode); + err = btrfs_dirty_inode(inode); - if (attr->ia_valid & ATTR_MODE) + if (!err && attr->ia_valid & ATTR_MODE) err = btrfs_acl_chmod(inode); } @@ -3613,6 +3487,8 @@ void btrfs_evict_inode(struct inode *inode) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *rsv, *global_rsv; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); unsigned long nr; int ret; @@ -3640,22 +3516,55 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + rsv = btrfs_alloc_block_rsv(root); + if (!rsv) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + rsv->size = min_size; + global_rsv = &root->fs_info->global_block_rsv; + btrfs_i_size_write(inode, 0); + /* + * This is a bit simpler than btrfs_truncate since + * + * 1) We've already reserved our space for our orphan item in the + * unlink. + * 2) We're going to delete the inode item, so we don't need to update + * it at all. + * + * So we just need to reserve some slack space in case we add bytes when + * doing the truncate. + */ while (1) { - trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); - trans->block_rsv = root->orphan_block_rsv; + ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); + + /* + * Try and steal from the global reserve since we will + * likely not use this space anyway, we want to try as + * hard as possible to get this to work. + */ + if (ret) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); - ret = btrfs_block_rsv_check(trans, root, - root->orphan_block_rsv, 0, 5); if (ret) { - BUG_ON(ret != -EAGAIN); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); - continue; + printk(KERN_WARNING "Could not get space for a " + "delete, will truncate on mount %d\n", ret); + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; } + trans->block_rsv = rsv; + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); if (ret != -EAGAIN) break; @@ -3664,14 +3573,17 @@ void btrfs_evict_inode(struct inode *inode) btrfs_end_transaction(trans, root); trans = NULL; btrfs_btree_balance_dirty(root, nr); - } + btrfs_free_block_rsv(root, rsv); + if (ret == 0) { + trans->block_rsv = root->orphan_block_rsv; ret = btrfs_orphan_del(trans, inode); BUG_ON(ret); } + trans->block_rsv = &root->fs_info->trans_block_rsv; if (!(root == root->fs_info->tree_root || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) btrfs_return_ino(root, btrfs_ino(inode)); @@ -4340,42 +4252,80 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ -void btrfs_dirty_inode(struct inode *inode, int flags) +int btrfs_dirty_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; int ret; if (BTRFS_I(inode)->dummy_inode) - return; + return 0; trans = btrfs_join_transaction(root); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_update_inode(trans, root, inode); if (ret && ret == -ENOSPC) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans, root); trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - printk_ratelimited(KERN_ERR "btrfs: fail to " - "dirty inode %llu error %ld\n", - (unsigned long long)btrfs_ino(inode), - PTR_ERR(trans)); - return; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_update_inode(trans, root, inode); - if (ret) { - printk_ratelimited(KERN_ERR "btrfs: fail to " - "dirty inode %llu error %d\n", - (unsigned long long)btrfs_ino(inode), - ret); - } } btrfs_end_transaction(trans, root); if (BTRFS_I(inode)->delayed_node) btrfs_balance_delayed_items(root); + + return ret; +} + +/* + * This is a copy of file_update_time. We need this so we can return error on + * ENOSPC for updating the inode in the case of file write and mmap writes. + */ +int btrfs_update_time(struct file *file) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct timespec now; + int ret; + enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; + + /* First try to exhaust all avenues to not sync */ + if (IS_NOCMTIME(inode)) + return 0; + + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_mtime, &now)) + sync_it = S_MTIME; + + if (!timespec_equal(&inode->i_ctime, &now)) + sync_it |= S_CTIME; + + if (IS_I_VERSION(inode)) + sync_it |= S_VERSION; + + if (!sync_it) + return 0; + + /* Finally allowed to write? Takes lock. */ + if (mnt_want_write_file(file)) + return 0; + + /* Only change inode inside the lock region */ + if (sync_it & S_VERSION) + inode_inc_iversion(inode); + if (sync_it & S_CTIME) + inode->i_ctime = now; + if (sync_it & S_MTIME) + inode->i_mtime = now; + ret = btrfs_dirty_inode(inode); + if (!ret) + mark_inode_dirty_sync(inode); + mnt_drop_write(file->f_path.mnt); + return ret; } /* @@ -4640,10 +4590,6 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans, int err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, backref, index); - if (!err) { - d_instantiate(dentry, inode); - return 0; - } if (err > 0) err = -EEXIST; return err; @@ -4691,13 +4637,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } + /* + * If the active LSM wants to access the inode during + * d_instantiate it needs these. Smack checks to see + * if the filesystem supports xattrs by looking at the + * ops vector. + */ + + inode->i_op = &btrfs_special_inode_operations; err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { - inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); + d_instantiate(dentry, inode); } out_unlock: nr = trans->blocks_used; @@ -4749,15 +4703,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock; } + /* + * If the active LSM wants to access the inode during + * d_instantiate it needs these. Smack checks to see + * if the filesystem supports xattrs by looking at the + * ops vector. + */ + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { inode->i_mapping->a_ops = &btrfs_aops; inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + d_instantiate(dentry, inode); } out_unlock: nr = trans->blocks_used; @@ -4815,6 +4777,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, root, inode); BUG_ON(err); + d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } @@ -5795,8 +5758,7 @@ again: if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret) - ret = btrfs_update_inode(trans, root, inode); - err = ret; + err = btrfs_update_inode_fallback(trans, root, inode); goto out; } @@ -5834,7 +5796,7 @@ again: add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) - btrfs_update_inode(trans, root, inode); + btrfs_update_inode_fallback(trans, root, inode); ret = 0; out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, @@ -6289,7 +6251,7 @@ int btrfs_readpage(struct file *file, struct page *page) { struct extent_io_tree *tree; tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_read_full_page(tree, page, btrfs_get_extent); + return extent_read_full_page(tree, page, btrfs_get_extent, 0); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -6440,7 +6402,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; + /* Need this to keep space reservations serialized */ + mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + mutex_unlock(&inode->i_mutex); + if (!ret) + ret = btrfs_update_time(vma->vm_file); if (ret) { if (ret == -ENOMEM) ret = VM_FAULT_OOM; @@ -6541,6 +6508,7 @@ static int btrfs_truncate(struct inode *inode) struct btrfs_trans_handle *trans; unsigned long nr; u64 mask = root->sectorsize - 1; + u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (ret) @@ -6588,19 +6556,23 @@ static int btrfs_truncate(struct inode *inode) rsv = btrfs_alloc_block_rsv(root); if (!rsv) return -ENOMEM; - btrfs_add_durable_block_rsv(root->fs_info, rsv); + rsv->size = min_size; + /* + * 1 for the truncate slack space + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion + * 1 for updating the inode. + */ trans = btrfs_start_transaction(root, 4); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; } - /* - * Reserve space for the truncate process. Truncate should be adding - * space, but if there are snapshots it may end up using space. - */ - ret = btrfs_truncate_reserve_metadata(trans, root, rsv); + /* Migrate the slack space for the truncate to our reserve */ + ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + min_size); BUG_ON(ret); ret = btrfs_orphan_add(trans, inode); @@ -6609,21 +6581,6 @@ static int btrfs_truncate(struct inode *inode) goto out; } - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - - /* - * Ok so we've already migrated our bytes over for the truncate, so here - * just reserve the one slot we need for updating the inode. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out; - } - trans->block_rsv = rsv; - /* * setattr is responsible for setting the ordered_data_close flag, * but that is only tested during the last file release. That @@ -6645,20 +6602,31 @@ static int btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { + ret = btrfs_block_rsv_refill(root, rsv, min_size); + if (ret) { + /* + * This can only happen with the original transaction we + * started above, every other time we shouldn't have a + * transaction started yet. + */ + if (ret == -EAGAIN) + goto end_trans; + err = ret; + break; + } + if (!trans) { - trans = btrfs_start_transaction(root, 3); + /* Just need the 1 for updating the inode */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out; + ret = err = PTR_ERR(trans); + trans = NULL; + break; } - - ret = btrfs_truncate_reserve_metadata(trans, root, - rsv); - BUG_ON(ret); - - trans->block_rsv = rsv; } + trans->block_rsv = rsv; + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); @@ -6673,7 +6641,7 @@ static int btrfs_truncate(struct inode *inode) err = ret; break; } - +end_trans: nr = trans->blocks_used; btrfs_end_transaction(trans, root); trans = NULL; @@ -6693,14 +6661,16 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(NULL, inode); } - trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - if (ret && !err) - err = ret; + if (trans) { + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + if (ret && !err) + err = ret; - nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); + } out: btrfs_free_block_rsv(root, rsv); @@ -6755,9 +6725,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; - ei->reserved_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; + ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->last_unlink_trans = 0; @@ -6769,6 +6739,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->orphan_meta_reserved = 0; ei->dummy_inode = 0; ei->in_defrag = 0; + ei->delalloc_meta_reserved = 0; ei->force_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; @@ -6803,6 +6774,8 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(inode->i_data.nrpages); WARN_ON(BTRFS_I(inode)->outstanding_extents); WARN_ON(BTRFS_I(inode)->reserved_extents); + WARN_ON(BTRFS_I(inode)->delalloc_bytes); + WARN_ON(BTRFS_I(inode)->csum_bytes); /* * This can happen where we create an inode, but somebody else also @@ -6926,11 +6899,13 @@ static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; + u32 blocksize = inode->i_sb->s_blocksize; + generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; - stat->blocks = (inode_get_bytes(inode) + - BTRFS_I(inode)->delalloc_bytes) >> 9; + stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + + ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; return 0; } @@ -7206,14 +7181,21 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; } + /* + * If the active LSM wants to access the inode during + * d_instantiate it needs these. Smack checks to see + * if the filesystem supports xattrs by looking at the + * ops vector. + */ + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { inode->i_mapping->a_ops = &btrfs_aops; inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } if (drop_inode) @@ -7262,6 +7244,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, drop_inode = 1; out_unlock: + if (!err) + d_instantiate(dentry, inode); nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); if (drop_inode) { @@ -7420,7 +7404,6 @@ static struct extent_io_ops btrfs_extent_io_ops = { .readpage_end_io_hook = btrfs_readpage_end_io_hook, .writepage_end_io_hook = btrfs_writepage_end_io_hook, .writepage_start_hook = btrfs_writepage_start_hook, - .readpage_io_failed_hook = btrfs_io_failed_hook, .set_bit_hook = btrfs_set_bit_hook, .clear_bit_hook = btrfs_clear_bit_hook, .merge_extent_hook = btrfs_merge_extent_hook, @@ -7484,6 +7467,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .follow_link = page_follow_link_light, .put_link = page_put_link, .getattr = btrfs_getattr, + .setattr = btrfs_setattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, .getxattr = btrfs_getxattr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dae5dfe41ba5..c04f02c7d5bb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -51,6 +51,7 @@ #include "volumes.h" #include "locking.h" #include "inode-map.h" +#include "backref.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode) /* * Inherit flags from the parent inode. * - * Unlike extN we don't have any flags we don't want to inherit currently. + * Currently only the compression flags and the cow flags are inherited. */ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) { @@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) flags = BTRFS_I(dir)->flags; - if (S_ISREG(inode->i_mode)) - flags &= ~BTRFS_INODE_DIRSYNC; - else if (!S_ISDIR(inode->i_mode)) - flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); + if (flags & BTRFS_INODE_NOCOMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + } else if (flags & BTRFS_INODE_COMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + } + + if (flags & BTRFS_INODE_NODATACOW) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - BTRFS_I(inode)->flags = flags; btrfs_update_iflags(inode); } @@ -246,11 +252,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); + btrfs_update_iflags(inode); + inode->i_ctime = CURRENT_TIME; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_update_iflags(inode); - inode->i_ctime = CURRENT_TIME; btrfs_end_transaction(trans, root); mnt_drop_write(file->f_path.mnt); @@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; + u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); int ret; if (!capable(CAP_SYS_ADMIN)) @@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) } } rcu_read_unlock(); + if (!num_devices) return -EOPNOTSUPP; - if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; + if (range.start > total_bytes) + return -EINVAL; + range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(root, &range); if (ret < 0) @@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, int ret = 1; /* - * make sure that once we start defragging and extent, we keep on + * make sure that once we start defragging an extent, we keep on * defragging it */ if (start < *defrag_end) @@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, * extent will force at least part of that big extent to be defragged. */ if (ret) { - *last_len += len; *defrag_end = extent_map_end(em); } else { *last_len = 0; @@ -843,13 +852,16 @@ static int cluster_pages_for_defrag(struct inode *inode, int i_done; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); if (isize == 0) return 0; file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_space(inode, num_pages << PAGE_CACHE_SHIFT); + mutex_unlock(&inode->i_mutex); if (ret) return ret; again: @@ -860,7 +872,7 @@ again: for (i = 0; i < num_pages; i++) { struct page *page; page = find_or_create_page(inode->i_mapping, - start_index + i, GFP_NOFS); + start_index + i, mask); if (!page) break; @@ -972,18 +984,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_super_block *disk_super; struct file_ra_state *ra = NULL; unsigned long last_index; + u64 isize = i_size_read(inode); u64 features; u64 last_len = 0; u64 skip = 0; u64 defrag_end = 0; u64 newer_off = range->start; - int newer_left = 0; unsigned long i; + unsigned long ra_index = 0; int ret; int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; int extent_thresh = range->extent_thresh; - int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + int cluster = max_cluster; u64 new_align = ~((u64)128 * 1024 - 1); struct page **pages = NULL; @@ -997,7 +1011,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, compress_type = range->compress_type; } - if (inode->i_size == 0) + if (isize == 0) return 0; /* @@ -1013,7 +1027,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra = &file->f_ra; } - pages = kmalloc(sizeof(struct page *) * newer_cluster, + pages = kmalloc(sizeof(struct page *) * max_cluster, GFP_NOFS); if (!pages) { ret = -ENOMEM; @@ -1022,10 +1036,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, /* find the last page to defrag */ if (range->start + range->len > range->start) { - last_index = min_t(u64, inode->i_size - 1, + last_index = min_t(u64, isize - 1, range->start + range->len - 1) >> PAGE_CACHE_SHIFT; } else { - last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; + last_index = (isize - 1) >> PAGE_CACHE_SHIFT; } if (newer_than) { @@ -1038,14 +1052,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * the extents in the file evenly spaced */ i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; - newer_left = newer_cluster; } else goto out_ra; } else { i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index - 1; + max_to_defrag = last_index; /* * make writeback starts from i, so the defrag range can be @@ -1079,18 +1092,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = max(i + 1, next); continue; } + + if (!newer_than) { + cluster = (PAGE_CACHE_ALIGN(defrag_end) >> + PAGE_CACHE_SHIFT) - i; + cluster = min(cluster, max_cluster); + } else { + cluster = max_cluster; + } + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; - btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); + if (i + cluster > ra_index) { + ra_index = max(i, ra_index); + btrfs_force_ra(inode->i_mapping, ra, file, ra_index, + cluster); + ra_index += max_cluster; + } - ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); + ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) goto out_ra; defrag_count += ret; balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); - i += ret; if (newer_than) { if (newer_off == (u64)-1) @@ -1105,12 +1131,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!ret) { range->start = newer_off; i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; - newer_left = newer_cluster; } else { break; } } else { - i++; + if (ret > 0) { + i += ret; + last_len += ret << PAGE_CACHE_SHIFT; + } else { + i++; + last_len = 0; + } } } @@ -1136,16 +1167,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, mutex_unlock(&inode->i_mutex); } - disk_super = &root->fs_info->super_copy; + disk_super = root->fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (range->compress_type == BTRFS_COMPRESS_LZO) { features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; btrfs_set_super_incompat_flags(disk_super, features); } - if (!file) - kfree(ra); - return defrag_count; + ret = defrag_count; out_ra: if (!file) @@ -1189,12 +1218,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", + printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", + printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_unlock; @@ -1240,7 +1269,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk(KERN_INFO "new size for %s is %llu\n", + printk(KERN_INFO "btrfs: new size for %s is %llu\n", device->name, (unsigned long long)new_size); if (new_size > old_size) { @@ -1251,7 +1280,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); - } else { + } else if (new_size < old_size) { ret = btrfs_shrink_device(device, new_size); } @@ -2587,7 +2616,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) return PTR_ERR(trans); } - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, dir_id, "default", 7, 1); if (IS_ERR_OR_NULL(di)) { @@ -2603,7 +2632,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - disk_super = &root->fs_info->super_copy; + disk_super = root->fs_info->super_copy; features = btrfs_super_incompat_flags(disk_super); if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; @@ -2864,6 +2893,147 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, return ret; } +static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) +{ + int ret = 0; + int i; + u64 rel_ptr; + int size; + struct btrfs_ioctl_ino_path_args *ipa = NULL; + struct inode_fs_paths *ipath = NULL; + struct btrfs_path *path; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ipa = memdup_user(arg, sizeof(*ipa)); + if (IS_ERR(ipa)) { + ret = PTR_ERR(ipa); + ipa = NULL; + goto out; + } + + size = min_t(u32, ipa->size, 4096); + ipath = init_ipath(size, root, path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto out; + } + + ret = paths_from_inode(ipa->inum, ipath); + if (ret < 0) + goto out; + + for (i = 0; i < ipath->fspath->elem_cnt; ++i) { + rel_ptr = ipath->fspath->val[i] - + (u64)(unsigned long)ipath->fspath->val; + ipath->fspath->val[i] = rel_ptr; + } + + ret = copy_to_user((void *)(unsigned long)ipa->fspath, + (void *)(unsigned long)ipath->fspath, size); + if (ret) { + ret = -EFAULT; + goto out; + } + +out: + btrfs_free_path(path); + free_ipath(ipath); + kfree(ipa); + + return ret; +} + +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + +static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, + void __user *arg) +{ + int ret = 0; + int size; + u64 extent_offset; + struct btrfs_ioctl_logical_ino_args *loi; + struct btrfs_data_container *inodes = NULL; + struct btrfs_path *path = NULL; + struct btrfs_key key; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + loi = memdup_user(arg, sizeof(*loi)); + if (IS_ERR(loi)) { + ret = PTR_ERR(loi); + loi = NULL; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + size = min_t(u32, loi->size, 4096); + inodes = init_data_container(size); + if (IS_ERR(inodes)) { + ret = PTR_ERR(inodes); + inodes = NULL; + goto out; + } + + ret = extent_from_logical(root->fs_info, loi->logical, path, &key); + + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) + ret = -ENOENT; + if (ret < 0) + goto out; + + extent_offset = loi->logical - key.objectid; + ret = iterate_extent_inodes(root->fs_info, path, key.objectid, + extent_offset, build_ino_list, inodes); + + if (ret < 0) + goto out; + + ret = copy_to_user((void *)(unsigned long)loi->inodes, + (void *)(unsigned long)inodes, size); + if (ret) + ret = -EFAULT; + +out: + btrfs_free_path(path); + kfree(inodes); + kfree(loi); + + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2921,6 +3091,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_tree_search(file, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(file, argp); + case BTRFS_IOC_INO_PATHS: + return btrfs_ioctl_ino_to_path(root, argp); + case BTRFS_IOC_LOGICAL_INO: + return btrfs_ioctl_logical_to_ino(root, argp); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(root, argp); case BTRFS_IOC_SYNC: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ad1ea789fcb4..252ae9915de8 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_space_info spaces[0]; }; +struct btrfs_data_container { + __u32 bytes_left; /* out -- bytes not needed to deliver output */ + __u32 bytes_missing; /* out -- additional bytes needed for result */ + __u32 elem_cnt; /* out */ + __u32 elem_missed; /* out */ + __u64 val[0]; /* out */ +}; + +struct btrfs_ioctl_ino_path_args { + __u64 inum; /* in */ + __u32 size; /* in */ + __u64 reserved[4]; + /* struct btrfs_data_container *fspath; out */ + __u64 fspath; /* out */ +}; + +struct btrfs_ioctl_logical_ino_args { + __u64 logical; /* in */ + __u32 size; /* in */ + __u64 reserved[4]; + /* struct btrfs_data_container *inodes; out */ + __u64 inodes; +}; + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_dev_info_args) #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ struct btrfs_ioctl_fs_info_args) +#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ + struct btrfs_ioctl_ino_path_args) +#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ + struct btrfs_ioctl_ino_path_args) + #endif diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index fb2605d998e9..f38e452486b8 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot) void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; - u32 type; - u32 nr = btrfs_header_nritems(l); + u32 type, nr; struct btrfs_item *item; struct btrfs_root_item *ri; struct btrfs_dir_item *di; @@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) struct btrfs_key key; struct btrfs_key found_key; + if (!l) + return; + + nr = btrfs_header_nritems(l); + printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", (unsigned long long)btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c new file mode 100644 index 000000000000..2373b39a132b --- /dev/null +++ b/fs/btrfs/reada.c @@ -0,0 +1,951 @@ +/* + * Copyright (C) 2011 STRATO. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include "ctree.h" +#include "volumes.h" +#include "disk-io.h" +#include "transaction.h" + +#undef DEBUG + +/* + * This is the implementation for the generic read ahead framework. + * + * To trigger a readahead, btrfs_reada_add must be called. It will start + * a read ahead for the given range [start, end) on tree root. The returned + * handle can either be used to wait on the readahead to finish + * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). + * + * The read ahead works as follows: + * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. + * reada_start_machine will then search for extents to prefetch and trigger + * some reads. When a read finishes for a node, all contained node/leaf + * pointers that lie in the given range will also be enqueued. The reads will + * be triggered in sequential order, thus giving a big win over a naive + * enumeration. It will also make use of multi-device layouts. Each disk + * will have its on read pointer and all disks will by utilized in parallel. + * Also will no two disks read both sides of a mirror simultaneously, as this + * would waste seeking capacity. Instead both disks will read different parts + * of the filesystem. + * Any number of readaheads can be started in parallel. The read order will be + * determined globally, i.e. 2 parallel readaheads will normally finish faster + * than the 2 started one after another. + */ + +#define MAX_MIRRORS 2 +#define MAX_IN_FLIGHT 6 + +struct reada_extctl { + struct list_head list; + struct reada_control *rc; + u64 generation; +}; + +struct reada_extent { + u64 logical; + struct btrfs_key top; + u32 blocksize; + int err; + struct list_head extctl; + struct kref refcnt; + spinlock_t lock; + struct reada_zone *zones[MAX_MIRRORS]; + int nzones; + struct btrfs_device *scheduled_for; +}; + +struct reada_zone { + u64 start; + u64 end; + u64 elems; + struct list_head list; + spinlock_t lock; + int locked; + struct btrfs_device *device; + struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */ + int ndevs; + struct kref refcnt; +}; + +struct reada_machine_work { + struct btrfs_work work; + struct btrfs_fs_info *fs_info; +}; + +static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); +static void reada_control_release(struct kref *kref); +static void reada_zone_release(struct kref *kref); +static void reada_start_machine(struct btrfs_fs_info *fs_info); +static void __reada_start_machine(struct btrfs_fs_info *fs_info); + +static int reada_add_block(struct reada_control *rc, u64 logical, + struct btrfs_key *top, int level, u64 generation); + +/* recurses */ +/* in case of err, eb might be NULL */ +static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err) +{ + int level = 0; + int nritems; + int i; + u64 bytenr; + u64 generation; + struct reada_extent *re; + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head list; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct btrfs_device *for_dev; + + if (eb) + level = btrfs_header_level(eb); + + /* find extent */ + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, index); + if (re) + kref_get(&re->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (!re) + return -1; + + spin_lock(&re->lock); + /* + * just take the full list from the extent. afterwards we + * don't need the lock anymore + */ + list_replace_init(&re->extctl, &list); + for_dev = re->scheduled_for; + re->scheduled_for = NULL; + spin_unlock(&re->lock); + + if (err == 0) { + nritems = level ? btrfs_header_nritems(eb) : 0; + generation = btrfs_header_generation(eb); + /* + * FIXME: currently we just set nritems to 0 if this is a leaf, + * effectively ignoring the content. In a next step we could + * trigger more readahead depending from the content, e.g. + * fetch the checksums for the extents in the leaf. + */ + } else { + /* + * this is the error case, the extent buffer has not been + * read correctly. We won't access anything from it and + * just cleanup our data structures. Effectively this will + * cut the branch below this node from read ahead. + */ + nritems = 0; + generation = 0; + } + + for (i = 0; i < nritems; i++) { + struct reada_extctl *rec; + u64 n_gen; + struct btrfs_key key; + struct btrfs_key next_key; + + btrfs_node_key_to_cpu(eb, &key, i); + if (i + 1 < nritems) + btrfs_node_key_to_cpu(eb, &next_key, i + 1); + else + next_key = re->top; + bytenr = btrfs_node_blockptr(eb, i); + n_gen = btrfs_node_ptr_generation(eb, i); + + list_for_each_entry(rec, &list, list) { + struct reada_control *rc = rec->rc; + + /* + * if the generation doesn't match, just ignore this + * extctl. This will probably cut off a branch from + * prefetch. Alternatively one could start a new (sub-) + * prefetch for this branch, starting again from root. + * FIXME: move the generation check out of this loop + */ +#ifdef DEBUG + if (rec->generation != generation) { + printk(KERN_DEBUG "generation mismatch for " + "(%llu,%d,%llu) %llu != %llu\n", + key.objectid, key.type, key.offset, + rec->generation, generation); + } +#endif + if (rec->generation == generation && + btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && + btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) + reada_add_block(rc, bytenr, &next_key, + level - 1, n_gen); + } + } + /* + * free extctl records + */ + while (!list_empty(&list)) { + struct reada_control *rc; + struct reada_extctl *rec; + + rec = list_first_entry(&list, struct reada_extctl, list); + list_del(&rec->list); + rc = rec->rc; + kfree(rec); + + kref_get(&rc->refcnt); + if (atomic_dec_and_test(&rc->elems)) { + kref_put(&rc->refcnt, reada_control_release); + wake_up(&rc->wait); + } + kref_put(&rc->refcnt, reada_control_release); + + reada_extent_put(fs_info, re); /* one ref for each entry */ + } + reada_extent_put(fs_info, re); /* our ref */ + if (for_dev) + atomic_dec(&for_dev->reada_in_flight); + + return 0; +} + +/* + * start is passed separately in case eb in NULL, which may be the case with + * failed I/O + */ +int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, + u64 start, int err) +{ + int ret; + + ret = __readahead_hook(root, eb, start, err); + + reada_start_machine(root->fs_info); + + return ret; +} + +static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev, u64 logical, + struct btrfs_bio *bbio) +{ + int ret; + int looped = 0; + struct reada_zone *zone; + struct btrfs_block_group_cache *cache = NULL; + u64 start; + u64 end; + int i; + +again: + zone = NULL; + spin_lock(&fs_info->reada_lock); + ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, + logical >> PAGE_CACHE_SHIFT, 1); + if (ret == 1) + kref_get(&zone->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (ret == 1) { + if (logical >= zone->start && logical < zone->end) + return zone; + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + + if (looped) + return NULL; + + cache = btrfs_lookup_block_group(fs_info, logical); + if (!cache) + return NULL; + + start = cache->key.objectid; + end = start + cache->key.offset - 1; + btrfs_put_block_group(cache); + + zone = kzalloc(sizeof(*zone), GFP_NOFS); + if (!zone) + return NULL; + + zone->start = start; + zone->end = end; + INIT_LIST_HEAD(&zone->list); + spin_lock_init(&zone->lock); + zone->locked = 0; + kref_init(&zone->refcnt); + zone->elems = 0; + zone->device = dev; /* our device always sits at index 0 */ + for (i = 0; i < bbio->num_stripes; ++i) { + /* bounds have already been checked */ + zone->devs[i] = bbio->stripes[i].dev; + } + zone->ndevs = bbio->num_stripes; + + spin_lock(&fs_info->reada_lock); + ret = radix_tree_insert(&dev->reada_zones, + (unsigned long)zone->end >> PAGE_CACHE_SHIFT, + zone); + spin_unlock(&fs_info->reada_lock); + + if (ret) { + kfree(zone); + looped = 1; + goto again; + } + + return zone; +} + +static struct reada_extent *reada_find_extent(struct btrfs_root *root, + u64 logical, + struct btrfs_key *top, int level) +{ + int ret; + int looped = 0; + struct reada_extent *re = NULL; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct btrfs_bio *bbio = NULL; + struct btrfs_device *dev; + u32 blocksize; + u64 length; + int nzones = 0; + int i; + unsigned long index = logical >> PAGE_CACHE_SHIFT; + +again: + spin_lock(&fs_info->reada_lock); + re = radix_tree_lookup(&fs_info->reada_tree, index); + if (re) + kref_get(&re->refcnt); + spin_unlock(&fs_info->reada_lock); + + if (re || looped) + return re; + + re = kzalloc(sizeof(*re), GFP_NOFS); + if (!re) + return NULL; + + blocksize = btrfs_level_size(root, level); + re->logical = logical; + re->blocksize = blocksize; + re->top = *top; + INIT_LIST_HEAD(&re->extctl); + spin_lock_init(&re->lock); + kref_init(&re->refcnt); + + /* + * map block + */ + length = blocksize; + ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); + if (ret || !bbio || length < blocksize) + goto error; + + if (bbio->num_stripes > MAX_MIRRORS) { + printk(KERN_ERR "btrfs readahead: more than %d copies not " + "supported", MAX_MIRRORS); + goto error; + } + + for (nzones = 0; nzones < bbio->num_stripes; ++nzones) { + struct reada_zone *zone; + + dev = bbio->stripes[nzones].dev; + zone = reada_find_zone(fs_info, dev, logical, bbio); + if (!zone) + break; + + re->zones[nzones] = zone; + spin_lock(&zone->lock); + if (!zone->elems) + kref_get(&zone->refcnt); + ++zone->elems; + spin_unlock(&zone->lock); + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + re->nzones = nzones; + if (nzones == 0) { + /* not a single zone found, error and out */ + goto error; + } + + /* insert extent in reada_tree + all per-device trees, all or nothing */ + spin_lock(&fs_info->reada_lock); + ret = radix_tree_insert(&fs_info->reada_tree, index, re); + if (ret) { + spin_unlock(&fs_info->reada_lock); + if (ret != -ENOMEM) { + /* someone inserted the extent in the meantime */ + looped = 1; + } + goto error; + } + for (i = 0; i < nzones; ++i) { + dev = bbio->stripes[i].dev; + ret = radix_tree_insert(&dev->reada_extents, index, re); + if (ret) { + while (--i >= 0) { + dev = bbio->stripes[i].dev; + BUG_ON(dev == NULL); + radix_tree_delete(&dev->reada_extents, index); + } + BUG_ON(fs_info == NULL); + radix_tree_delete(&fs_info->reada_tree, index); + spin_unlock(&fs_info->reada_lock); + goto error; + } + } + spin_unlock(&fs_info->reada_lock); + + kfree(bbio); + return re; + +error: + while (nzones) { + struct reada_zone *zone; + + --nzones; + zone = re->zones[nzones]; + kref_get(&zone->refcnt); + spin_lock(&zone->lock); + --zone->elems; + if (zone->elems == 0) { + /* + * no fs_info->reada_lock needed, as this can't be + * the last ref + */ + kref_put(&zone->refcnt, reada_zone_release); + } + spin_unlock(&zone->lock); + + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + kfree(bbio); + kfree(re); + if (looped) + goto again; + return NULL; +} + +static void reada_kref_dummy(struct kref *kr) +{ +} + +static void reada_extent_put(struct btrfs_fs_info *fs_info, + struct reada_extent *re) +{ + int i; + unsigned long index = re->logical >> PAGE_CACHE_SHIFT; + + spin_lock(&fs_info->reada_lock); + if (!kref_put(&re->refcnt, reada_kref_dummy)) { + spin_unlock(&fs_info->reada_lock); + return; + } + + radix_tree_delete(&fs_info->reada_tree, index); + for (i = 0; i < re->nzones; ++i) { + struct reada_zone *zone = re->zones[i]; + + radix_tree_delete(&zone->device->reada_extents, index); + } + + spin_unlock(&fs_info->reada_lock); + + for (i = 0; i < re->nzones; ++i) { + struct reada_zone *zone = re->zones[i]; + + kref_get(&zone->refcnt); + spin_lock(&zone->lock); + --zone->elems; + if (zone->elems == 0) { + /* no fs_info->reada_lock needed, as this can't be + * the last ref */ + kref_put(&zone->refcnt, reada_zone_release); + } + spin_unlock(&zone->lock); + + spin_lock(&fs_info->reada_lock); + kref_put(&zone->refcnt, reada_zone_release); + spin_unlock(&fs_info->reada_lock); + } + if (re->scheduled_for) + atomic_dec(&re->scheduled_for->reada_in_flight); + + kfree(re); +} + +static void reada_zone_release(struct kref *kref) +{ + struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); + + radix_tree_delete(&zone->device->reada_zones, + zone->end >> PAGE_CACHE_SHIFT); + + kfree(zone); +} + +static void reada_control_release(struct kref *kref) +{ + struct reada_control *rc = container_of(kref, struct reada_control, + refcnt); + + kfree(rc); +} + +static int reada_add_block(struct reada_control *rc, u64 logical, + struct btrfs_key *top, int level, u64 generation) +{ + struct btrfs_root *root = rc->root; + struct reada_extent *re; + struct reada_extctl *rec; + + re = reada_find_extent(root, logical, top, level); /* takes one ref */ + if (!re) + return -1; + + rec = kzalloc(sizeof(*rec), GFP_NOFS); + if (!rec) { + reada_extent_put(root->fs_info, re); + return -1; + } + + rec->rc = rc; + rec->generation = generation; + atomic_inc(&rc->elems); + + spin_lock(&re->lock); + list_add_tail(&rec->list, &re->extctl); + spin_unlock(&re->lock); + + /* leave the ref on the extent */ + + return 0; +} + +/* + * called with fs_info->reada_lock held + */ +static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) +{ + int i; + unsigned long index = zone->end >> PAGE_CACHE_SHIFT; + + for (i = 0; i < zone->ndevs; ++i) { + struct reada_zone *peer; + peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); + if (peer && peer->device != zone->device) + peer->locked = lock; + } +} + +/* + * called with fs_info->reada_lock held + */ +static int reada_pick_zone(struct btrfs_device *dev) +{ + struct reada_zone *top_zone = NULL; + struct reada_zone *top_locked_zone = NULL; + u64 top_elems = 0; + u64 top_locked_elems = 0; + unsigned long index = 0; + int ret; + + if (dev->reada_curr_zone) { + reada_peer_zones_set_lock(dev->reada_curr_zone, 0); + kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); + dev->reada_curr_zone = NULL; + } + /* pick the zone with the most elements */ + while (1) { + struct reada_zone *zone; + + ret = radix_tree_gang_lookup(&dev->reada_zones, + (void **)&zone, index, 1); + if (ret == 0) + break; + index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + if (zone->locked) { + if (zone->elems > top_locked_elems) { + top_locked_elems = zone->elems; + top_locked_zone = zone; + } + } else { + if (zone->elems > top_elems) { + top_elems = zone->elems; + top_zone = zone; + } + } + } + if (top_zone) + dev->reada_curr_zone = top_zone; + else if (top_locked_zone) + dev->reada_curr_zone = top_locked_zone; + else + return 0; + + dev->reada_next = dev->reada_curr_zone->start; + kref_get(&dev->reada_curr_zone->refcnt); + reada_peer_zones_set_lock(dev->reada_curr_zone, 1); + + return 1; +} + +static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, + struct btrfs_device *dev) +{ + struct reada_extent *re = NULL; + int mirror_num = 0; + struct extent_buffer *eb = NULL; + u64 logical; + u32 blocksize; + int ret; + int i; + int need_kick = 0; + + spin_lock(&fs_info->reada_lock); + if (dev->reada_curr_zone == NULL) { + ret = reada_pick_zone(dev); + if (!ret) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + } + /* + * FIXME currently we issue the reads one extent at a time. If we have + * a contiguous block of extents, we could also coagulate them or use + * plugging to speed things up + */ + ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, + dev->reada_next >> PAGE_CACHE_SHIFT, 1); + if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { + ret = reada_pick_zone(dev); + if (!ret) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + re = NULL; + ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, + dev->reada_next >> PAGE_CACHE_SHIFT, 1); + } + if (ret == 0) { + spin_unlock(&fs_info->reada_lock); + return 0; + } + dev->reada_next = re->logical + re->blocksize; + kref_get(&re->refcnt); + + spin_unlock(&fs_info->reada_lock); + + /* + * find mirror num + */ + for (i = 0; i < re->nzones; ++i) { + if (re->zones[i]->device == dev) { + mirror_num = i + 1; + break; + } + } + logical = re->logical; + blocksize = re->blocksize; + + spin_lock(&re->lock); + if (re->scheduled_for == NULL) { + re->scheduled_for = dev; + need_kick = 1; + } + spin_unlock(&re->lock); + + reada_extent_put(fs_info, re); + + if (!need_kick) + return 0; + + atomic_inc(&dev->reada_in_flight); + ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, + mirror_num, &eb); + if (ret) + __readahead_hook(fs_info->extent_root, NULL, logical, ret); + else if (eb) + __readahead_hook(fs_info->extent_root, eb, eb->start, ret); + + if (eb) + free_extent_buffer(eb); + + return 1; + +} + +static void reada_start_machine_worker(struct btrfs_work *work) +{ + struct reada_machine_work *rmw; + struct btrfs_fs_info *fs_info; + + rmw = container_of(work, struct reada_machine_work, work); + fs_info = rmw->fs_info; + + kfree(rmw); + + __reada_start_machine(fs_info); +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u64 enqueued; + u64 total = 0; + int i; + + do { + enqueued = 0; + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (atomic_read(&device->reada_in_flight) < + MAX_IN_FLIGHT) + enqueued += reada_start_machine_dev(fs_info, + device); + } + total += enqueued; + } while (enqueued && total < 10000); + + if (enqueued == 0) + return; + + /* + * If everything is already in the cache, this is effectively single + * threaded. To a) not hold the caller for too long and b) to utilize + * more cores, we broke the loop above after 10000 iterations and now + * enqueue to workers to finish it. This will distribute the load to + * the cores. + */ + for (i = 0; i < 2; ++i) + reada_start_machine(fs_info); +} + +static void reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct reada_machine_work *rmw; + + rmw = kzalloc(sizeof(*rmw), GFP_NOFS); + if (!rmw) { + /* FIXME we cannot handle this properly right now */ + BUG(); + } + rmw->work.func = reada_start_machine_worker; + rmw->fs_info = fs_info; + + btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); +} + +#ifdef DEBUG +static void dump_devs(struct btrfs_fs_info *fs_info, int all) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + unsigned long index; + int ret; + int i; + int j; + int cnt; + + spin_lock(&fs_info->reada_lock); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid, + atomic_read(&device->reada_in_flight)); + index = 0; + while (1) { + struct reada_zone *zone; + ret = radix_tree_gang_lookup(&device->reada_zones, + (void **)&zone, index, 1); + if (ret == 0) + break; + printk(KERN_DEBUG " zone %llu-%llu elems %llu locked " + "%d devs", zone->start, zone->end, zone->elems, + zone->locked); + for (j = 0; j < zone->ndevs; ++j) { + printk(KERN_CONT " %lld", + zone->devs[j]->devid); + } + if (device->reada_curr_zone == zone) + printk(KERN_CONT " curr off %llu", + device->reada_next - zone->start); + printk(KERN_CONT "\n"); + index = (zone->end >> PAGE_CACHE_SHIFT) + 1; + } + cnt = 0; + index = 0; + while (all) { + struct reada_extent *re = NULL; + + ret = radix_tree_gang_lookup(&device->reada_extents, + (void **)&re, index, 1); + if (ret == 0) + break; + printk(KERN_DEBUG + " re: logical %llu size %u empty %d for %lld", + re->logical, re->blocksize, + list_empty(&re->extctl), re->scheduled_for ? + re->scheduled_for->devid : -1); + + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); + } + } + printk(KERN_CONT "\n"); + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + if (++cnt > 15) + break; + } + } + + index = 0; + cnt = 0; + while (all) { + struct reada_extent *re = NULL; + + ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, + index, 1); + if (ret == 0) + break; + if (!re->scheduled_for) { + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + continue; + } + printk(KERN_DEBUG + "re: logical %llu size %u list empty %d for %lld", + re->logical, re->blocksize, list_empty(&re->extctl), + re->scheduled_for ? re->scheduled_for->devid : -1); + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (i = 0; i < re->nzones; ++i) { + printk(KERN_CONT " zone %llu-%llu devs", + re->zones[i]->start, + re->zones[i]->end); + for (j = 0; j < re->zones[i]->ndevs; ++j) { + printk(KERN_CONT " %lld", + re->zones[i]->devs[j]->devid); + } + } + } + printk(KERN_CONT "\n"); + index = (re->logical >> PAGE_CACHE_SHIFT) + 1; + } + spin_unlock(&fs_info->reada_lock); +} +#endif + +/* + * interface + */ +struct reada_control *btrfs_reada_add(struct btrfs_root *root, + struct btrfs_key *key_start, struct btrfs_key *key_end) +{ + struct reada_control *rc; + u64 start; + u64 generation; + int level; + struct extent_buffer *node; + static struct btrfs_key max_key = { + .objectid = (u64)-1, + .type = (u8)-1, + .offset = (u64)-1 + }; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return ERR_PTR(-ENOMEM); + + rc->root = root; + rc->key_start = *key_start; + rc->key_end = *key_end; + atomic_set(&rc->elems, 0); + init_waitqueue_head(&rc->wait); + kref_init(&rc->refcnt); + kref_get(&rc->refcnt); /* one ref for having elements */ + + node = btrfs_root_node(root); + start = node->start; + level = btrfs_header_level(node); + generation = btrfs_header_generation(node); + free_extent_buffer(node); + + reada_add_block(rc, start, &max_key, level, generation); + + reada_start_machine(root->fs_info); + + return rc; +} + +#ifdef DEBUG +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + + while (atomic_read(&rc->elems)) { + wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, + 5 * HZ); + dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + } + + dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); + + kref_put(&rc->refcnt, reada_control_release); + + return 0; +} +#else +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + + while (atomic_read(&rc->elems)) { + wait_event(rc->wait, atomic_read(&rc->elems) == 0); + } + + kref_put(&rc->refcnt, reada_control_release); + + return 0; +} +#endif + +void btrfs_reada_detach(void *handle) +{ + struct reada_control *rc = handle; + + kref_put(&rc->refcnt, reada_control_release); +} diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 59bb1764273d..cfb55434a469 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, list_add_tail(&new_edge->list[UPPER], &new_node->lower); } + } else { + list_add_tail(&new_node->lower, &cache->leaves); } rb_node = tree_insert(&cache->rb_root, new_node->bytenr, @@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, - min_reserved, 0); + ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); if (ret) { BUG_ON(ret != -EAGAIN); ret = btrfs_commit_transaction(trans, root); @@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err) again: if (!err) { num_bytes = rc->merging_rsv_size; - ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, - num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); if (ret) err = ret; } @@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); + ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; @@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode, unsigned long last_index; struct page *page; struct file_ra_state *ra; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int nr = 0; int ret = 0; @@ -2946,7 +2947,9 @@ static int relocate_file_extent_cluster(struct inode *inode, index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; while (index <= last_index) { + mutex_lock(&inode->i_mutex); ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); + mutex_unlock(&inode->i_mutex); if (ret) goto out; @@ -2956,7 +2959,7 @@ static int relocate_file_extent_cluster(struct inode *inode, ra, NULL, index, last_index + 1 - index); page = find_or_create_page(inode->i_mapping, index, - GFP_NOFS); + mask); if (!page) { btrfs_delalloc_release_metadata(inode, PAGE_CACHE_SIZE); @@ -3323,8 +3326,11 @@ static int find_data_references(struct reloc_control *rc, } key.objectid = ref_objectid; - key.offset = ref_offset; key.type = BTRFS_EXTENT_DATA_KEY; + if (ref_offset > ((u64)-1 << 32)) + key.offset = 0; + else + key.offset = ref_offset; path->search_commit_root = 1; path->skip_locking = 1; @@ -3645,14 +3651,11 @@ int prepare_to_relocate(struct reloc_control *rc) * btrfs_init_reloc_root will use them when there * is no reservation in transaction handle. */ - ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, + ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, rc->extent_root->nodesize * 256); if (ret) return ret; - rc->block_rsv->refill_used = 1; - btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv); - memset(&rc->cluster, 0, sizeof(rc->cluster)); rc->search_start = rc->block_group->key.objectid; rc->extents_found = 0; @@ -3777,8 +3780,7 @@ restart: } } - ret = btrfs_block_rsv_check(trans, rc->extent_root, - rc->block_rsv, 0, 5); + ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); if (ret < 0) { if (ret != -EAGAIN) { err = ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5efb5d..ddf2c90d3fc0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -17,10 +17,14 @@ */ #include <linux/blkdev.h> +#include <linux/ratelimit.h> #include "ctree.h" #include "volumes.h" #include "disk-io.h" #include "ordered-data.h" +#include "transaction.h" +#include "backref.h" +#include "extent_io.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -29,15 +33,12 @@ * any can be found. * * Future enhancements: - * - To enhance the performance, better read-ahead strategies for the - * extent-tree can be employed. * - In case an unrepairable extent is encountered, track which files are * affected and report them * - In case of a read error on files with nodatasum, map the file and read * the extent to trigger a writeback of the good copy * - track and record media errors, throw out bad devices * - add a mode to also read unallocated space - * - make the prefetch cancellable */ struct scrub_bio; @@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix); struct scrub_page { u64 flags; /* extent flags */ u64 generation; - u64 mirror_num; + int mirror_num; int have_csum; u8 csum[BTRFS_CSUM_SIZE]; }; @@ -87,6 +88,7 @@ struct scrub_dev { int first_free; int curr; atomic_t in_flight; + atomic_t fixup_cnt; spinlock_t list_lock; wait_queue_head_t list_wait; u16 csum_size; @@ -100,6 +102,27 @@ struct scrub_dev { spinlock_t stat_lock; }; +struct scrub_fixup_nodatasum { + struct scrub_dev *sdev; + u64 logical; + struct btrfs_root *root; + struct btrfs_work work; + int mirror_num; +}; + +struct scrub_warning { + struct btrfs_path *path; + u64 extent_item_size; + char *scratch_buf; + char *msg_buf; + const char *errstr; + sector_t sector; + u64 logical; + struct btrfs_device *dev; + int msg_bufsize; + int scratch_bufsize; +}; + static void scrub_free_csums(struct scrub_dev *sdev) { while (!list_empty(&sdev->csum_list)) { @@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; - else + else sdev->bios[i]->next_free = -1; } sdev->first_free = 0; sdev->curr = -1; atomic_set(&sdev->in_flight, 0); + atomic_set(&sdev->fixup_cnt, 0); atomic_set(&sdev->cancel_req, 0); - sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); + sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); INIT_LIST_HEAD(&sdev->csum_list); spin_lock_init(&sdev->list_lock); @@ -195,24 +219,366 @@ nomem: return ERR_PTR(-ENOMEM); } +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) +{ + u64 isize; + u32 nlink; + int ret; + int i; + struct extent_buffer *eb; + struct btrfs_inode_item *inode_item; + struct scrub_warning *swarn = ctx; + struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; + struct inode_fs_paths *ipath = NULL; + struct btrfs_root *local_root; + struct btrfs_key root_key; + + root_key.objectid = root; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(local_root)) { + ret = PTR_ERR(local_root); + goto err; + } + + ret = inode_item_info(inum, 0, local_root, swarn->path); + if (ret) { + btrfs_release_path(swarn->path); + goto err; + } + + eb = swarn->path->nodes[0]; + inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], + struct btrfs_inode_item); + isize = btrfs_inode_size(eb, inode_item); + nlink = btrfs_inode_nlink(eb, inode_item); + btrfs_release_path(swarn->path); + + ipath = init_ipath(4096, local_root, swarn->path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto err; + } + ret = paths_from_inode(inum, ipath); + + if (ret < 0) + goto err; + + /* + * we deliberately ignore the bit ipath might have been too small to + * hold all of the paths here + */ + for (i = 0; i < ipath->fspath->elem_cnt; ++i) + printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu, " + "length %llu, links %u (path: %s)\n", swarn->errstr, + swarn->logical, swarn->dev->name, + (unsigned long long)swarn->sector, root, inum, offset, + min(isize - offset, (u64)PAGE_SIZE), nlink, + (char *)(unsigned long)ipath->fspath->val[i]); + + free_ipath(ipath); + return 0; + +err: + printk(KERN_WARNING "btrfs: %s at logical %llu on dev " + "%s, sector %llu, root %llu, inode %llu, offset %llu: path " + "resolving failed with ret=%d\n", swarn->errstr, + swarn->logical, swarn->dev->name, + (unsigned long long)swarn->sector, root, inum, offset, ret); + + free_ipath(ipath); + return 0; +} + +static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, + int ix) +{ + struct btrfs_device *dev = sbio->sdev->dev; + struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + struct btrfs_path *path; + struct btrfs_key found_key; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct scrub_warning swarn; + u32 item_size; + int ret; + u64 ref_root; + u8 ref_level; + unsigned long ptr = 0; + const int bufsize = 4096; + u64 extent_offset; + + path = btrfs_alloc_path(); + + swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); + swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); + swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + swarn.logical = sbio->logical + ix * PAGE_SIZE; + swarn.errstr = errstr; + swarn.dev = dev; + swarn.msg_bufsize = bufsize; + swarn.scratch_bufsize = bufsize; + + if (!path || !swarn.scratch_buf || !swarn.msg_buf) + goto out; + + ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); + if (ret < 0) + goto out; + + extent_offset = swarn.logical - found_key.objectid; + swarn.extent_item_size = found_key.offset; + + eb = path->nodes[0]; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + do { + ret = tree_backref_for_extent(&ptr, eb, ei, item_size, + &ref_root, &ref_level); + printk(KERN_WARNING "%s at logical %llu on dev %s, " + "sector %llu: metadata %s (level %d) in tree " + "%llu\n", errstr, swarn.logical, dev->name, + (unsigned long long)swarn.sector, + ref_level ? "node" : "leaf", + ret < 0 ? -1 : ref_level, + ret < 0 ? -1 : ref_root); + } while (ret != 1); + } else { + swarn.path = path; + iterate_extent_inodes(fs_info, path, found_key.objectid, + extent_offset, + scrub_print_warning_inode, &swarn); + } + +out: + btrfs_free_path(path); + kfree(swarn.scratch_buf); + kfree(swarn.msg_buf); +} + +static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct page *page = NULL; + unsigned long index; + struct scrub_fixup_nodatasum *fixup = ctx; + int ret; + int corrected = 0; + struct btrfs_key key; + struct inode *inode = NULL; + u64 end = offset + PAGE_SIZE - 1; + struct btrfs_root *local_root; + + key.objectid = root; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); + if (IS_ERR(local_root)) + return PTR_ERR(local_root); + + key.type = BTRFS_INODE_ITEM_KEY; + key.objectid = inum; + key.offset = 0; + inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + index = offset >> PAGE_CACHE_SHIFT; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + goto out; + } + + if (PageUptodate(page)) { + struct btrfs_mapping_tree *map_tree; + if (PageDirty(page)) { + /* + * we need to write the data to the defect sector. the + * data that was in that sector is not in memory, + * because the page was modified. we must not write the + * modified page to that sector. + * + * TODO: what could be done here: wait for the delalloc + * runner to write out that page (might involve + * COW) and see whether the sector is still + * referenced afterwards. + * + * For the meantime, we'll treat this error + * incorrectable, although there is a chance that a + * later scrub will find the bad sector again and that + * there's no dirty page in memory, then. + */ + ret = -EIO; + goto out; + } + map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; + ret = repair_io_failure(map_tree, offset, PAGE_SIZE, + fixup->logical, page, + fixup->mirror_num); + unlock_page(page); + corrected = !ret; + } else { + /* + * we need to get good data first. the general readpage path + * will call repair_io_failure for us, we just have to make + * sure we read the bad mirror. + */ + ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + if (ret) { + /* set_extent_bits should give proper error */ + WARN_ON(ret > 0); + if (ret > 0) + ret = -EFAULT; + goto out; + } + + ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, + btrfs_get_extent, + fixup->mirror_num); + wait_on_page_locked(page); + + corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, + end, EXTENT_DAMAGED, 0, NULL); + if (!corrected) + clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, + EXTENT_DAMAGED, GFP_NOFS); + } + +out: + if (page) + put_page(page); + if (inode) + iput(inode); + + if (ret < 0) + return ret; + + if (ret == 0 && corrected) { + /* + * we only need to call readpage for one of the inodes belonging + * to this extent. so make iterate_extent_inodes stop + */ + return 1; + } + + return -EIO; +} + +static void scrub_fixup_nodatasum(struct btrfs_work *work) +{ + int ret; + struct scrub_fixup_nodatasum *fixup; + struct scrub_dev *sdev; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path; + int uncorrectable = 0; + + fixup = container_of(work, struct scrub_fixup_nodatasum, work); + sdev = fixup->sdev; + fs_info = fixup->root->fs_info; + + path = btrfs_alloc_path(); + if (!path) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.malloc_errors; + spin_unlock(&sdev->stat_lock); + uncorrectable = 1; + goto out; + } + + trans = btrfs_join_transaction(fixup->root); + if (IS_ERR(trans)) { + uncorrectable = 1; + goto out; + } + + /* + * the idea is to trigger a regular read through the standard path. we + * read a page from the (failed) logical address by specifying the + * corresponding copynum of the failed sector. thus, that readpage is + * expected to fail. + * that is the point where on-the-fly error correction will kick in + * (once it's finished) and rewrite the failed sector if a good copy + * can be found. + */ + ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, + path, scrub_fixup_readpage, + fixup); + if (ret < 0) { + uncorrectable = 1; + goto out; + } + WARN_ON(ret != 1); + + spin_lock(&sdev->stat_lock); + ++sdev->stat.corrected_errors; + spin_unlock(&sdev->stat_lock); + +out: + if (trans && !IS_ERR(trans)) + btrfs_end_transaction(trans, fixup->root); + if (uncorrectable) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.uncorrectable_errors; + spin_unlock(&sdev->stat_lock); + printk_ratelimited(KERN_ERR "btrfs: unable to fixup " + "(nodatasum) error at logical %llu\n", + fixup->logical); + } + + btrfs_free_path(path); + kfree(fixup); + + /* see caller why we're pretending to be paused in the scrub counters */ + mutex_lock(&fs_info->scrub_lock); + atomic_dec(&fs_info->scrubs_running); + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_dec(&sdev->fixup_cnt); + wake_up(&fs_info->scrub_pause_wait); + wake_up(&sdev->list_wait); +} + /* * scrub_recheck_error gets called when either verification of the page * failed or the bio failed to read, e.g. with EIO. In the latter case, * recheck_error gets called for every page in the bio, even though only * one may be bad */ -static void scrub_recheck_error(struct scrub_bio *sbio, int ix) +static int scrub_recheck_error(struct scrub_bio *sbio, int ix) { + struct scrub_dev *sdev = sbio->sdev; + u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + if (sbio->err) { - if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, - (sbio->physical + ix * PAGE_SIZE) >> 9, + if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, sbio->bio->bi_io_vec[ix].bv_page) == 0) { if (scrub_fixup_check(sbio, ix) == 0) - return; + return 0; } + if (__ratelimit(&_rs)) + scrub_print_warning("i/o error", sbio, ix); + } else { + if (__ratelimit(&_rs)) + scrub_print_warning("checksum error", sbio, ix); } + spin_lock(&sdev->stat_lock); + ++sdev->stat.read_errors; + spin_unlock(&sdev->stat_lock); + scrub_fixup(sbio, ix); + return 1; } static int scrub_fixup_check(struct scrub_bio *sbio, int ix) @@ -250,7 +616,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) struct scrub_dev *sdev = sbio->sdev; struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; + struct scrub_fixup_nodatasum *fixup; u64 logical = sbio->logical + ix * PAGE_SIZE; u64 length; int i; @@ -259,38 +626,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && (sbio->spag[ix].have_csum == 0)) { + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + goto uncorrectable; + fixup->sdev = sdev; + fixup->logical = logical; + fixup->root = fs_info->extent_root; + fixup->mirror_num = sbio->spag[ix].mirror_num; /* - * nodatasum, don't try to fix anything - * FIXME: we can do better, open the inode and trigger a - * writeback + * increment scrubs_running to prevent cancel requests from + * completing as long as a fixup worker is running. we must also + * increment scrubs_paused to prevent deadlocking on pause + * requests used for transactions commits (as the worker uses a + * transaction context). it is safe to regard the fixup worker + * as paused for all matters practical. effectively, we only + * avoid cancellation requests from completing. */ - goto uncorrectable; + mutex_lock(&fs_info->scrub_lock); + atomic_inc(&fs_info->scrubs_running); + atomic_inc(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + atomic_inc(&sdev->fixup_cnt); + fixup->work.func = scrub_fixup_nodatasum; + btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); + return; } length = PAGE_SIZE; ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, - &multi, 0); - if (ret || !multi || length < PAGE_SIZE) { + &bbio, 0); + if (ret || !bbio || length < PAGE_SIZE) { printk(KERN_ERR "scrub_fixup: btrfs_map_block failed us for %llu\n", (unsigned long long)logical); WARN_ON(1); + kfree(bbio); return; } - if (multi->num_stripes == 1) + if (bbio->num_stripes == 1) /* there aren't any replicas */ goto uncorrectable; /* * first find a good copy */ - for (i = 0; i < multi->num_stripes; ++i) { - if (i == sbio->spag[ix].mirror_num) + for (i = 0; i < bbio->num_stripes; ++i) { + if (i + 1 == sbio->spag[ix].mirror_num) continue; - if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, - multi->stripes[i].physical >> 9, + if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, + bbio->stripes[i].physical >> 9, sbio->bio->bi_io_vec[ix].bv_page)) { /* I/O-error, this is not a good copy */ continue; @@ -299,7 +685,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) if (scrub_fixup_check(sbio, ix) == 0) break; } - if (i == multi->num_stripes) + if (i == bbio->num_stripes) goto uncorrectable; if (!sdev->readonly) { @@ -314,25 +700,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) } } - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.corrected_errors; spin_unlock(&sdev->stat_lock); - if (printk_ratelimit()) - printk(KERN_ERR "btrfs: fixed up at %llu\n", - (unsigned long long)logical); + printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", + (unsigned long long)logical); return; uncorrectable: - kfree(multi); + kfree(bbio); spin_lock(&sdev->stat_lock); ++sdev->stat.uncorrectable_errors; spin_unlock(&sdev->stat_lock); - if (printk_ratelimit()) - printk(KERN_ERR "btrfs: unable to fixup at %llu\n", - (unsigned long long)logical); + printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " + "logical %llu\n", (unsigned long long)logical); } static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, @@ -382,8 +766,14 @@ static void scrub_checksum(struct btrfs_work *work) int ret; if (sbio->err) { + ret = 0; for (i = 0; i < sbio->count; ++i) - scrub_recheck_error(sbio, i); + ret |= scrub_recheck_error(sbio, i); + if (!ret) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.unverified_errors; + spin_unlock(&sdev->stat_lock); + } sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); sbio->bio->bi_flags |= 1 << BIO_UPTODATE; @@ -396,10 +786,6 @@ static void scrub_checksum(struct btrfs_work *work) bi->bv_offset = 0; bi->bv_len = PAGE_SIZE; } - - spin_lock(&sdev->stat_lock); - ++sdev->stat.read_errors; - spin_unlock(&sdev->stat_lock); goto out; } for (i = 0; i < sbio->count; ++i) { @@ -420,8 +806,14 @@ static void scrub_checksum(struct btrfs_work *work) WARN_ON(1); } kunmap_atomic(buffer, KM_USER0); - if (ret) - scrub_recheck_error(sbio, i); + if (ret) { + ret = scrub_recheck_error(sbio, i); + if (!ret) { + spin_lock(&sdev->stat_lock); + ++sdev->stat.unverified_errors; + spin_unlock(&sdev->stat_lock); + } + } } out: @@ -557,57 +949,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) static int scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; - struct bio *bio; - int i; if (sdev->curr == -1) return 0; sbio = sdev->bios[sdev->curr]; - - bio = bio_alloc(GFP_NOFS, sbio->count); - if (!bio) - goto nomem; - - bio->bi_private = sbio; - bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = sbio->physical >> 9; - - for (i = 0; i < sbio->count; ++i) { - struct page *page; - int ret; - - page = alloc_page(GFP_NOFS); - if (!page) - goto nomem; - - ret = bio_add_page(bio, page, PAGE_SIZE, 0); - if (!ret) { - __free_page(page); - goto nomem; - } - } - sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(READ, bio); + submit_bio(READ, sbio->bio); return 0; - -nomem: - scrub_free_bio(bio); - - return -ENOMEM; } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, u64 mirror_num, + u64 physical, u64 flags, u64 gen, int mirror_num, u8 *csum, int force) { struct scrub_bio *sbio; + struct page *page; + int ret; again: /* @@ -628,12 +990,22 @@ again: } sbio = sdev->bios[sdev->curr]; if (sbio->count == 0) { + struct bio *bio; + sbio->physical = physical; sbio->logical = logical; + bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); + if (!bio) + return -ENOMEM; + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sdev->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + sbio->err = 0; + sbio->bio = bio; } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || sbio->logical + sbio->count * PAGE_SIZE != logical) { - int ret; - ret = scrub_submit(sdev); if (ret) return ret; @@ -643,6 +1015,20 @@ again: sbio->spag[sbio->count].generation = gen; sbio->spag[sbio->count].have_csum = 0; sbio->spag[sbio->count].mirror_num = mirror_num; + + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + + ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); + if (!ret) { + __free_page(page); + ret = scrub_submit(sdev); + if (ret) + return ret; + goto again; + } + if (csum) { sbio->spag[sbio->count].have_csum = 1; memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); @@ -701,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, /* scrub extent tries to collect up to 64 kB for each bio */ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, - u64 physical, u64 flags, u64 gen, u64 mirror_num) + u64 physical, u64 flags, u64 gen, int mirror_num) { int ret; u8 csum[BTRFS_CSUM_SIZE]; @@ -741,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, int slot; int i; u64 nstripes; - int start_stripe; struct extent_buffer *l; struct btrfs_key key; u64 physical; u64 logical; u64 generation; - u64 mirror_num; + int mirror_num; + struct reada_control *reada1; + struct reada_control *reada2; + struct btrfs_key key_start; + struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; @@ -758,102 +1147,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; - mirror_num = 0; + mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; - mirror_num = num % map->sub_stripes; + mirror_num = num % map->sub_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { increment = map->stripe_len; - mirror_num = num % map->num_stripes; + mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; - mirror_num = num % map->num_stripes; + mirror_num = num % map->num_stripes + 1; } else { increment = map->stripe_len; - mirror_num = 0; + mirror_num = 1; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; path->search_commit_root = 1; path->skip_locking = 1; /* - * find all extents for each stripe and just read them to get - * them into the page cache - * FIXME: we can do better. build a more intelligent prefetching + * trigger the readahead for extent tree csum tree and wait for + * completion. During readahead, the scrub is officially paused + * to not hold off transaction commits */ logical = base + offset; - physical = map->stripes[num].physical; - ret = 0; - for (i = 0; i < nstripes; ++i) { - key.objectid = logical; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = (u64)0; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out_noplug; - /* - * we might miss half an extent here, but that doesn't matter, - * as it's only the prefetch - */ - while (1) { - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out_noplug; - - break; - } - btrfs_item_key_to_cpu(l, &key, slot); + wait_event(sdev->list_wait, + atomic_read(&sdev->in_flight) == 0); + atomic_inc(&fs_info->scrubs_paused); + wake_up(&fs_info->scrub_pause_wait); - if (key.objectid >= logical + map->stripe_len) - break; + /* FIXME it might be better to start readahead at commit root */ + key_start.objectid = logical; + key_start.type = BTRFS_EXTENT_ITEM_KEY; + key_start.offset = (u64)0; + key_end.objectid = base + offset + nstripes * increment; + key_end.type = BTRFS_EXTENT_ITEM_KEY; + key_end.offset = (u64)0; + reada1 = btrfs_reada_add(root, &key_start, &key_end); + + key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_start.type = BTRFS_EXTENT_CSUM_KEY; + key_start.offset = logical; + key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_end.type = BTRFS_EXTENT_CSUM_KEY; + key_end.offset = base + offset + nstripes * increment; + reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + + if (!IS_ERR(reada1)) + btrfs_reada_wait(reada1); + if (!IS_ERR(reada2)) + btrfs_reada_wait(reada2); - path->slots[0]++; - } - btrfs_release_path(path); - logical += increment; - physical += map->stripe_len; - cond_resched(); + mutex_lock(&fs_info->scrub_lock); + while (atomic_read(&fs_info->scrub_pause_req)) { + mutex_unlock(&fs_info->scrub_lock); + wait_event(fs_info->scrub_pause_wait, + atomic_read(&fs_info->scrub_pause_req) == 0); + mutex_lock(&fs_info->scrub_lock); } + atomic_dec(&fs_info->scrubs_paused); + mutex_unlock(&fs_info->scrub_lock); + wake_up(&fs_info->scrub_pause_wait); /* * collect all data csums for the stripe to avoid seeking during * the scrub. This might currently (crc32) end up to be about 1MB */ - start_stripe = 0; blk_start_plug(&plug); -again: - logical = base + offset + start_stripe * increment; - for (i = start_stripe; i < nstripes; ++i) { - ret = btrfs_lookup_csums_range(csum_root, logical, - logical + map->stripe_len - 1, - &sdev->csum_list, 1); - if (ret) - goto out; - logical += increment; - cond_resched(); - } /* * now find all extents for each stripe and scrub them */ - logical = base + offset + start_stripe * increment; - physical = map->stripes[num].physical + start_stripe * map->stripe_len; + logical = base + offset; + physical = map->stripes[num].physical; ret = 0; - for (i = start_stripe; i < nstripes; ++i) { + for (i = 0; i < nstripes; ++i) { /* * canceled? */ @@ -882,11 +1257,14 @@ again: atomic_dec(&fs_info->scrubs_paused); mutex_unlock(&fs_info->scrub_lock); wake_up(&fs_info->scrub_pause_wait); - scrub_free_csums(sdev); - start_stripe = i; - goto again; } + ret = btrfs_lookup_csums_range(csum_root, logical, + logical + map->stripe_len - 1, + &sdev->csum_list, 1); + if (ret) + goto out; + key.objectid = logical; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = (u64)0; @@ -982,7 +1360,6 @@ next: out: blk_finish_plug(&plug); -out_noplug: btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -1158,18 +1535,22 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; mutex_lock(&fs_info->scrub_lock); if (fs_info->scrub_workers_refcnt == 0) { btrfs_init_workers(&fs_info->scrub_workers, "scrub", fs_info->thread_pool_size, &fs_info->generic_worker); fs_info->scrub_workers.idle_thresh = 4; - btrfs_start_workers(&fs_info->scrub_workers, 1); + ret = btrfs_start_workers(&fs_info->scrub_workers); + if (ret) + goto out; } ++fs_info->scrub_workers_refcnt; +out: mutex_unlock(&fs_info->scrub_lock); - return 0; + return ret; } static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) @@ -1253,10 +1634,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, ret = scrub_enumerate_chunks(sdev, start, end); wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); - atomic_dec(&fs_info->scrubs_running); wake_up(&fs_info->scrub_pause_wait); + wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); + if (progress) memcpy(progress, &sdev->stat, sizeof(*progress)); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 15634d4648d7..200f63bc6675 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -40,6 +40,8 @@ #include <linux/magic.h> #include <linux/slab.h> #include <linux/cleancache.h> +#include <linux/mnt_namespace.h> +#include <linux/ratelimit.h> #include "compat.h" #include "delayed-inode.h" #include "ctree.h" @@ -58,6 +60,7 @@ #include <trace/events/btrfs.h> static const struct super_operations btrfs_super_ops; +static struct file_system_type btrfs_fs_type; static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, char nbuf[16]) @@ -162,7 +165,7 @@ enum { Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, - Opt_inode_cache, Opt_err, + Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, }; static match_table_t tokens = { @@ -195,6 +198,8 @@ static match_table_t tokens = { {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, {Opt_inode_cache, "inode_cache"}, + {Opt_no_space_cache, "nospace_cache"}, + {Opt_recovery, "recovery"}, {Opt_err, NULL}, }; @@ -206,14 +211,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) { struct btrfs_fs_info *info = root->fs_info; substring_t args[MAX_OPT_ARGS]; - char *p, *num, *orig; + char *p, *num, *orig = NULL; + u64 cache_gen; int intarg; int ret = 0; char *compress_type; bool compress_force = false; + cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); + if (cache_gen) + btrfs_set_opt(info->mount_opt, SPACE_CACHE); + if (!options) - return 0; + goto out; /* * strsep changes the string, duplicate it because parse_options @@ -360,9 +370,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, DISCARD); break; case Opt_space_cache: - printk(KERN_INFO "btrfs: enabling disk space caching\n"); btrfs_set_opt(info->mount_opt, SPACE_CACHE); break; + case Opt_no_space_cache: + printk(KERN_INFO "btrfs: disabling disk space caching\n"); + btrfs_clear_opt(info->mount_opt, SPACE_CACHE); + break; case Opt_inode_cache: printk(KERN_INFO "btrfs: enabling inode map caching\n"); btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); @@ -381,6 +394,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling auto defrag"); btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); break; + case Opt_recovery: + printk(KERN_INFO "btrfs: enabling auto recovery"); + btrfs_set_opt(info->mount_opt, RECOVERY); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -391,6 +408,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } } out: + if (!ret && btrfs_test_opt(root, SPACE_CACHE)) + printk(KERN_INFO "btrfs: disk space caching is enabled\n"); kfree(orig); return ret; } @@ -406,12 +425,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; - char *opts, *orig, *p; + char *device_name, *opts, *orig, *p; int error = 0; int intarg; if (!options) - goto out; + return 0; /* * strsep changes the string, duplicate it because parse_options @@ -430,6 +449,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, token = match_token(p, tokens, args); switch (token) { case Opt_subvol: + kfree(*subvol_name); *subvol_name = match_strdup(&args[0]); break; case Opt_subvolid: @@ -457,29 +477,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, } break; case Opt_device: - error = btrfs_scan_one_device(match_strdup(&args[0]), + device_name = match_strdup(&args[0]); + if (!device_name) { + error = -ENOMEM; + goto out; + } + error = btrfs_scan_one_device(device_name, flags, holder, fs_devices); + kfree(device_name); if (error) - goto out_free_opts; + goto out; break; default: break; } } - out_free_opts: +out: kfree(orig); - out: - /* - * If no subvolume name is specified we use the default one. Allocate - * a copy of the string "." here so that code later in the - * mount path doesn't care if it's the default volume or another one. - */ - if (!*subvol_name) { - *subvol_name = kstrdup(".", GFP_KERNEL); - if (!*subvol_name) - return -ENOMEM; - } return error; } @@ -492,7 +507,6 @@ static struct dentry *get_default_root(struct super_block *sb, struct btrfs_path *path; struct btrfs_key location; struct inode *inode; - struct dentry *dentry; u64 dir_id; int new = 0; @@ -517,7 +531,7 @@ static struct dentry *get_default_root(struct super_block *sb, * will mount by default if we haven't been given a specific subvolume * to mount. */ - dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); + dir_id = btrfs_super_root_dir(root->fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); @@ -566,29 +580,7 @@ setup_root: return dget(sb->s_root); } - if (new) { - const struct qstr name = { .name = "/", .len = 1 }; - - /* - * New inode, we need to make the dentry a sibling of s_root so - * everything gets cleaned up properly on unmount. - */ - dentry = d_alloc(sb->s_root, &name); - if (!dentry) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - d_splice_alias(inode, dentry); - } else { - /* - * We found the inode in cache, just find a dentry for it and - * put the reference to the inode we just got. - */ - dentry = d_find_alias(inode); - iput(inode); - } - - return dentry; + return d_obtain_alias(inode); } static int btrfs_fill_super(struct super_block *sb, @@ -719,6 +711,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noacl"); if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); + else + seq_puts(seq, ",nospace_cache"); if (btrfs_test_opt(root, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) @@ -753,6 +747,111 @@ static int btrfs_set_super(struct super_block *s, void *data) return set_anon_super(s, data); } +/* + * subvolumes are identified by ino 256 + */ +static inline int is_subvolume_inode(struct inode *inode) +{ + if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return 1; + return 0; +} + +/* + * This will strip out the subvol=%s argument for an argument string and add + * subvolid=0 to make sure we get the actual tree root for path walking to the + * subvol we want. + */ +static char *setup_root_args(char *args) +{ + unsigned copied = 0; + unsigned len = strlen(args) + 2; + char *pos; + char *ret; + + /* + * We need the same args as before, but minus + * + * subvol=a + * + * and add + * + * subvolid=0 + * + * which is a difference of 2 characters, so we allocate strlen(args) + + * 2 characters. + */ + ret = kzalloc(len * sizeof(char), GFP_NOFS); + if (!ret) + return NULL; + pos = strstr(args, "subvol="); + + /* This shouldn't happen, but just in case.. */ + if (!pos) { + kfree(ret); + return NULL; + } + + /* + * The subvol=<> arg is not at the front of the string, copy everybody + * up to that into ret. + */ + if (pos != args) { + *pos = '\0'; + strcpy(ret, args); + copied += strlen(args); + pos++; + } + + strncpy(ret + copied, "subvolid=0", len - copied); + + /* Length of subvolid=0 */ + copied += 10; + + /* + * If there is no , after the subvol= option then we know there's no + * other options and we can just return. + */ + pos = strchr(pos, ','); + if (!pos) + return ret; + + /* Copy the rest of the arguments into our buffer */ + strncpy(ret + copied, pos, len - copied); + copied += strlen(pos); + + return ret; +} + +static struct dentry *mount_subvol(const char *subvol_name, int flags, + const char *device_name, char *data) +{ + struct dentry *root; + struct vfsmount *mnt; + char *newargs; + + newargs = setup_root_args(data); + if (!newargs) + return ERR_PTR(-ENOMEM); + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, + newargs); + kfree(newargs); + if (IS_ERR(mnt)) + return ERR_CAST(mnt); + + root = mount_subtree(mnt, subvol_name); + + if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + struct super_block *s = root->d_sb; + dput(root); + root = ERR_PTR(-EINVAL); + deactivate_locked_super(s); + printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", + subvol_name); + } + + return root; +} /* * Find a superblock for the given device / mount point. @@ -767,7 +866,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_root *tree_root = NULL; struct btrfs_fs_info *fs_info = NULL; fmode_t mode = FMODE_READ; char *subvol_name = NULL; @@ -781,21 +879,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_parse_early_options(data, mode, fs_type, &subvol_name, &subvol_objectid, &subvol_rootid, &fs_devices); - if (error) + if (error) { + kfree(subvol_name); return ERR_PTR(error); + } - error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); - if (error) - goto error_free_subvol_name; + if (subvol_name) { + root = mount_subvol(subvol_name, flags, device_name, data); + kfree(subvol_name); + return root; + } - error = btrfs_open_devices(fs_devices, mode, fs_type); + error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); if (error) - goto error_free_subvol_name; - - if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; - } + return ERR_PTR(error); /* * Setup a dummy root and fs_info for test/set super. This is because @@ -804,19 +901,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * then open_ctree will properly initialize everything later. */ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); - tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!fs_info || !tree_root) { + if (!fs_info) + return ERR_PTR(-ENOMEM); + + fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info->tree_root) { error = -ENOMEM; - goto error_close_devices; + goto error_fs_info; } - fs_info->tree_root = tree_root; + fs_info->tree_root->fs_info = fs_info; fs_info->fs_devices = fs_devices; - tree_root->fs_info = fs_info; + + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + error = -ENOMEM; + goto error_fs_info; + } + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_fs_info; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; + goto error_close_devices; + } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); - if (IS_ERR(s)) - goto error_s; + s = sget(fs_type, btrfs_test_super, btrfs_set_super, + fs_info->tree_root); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto error_close_devices; + } if (s->s_root) { if ((flags ^ s->s_flags) & MS_RDONLY) { @@ -826,75 +944,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } btrfs_close_devices(fs_devices); - kfree(fs_info); - kfree(tree_root); + free_fs_info(fs_info); } else { char b[BDEVNAME_SIZE]; s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + btrfs_sb(s)->fs_info->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); - goto error_free_subvol_name; + return ERR_PTR(error); } - btrfs_sb(s)->fs_info->bdev_holder = fs_type; s->s_flags |= MS_ACTIVE; } - /* if they gave us a subvolume name bind mount into that */ - if (strcmp(subvol_name, ".")) { - struct dentry *new_root; - - root = get_default_root(s, subvol_rootid); - if (IS_ERR(root)) { - error = PTR_ERR(root); - deactivate_locked_super(s); - goto error_free_subvol_name; - } - - mutex_lock(&root->d_inode->i_mutex); - new_root = lookup_one_len(subvol_name, root, - strlen(subvol_name)); - mutex_unlock(&root->d_inode->i_mutex); - - if (IS_ERR(new_root)) { - dput(root); - deactivate_locked_super(s); - error = PTR_ERR(new_root); - goto error_free_subvol_name; - } - if (!new_root->d_inode) { - dput(root); - dput(new_root); - deactivate_locked_super(s); - error = -ENXIO; - goto error_free_subvol_name; - } - dput(root); - root = new_root; - } else { - root = get_default_root(s, subvol_objectid); - if (IS_ERR(root)) { - error = PTR_ERR(root); - deactivate_locked_super(s); - goto error_free_subvol_name; - } + root = get_default_root(s, subvol_objectid); + if (IS_ERR(root)) { + deactivate_locked_super(s); + return root; } - kfree(subvol_name); return root; -error_s: - error = PTR_ERR(s); error_close_devices: btrfs_close_devices(fs_devices); - kfree(fs_info); - kfree(tree_root); -error_free_subvol_name: - kfree(subvol_name); +error_fs_info: + free_fs_info(fs_info); return ERR_PTR(error); } @@ -919,7 +997,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (root->fs_info->fs_devices->rw_devices == 0) return -EACCES; - if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) + if (btrfs_super_log_root(root->fs_info->super_copy) != 0) return -EINVAL; ret = btrfs_cleanup_fs_roots(root->fs_info); @@ -976,11 +1054,11 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) u64 avail_space; u64 used_space; u64 min_stripe_size; - int min_stripes = 1; + int min_stripes = 1, num_stripes = 1; int i = 0, nr_devices; int ret; - nr_devices = fs_info->fs_devices->rw_devices; + nr_devices = fs_info->fs_devices->open_devices; BUG_ON(!nr_devices); devices_info = kmalloc(sizeof(*devices_info) * nr_devices, @@ -990,20 +1068,24 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) /* calc min stripe number for data space alloction */ type = btrfs_get_alloc_profile(root, 1); - if (type & BTRFS_BLOCK_GROUP_RAID0) + if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; - else if (type & BTRFS_BLOCK_GROUP_RAID1) + num_stripes = nr_devices; + } else if (type & BTRFS_BLOCK_GROUP_RAID1) { min_stripes = 2; - else if (type & BTRFS_BLOCK_GROUP_RAID10) + num_stripes = 2; + } else if (type & BTRFS_BLOCK_GROUP_RAID10) { min_stripes = 4; + num_stripes = 4; + } if (type & BTRFS_BLOCK_GROUP_DUP) min_stripe_size = 2 * BTRFS_STRIPE_LEN; else min_stripe_size = BTRFS_STRIPE_LEN; - list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - if (!device->in_fs_metadata) + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->in_fs_metadata || !device->bdev) continue; avail_space = device->total_bytes - device->bytes_used; @@ -1064,13 +1146,16 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) i = nr_devices - 1; avail_space = 0; while (nr_devices >= min_stripes) { + if (num_stripes > nr_devices) + num_stripes = nr_devices; + if (devices_info[i].max_avail >= min_stripe_size) { int j; u64 alloc_size; - avail_space += devices_info[i].max_avail * min_stripes; + avail_space += devices_info[i].max_avail * num_stripes; alloc_size = devices_info[i].max_avail; - for (j = i + 1 - min_stripes; j <= i; j++) + for (j = i + 1 - num_stripes; j <= i; j++) devices_info[j].max_avail -= alloc_size; } i--; @@ -1085,7 +1170,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_root *root = btrfs_sb(dentry->d_sb); - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; @@ -1187,6 +1272,16 @@ static int btrfs_unfreeze(struct super_block *sb) return 0; } +static void btrfs_fs_dirty_inode(struct inode *inode, int flags) +{ + int ret; + + ret = btrfs_dirty_inode(inode); + if (ret) + printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu " + "error %d\n", btrfs_ino(inode), ret); +} + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, @@ -1194,7 +1289,7 @@ static const struct super_operations btrfs_super_ops = { .sync_fs = btrfs_sync_fs, .show_options = btrfs_show_options, .write_inode = btrfs_write_inode, - .dirty_inode = btrfs_dirty_inode, + .dirty_inode = btrfs_fs_dirty_inode, .alloc_inode = btrfs_alloc_inode, .destroy_inode = btrfs_destroy_inode, .statfs = btrfs_statfs, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e24b7964a155..81376d94cd3c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) struct btrfs_transaction *cur_trans; spin_lock(&root->fs_info->trans_lock); +loop: if (root->fs_info->trans_no_join) { if (!nofail) { spin_unlock(&root->fs_info->trans_lock); @@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); if (!cur_trans) return -ENOMEM; + spin_lock(&root->fs_info->trans_lock); if (root->fs_info->running_transaction) { + /* + * someone started a transaction after we unlocked. Make sure + * to redo the trans_no_join checks above + */ kmem_cache_free(btrfs_transaction_cachep, cur_trans); cur_trans = root->fs_info->running_transaction; - atomic_inc(&cur_trans->use_count); - atomic_inc(&cur_trans->num_writers); - cur_trans->num_joined++; - spin_unlock(&root->fs_info->trans_lock); - return 0; + goto loop; } + atomic_set(&cur_trans->num_writers, 1); cur_trans->num_joined = 0; init_waitqueue_head(&cur_trans->writer_wait); @@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, */ if (num_items > 0 && root != root->fs_info->chunk_root) { num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - ret = btrfs_block_rsv_add(NULL, root, + ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, num_bytes); if (ret) @@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - ret = btrfs_block_rsv_check(trans, root, - &root->fs_info->global_block_rsv, 0, 5); + + ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); return ret ? 1 : 0; } @@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_block_rsv *rsv = trans->block_rsv; int updates; smp_mb(); if (cur_trans->blocked || cur_trans->delayed_refs.flushing) return 1; + /* + * We need to do this in case we're deleting csums so the global block + * rsv get's used instead of the csum block rsv. + */ + trans->block_rsv = NULL; + updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (updates) btrfs_run_delayed_refs(trans, root, updates); + trans->block_rsv = rsv; + return should_end_transaction(trans, root); } @@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; while (count < 4) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; @@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, count++; } - btrfs_trans_release_metadata(trans, root); - if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { trans->transaction->blocked = 1; @@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { - int ret; int err = 0; int werr = 0; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; + struct address_space *mapping = root->fs_info->btree_inode->i_mapping; u64 start = 0; u64 end; - unsigned long index; - - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); - if (ret) - break; - while (start <= end) { - cond_resched(); - - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) - continue; - - btree_lock_page_hook(page); - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - continue; - } - if (PageWriteback(page)) { - if (PageDirty(page)) - wait_on_page_writeback(page); - else { - unlock_page(page); - page_cache_release(page); - continue; - } - } - err = write_one_page(page, 0); - if (err) - werr = err; - page_cache_release(page); - } + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + mark)) { + convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, + GFP_NOFS); + err = filemap_fdatawrite_range(mapping, start, end); + if (err) + werr = err; + cond_resched(); + start = end + 1; } if (err) werr = err; @@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root, int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark) { - int ret; int err = 0; int werr = 0; - struct page *page; - struct inode *btree_inode = root->fs_info->btree_inode; + struct address_space *mapping = root->fs_info->btree_inode->i_mapping; u64 start = 0; u64 end; - unsigned long index; - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark); - if (ret) - break; - - clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); - while (start <= end) { - index = start >> PAGE_CACHE_SHIFT; - start = (u64)(index + 1) << PAGE_CACHE_SHIFT; - page = find_get_page(btree_inode->i_mapping, index); - if (!page) - continue; - if (PageDirty(page)) { - btree_lock_page_hook(page); - wait_on_page_writeback(page); - err = write_one_page(page, 0); - if (err) - werr = err; - } - wait_on_page_writeback(page); - page_cache_release(page); - cond_resched(); - } + while (!find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT)) { + clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); + err = filemap_fdatawait_range(mapping, start, end); + if (err) + werr = err; + cond_resched(); + start = end + 1; } if (err) werr = err; @@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, ret = btrfs_write_marked_extents(root, dirty_pages, mark); ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); - return ret || ret2; + + if (ret) + return ret; + if (ret2) + return ret2; + return 0; } int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, @@ -816,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_save_ino_cache(root, trans); + /* see comments in should_cow_block() */ + root->force_cow = 0; + smp_wmb(); + if (root->commit_root != root->node) { mutex_lock(&root->fs_commit_mutex); switch_commit_root(root); @@ -911,11 +884,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); - btrfs_orphan_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, - to_reserve); + ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, + to_reserve); if (ret) { pending->error = ret; goto fail; @@ -979,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); + /* see comments in should_cow_block() */ + root->force_cow = 1; + smp_wmb(); + btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ key.offset = trans->transid; @@ -1002,7 +978,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, BUG_ON(IS_ERR(pending->snap)); btrfs_reloc_post_snapshot(trans, pending); - btrfs_orphan_post_snapshot(trans, pending); fail: kfree(new_root_item); trans->block_rsv = rsv; @@ -1032,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root) struct btrfs_root_item *root_item; struct btrfs_super_block *super; - super = &root->fs_info->super_copy; + super = root->fs_info->super_copy; root_item = &root->fs_info->chunk_root->root_item; super->chunk_root = root_item->bytenr; @@ -1043,7 +1018,7 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; - if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root, SPACE_CACHE)) super->cache_generation = root_item->generation; } @@ -1168,14 +1143,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_run_ordered_operations(root, 0); + btrfs_trans_release_metadata(trans, root); + trans->block_rsv = NULL; + /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); - btrfs_trans_release_metadata(trans, root); - cur_trans = trans->transaction; /* * set the flushing flag so procs in this transaction have to @@ -1341,12 +1317,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, update_super_roots(root); if (!root->fs_info->log_root_recovering) { - btrfs_set_super_log_root(&root->fs_info->super_copy, 0); - btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); + btrfs_set_super_log_root(root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); } - memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, - sizeof(root->fs_info->super_copy)); + memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, + sizeof(*root->fs_info->super_copy)); trans->transaction->blocked = 0; spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 0618aa39740b..3568374d419d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log, struct walk_control *wc, u64 gen) { if (wc->pin) - btrfs_pin_extent(log->fs_info->extent_root, - eb->start, eb->len, 0); + btrfs_pin_extent_for_log_replay(wc->trans, + log->fs_info->extent_root, + eb->start, eb->len); if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) @@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent(root, bytenr, blocksize); BUG_ON(ret); } @@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(next); WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent(root, path->nodes[*level]->start, path->nodes[*level]->len); BUG_ON(ret); @@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(log, next->start, + ret = btrfs_free_and_pin_reserved_extent(log, next->start, next->len); BUG_ON(ret); } @@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* wait for previous tree log sync to complete */ if (atomic_read(&root->log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root->log_transid - 1); - while (1) { unsigned long batch = root->log_batch; - if (root->log_multiple_pids) { + /* when we're on an ssd, just kick the log commit out */ + if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); mutex_lock(&root->log_mutex); @@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, BUG_ON(ret); btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - btrfs_set_super_log_root(&root->fs_info->super_for_commit, + btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); - btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_set_super_log_root_level(root->fs_info->super_for_commit, btrfs_header_level(log_root_tree->node)); log_root_tree->log_batch = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2a4cc79da61..f4b839fd3c9d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -295,6 +295,12 @@ loop_lock: btrfs_requeue_work(&device->work); goto done; } + /* unplug every 64 requests just for good measure */ + if (batch_run % 64 == 0) { + blk_finish_plug(&plug); + blk_start_plug(&plug); + sync_pending = 0; + } } cond_resched(); @@ -366,6 +372,14 @@ static noinline int device_list_add(const char *path, } INIT_LIST_HEAD(&device->dev_alloc_list); + /* init readahead state */ + spin_lock_init(&device->reada_lock); + device->reada_curr_zone = NULL; + atomic_set(&device->reada_in_flight, 0); + device->reada_next = 0; + INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); + INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); + mutex_lock(&fs_devices->device_list_mutex); list_add_rcu(&device->dev_list, &fs_devices->devices); mutex_unlock(&fs_devices->device_list_mutex); @@ -597,10 +611,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); - if (!bh) { - ret = -EINVAL; + if (!bh) goto error_close; - } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -655,7 +667,7 @@ error: continue; } if (fs_devices->open_devices == 0) { - ret = -EIO; + ret = -EINVAL; goto out; } fs_devices->seeding = seeding; @@ -993,7 +1005,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, key.objectid = device->devid; key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; - +again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, @@ -1006,6 +1018,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_dev_extent); BUG_ON(found_key.offset > start || found_key.offset + btrfs_dev_extent_length(leaf, extent) < start); + key = found_key; + btrfs_release_path(path); + goto again; } else if (ret == 0) { leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], @@ -1013,8 +1028,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, } BUG_ON(ret); - if (device->bytes_used > 0) - device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + if (device->bytes_used > 0) { + u64 len = btrfs_dev_extent_length(leaf, extent); + device->bytes_used -= len; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += len; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = btrfs_del_item(trans, root, path); out: @@ -1356,6 +1376,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto error_undo; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space = device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + device->in_fs_metadata = 0; btrfs_scrub_cancel_dev(root, device); @@ -1387,8 +1412,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) call_rcu(&device->rcu, free_device); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); - num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; - btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); + num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; @@ -1450,7 +1475,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_super_block *disk_super = root->fs_info->super_copy; struct btrfs_device *device; u64 super_flags; @@ -1592,7 +1617,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) return -EINVAL; - bdev = blkdev_get_by_path(device_path, FMODE_EXCL, + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -1691,15 +1716,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->num_can_discard++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes; + spin_unlock(&root->fs_info->free_chunk_lock); + if (!blk_queue_nonrot(bdev_get_queue(bdev))) root->fs_info->fs_devices->rotating = 1; - total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); - btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); + btrfs_set_super_total_bytes(root->fs_info->super_copy, total_bytes + device->total_bytes); - total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); - btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); + btrfs_set_super_num_devices(root->fs_info->super_copy, total_bytes + 1); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); @@ -1790,7 +1819,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { struct btrfs_super_block *super_copy = - &device->dev_root->fs_info->super_copy; + device->dev_root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 diff = new_size - device->total_bytes; @@ -1849,7 +1878,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 chunk_offset) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; u8 *ptr; @@ -2175,7 +2204,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) bool retried = false; struct extent_buffer *l; struct btrfs_key key; - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; @@ -2192,8 +2221,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) lock_chunks(root); device->total_bytes = new_size; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space -= diff; + spin_unlock(&root->fs_info->free_chunk_lock); + } unlock_chunks(root); again: @@ -2257,6 +2290,9 @@ again: device->total_bytes = old_size; if (device->writeable) device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); unlock_chunks(root); goto done; } @@ -2292,7 +2328,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key disk_key; u32 array_size; u8 *ptr; @@ -2615,6 +2651,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, index++; } + spin_lock(&extent_root->fs_info->free_chunk_lock); + extent_root->fs_info->free_chunk_space -= (stripe_size * + map->num_stripes); + spin_unlock(&extent_root->fs_info->free_chunk_lock); + index = 0; stripe = &chunk->stripe; while (index < map->num_stripes) { @@ -2848,7 +2889,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, + struct btrfs_bio **bbio_ret, int mirror_num) { struct extent_map *em; @@ -2866,18 +2907,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int i; int num_stripes; int max_errors = 0; - struct btrfs_multi_bio *multi = NULL; + struct btrfs_bio *bbio = NULL; - if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) + if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) stripes_allocated = 1; again: - if (multi_ret) { - multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), + if (bbio_ret) { + bbio = kzalloc(btrfs_bio_size(stripes_allocated), GFP_NOFS); - if (!multi) + if (!bbio) return -ENOMEM; - atomic_set(&multi->error, 0); + atomic_set(&bbio->error, 0); } read_lock(&em_tree->lock); @@ -2898,7 +2939,7 @@ again: if (mirror_num > map->num_stripes) mirror_num = 0; - /* if our multi bio struct is too small, back off and try again */ + /* if our btrfs_bio struct is too small, back off and try again */ if (rw & REQ_WRITE) { if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) { @@ -2917,11 +2958,11 @@ again: stripes_required = map->num_stripes; } } - if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && + if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); - kfree(multi); + kfree(bbio); goto again; } stripe_nr = offset; @@ -2950,7 +2991,7 @@ again: *length = em->len - offset; } - if (!multi_ret) + if (!bbio_ret) goto out; num_stripes = 1; @@ -2975,13 +3016,17 @@ again: stripe_index = find_live_mirror(map, 0, map->num_stripes, current->pid % map->num_stripes); + mirror_num = stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (rw & (REQ_WRITE | REQ_DISCARD)) { num_stripes = map->num_stripes; - else if (mirror_num) + } else if (mirror_num) { stripe_index = mirror_num - 1; + } else { + mirror_num = 1; + } } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; @@ -3001,6 +3046,7 @@ again: stripe_index = find_live_mirror(map, stripe_index, map->sub_stripes, stripe_index + current->pid % map->sub_stripes); + mirror_num = stripe_index + 1; } } else { /* @@ -3009,15 +3055,16 @@ again: * stripe_index is the number of our device in the stripe array */ stripe_index = do_div(stripe_nr, map->num_stripes); + mirror_num = stripe_index + 1; } BUG_ON(stripe_index >= map->num_stripes); if (rw & REQ_DISCARD) { for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical = + bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = map->stripes[stripe_index].dev; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { u64 stripes; @@ -3038,16 +3085,16 @@ again: } stripes = stripe_nr_end - 1 - j; do_div(stripes, map->num_stripes); - multi->stripes[i].length = map->stripe_len * + bbio->stripes[i].length = map->stripe_len * (stripes - stripe_nr + 1); if (i == 0) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_offset; stripe_offset = 0; } if (stripe_index == last_stripe) - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_end_offset; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { u64 stripes; @@ -3072,11 +3119,11 @@ again: } stripes = stripe_nr_end - 1 - j; do_div(stripes, factor); - multi->stripes[i].length = map->stripe_len * + bbio->stripes[i].length = map->stripe_len * (stripes - stripe_nr + 1); if (i < map->sub_stripes) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_offset; if (i == map->sub_stripes - 1) stripe_offset = 0; @@ -3084,11 +3131,11 @@ again: if (stripe_index >= last_stripe && stripe_index <= (last_stripe + map->sub_stripes - 1)) { - multi->stripes[i].length -= + bbio->stripes[i].length -= stripe_end_offset; } } else - multi->stripes[i].length = *length; + bbio->stripes[i].length = *length; stripe_index++; if (stripe_index == map->num_stripes) { @@ -3099,19 +3146,20 @@ again: } } else { for (i = 0; i < num_stripes; i++) { - multi->stripes[i].physical = + bbio->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - multi->stripes[i].dev = + bbio->stripes[i].dev = map->stripes[stripe_index].dev; stripe_index++; } } - if (multi_ret) { - *multi_ret = multi; - multi->num_stripes = num_stripes; - multi->max_errors = max_errors; + if (bbio_ret) { + *bbio_ret = bbio; + bbio->num_stripes = num_stripes; + bbio->max_errors = max_errors; + bbio->mirror_num = mirror_num; } out: free_extent_map(em); @@ -3120,9 +3168,9 @@ out: int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num) + struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, mirror_num); } @@ -3191,30 +3239,32 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -static void end_bio_multi_stripe(struct bio *bio, int err) +static void btrfs_end_bio(struct bio *bio, int err) { - struct btrfs_multi_bio *multi = bio->bi_private; + struct btrfs_bio *bbio = bio->bi_private; int is_orig_bio = 0; if (err) - atomic_inc(&multi->error); + atomic_inc(&bbio->error); - if (bio == multi->orig_bio) + if (bio == bbio->orig_bio) is_orig_bio = 1; - if (atomic_dec_and_test(&multi->stripes_pending)) { + if (atomic_dec_and_test(&bbio->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); - bio = multi->orig_bio; + bio = bbio->orig_bio; } - bio->bi_private = multi->private; - bio->bi_end_io = multi->end_io; + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + bio->bi_bdev = (struct block_device *) + (unsigned long)bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the multi-bio */ - if (atomic_read(&multi->error) > multi->max_errors) { + if (atomic_read(&bbio->error) > bbio->max_errors) { err = -EIO; - } else if (err) { + } else { /* * this bio is actually up to date, we didn't * go over the max number of errors @@ -3222,7 +3272,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err) set_bit(BIO_UPTODATE, &bio->bi_flags); err = 0; } - kfree(multi); + kfree(bbio); bio_endio(bio, err); } else if (!is_orig_bio) { @@ -3302,20 +3352,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_sector << 9; u64 length = 0; u64 map_length; - struct btrfs_multi_bio *multi = NULL; int ret; int dev_nr = 0; int total_devs = 1; + struct btrfs_bio *bbio = NULL; length = bio->bi_size; map_tree = &root->fs_info->mapping_tree; map_length = length; - ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, mirror_num); BUG_ON(ret); - total_devs = multi->num_stripes; + total_devs = bbio->num_stripes; if (map_length < length) { printk(KERN_CRIT "mapping failed logical %llu bio len %llu " "len %llu\n", (unsigned long long)logical, @@ -3323,25 +3373,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, (unsigned long long)map_length); BUG(); } - multi->end_io = first_bio->bi_end_io; - multi->private = first_bio->bi_private; - multi->orig_bio = first_bio; - atomic_set(&multi->stripes_pending, multi->num_stripes); + + bbio->orig_bio = first_bio; + bbio->private = first_bio->bi_private; + bbio->end_io = first_bio->bi_end_io; + atomic_set(&bbio->stripes_pending, bbio->num_stripes); while (dev_nr < total_devs) { - if (total_devs > 1) { - if (dev_nr < total_devs - 1) { - bio = bio_clone(first_bio, GFP_NOFS); - BUG_ON(!bio); - } else { - bio = first_bio; - } - bio->bi_private = multi; - bio->bi_end_io = end_bio_multi_stripe; + if (dev_nr < total_devs - 1) { + bio = bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); + } else { + bio = first_bio; } - bio->bi_sector = multi->stripes[dev_nr].physical >> 9; - dev = multi->stripes[dev_nr].dev; + bio->bi_private = bbio; + bio->bi_end_io = btrfs_end_bio; + bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; + dev = bbio->stripes[dev_nr].dev; if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { + pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " + "(%s id %llu), size=%u\n", rw, + (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, + dev->name, dev->devid, bio->bi_size); bio->bi_bdev = dev->bdev; if (async_submit) schedule_bio(root, dev, rw, bio); @@ -3354,8 +3407,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } dev_nr++; } - if (total_devs == 1) - kfree(multi); return 0; } @@ -3616,15 +3667,20 @@ static int read_one_dev(struct btrfs_root *root, fill_device_from_item(leaf, dev_item, device); device->dev_root = root->fs_info->dev_root; device->in_fs_metadata = 1; - if (device->writeable) + if (device->writeable) { device->fs_devices->total_rw_bytes += device->total_bytes; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += device->total_bytes - + device->bytes_used; + spin_unlock(&root->fs_info->free_chunk_lock); + } ret = 0; return ret; } int btrfs_read_sys_array(struct btrfs_root *root) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6d866db4e177..78f2d4d4f37f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -92,6 +92,20 @@ struct btrfs_device { struct btrfs_work work; struct rcu_head rcu; struct work_struct rcu_work; + + /* readahead state */ + spinlock_t reada_lock; + atomic_t reada_in_flight; + u64 reada_next; + struct reada_zone *reada_curr_zone; + struct radix_tree_root reada_zones; + struct radix_tree_root reada_extents; + + /* for sending down flush barriers */ + struct bio *flush_bio; + struct completion flush_wait; + int nobarriers; + }; struct btrfs_fs_devices { @@ -136,7 +150,10 @@ struct btrfs_bio_stripe { u64 length; /* only used for discard mappings */ }; -struct btrfs_multi_bio { +struct btrfs_bio; +typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); + +struct btrfs_bio { atomic_t stripes_pending; bio_end_io_t *end_io; struct bio *orig_bio; @@ -144,6 +161,7 @@ struct btrfs_multi_bio { atomic_t error; int max_errors; int num_stripes; + int mirror_num; struct btrfs_bio_stripe stripes[]; }; @@ -171,7 +189,7 @@ struct map_lookup { int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); -#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ +#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, @@ -180,7 +198,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, u64 chunk_offset, u64 start, u64 num_bytes); int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num); + struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 426aa464f1af..3848b04e310e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans, again: ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), name, name_len, value, size); + /* + * If we're setting an xattr to a new value but the new value is say + * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting + * back from split_leaf. This is because it thinks we'll be extending + * the existing item size, but we're asking for enough space to add the + * item itself. So if we get EOVERFLOW just set ret to EEXIST and let + * the rest of the function figure it out. + */ + if (ret == -EOVERFLOW) + ret = -EEXIST; + if (ret == -EEXIST) { if (flags & XATTR_CREATE) goto out; diff --git a/fs/buffer.c b/fs/buffer.c index 70a19745cb61..19d8eb7fdc81 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -288,7 +288,7 @@ static void free_more_memory(void) struct zone *zone; int nid; - wakeup_flusher_threads(1024); + wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); yield(); for_each_online_node(nid) { diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 2cfb695d1f89..5d9b9acc5fce 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -204,7 +204,7 @@ int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, } /* first calculate 24 bytes ntlm response and then 16 byte session key */ -int setup_ntlm_response(struct cifs_ses *ses) +int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp) { int rc = 0; unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; @@ -221,14 +221,14 @@ int setup_ntlm_response(struct cifs_ses *ses) ses->auth_key.len = temp_len; rc = SMBNTencrypt(ses->password, ses->server->cryptkey, - ses->auth_key.response + CIFS_SESS_KEY_SIZE); + ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp); if (rc) { cFYI(1, "%s Can't generate NTLM response, error: %d", __func__, rc); return rc; } - rc = E_md4hash(ses->password, temp_key); + rc = E_md4hash(ses->password, temp_key, nls_cp); if (rc) { cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); return rc; @@ -404,7 +404,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, } /* calculate md4 hash of password */ - E_md4hash(ses->password, nt_hash); + E_md4hash(ses->password, nt_hash, nls_cp); rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, CIFS_NTHASH_SIZE); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index d9dbaf869cd1..30ff56005d8f 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -125,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.75" +#define CIFS_VERSION "1.76" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index ef4f631e4c01..6f4e243e0f62 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -395,8 +395,9 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, struct TCP_Server_Info *server, __u32 expected_sequence_number); -extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); -extern int setup_ntlm_response(struct cifs_ses *); +extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *, + const struct nls_table *); +extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); @@ -448,7 +449,8 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, const unsigned char *path, struct cifs_sb_info *cifs_sb, int xid); extern int mdfour(unsigned char *, unsigned char *, int); -extern int E_md4hash(const unsigned char *passwd, unsigned char *p16); +extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, + const struct nls_table *codepage); extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d545a95c30ed..8cd4b52d4217 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -37,6 +37,7 @@ #include <asm/uaccess.h> #include <asm/processor.h> #include <linux/inet.h> +#include <linux/module.h> #include <net/ipv6.h> #include "cifspdu.h" #include "cifsglob.h" @@ -440,6 +441,8 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, smb_msg.msg_controllen = 0; for (total_read = 0; to_read; total_read += length, to_read -= length) { + try_to_freeze(); + if (server_unresponsive(server)) { total_read = -EAGAIN; break; @@ -3452,7 +3455,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, else #endif /* CIFS_WEAK_PW_HASH */ rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, - bcc_ptr); + bcc_ptr, nls_codepage); bcc_ptr += CIFS_AUTH_RESP_SIZE; if (ses->capabilities & CAP_UNICODE) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ea096ce5d4f7..4dd9283885e7 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -645,20 +645,20 @@ int cifs_closedir(struct inode *inode, struct file *file) } static struct cifsLockInfo * -cifs_lock_init(__u64 len, __u64 offset, __u8 type, __u16 netfid) +cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid) { - struct cifsLockInfo *li = + struct cifsLockInfo *lock = kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); - if (!li) - return li; - li->netfid = netfid; - li->offset = offset; - li->length = len; - li->type = type; - li->pid = current->tgid; - INIT_LIST_HEAD(&li->blist); - init_waitqueue_head(&li->block_q); - return li; + if (!lock) + return lock; + lock->offset = offset; + lock->length = length; + lock->type = type; + lock->netfid = netfid; + lock->pid = current->tgid; + INIT_LIST_HEAD(&lock->blist); + init_waitqueue_head(&lock->block_q); + return lock; } static void @@ -672,7 +672,7 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock) } static bool -cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, +__cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, __u8 type, __u16 netfid, struct cifsLockInfo **conf_lock) { @@ -694,6 +694,21 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, return false; } +static bool +cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, + struct cifsLockInfo **conf_lock) +{ + return __cifs_find_lock_conflict(cinode, lock->offset, lock->length, + lock->type, lock->netfid, conf_lock); +} + +/* + * Check if there is another lock that prevents us to set the lock (mandatory + * style). If such a lock exists, update the flock structure with its + * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks + * or leave it the same if we can't. Returns 0 if we don't need to request to + * the server or 1 otherwise. + */ static int cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, __u8 type, __u16 netfid, struct file_lock *flock) @@ -704,8 +719,8 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, mutex_lock(&cinode->lock_mutex); - exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid, - &conf_lock); + exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid, + &conf_lock); if (exist) { flock->fl_start = conf_lock->offset; flock->fl_end = conf_lock->offset + conf_lock->length - 1; @@ -723,40 +738,33 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, return rc; } -static int -cifs_lock_add(struct cifsInodeInfo *cinode, __u64 len, __u64 offset, - __u8 type, __u16 netfid) +static void +cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock) { - struct cifsLockInfo *li; - - li = cifs_lock_init(len, offset, type, netfid); - if (!li) - return -ENOMEM; - mutex_lock(&cinode->lock_mutex); - list_add_tail(&li->llist, &cinode->llist); + list_add_tail(&lock->llist, &cinode->llist); mutex_unlock(&cinode->lock_mutex); - return 0; } +/* + * Set the byte-range lock (mandatory style). Returns: + * 1) 0, if we set the lock and don't need to request to the server; + * 2) 1, if no locks prevent us but we need to request to the server; + * 3) -EACCESS, if there is a lock that prevents us and wait is false. + */ static int -cifs_lock_add_if(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, - __u8 type, __u16 netfid, bool wait) +cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, + bool wait) { - struct cifsLockInfo *lock, *conf_lock; + struct cifsLockInfo *conf_lock; bool exist; int rc = 0; - lock = cifs_lock_init(length, offset, type, netfid); - if (!lock) - return -ENOMEM; - try_again: exist = false; mutex_lock(&cinode->lock_mutex); - exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid, - &conf_lock); + exist = cifs_find_lock_conflict(cinode, lock, &conf_lock); if (!exist && cinode->can_cache_brlcks) { list_add_tail(&lock->llist, &cinode->llist); mutex_unlock(&cinode->lock_mutex); @@ -775,18 +783,21 @@ try_again: (lock->blist.next == &lock->blist)); if (!rc) goto try_again; - else { - mutex_lock(&cinode->lock_mutex); - list_del_init(&lock->blist); - mutex_unlock(&cinode->lock_mutex); - } + mutex_lock(&cinode->lock_mutex); + list_del_init(&lock->blist); } - kfree(lock); mutex_unlock(&cinode->lock_mutex); return rc; } +/* + * Check if there is another lock that prevents us to set the lock (posix + * style). If such a lock exists, update the flock structure with its + * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks + * or leave it the same if we can't. Returns 0 if we don't need to request to + * the server or 1 otherwise. + */ static int cifs_posix_lock_test(struct file *file, struct file_lock *flock) { @@ -794,6 +805,9 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock) struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); unsigned char saved_type = flock->fl_type; + if ((flock->fl_flags & FL_POSIX) == 0) + return 1; + mutex_lock(&cinode->lock_mutex); posix_test_lock(file, flock); @@ -806,16 +820,25 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock) return rc; } +/* + * Set the byte-range lock (posix style). Returns: + * 1) 0, if we set the lock and don't need to request to the server; + * 2) 1, if we need to request to the server; + * 3) <0, if the error occurs while setting the lock. + */ static int cifs_posix_lock_set(struct file *file, struct file_lock *flock) { struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); - int rc; + int rc = 1; + + if ((flock->fl_flags & FL_POSIX) == 0) + return rc; mutex_lock(&cinode->lock_mutex); if (!cinode->can_cache_brlcks) { mutex_unlock(&cinode->lock_mutex); - return 1; + return rc; } rc = posix_lock_file_wait(file, flock); mutex_unlock(&cinode->lock_mutex); @@ -928,7 +951,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) else type = CIFS_WRLCK; - lck = cifs_lock_init(length, flock->fl_start, type, + lck = cifs_lock_init(flock->fl_start, length, type, cfile->netfid); if (!lck) { rc = -ENOMEM; @@ -1065,14 +1088,12 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type, if (rc != 0) cERROR(1, "Error unlocking previously locked " "range %d during test of lock", rc); - rc = 0; - return rc; + return 0; } if (type & LOCKING_ANDX_SHARED_LOCK) { flock->fl_type = F_WRLCK; - rc = 0; - return rc; + return 0; } rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, @@ -1090,8 +1111,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type, } else flock->fl_type = F_WRLCK; - rc = 0; - return rc; + return 0; } static void @@ -1249,20 +1269,26 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, } if (lock) { - rc = cifs_lock_add_if(cinode, flock->fl_start, length, - type, netfid, wait_flag); + struct cifsLockInfo *lock; + + lock = cifs_lock_init(flock->fl_start, length, type, netfid); + if (!lock) + return -ENOMEM; + + rc = cifs_lock_add_if(cinode, lock, wait_flag); if (rc < 0) - return rc; - else if (!rc) + kfree(lock); + if (rc <= 0) goto out; rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, flock->fl_start, 0, 1, type, wait_flag, 0); - if (rc == 0) { - /* For Windows locks we must store them. */ - rc = cifs_lock_add(cinode, length, flock->fl_start, - type, netfid); + if (rc) { + kfree(lock); + goto out; } + + cifs_lock_add(cinode, lock); } else if (unlock) rc = cifs_unlock_range(cfile, flock, xid); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 5de03ec20144..a090bbe6ee29 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -554,7 +554,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, rc); return rc; } - cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); + /* FindFirst/Next set last_entry to NULL on malformed reply */ + if (cifsFile->srch_inf.last_entry) + cifs_save_resume_key(cifsFile->srch_inf.last_entry, + cifsFile); } while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && @@ -562,7 +565,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, cFYI(1, "calling findnext2"); rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, &cifsFile->srch_inf); - cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); + /* FindFirst/Next set last_entry to NULL on malformed reply */ + if (cifsFile->srch_inf.last_entry) + cifs_save_resume_key(cifsFile->srch_inf.last_entry, + cifsFile); if (rc) return -ENOENT; } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index c7d80e24f24e..4ec3ee9d72cc 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -683,7 +683,7 @@ ssetup_ntlmssp_authenticate: cpu_to_le16(CIFS_AUTH_RESP_SIZE); /* calculate ntlm response and session key */ - rc = setup_ntlm_response(ses); + rc = setup_ntlm_response(ses, nls_cp); if (rc) { cERROR(1, "Error %d during NTLM authentication", rc); goto ssetup_exit; diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index ac1221d969d6..80d850881938 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -199,75 +199,36 @@ SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24) return rc; } -/* Routines for Windows NT MD4 Hash functions. */ -static int -_my_wcslen(__u16 *str) -{ - int len = 0; - while (*str++ != 0) - len++; - return len; -} - -/* - * Convert a string into an NT UNICODE string. - * Note that regardless of processor type - * this must be in intel (little-endian) - * format. - */ - -static int -_my_mbstowcs(__u16 *dst, const unsigned char *src, int len) -{ /* BB not a very good conversion routine - change/fix */ - int i; - __u16 val; - - for (i = 0; i < len; i++) { - val = *src; - SSVAL(dst, 0, val); - dst++; - src++; - if (val == 0) - break; - } - return i; -} - /* * Creates the MD4 Hash of the users password in NT UNICODE. */ int -E_md4hash(const unsigned char *passwd, unsigned char *p16) +E_md4hash(const unsigned char *passwd, unsigned char *p16, + const struct nls_table *codepage) { int rc; int len; - __u16 wpwd[129]; + __le16 wpwd[129]; /* Password cannot be longer than 128 characters */ - if (passwd) { - len = strlen((char *) passwd); - if (len > 128) - len = 128; - - /* Password must be converted to NT unicode */ - _my_mbstowcs(wpwd, passwd, len); - } else + if (passwd) /* Password must be converted to NT unicode */ + len = cifs_strtoUCS(wpwd, passwd, 128, codepage); + else { len = 0; + *wpwd = 0; /* Ensure string is null terminated */ + } - wpwd[len] = 0; /* Ensure string is null terminated */ - /* Calculate length in bytes */ - len = _my_wcslen(wpwd) * sizeof(__u16); - - rc = mdfour(p16, (unsigned char *) wpwd, len); - memset(wpwd, 0, 129 * 2); + rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16)); + memset(wpwd, 0, 129 * sizeof(__le16)); return rc; } /* Does the NT MD4 hash then des encryption. */ int -SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) +SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24, + const struct nls_table *codepage) { int rc; unsigned char p16[16], p21[21]; @@ -275,7 +236,7 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) memset(p16, '\0', 16); memset(p21, '\0', 21); - rc = E_md4hash(passwd, p16); + rc = E_md4hash(passwd, p16, codepage); if (rc) { cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); return rc; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index ca418aaf6352..9d8715c45f25 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -292,7 +292,7 @@ int __init configfs_inode_init(void) return bdi_init(&configfs_backing_dev_info); } -void __exit configfs_inode_exit(void) +void configfs_inode_exit(void) { bdi_destroy(&configfs_backing_dev_info); } diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index ecc62178beda..276e15cafd58 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -143,28 +143,26 @@ static int __init configfs_init(void) goto out; config_kobj = kobject_create_and_add("config", kernel_kobj); - if (!config_kobj) { - kmem_cache_destroy(configfs_dir_cachep); - configfs_dir_cachep = NULL; - goto out; - } + if (!config_kobj) + goto out2; + + err = configfs_inode_init(); + if (err) + goto out3; err = register_filesystem(&configfs_fs_type); - if (err) { - printk(KERN_ERR "configfs: Unable to register filesystem!\n"); - kobject_put(config_kobj); - kmem_cache_destroy(configfs_dir_cachep); - configfs_dir_cachep = NULL; - goto out; - } + if (err) + goto out4; - err = configfs_inode_init(); - if (err) { - unregister_filesystem(&configfs_fs_type); - kobject_put(config_kobj); - kmem_cache_destroy(configfs_dir_cachep); - configfs_dir_cachep = NULL; - } + return 0; +out4: + printk(KERN_ERR "configfs: Unable to register filesystem!\n"); + configfs_inode_exit(); +out3: + kobject_put(config_kobj); +out2: + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; out: return err; } diff --git a/fs/dcache.c b/fs/dcache.c index 274f13e2f094..89509b5a090e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -36,6 +36,7 @@ #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/prefetch.h> +#include <linux/ratelimit.h> #include "internal.h" /* @@ -546,9 +547,11 @@ int d_invalidate(struct dentry * dentry) * would make it unreachable from the root, * we might still populate it if it was a * working directory or similar). + * We also need to leave mountpoints alone, + * directory or not. */ - if (dentry->d_count > 1) { - if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { + if (dentry->d_count > 1 && dentry->d_inode) { + if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) { spin_unlock(&dentry->d_lock); return -EBUSY; } @@ -2381,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) actual = __d_unalias(inode, dentry, alias); } write_sequnlock(&rename_lock); - if (IS_ERR(actual)) + if (IS_ERR(actual)) { + if (PTR_ERR(actual) == -ELOOP) + pr_warn_ratelimited( + "VFS: Lookup of '%s' in %s %s" + " would have caused loop\n", + dentry->d_name.name, + inode->i_sb->s_type->name, + inode->i_sb->s_id); dput(alias); + } goto out_nolock; } } @@ -2428,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry (may be modified by this function) + * @root: root vfsmnt/dentry * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * * Caller holds the rename_lock. - * - * If path is not reachable from the supplied root, then the value of - * root is changed (without modifying refcounts). */ -static int prepend_path(const struct path *path, struct path *root, +static int prepend_path(const struct path *path, + const struct path *root, char **buffer, int *buflen) { struct dentry *dentry = path->dentry; @@ -2472,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root, dentry = parent; } -out: if (!error && !slash) error = prepend(buffer, buflen, "/", 1); +out: br_read_unlock(vfsmount_lock); return error; @@ -2489,15 +2498,17 @@ global_root: WARN(1, "Root dentry has weird name <%.*s>\n", (int) dentry->d_name.len, dentry->d_name.name); } - root->mnt = vfsmnt; - root->dentry = dentry; + if (!slash) + error = prepend(buffer, buflen, "/", 1); + if (!error) + error = vfsmnt->mnt_ns ? 1 : 2; goto out; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry (may be modified by this function) + * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * @@ -2508,10 +2519,10 @@ global_root: * * "buflen" should be positive. * - * If path is not reachable from the supplied root, then the value of - * root is changed (without modifying refcounts). + * If the path is not reachable from the supplied root, return %NULL. */ -char *__d_path(const struct path *path, struct path *root, +char *__d_path(const struct path *path, + const struct path *root, char *buf, int buflen) { char *res = buf + buflen; @@ -2522,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root, error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); - if (error) + if (error < 0) + return ERR_PTR(error); + if (error > 0) + return NULL; + return res; +} + +char *d_absolute_path(const struct path *path, + char *buf, int buflen) +{ + struct path root = {}; + char *res = buf + buflen; + int error; + + prepend(&res, &buflen, "\0", 1); + write_seqlock(&rename_lock); + error = prepend_path(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + + if (error > 1) + error = -EINVAL; + if (error < 0) return ERR_PTR(error); return res; } @@ -2530,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root, /* * same as __d_path but appends "(deleted)" for unlinked files. */ -static int path_with_deleted(const struct path *path, struct path *root, - char **buf, int *buflen) +static int path_with_deleted(const struct path *path, + const struct path *root, + char **buf, int *buflen) { prepend(buf, buflen, "\0", 1); if (d_unlinked(path->dentry)) { @@ -2568,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; - struct path tmp; int error; /* @@ -2583,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); write_seqlock(&rename_lock); - tmp = root; - error = path_with_deleted(path, &tmp, &res, &buflen); - if (error) + error = path_with_deleted(path, &root, &res, &buflen); + if (error < 0) res = ERR_PTR(error); write_sequnlock(&rename_lock); path_put(&root); @@ -2606,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; - struct path tmp; int error; if (path->dentry->d_op && path->dentry->d_op->d_dname) @@ -2614,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); write_seqlock(&rename_lock); - tmp = root; - error = path_with_deleted(path, &tmp, &res, &buflen); - if (!error && !path_equal(&tmp, &root)) + error = path_with_deleted(path, &root, &res, &buflen); + if (error > 0) error = prepend_unreachable(&res, &buflen); write_sequnlock(&rename_lock); path_put(&root); @@ -2747,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; - struct path tmp = root; char *cwd = page + PAGE_SIZE; int buflen = PAGE_SIZE; prepend(&cwd, &buflen, "\0", 1); - error = prepend_path(&pwd, &tmp, &cwd, &buflen); + error = prepend_path(&pwd, &root, &cwd, &buflen); write_sequnlock(&rename_lock); - if (error) + if (error < 0) goto out; /* Unreachable from current root */ - if (!path_equal(&tmp, &root)) { + if (error > 0) { error = prepend_unreachable(&cwd, &buflen); if (error) goto out; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 58609bde3b9f..2a834255c75d 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -967,7 +967,7 @@ static void ecryptfs_set_default_crypt_stat_vals( /** * ecryptfs_new_file_context - * @ecryptfs_dentry: The eCryptfs dentry + * @ecryptfs_inode: The eCryptfs inode * * If the crypto context for the file has not yet been established, * this is where we do that. Establishing a new crypto context @@ -984,13 +984,13 @@ static void ecryptfs_set_default_crypt_stat_vals( * * Returns zero on success; non-zero otherwise */ -int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) +int ecryptfs_new_file_context(struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; + ecryptfs_inode->i_sb)->mount_crypt_stat; int cipher_name_len; int rc = 0; @@ -1299,12 +1299,12 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, } static int -ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, +ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode, char *virt, size_t virt_len) { int rc; - rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, + rc = ecryptfs_write_lower(ecryptfs_inode, virt, 0, virt_len); if (rc < 0) printk(KERN_ERR "%s: Error attempting to write header " @@ -1338,7 +1338,8 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, /** * ecryptfs_write_metadata - * @ecryptfs_dentry: The eCryptfs dentry + * @ecryptfs_dentry: The eCryptfs dentry, which should be negative + * @ecryptfs_inode: The newly created eCryptfs inode * * Write the file headers out. This will likely involve a userspace * callout, in which the session key is encrypted with one or more @@ -1348,10 +1349,11 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, * * Returns zero on success; non-zero on error */ -int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; unsigned int order; char *virt; size_t virt_len; @@ -1391,7 +1393,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, size); else - rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, + rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt, virt_len); if (rc) { printk(KERN_ERR "%s: Error writing metadata out to lower file; " @@ -1943,7 +1945,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD" /* We could either offset on every reverse map or just pad some 0x00's * at the front here */ -static const unsigned char filename_rev_map[] = { +static const unsigned char filename_rev_map[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ @@ -1959,7 +1961,7 @@ static const unsigned char filename_rev_map[] = { 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ - 0x3D, 0x3E, 0x3F + 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */ }; /** diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 54481a3b2c79..a9f29b12fbf2 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -584,9 +584,10 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); int ecryptfs_encrypt_page(struct page *page); int ecryptfs_decrypt_page(struct page *page); -int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode); int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); -int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); +int ecryptfs_new_file_context(struct inode *ecryptfs_inode); void ecryptfs_write_crypt_stat_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, size_t *written); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index c6ac98cf9baa..d3f95f941c47 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -139,6 +139,27 @@ out: return rc; } +static void ecryptfs_vma_close(struct vm_area_struct *vma) +{ + filemap_write_and_wait(vma->vm_file->f_mapping); +} + +static const struct vm_operations_struct ecryptfs_file_vm_ops = { + .close = ecryptfs_vma_close, + .fault = filemap_fault, +}; + +static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int rc; + + rc = generic_file_mmap(file, vma); + if (!rc) + vma->vm_ops = &ecryptfs_file_vm_ops; + + return rc; +} + struct kmem_cache *ecryptfs_file_info_cache; /** @@ -349,7 +370,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_file_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index a36d327f1521..32f90a3ae63e 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -172,22 +172,23 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, * it. It will also update the eCryptfs directory inode to mimic the * stat of the lower directory inode. * - * Returns zero on success; non-zero on error condition + * Returns the new eCryptfs inode on success; an ERR_PTR on error condition */ -static int +static struct inode * ecryptfs_do_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, int mode) { int rc; struct dentry *lower_dentry; struct dentry *lower_dir_dentry; + struct inode *inode; lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); if (IS_ERR(lower_dir_dentry)) { ecryptfs_printk(KERN_ERR, "Error locking directory of " "dentry\n"); - rc = PTR_ERR(lower_dir_dentry); + inode = ERR_CAST(lower_dir_dentry); goto out; } rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, @@ -195,20 +196,19 @@ ecryptfs_do_create(struct inode *directory_inode, if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); + inode = ERR_PTR(rc); goto out_lock; } - rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, - directory_inode->i_sb); - if (rc) { - ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); + inode = __ecryptfs_get_inode(lower_dentry->d_inode, + directory_inode->i_sb); + if (IS_ERR(inode)) goto out_lock; - } fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); out_lock: unlock_dir(lower_dir_dentry); out: - return rc; + return inode; } /** @@ -219,26 +219,26 @@ out: * * Returns zero on success */ -static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) +static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; int rc = 0; - if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { + if (S_ISDIR(ecryptfs_inode->i_mode)) { ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); goto out; } ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); - rc = ecryptfs_new_file_context(ecryptfs_dentry); + rc = ecryptfs_new_file_context(ecryptfs_inode); if (rc) { ecryptfs_printk(KERN_ERR, "Error creating new file " "context; rc = [%d]\n", rc); goto out; } - rc = ecryptfs_get_lower_file(ecryptfs_dentry, - ecryptfs_dentry->d_inode); + rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " @@ -246,10 +246,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) ecryptfs_dentry->d_name.name, rc); goto out; } - rc = ecryptfs_write_metadata(ecryptfs_dentry); + rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode); if (rc) printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); - ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); + ecryptfs_put_lower_file(ecryptfs_inode); out: return rc; } @@ -269,18 +269,28 @@ static int ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, int mode, struct nameidata *nd) { + struct inode *ecryptfs_inode; int rc; - /* ecryptfs_do_create() calls ecryptfs_interpose() */ - rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode); - if (unlikely(rc)) { + ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry, + mode); + if (unlikely(IS_ERR(ecryptfs_inode))) { ecryptfs_printk(KERN_WARNING, "Failed to create file in" "lower filesystem\n"); + rc = PTR_ERR(ecryptfs_inode); goto out; } /* At this point, a file exists on "disk"; we need to make sure * that this on disk file is prepared to be an ecryptfs file */ - rc = ecryptfs_initialize_file(ecryptfs_dentry); + rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode); + if (rc) { + drop_nlink(ecryptfs_inode); + unlock_new_inode(ecryptfs_inode); + iput(ecryptfs_inode); + goto out; + } + d_instantiate(ecryptfs_dentry, ecryptfs_inode); + unlock_new_inode(ecryptfs_inode); out: return rc; } diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index fa9a286c8771..da42f32c49be 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig @@ -5,7 +5,7 @@ # selected by any of the users. config ORE tristate - depends on EXOFS_FS + depends on EXOFS_FS || PNFS_OBJLAYOUT select ASYNC_XOR default SCSI_OSD_ULD diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index fcfa86ae6faf..d271ad837202 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -23,6 +23,7 @@ */ #include <linux/slab.h> +#include <linux/module.h> #include <asm/div64.h> #include <linux/lcm.h> diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 057b237b8b69..e6085ec192d6 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -35,6 +35,7 @@ #include <linux/parser.h> #include <linux/vfs.h> #include <linux/random.h> +#include <linux/module.h> #include <linux/exportfs.h> #include <linux/slab.h> diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f6dba4505f1c..12ccacda44e0 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -565,7 +565,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" ", computed = %llu, %llu\n", - EXT4_B2C(sbi, ext4_free_blocks_count(es)), + EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 61fa9e1614af..607b1557d292 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1095,7 +1095,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); - neh->eh_depth = cpu_to_le16(neh->eh_depth + 1); + neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1); ext4_mark_inode_dirty(handle, inode); out: brelse(bh); @@ -2955,7 +2955,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, /* Pre-conditions */ BUG_ON(!ext4_ext_is_uninitialized(ex)); BUG_ON(!in_range(map->m_lblk, ee_block, ee_len)); - BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len); /* * Attempt to transfer newly initialized blocks from the currently diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cc5a6da030a1..92655fd89657 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1339,8 +1339,11 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, clear_buffer_unwritten(bh); } - /* skip page if block allocation undone */ - if (buffer_delay(bh) || buffer_unwritten(bh)) + /* + * skip page if block allocation undone and + * block is dirty + */ + if (ext4_bh_delay_or_unwritten(NULL, bh)) skip_page = 1; bh = bh->b_this_page; block_start += bh->b_size; @@ -2270,6 +2273,7 @@ retry: ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); + blk_finish_plug(&plug); goto out_writepages; } @@ -2372,7 +2376,7 @@ static int ext4_nonda_switch(struct super_block *sb) * start pushing delalloc when 1/2 of free blocks are dirty. */ if (free_blocks < 2 * dirty_blocks) - writeback_inodes_sb_if_idle(sb); + writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); return 0; } @@ -2386,7 +2390,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; struct inode *inode = mapping->host; handle_t *handle; - loff_t page_len; index = pos >> PAGE_CACHE_SHIFT; @@ -2433,13 +2436,6 @@ retry: */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); - } else { - page_len = pos & (PAGE_CACHE_SIZE - 1); - if (page_len > 0) { - ret = ext4_discard_partial_page_buffers_no_lock(handle, - inode, page, pos - page_len, page_len, - EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED); - } } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -2482,7 +2478,6 @@ static int ext4_da_write_end(struct file *file, loff_t new_i_size; unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; - loff_t page_len; if (write_mode == FALL_BACK_TO_NONDELALLOC) { if (ext4_should_order_data(inode)) { @@ -2507,7 +2502,7 @@ static int ext4_da_write_end(struct file *file, */ new_i_size = pos + copied; - if (new_i_size > EXT4_I(inode)->i_disksize) { + if (copied && new_i_size > EXT4_I(inode)->i_disksize) { if (ext4_da_should_update_i_disksize(page, end)) { down_write(&EXT4_I(inode)->i_data_sem); if (new_i_size > EXT4_I(inode)->i_disksize) { @@ -2531,16 +2526,6 @@ static int ext4_da_write_end(struct file *file, } ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - - page_len = PAGE_CACHE_SIZE - - ((pos + copied - 1) & (PAGE_CACHE_SIZE - 1)); - - if (page_len > 0) { - ret = ext4_discard_partial_page_buffers_no_lock(handle, - inode, page, pos + copied - 1, page_len, - EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED); - } - copied = ret2; if (ret2 < 0) ret = ret2; @@ -2780,10 +2765,11 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, iocb->private, io_end->inode->i_ino, iocb, offset, size); + iocb->private = NULL; + /* if not aio dio with unwritten extents, just free io and return */ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { ext4_free_io_end(io_end); - iocb->private = NULL; out: if (is_async) aio_complete(iocb, ret, 0); @@ -2807,7 +2793,6 @@ out: /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); - iocb->private = NULL; /* XXX: probably should move into the real I/O completion handler */ inode_dio_done(inode); @@ -3202,26 +3187,8 @@ int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - if (!page_has_buffers(page)) { - /* - * If the range to be discarded covers a partial block - * we need to get the page buffers. This is because - * partial blocks cannot be released and the page needs - * to be updated with the contents of the block before - * we write the zeros on top of it. - */ - if ((from & (blocksize - 1)) || - ((from + length) & (blocksize - 1))) { - create_empty_buffers(page, blocksize, 0); - } else { - /* - * If there are no partial blocks, - * there is nothing to update, - * so we can return now - */ - return 0; - } - } + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); /* Find the buffer that contains "offset" */ bh = page_buffers(page); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7ce1d0b19c94..7e106c810c62 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -385,6 +385,18 @@ int ext4_bio_write_page(struct ext4_io_submit *io, block_end = block_start + blocksize; if (block_start >= len) { + /* + * Comments copied from block_write_full_page_endio: + * + * The page straddles i_size. It must be zeroed out on + * each and every writepage invocation because it may + * be mmapped. "A file is mapped in multiples of the + * page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when + * mapped, and writes to that region are not written + * out to the file." + */ + zero_user_segment(page, block_start, block_end); clear_buffer_dirty(bh); set_buffer_uptodate(bh); continue; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9953d80145ad..3e1329e2f826 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1155,9 +1155,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",block_validity"); if (!test_opt(sb, INIT_INODE_TABLE)) - seq_puts(seq, ",noinit_inode_table"); + seq_puts(seq, ",noinit_itable"); else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) - seq_printf(seq, ",init_inode_table=%u", + seq_printf(seq, ",init_itable=%u", (unsigned) sbi->s_li_wait_mult); ext4_show_quota_options(seq, sb); @@ -1333,8 +1333,7 @@ enum { Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, - Opt_discard, Opt_nodiscard, - Opt_init_inode_table, Opt_noinit_inode_table, + Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, }; static const match_table_t tokens = { @@ -1407,9 +1406,9 @@ static const match_table_t tokens = { {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, - {Opt_init_inode_table, "init_itable=%u"}, - {Opt_init_inode_table, "init_itable"}, - {Opt_noinit_inode_table, "noinit_itable"}, + {Opt_init_itable, "init_itable=%u"}, + {Opt_init_itable, "init_itable"}, + {Opt_noinit_itable, "noinit_itable"}, {Opt_err, NULL}, }; @@ -1683,7 +1682,9 @@ static int parse_options(char *options, struct super_block *sb, data_opt = EXT4_MOUNT_WRITEBACK_DATA; datacheck: if (is_remount) { - if (test_opt(sb, DATA_FLAGS) != data_opt) { + if (!sbi->s_journal) + ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); + else if (test_opt(sb, DATA_FLAGS) != data_opt) { ext4_msg(sb, KERN_ERR, "Cannot change data mode on remount"); return 0; @@ -1890,7 +1891,7 @@ set_qf_format: case Opt_dioread_lock: clear_opt(sb, DIOREAD_NOLOCK); break; - case Opt_init_inode_table: + case Opt_init_itable: set_opt(sb, INIT_INODE_TABLE); if (args[0].from) { if (match_int(&args[0], &option)) @@ -1901,7 +1902,7 @@ set_qf_format: return 0; sbi->s_li_wait_mult = option; break; - case Opt_noinit_inode_table: + case Opt_noinit_itable: clear_opt(sb, INIT_INODE_TABLE); break; default: @@ -3099,8 +3100,6 @@ static void ext4_destroy_lazyinit_thread(void) } static int ext4_fill_super(struct super_block *sb, void *data, int silent) - __releases(kernel_lock) - __acquires(kernel_lock) { char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 04cf3b91e501..517f211a3bd4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -41,6 +41,7 @@ struct wb_writeback_work { unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; + enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ struct completion *done; /* set if the caller waits */ @@ -115,7 +116,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic) + bool range_cyclic, enum wb_reason reason) { struct wb_writeback_work *work; @@ -135,6 +136,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->sync_mode = WB_SYNC_NONE; work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; + work->reason = reason; bdi_queue_work(bdi, work); } @@ -143,6 +145,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, * bdi_start_writeback - start writeback * @bdi: the backing device to write from * @nr_pages: the number of pages to write + * @reason: reason why some writeback work was initiated * * Description: * This does WB_SYNC_NONE opportunistic writeback. The IO is only @@ -150,9 +153,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, * completion. Caller need not hold sb s_umount semaphore. * */ -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) +void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, + enum wb_reason reason) { - __bdi_start_writeback(bdi, nr_pages, true); + __bdi_start_writeback(bdi, nr_pages, true, reason); } /** @@ -251,7 +255,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, - unsigned long *older_than_this) + struct wb_writeback_work *work) { LIST_HEAD(tmp); struct list_head *pos, *node; @@ -262,8 +266,8 @@ static int move_expired_inodes(struct list_head *delaying_queue, while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (older_than_this && - inode_dirtied_after(inode, *older_than_this)) + if (work->older_than_this && + inode_dirtied_after(inode, *work->older_than_this)) break; if (sb && sb != inode->i_sb) do_sb_sort = 1; @@ -302,13 +306,13 @@ out: * | * +--> dequeue for IO */ -static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) +static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) { int moved; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); - trace_writeback_queue_io(wb, older_than_this, moved); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); + trace_writeback_queue_io(wb, work, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) @@ -641,31 +645,40 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, return wrote; } -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, + enum wb_reason reason) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, + .reason = reason, }; spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) - queue_io(wb, NULL); + queue_io(wb, &work); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); return nr_pages - work.nr_pages; } -static inline bool over_bground_thresh(void) +static bool over_bground_thresh(struct backing_dev_info *bdi) { unsigned long background_thresh, dirty_thresh; global_dirty_limits(&background_thresh, &dirty_thresh); - return (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) > background_thresh); + if (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) > background_thresh) + return true; + + if (bdi_stat(bdi, BDI_RECLAIMABLE) > + bdi_dirty_limit(bdi, background_thresh)) + return true; + + return false; } /* @@ -675,7 +688,7 @@ static inline bool over_bground_thresh(void) static void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { - __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); + __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time); } /* @@ -727,7 +740,7 @@ static long wb_writeback(struct bdi_writeback *wb, * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh()) + if (work->for_background && !over_bground_thresh(wb->bdi)) break; if (work->for_kupdate) { @@ -738,7 +751,7 @@ static long wb_writeback(struct bdi_writeback *wb, trace_writeback_start(wb->bdi, work); if (list_empty(&wb->b_io)) - queue_io(wb, work->older_than_this); + queue_io(wb, work); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else @@ -811,13 +824,14 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh()) { + if (over_bground_thresh(wb->bdi)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, .sync_mode = WB_SYNC_NONE, .for_background = 1, .range_cyclic = 1, + .reason = WB_REASON_BACKGROUND, }; return wb_writeback(wb, &work); @@ -851,6 +865,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, + .reason = WB_REASON_PERIODIC, }; return wb_writeback(wb, &work); @@ -969,7 +984,7 @@ int bdi_writeback_thread(void *data) * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. */ -void wakeup_flusher_threads(long nr_pages) +void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) { struct backing_dev_info *bdi; @@ -982,7 +997,7 @@ void wakeup_flusher_threads(long nr_pages) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false); + __bdi_start_writeback(bdi, nr_pages, false, reason); } rcu_read_unlock(); } @@ -1198,12 +1213,15 @@ static void wait_sb_inodes(struct super_block *sb) * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock * @nr: the number of pages to write + * @reason: reason why some writeback work initiated * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ -void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) +void writeback_inodes_sb_nr(struct super_block *sb, + unsigned long nr, + enum wb_reason reason) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { @@ -1212,6 +1230,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) .tagged_writepages = 1, .done = &done, .nr_pages = nr, + .reason = reason, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); @@ -1223,29 +1242,31 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr); /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock + * @reason: reason why some writeback work was initiated * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ -void writeback_inodes_sb(struct super_block *sb) +void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { - return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); + return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } EXPORT_SYMBOL(writeback_inodes_sb); /** * writeback_inodes_sb_if_idle - start writeback if none underway * @sb: the superblock + * @reason: reason why some writeback work was initiated * * Invoke writeback_inodes_sb if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ -int writeback_inodes_sb_if_idle(struct super_block *sb) +int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason) { if (!writeback_in_progress(sb->s_bdi)) { down_read(&sb->s_umount); - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, reason); up_read(&sb->s_umount); return 1; } else @@ -1257,16 +1278,18 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle); * writeback_inodes_sb_if_idle - start writeback if none underway * @sb: the superblock * @nr: the number of pages to write + * @reason: reason why some writeback work was initiated * * Invoke writeback_inodes_sb if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ int writeback_inodes_sb_nr_if_idle(struct super_block *sb, - unsigned long nr) + unsigned long nr, + enum wb_reason reason) { if (!writeback_in_progress(sb->s_bdi)) { down_read(&sb->s_umount); - writeback_inodes_sb_nr(sb, nr); + writeback_inodes_sb_nr(sb, nr, reason); up_read(&sb->s_umount); return 1; } else @@ -1290,6 +1313,7 @@ void sync_inodes_sb(struct super_block *sb) .nr_pages = LONG_MAX, .range_cyclic = 0, .done = &done, + .reason = WB_REASON_SYNC, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b6cca47f7b07..3426521f3205 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -47,6 +47,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stat.h> +#include <linux/module.h> #include "fuse_i.h" diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5cb8614508c3..2aaf3eaaf13d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1512,7 +1512,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, else if (outarg->offset + num > file_size) num = file_size - outarg->offset; - while (num) { + while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) { struct page *page; unsigned int this_num; @@ -1526,6 +1526,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, num -= this_num; total_len += this_num; + index++; } req->misc.retrieve_in.offset = outarg->offset; req->misc.retrieve_in.size = total_len; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 594f07a81c28..0c84100acd44 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1556,7 +1556,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) struct inode *inode = file->f_path.dentry->d_inode; mutex_lock(&inode->i_mutex); - if (origin != SEEK_CUR || origin != SEEK_SET) { + if (origin != SEEK_CUR && origin != SEEK_SET) { retval = fuse_update_attributes(inode, NULL, file, NULL); if (retval) goto exit; @@ -1567,6 +1567,10 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) offset += i_size_read(inode); break; case SEEK_CUR: + if (offset == 0) { + retval = file->f_pos; + goto exit; + } offset += file->f_pos; break; case SEEK_DATA: diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3e6d72756479..aa83109b9431 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1138,28 +1138,28 @@ static int __init fuse_fs_init(void) { int err; - err = register_filesystem(&fuse_fs_type); - if (err) - goto out; - - err = register_fuseblk(); - if (err) - goto out_unreg; - fuse_inode_cachep = kmem_cache_create("fuse_inode", sizeof(struct fuse_inode), 0, SLAB_HWCACHE_ALIGN, fuse_inode_init_once); err = -ENOMEM; if (!fuse_inode_cachep) - goto out_unreg2; + goto out; + + err = register_fuseblk(); + if (err) + goto out2; + + err = register_filesystem(&fuse_fs_type); + if (err) + goto out3; return 0; - out_unreg2: + out3: unregister_fuseblk(); - out_unreg: - unregister_filesystem(&fuse_fs_type); + out2: + kmem_cache_destroy(fuse_inode_cachep); out: return err; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7e823bbd2453..cb23c2be731a 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -14,6 +14,7 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/kthread.h> +#include <linux/export.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/gfs2_ondisk.h> diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c index e673a88b8ae7..b1ce4c7ad3fb 100644 --- a/fs/hfs/trans.c +++ b/fs/hfs/trans.c @@ -40,6 +40,8 @@ int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in) src = in->name; srclen = in->len; + if (srclen > HFS_NAMELEN) + srclen = HFS_NAMELEN; dst = out; dstlen = HFS_MAX_NAMELEN; if (nls_io) { diff --git a/fs/ioprio.c b/fs/ioprio.c index 7da2a06508e5..f79dab83e17b 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -21,6 +21,7 @@ */ #include <linux/gfp.h> #include <linux/kernel.h> +#include <linux/export.h> #include <linux/ioprio.h> #include <linux/blkdev.h> #include <linux/capability.h> diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index de4247021d25..5b6c9d1a2fb9 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -53,6 +53,78 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this, return 0; } +/* + * jffs2_selected_compress: + * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB). + * If 0, just take the first available compression mode. + * @data_in: Pointer to uncompressed data + * @cpage_out: Pointer to returned pointer to buffer for compressed data + * @datalen: On entry, holds the amount of data available for compression. + * On exit, expected to hold the amount of data actually compressed. + * @cdatalen: On entry, holds the amount of space available for compressed + * data. On exit, expected to hold the actual size of the compressed + * data. + * + * Returns: the compression type used. Zero is used to show that the data + * could not be compressed; probably because we couldn't find the requested + * compression mode. + */ +static int jffs2_selected_compress(u8 compr, unsigned char *data_in, + unsigned char **cpage_out, u32 *datalen, u32 *cdatalen) +{ + struct jffs2_compressor *this; + int err, ret = JFFS2_COMPR_NONE; + uint32_t orig_slen, orig_dlen; + char *output_buf; + + output_buf = kmalloc(*cdatalen, GFP_KERNEL); + if (!output_buf) { + printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n"); + return ret; + } + orig_slen = *datalen; + orig_dlen = *cdatalen; + spin_lock(&jffs2_compressor_list_lock); + list_for_each_entry(this, &jffs2_compressor_list, list) { + /* Skip decompress-only and disabled modules */ + if (!this->compress || this->disabled) + continue; + + /* Skip if not the desired compression type */ + if (compr && (compr != this->compr)) + continue; + + /* + * Either compression type was unspecified, or we found our + * compressor; either way, we're good to go. + */ + this->usecount++; + spin_unlock(&jffs2_compressor_list_lock); + + *datalen = orig_slen; + *cdatalen = orig_dlen; + err = this->compress(data_in, output_buf, datalen, cdatalen); + + spin_lock(&jffs2_compressor_list_lock); + this->usecount--; + if (!err) { + /* Success */ + ret = this->compr; + this->stat_compr_blocks++; + this->stat_compr_orig_size += *datalen; + this->stat_compr_new_size += *cdatalen; + break; + } + } + spin_unlock(&jffs2_compressor_list_lock); + if (ret == JFFS2_COMPR_NONE) + kfree(output_buf); + else + *cpage_out = output_buf; + + return ret; +} + /* jffs2_compress: * @data_in: Pointer to uncompressed data * @cpage_out: Pointer to returned pointer to buffer for compressed data @@ -76,47 +148,23 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint32_t *datalen, uint32_t *cdatalen) { int ret = JFFS2_COMPR_NONE; - int compr_ret; + int mode, compr_ret; struct jffs2_compressor *this, *best=NULL; unsigned char *output_buf = NULL, *tmp_buf; uint32_t orig_slen, orig_dlen; uint32_t best_slen=0, best_dlen=0; - switch (jffs2_compression_mode) { + if (c->mount_opts.override_compr) + mode = c->mount_opts.compr; + else + mode = jffs2_compression_mode; + + switch (mode) { case JFFS2_COMPR_MODE_NONE: break; case JFFS2_COMPR_MODE_PRIORITY: - output_buf = kmalloc(*cdatalen,GFP_KERNEL); - if (!output_buf) { - printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n"); - goto out; - } - orig_slen = *datalen; - orig_dlen = *cdatalen; - spin_lock(&jffs2_compressor_list_lock); - list_for_each_entry(this, &jffs2_compressor_list, list) { - /* Skip decompress-only backwards-compatibility and disabled modules */ - if ((!this->compress)||(this->disabled)) - continue; - - this->usecount++; - spin_unlock(&jffs2_compressor_list_lock); - *datalen = orig_slen; - *cdatalen = orig_dlen; - compr_ret = this->compress(data_in, output_buf, datalen, cdatalen); - spin_lock(&jffs2_compressor_list_lock); - this->usecount--; - if (!compr_ret) { - ret = this->compr; - this->stat_compr_blocks++; - this->stat_compr_orig_size += *datalen; - this->stat_compr_new_size += *cdatalen; - break; - } - } - spin_unlock(&jffs2_compressor_list_lock); - if (ret == JFFS2_COMPR_NONE) - kfree(output_buf); + ret = jffs2_selected_compress(0, data_in, cpage_out, datalen, + cdatalen); break; case JFFS2_COMPR_MODE_SIZE: case JFFS2_COMPR_MODE_FAVOURLZO: @@ -174,22 +222,28 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, best->stat_compr_orig_size += best_slen; best->stat_compr_new_size += best_dlen; ret = best->compr; + *cpage_out = output_buf; } spin_unlock(&jffs2_compressor_list_lock); break; + case JFFS2_COMPR_MODE_FORCELZO: + ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in, + cpage_out, datalen, cdatalen); + break; + case JFFS2_COMPR_MODE_FORCEZLIB: + ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in, + cpage_out, datalen, cdatalen); + break; default: printk(KERN_ERR "JFFS2: unknown compression mode.\n"); } - out: + if (ret == JFFS2_COMPR_NONE) { *cpage_out = data_in; *datalen = *cdatalen; none_stat_compr_blocks++; none_stat_compr_size += *datalen; } - else { - *cpage_out = output_buf; - } return ret; } diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h index 13bb7597ab39..5e91d578f4ed 100644 --- a/fs/jffs2/compr.h +++ b/fs/jffs2/compr.h @@ -40,6 +40,8 @@ #define JFFS2_COMPR_MODE_PRIORITY 1 #define JFFS2_COMPR_MODE_SIZE 2 #define JFFS2_COMPR_MODE_FAVOURLZO 3 +#define JFFS2_COMPR_MODE_FORCELZO 4 +#define JFFS2_COMPR_MODE_FORCEZLIB 5 #define FAVOUR_LZO_PERCENT 80 diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 7286e44ac665..4b8afe39a87f 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -379,7 +379,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags) jffs2_do_setattr(inode, &iattr); } -int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) +int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 0bc6a6c80a56..55a0c1dceadf 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -29,6 +29,11 @@ struct jffs2_inodirty; +struct jffs2_mount_opts { + bool override_compr; + unsigned int compr; +}; + /* A struct for the overall file system control. Pointers to jffs2_sb_info structs are named `c' in the source code. Nee jffs_control @@ -126,6 +131,7 @@ struct jffs2_sb_info { #endif struct jffs2_summary *summary; /* Summary information */ + struct jffs2_mount_opts mount_opts; #ifdef CONFIG_JFFS2_FS_XATTR #define XATTRINDEX_HASHSIZE (57) diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 6c1755c59c0f..ab65ee3ec858 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -176,7 +176,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags); struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri); int jffs2_statfs (struct dentry *, struct kstatfs *); -int jffs2_remount_fs (struct super_block *, int *, char *); +int jffs2_do_remount_fs(struct super_block *, int *, char *); int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); void jffs2_gc_release_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f); diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 8d8cd3419d02..28107ca136e4 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -275,9 +275,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) else c->mtd->unpoint(c->mtd, 0, c->mtd->size); #endif - if (s) - kfree(s); - + kfree(s); return ret; } diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 853b8e300084..e7e974454115 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -17,11 +17,13 @@ #include <linux/fs.h> #include <linux/err.h> #include <linux/mount.h> +#include <linux/parser.h> #include <linux/jffs2.h> #include <linux/pagemap.h> #include <linux/mtd/super.h> #include <linux/ctype.h> #include <linux/namei.h> +#include <linux/seq_file.h> #include <linux/exportfs.h> #include "compr.h" #include "nodelist.h" @@ -75,6 +77,37 @@ static void jffs2_write_super(struct super_block *sb) unlock_super(sb); } +static const char *jffs2_compr_name(unsigned int compr) +{ + switch (compr) { + case JFFS2_COMPR_MODE_NONE: + return "none"; +#ifdef CONFIG_JFFS2_LZO + case JFFS2_COMPR_MODE_FORCELZO: + return "lzo"; +#endif +#ifdef CONFIG_JFFS2_ZLIB + case JFFS2_COMPR_MODE_FORCEZLIB: + return "zlib"; +#endif + default: + /* should never happen; programmer error */ + WARN_ON(1); + return ""; + } +} + +static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb); + struct jffs2_mount_opts *opts = &c->mount_opts; + + if (opts->override_compr) + seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr)); + + return 0; +} + static int jffs2_sync_fs(struct super_block *sb, int wait) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); @@ -133,6 +166,85 @@ static const struct export_operations jffs2_export_ops = { .fh_to_parent = jffs2_fh_to_parent, }; +/* + * JFFS2 mount options. + * + * Opt_override_compr: override default compressor + * Opt_err: just end of array marker + */ +enum { + Opt_override_compr, + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_override_compr, "compr=%s"}, + {Opt_err, NULL}, +}; + +static int jffs2_parse_options(struct jffs2_sb_info *c, char *data) +{ + substring_t args[MAX_OPT_ARGS]; + char *p, *name; + + if (!data) + return 0; + + while ((p = strsep(&data, ","))) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_override_compr: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strcmp(name, "none")) + c->mount_opts.compr = JFFS2_COMPR_MODE_NONE; +#ifdef CONFIG_JFFS2_LZO + else if (!strcmp(name, "lzo")) + c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO; +#endif +#ifdef CONFIG_JFFS2_ZLIB + else if (!strcmp(name, "zlib")) + c->mount_opts.compr = + JFFS2_COMPR_MODE_FORCEZLIB; +#endif + else { + printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"", + name); + kfree(name); + return -EINVAL; + } + kfree(name); + c->mount_opts.override_compr = true; + break; + default: + printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n", + p); + return -EINVAL; + } + } + + return 0; +} + +static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + int err; + + err = jffs2_parse_options(c, data); + if (err) + return -EINVAL; + + return jffs2_do_remount_fs(sb, flags, data); +} + static const struct super_operations jffs2_super_operations = { .alloc_inode = jffs2_alloc_inode, @@ -143,6 +255,7 @@ static const struct super_operations jffs2_super_operations = .remount_fs = jffs2_remount_fs, .evict_inode = jffs2_evict_inode, .dirty_inode = jffs2_dirty_inode, + .show_options = jffs2_show_options, .sync_fs = jffs2_sync_fs, }; @@ -166,6 +279,12 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) c->os_priv = sb; sb->s_fs_info = c; + ret = jffs2_parse_options(c, data); + if (ret) { + kfree(c); + return -EINVAL; + } + /* Initialize JFFS2 superblock locks, the further initialization will * be done later */ mutex_init(&c->alloc_sem); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 4515bea0268f..b09e51d2f81f 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -578,8 +578,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) if (!jffs2_is_writebuffered(c)) return 0; - if (mutex_trylock(&c->alloc_sem)) { - mutex_unlock(&c->alloc_sem); + if (!mutex_is_locked(&c->alloc_sem)) { printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); BUG(); } @@ -1026,7 +1025,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); struct mtd_oob_ops ops; - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; ops.oobbuf = c->oobbuf; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; @@ -1069,7 +1068,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, struct mtd_oob_ops ops; int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = cmlen; ops.oobbuf = c->oobbuf; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; @@ -1095,7 +1094,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, struct mtd_oob_ops ops; int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); - ops.mode = MTD_OOB_AUTO; + ops.mode = MTD_OPS_AUTO_OOB; ops.ooblen = cmlen; ops.oobbuf = (uint8_t *)&oob_cleanmarker; ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 583636f745e5..cc5f811ed383 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -67,6 +67,7 @@ #include <linux/buffer_head.h> /* for sync_blockdev() */ #include <linux/bio.h> #include <linux/freezer.h> +#include <linux/export.h> #include <linux/delay.h> #include <linux/mutex.h> #include <linux/seq_file.h> diff --git a/fs/locks.c b/fs/locks.c index 3b0d05dcd7c1..637694bf3a03 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1205,6 +1205,8 @@ int __break_lease(struct inode *inode, unsigned int mode) int want_write = (mode & O_ACCMODE) != O_RDONLY; new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); + if (IS_ERR(new_fl)) + return PTR_ERR(new_fl); lock_flocks(); @@ -1221,12 +1223,6 @@ int __break_lease(struct inode *inode, unsigned int mode) if (fl->fl_owner == current->files) i_have_this_lease = 1; - if (IS_ERR(new_fl) && !i_have_this_lease - && ((mode & O_NONBLOCK) == 0)) { - error = PTR_ERR(new_fl); - goto out; - } - break_time = 0; if (lease_break_time > 0) { break_time = jiffies + lease_break_time * HZ; @@ -1284,8 +1280,7 @@ restart: out: unlock_flocks(); - if (!IS_ERR(new_fl)) - locks_free_lock(new_fl); + locks_free_lock(new_fl); return error; } diff --git a/fs/logfs/super.c b/fs/logfs/super.c index f2697e4df109..e795c234ea33 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -13,6 +13,7 @@ #include <linux/bio.h> #include <linux/slab.h> #include <linux/blkdev.h> +#include <linux/module.h> #include <linux/mtd/mtd.h> #include <linux/statfs.h> #include <linux/buffer_head.h> diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 3f32bcb0d9bd..ef175cb8cfd8 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -16,38 +16,26 @@ #include <linux/bitops.h> #include <linux/sched.h> -static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 }; - static DEFINE_SPINLOCK(bitmap_lock); -static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) +/* + * bitmap consists of blocks filled with 16bit words + * bit set == busy, bit clear == free + * endianness is a mess, but for counting zero bits it really doesn't matter... + */ +static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits) { - unsigned i, j, sum = 0; - struct buffer_head *bh; - - for (i=0; i<numblocks-1; i++) { - if (!(bh=map[i])) - return(0); - for (j=0; j<bh->b_size; j++) - sum += nibblemap[bh->b_data[j] & 0xf] - + nibblemap[(bh->b_data[j]>>4) & 0xf]; - } + __u32 sum = 0; + unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8); - if (numblocks==0 || !(bh=map[numblocks-1])) - return(0); - i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2; - for (j=0; j<i; j++) { - sum += nibblemap[bh->b_data[j] & 0xf] - + nibblemap[(bh->b_data[j]>>4) & 0xf]; + while (blocks--) { + unsigned words = blocksize / 2; + __u16 *p = (__u16 *)(*map++)->b_data; + while (words--) + sum += 16 - hweight16(*p++); } - i = numbits%16; - if (i!=0) { - i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1); - sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf]; - sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf]; - } - return(sum); + return sum; } void minix_free_block(struct inode *inode, unsigned long block) @@ -105,10 +93,12 @@ int minix_new_block(struct inode * inode) return 0; } -unsigned long minix_count_free_blocks(struct minix_sb_info *sbi) +unsigned long minix_count_free_blocks(struct super_block *sb) { - return (count_free(sbi->s_zmap, sbi->s_zmap_blocks, - sbi->s_nzones - sbi->s_firstdatazone + 1) + struct minix_sb_info *sbi = minix_sb(sb); + u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1); + + return (count_free(sbi->s_zmap, sb->s_blocksize, bits) << sbi->s_log_zone_size); } @@ -273,7 +263,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) return inode; } -unsigned long minix_count_free_inodes(struct minix_sb_info *sbi) +unsigned long minix_count_free_inodes(struct super_block *sb) { - return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1); + struct minix_sb_info *sbi = minix_sb(sb); + u32 bits = sbi->s_ninodes + 1; + + return count_free(sbi->s_imap, sb->s_blocksize, bits); } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 64cdcd662ffc..1d9e33966db0 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -279,6 +279,27 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) else if (sbi->s_mount_state & MINIX_ERROR_FS) printk("MINIX-fs: mounting file system with errors, " "running fsck is recommended\n"); + + /* Apparently minix can create filesystems that allocate more blocks for + * the bitmaps than needed. We simply ignore that, but verify it didn't + * create one with not enough blocks and bail out if so. + */ + block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize); + if (sbi->s_imap_blocks < block) { + printk("MINIX-fs: file system does not have enough " + "imap blocks allocated. Refusing to mount\n"); + goto out_iput; + } + + block = minix_blocks_needed( + (sbi->s_nzones - (sbi->s_firstdatazone + 1)), + s->s_blocksize); + if (sbi->s_zmap_blocks < block) { + printk("MINIX-fs: file system does not have enough " + "zmap blocks allocated. Refusing to mount.\n"); + goto out_iput; + } + return 0; out_iput: @@ -339,10 +360,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; - buf->f_bfree = minix_count_free_blocks(sbi); + buf->f_bfree = minix_count_free_blocks(sb); buf->f_bavail = buf->f_bfree; buf->f_files = sbi->s_ninodes; - buf->f_ffree = minix_count_free_inodes(sbi); + buf->f_ffree = minix_count_free_inodes(sb); buf->f_namelen = sbi->s_namelen; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 341e2122879a..26bbd55e82ea 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -48,10 +48,10 @@ extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, stru extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); extern struct inode * minix_new_inode(const struct inode *, int, int *); extern void minix_free_inode(struct inode * inode); -extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); +extern unsigned long minix_count_free_inodes(struct super_block *sb); extern int minix_new_block(struct inode * inode); extern void minix_free_block(struct inode *inode, unsigned long block); -extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); +extern unsigned long minix_count_free_blocks(struct super_block *sb); extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); @@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode) return list_entry(inode, struct minix_inode_info, vfs_inode); } +static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) +{ + return DIV_ROUND_UP(bits, blocksize * 8); +} + #if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) @@ -125,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size) if (!size) return 0; - size = (size >> 4) + ((size & 15) > 0); + size >>= 4; while (*p++ == 0xffff) { if (--size == 0) return (p - addr) << 4; diff --git a/fs/namei.c b/fs/namei.c index ac6d214da827..5008f01787f5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -852,7 +852,7 @@ static int follow_managed(struct path *path, unsigned flags) mntput(path->mnt); if (ret == -EISDIR) ret = 0; - return ret; + return ret < 0 ? ret : need_mntput; } int follow_down_one(struct path *path) @@ -900,6 +900,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, break; path->mnt = mounted; path->dentry = mounted->mnt_root; + nd->flags |= LOOKUP_JUMPED; nd->seq = read_seqcount_begin(&path->dentry->d_seq); /* * Update the inode too. We don't need to re-check the @@ -1213,6 +1214,8 @@ retry: path_put_conditional(path, nd); return err; } + if (err) + nd->flags |= LOOKUP_JUMPED; *inode = path->dentry->d_inode; return 0; } @@ -2146,6 +2149,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } /* create side of things */ + /* + * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been + * cleared when we got to the last component we are about to look up + */ error = complete_walk(nd); if (error) return ERR_PTR(error); @@ -2214,6 +2221,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (error < 0) goto exit_dput; + if (error) + nd->flags |= LOOKUP_JUMPED; + error = -ENOENT; if (!path->dentry->d_inode) goto exit_dput; @@ -2223,6 +2233,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, path_to_nameidata(path, nd); nd->inode = path->dentry->d_inode; + /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ + error = complete_walk(nd); + if (error) + goto exit; error = -EISDIR; if (S_ISDIR(nd->inode->i_mode)) goto exit; diff --git a/fs/namespace.c b/fs/namespace.c index e5e1c7d1839b..cfc6d4448aa5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1048,15 +1048,12 @@ static int show_mountinfo(struct seq_file *m, void *v) if (err) goto out; seq_putc(m, ' '); - seq_path_root(m, &mnt_path, &root, " \t\n\\"); - if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { - /* - * Mountpoint is outside root, discard that one. Ugly, - * but less so than trying to do that in iterator in a - * race-free way (due to renames). - */ - return SEQ_SKIP; - } + + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &root, " \t\n\\"); + if (err) + goto out; + seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); show_mnt_opts(m, mnt); @@ -2483,11 +2480,43 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) __mnt_make_longterm(mnt); new_ns->root = mnt; list_add(&new_ns->list, &new_ns->root->mnt_list); + } else { + mntput(mnt); } return new_ns; } EXPORT_SYMBOL(create_mnt_ns); +struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) +{ + struct mnt_namespace *ns; + struct super_block *s; + struct path path; + int err; + + ns = create_mnt_ns(mnt); + if (IS_ERR(ns)) + return ERR_CAST(ns); + + err = vfs_path_lookup(mnt->mnt_root, mnt, + name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + + put_mnt_ns(ns); + + if (err) + return ERR_PTR(err); + + /* trade a vfsmount reference for active sb one */ + s = path.mnt->mnt_sb; + atomic_inc(&s->s_active); + mntput(path.mnt); + /* lock the sucker */ + down_write(&s->s_umount); + /* ... and return the root of (sub)tree on it */ + return path.dentry; +} +EXPORT_SYMBOL(mount_subtree); + SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { @@ -2744,3 +2773,8 @@ void kern_unmount(struct vfsmount *mnt) } } EXPORT_SYMBOL(kern_unmount); + +bool our_mnt(struct vfsmount *mnt) +{ + return check_mnt(mnt); +} diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 5b5fa33b6b9d..cbd1a61c110a 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -548,7 +548,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); if (error) - goto out_bdi; + goto out_fput; server->ncp_filp = ncp_filp; server->ncp_sock = sock; @@ -559,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) error = -EBADF; server->info_filp = fget(data.info_fd); if (!server->info_filp) - goto out_fput; + goto out_bdi; error = -ENOTSOCK; sock_inode = server->info_filp->f_path.dentry->d_inode; if (!S_ISSOCK(sock_inode->i_mode)) @@ -746,9 +746,9 @@ out_nls: out_fput2: if (server->info_filp) fput(server->info_filp); -out_fput: - bdi_destroy(&server->bdi); out_bdi: + bdi_destroy(&server->bdi); +out_fput: /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>: * * The previously used put_filp(ncp_filp); was bogus, since diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 918ad647afea..726e59a9e50f 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallanyargs *args) { - __be32 *p; + uint32_t bitmap[2]; + __be32 *p, status; args->craa_addr = svc_addr(rqstp); p = read_buf(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->craa_objs_to_keep = ntohl(*p++); - p = read_buf(xdr, 4); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_BADXDR); - args->craa_type_mask = ntohl(*p); + status = decode_bitmap(xdr, bitmap); + if (unlikely(status)) + return status; + args->craa_type_mask = bitmap[0]; return 0; } @@ -986,4 +987,5 @@ struct svc_version nfs4_callback_version4 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, + .vs_hidden = 1, }; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b238d95ac48c..ac2899098147 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1468,12 +1468,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ + case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; - /* case -EISDIR: */ /* case -EINVAL: */ default: res = ERR_CAST(inode); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 91c01f0a4c3b..606ef0f20aed 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -40,48 +40,8 @@ #define NFSDBG_FACILITY NFSDBG_FILE -static int nfs_file_open(struct inode *, struct file *); -static int nfs_file_release(struct inode *, struct file *); -static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin); -static int nfs_file_mmap(struct file *, struct vm_area_struct *); -static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos, - struct pipe_inode_info *pipe, - size_t count, unsigned int flags); -static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, - struct file *filp, loff_t *ppos, - size_t count, unsigned int flags); -static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -static int nfs_file_flush(struct file *, fl_owner_t id); -static int nfs_file_fsync(struct file *, loff_t, loff_t, int datasync); -static int nfs_check_flags(int flags); -static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); -static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); -static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); - static const struct vm_operations_struct nfs_file_vm_ops; -const struct file_operations nfs_file_operations = { - .llseek = nfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = nfs_file_read, - .aio_write = nfs_file_write, - .mmap = nfs_file_mmap, - .open = nfs_file_open, - .flush = nfs_file_flush, - .release = nfs_file_release, - .fsync = nfs_file_fsync, - .lock = nfs_lock, - .flock = nfs_flock, - .splice_read = nfs_file_splice_read, - .splice_write = nfs_file_splice_write, - .check_flags = nfs_check_flags, - .setlease = nfs_setlease, -}; - const struct inode_operations nfs_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, @@ -137,11 +97,9 @@ nfs_file_open(struct inode *inode, struct file *filp) static int nfs_file_release(struct inode *inode, struct file *filp) { - struct dentry *dentry = filp->f_path.dentry; - dprintk("NFS: release(%s/%s)\n", - dentry->d_parent->d_name.name, - dentry->d_name.name); + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); return nfs_release(inode, filp); @@ -189,7 +147,7 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate * the cached file length */ - if (origin != SEEK_SET || origin != SEEK_CUR) { + if (origin != SEEK_SET && origin != SEEK_CUR) { struct inode *inode = filp->f_mapping->host; int retval = nfs_revalidate_file_size(inode, filp); @@ -228,14 +186,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, struct dentry * dentry = iocb->ki_filp->f_path.dentry; struct inode * inode = dentry->d_inode; ssize_t result; - size_t count = iov_length(iov, nr_segs); if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_read(iocb, iov, nr_segs, pos); dprintk("NFS: read(%s/%s, %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - (unsigned long) count, (unsigned long) pos); + (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos); result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { @@ -889,3 +846,54 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) file->f_path.dentry->d_name.name, arg); return -EINVAL; } + +const struct file_operations nfs_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; + +#ifdef CONFIG_NFS_V4 +static int +nfs4_file_open(struct inode *inode, struct file *filp) +{ + /* + * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to + * this point, then something is very wrong + */ + dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp); + return -ENOTDIR; +} + +const struct file_operations nfs4_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs4_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; +#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c07a55aec838..50a15fa8cf98 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) */ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; if (S_ISREG(inode->i_mode)) { - inode->i_fop = &nfs_file_operations; + inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; } else if (S_ISDIR(inode->i_mode)) { diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c1a1bd8ddf1c..3f4d95751d52 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -299,6 +299,8 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head); +extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_readdata_release(struct nfs_read_data *rdata); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 85f1690ca08c..d4bc9ed91748 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -853,6 +853,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, .file_inode_ops = &nfs3_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs3_proc_get_root, .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 09119418402f..a62d36b9a99e 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -31,6 +31,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> +#include <linux/module.h> #include "internal.h" #include "nfs4filelayout.h" @@ -449,9 +450,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, fl->dsaddr = dsaddr; - if (fl->first_stripe_index < 0 || - fl->first_stripe_index >= dsaddr->stripe_count) { - dprintk("%s Bad first_stripe_index %d\n", + if (fl->first_stripe_index >= dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %u\n", __func__, fl->first_stripe_index); goto out_put; } @@ -552,7 +552,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. * Futher checking is done in filelayout_check_layout */ - if (fl->num_fh < 0 || fl->num_fh > + if (fl->num_fh > max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) goto out_err; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d2ae413c986a..d9f4d78c3413 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -39,6 +39,8 @@ #include <linux/delay.h> #include <linux/errno.h> #include <linux/string.h> +#include <linux/ratelimit.h> +#include <linux/printk.h> #include <linux/slab.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> @@ -894,6 +896,8 @@ out: static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) { + if (delegation == NULL) + return 0; if ((delegation->type & fmode) != fmode) return 0; if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) @@ -1036,8 +1040,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) } rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - if (delegation == NULL || - !can_open_delegated(delegation, fmode)) { + if (!can_open_delegated(delegation, fmode)) { rcu_read_unlock(); break; } @@ -1091,7 +1094,12 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data if (delegation) delegation_flags = delegation->flags; rcu_read_unlock(); - if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) + if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) { + pr_err_ratelimited("NFS: Broken NFSv4 server %s is " + "returning a delegation for " + "OPEN(CLAIM_DELEGATE_CUR)\n", + NFS_CLIENT(inode)->cl_server); + } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0) nfs_inode_set_delegation(state->inode, data->owner->so_cred, &data->o_res); @@ -1423,11 +1431,9 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) goto out_no_action; rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); - if (delegation != NULL && - test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) { - rcu_read_unlock(); - goto out_no_action; - } + if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && + can_open_delegated(delegation, data->o_arg.fmode)) + goto unlock_no_action; rcu_read_unlock(); } /* Update sequence id. */ @@ -1444,6 +1450,8 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) return; rpc_call_start(task); return; +unlock_no_action: + rcu_read_unlock(); out_no_action: task->tk_action = NULL; @@ -2464,8 +2472,7 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst case -NFS4ERR_BADNAME: return -ENOENT; case -NFS4ERR_MOVED: - err = nfs4_get_referral(dir, name, fattr, fhandle); - break; + return nfs4_get_referral(dir, name, fattr, fhandle); case -NFS4ERR_WRONGSEC: nfs_fixup_secinfo_attributes(fattr, fhandle); } @@ -5950,6 +5957,7 @@ static void nfs4_layoutcommit_release(void *calldata) { struct nfs4_layoutcommit_data *data = calldata; struct pnfs_layout_segment *lseg, *tmp; + unsigned long *bitlock = &NFS_I(data->args.inode)->flags; pnfs_cleanup_layoutcommit(data); /* Matched by references in pnfs_set_layoutcommit */ @@ -5959,6 +5967,11 @@ static void nfs4_layoutcommit_release(void *calldata) &lseg->pls_flags)) put_lseg(lseg); } + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); + put_rpccred(data->cred); kfree(data); } @@ -6247,6 +6260,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .dentry_ops = &nfs4_dentry_operations, .dir_inode_ops = &nfs4_dir_inode_operations, .file_inode_ops = &nfs4_file_inode_operations, + .file_ops = &nfs4_file_operations, .getroot = nfs4_proc_get_root, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 39914be40b03..6a7107ae6b72 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1156,11 +1156,13 @@ restart: if (status >= 0) { status = nfs4_reclaim_locks(state, ops); if (status >= 0) { + spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) printk("%s: Lock reclaim failed!\n", __func__); } + spin_unlock(&state->state_lock); nfs4_put_open_state(state); goto restart; } @@ -1224,10 +1226,12 @@ static void nfs4_clear_open_state(struct nfs4_state *state) clear_bit(NFS_O_RDONLY_STATE, &state->flags); clear_bit(NFS_O_WRONLY_STATE, &state->flags); clear_bit(NFS_O_RDWR_STATE, &state->flags); + spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { lock->ls_seqid.flags = 0; lock->ls_flags &= ~NFS_LOCK_INITIALIZED; } + spin_unlock(&state->state_lock); } static void nfs4_reset_seqids(struct nfs_server *server, @@ -1350,12 +1354,14 @@ static void nfs4_warn_keyexpired(const char *s) static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { + case 0: + break; case -NFS4ERR_CB_PATH_DOWN: nfs_handle_cb_pathdown(clp); - return 0; + break; case -NFS4ERR_NO_GRACE: nfs4_state_end_reclaim_reboot(clp); - return 0; + break; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_LEASE_MOVED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); @@ -1375,13 +1381,15 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) case -NFS4ERR_SEQ_MISORDERED: set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); /* Zero session reset errors */ - return 0; + break; case -EKEYEXPIRED: /* Nothing we can do */ nfs4_warn_keyexpired(clp->cl_hostname); - return 0; + break; + default: + return error; } - return error; + return 0; } static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) @@ -1428,7 +1436,7 @@ static int nfs4_check_lease(struct nfs_client *clp) struct rpc_cred *cred; const struct nfs4_state_maintenance_ops *ops = clp->cl_mvops->state_renewal_ops; - int status = -NFS4ERR_EXPIRED; + int status; /* Is the client already known to have an expired lease? */ if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) @@ -1438,6 +1446,7 @@ static int nfs4_check_lease(struct nfs_client *clp) spin_unlock(&clp->cl_lock); if (cred == NULL) { cred = nfs4_get_setclientid_cred(clp); + status = -ENOKEY; if (cred == NULL) goto out; } @@ -1525,16 +1534,16 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) { if (!flags) return; - else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) + if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); - else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | + if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | SEQ4_STATUS_ADMIN_STATE_REVOKED | SEQ4_STATUS_LEASE_MOVED)) nfs41_handle_state_revoked(clp); - else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) + if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) nfs41_handle_recallable_state_revoked(clp); - else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | + if (flags & (SEQ4_STATUS_CB_PATH_DOWN | SEQ4_STATUS_BACKCHANNEL_FAULT | SEQ4_STATUS_CB_PATH_DOWN_SESSION)) nfs41_handle_cb_path_down(clp); @@ -1662,10 +1671,10 @@ static void nfs4_state_manager(struct nfs_client *clp) if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { status = nfs4_check_lease(clp); + if (status < 0) + goto out_error; if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) continue; - if (status < 0 && status != -NFS4ERR_CB_PATH_DOWN) - goto out_error; } /* Initialize or reset the session */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1dce12f41a4f..e6161b213ed1 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, if (status) goto out; status = decode_secinfo(xdr, res); - if (status) - goto out; out: return status; } diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index d0cda12fddc3..c807ab93140e 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -38,21 +38,15 @@ */ #include <linux/module.h> -#include <scsi/osd_initiator.h> +#include <scsi/osd_ore.h> #include "objlayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD -#define _LLU(x) ((unsigned long long)x) - -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), -}; - struct objio_dev_ent { struct nfs4_deviceid_node id_node; - struct osd_dev *od; + struct ore_dev od; }; static void @@ -60,8 +54,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) { struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); - dprintk("%s: free od=%p\n", __func__, de->od); - osduld_put_device(de->od); + dprintk("%s: free od=%p\n", __func__, de->od.od); + osduld_put_device(de->od.od); kfree(de); } @@ -98,12 +92,12 @@ _dev_list_add(const struct nfs_server *nfss, nfss->pnfs_curr_ld, nfss->nfs_client, d_id); - de->od = od; + de->od.od = od; d = nfs4_insert_deviceid_node(&de->id_node); n = container_of(d, struct objio_dev_ent, id_node); if (n != de) { - dprintk("%s: Race with other n->od=%p\n", __func__, n->od); + dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od); objio_free_deviceid_node(&de->id_node); de = n; } @@ -111,28 +105,11 @@ _dev_list_add(const struct nfs_server *nfss, return de; } -struct caps_buffers { - u8 caps_key[OSD_CRYPTO_KEYID_SIZE]; - u8 creds[OSD_CAP_LEN]; -}; - struct objio_segment { struct pnfs_layout_segment lseg; - struct pnfs_osd_object_cred *comps; - - unsigned mirrors_p1; - unsigned stripe_unit; - unsigned group_width; /* Data stripe_units without integrity comps */ - u64 group_depth; - unsigned group_count; - - unsigned max_io_size; - - unsigned comps_index; - unsigned num_comps; - /* variable length */ - struct objio_dev_ent *ods[]; + struct ore_layout layout; + struct ore_components oc; }; static inline struct objio_segment * @@ -141,59 +118,44 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg) return container_of(lseg, struct objio_segment, lseg); } -struct objio_state; -typedef ssize_t (*objio_done_fn)(struct objio_state *ios); - struct objio_state { /* Generic layer */ - struct objlayout_io_state ol_state; - - struct objio_segment *layout; - - struct kref kref; - objio_done_fn done; - void *private; - - unsigned long length; - unsigned numdevs; /* Actually used devs in this IO */ - /* A per-device variable array of size numdevs */ - struct _objio_per_comp { - struct bio *bio; - struct osd_request *or; - unsigned long length; - u64 offset; - unsigned dev; - } per_dev[]; + struct objlayout_io_res oir; + + bool sync; + /*FIXME: Support for extra_bytes at ore_get_rw_state() */ + struct ore_io_state *ios; }; /* Send and wait for a get_device_info of devices in the layout, then look them up with the osd_initiator library */ -static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, unsigned comp, - gfp_t gfp_flags) +static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, + gfp_t gfp_flags) { struct pnfs_osd_deviceaddr *deviceaddr; - struct nfs4_deviceid *d_id; struct objio_dev_ent *ode; struct osd_dev *od; struct osd_dev_info odi; int err; - d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id; - ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); - if (ode) - return ode; + if (ode) { + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ + return 0; + } err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); if (unlikely(err)) { dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); - return ERR_PTR(err); + return err; } odi.systemid_len = deviceaddr->oda_systemid.len; if (odi.systemid_len > sizeof(odi.systemid)) { + dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", + __func__, sizeof(odi.systemid)); err = -EINVAL; goto out; } else if (odi.systemid_len) @@ -218,96 +180,53 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, gfp_flags); - + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ + dprintk("Adding new dev_id(%llx:%llx)\n", + _DEVID_LO(d_id), _DEVID_HI(d_id)); out: - dprintk("%s: return=%d\n", __func__, err); objlayout_put_deviceinfo(deviceaddr); - return err ? ERR_PTR(err) : ode; + return err; } -static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, - gfp_t gfp_flags) +static void copy_single_comp(struct ore_components *oc, unsigned c, + struct pnfs_osd_object_cred *src_comp) { - unsigned i; - int err; + struct ore_comp *ocomp = &oc->comps[c]; - /* lookup all devices */ - for (i = 0; i < objio_seg->num_comps; i++) { - struct objio_dev_ent *ode; + WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ + WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); - ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); - if (unlikely(IS_ERR(ode))) { - err = PTR_ERR(ode); - goto out; - } - objio_seg->ods[i] = ode; - } - err = 0; + ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; + ocomp->obj.id = src_comp->oc_object_id.oid_object_id; -out: - dprintk("%s: return=%d\n", __func__, err); - return err; + memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); } -static int _verify_data_map(struct pnfs_osd_layout *layout) +int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, + struct objio_segment **pseg) { - struct pnfs_osd_data_map *data_map = &layout->olo_map; - u64 stripe_length; - u32 group_width; - -/* FIXME: Only raid0 for now. if not go through MDS */ - if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { - printk(KERN_ERR "Only RAID_0 for now\n"); - return -ENOTSUPP; - } - if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { - printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", - data_map->odm_num_comps, data_map->odm_mirror_cnt); - return -EINVAL; - } + struct __alloc_objio_segment { + struct objio_segment olseg; + struct ore_dev *ods[numdevs]; + struct ore_comp comps[numdevs]; + } *aolseg; - if (data_map->odm_group_width) - group_width = data_map->odm_group_width; - else - group_width = data_map->odm_num_comps / - (data_map->odm_mirror_cnt + 1); - - stripe_length = (u64)data_map->odm_stripe_unit * group_width; - if (stripe_length >= (1ULL << 32)) { - printk(KERN_ERR "Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -ENOTSUPP; + aolseg = kzalloc(sizeof(*aolseg), gfp_flags); + if (unlikely(!aolseg)) { + dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__, + numdevs, sizeof(*aolseg)); + return -ENOMEM; } - if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { - printk(KERN_ERR "Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(data_map->odm_stripe_unit), PAGE_SIZE); - return -ENOTSUPP; - } + aolseg->olseg.oc.numdevs = numdevs; + aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS; + aolseg->olseg.oc.comps = aolseg->comps; + aolseg->olseg.oc.ods = aolseg->ods; + *pseg = &aolseg->olseg; return 0; } -static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp, - struct pnfs_osd_object_cred *src_comp, - struct caps_buffers *caps_p) -{ - WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key)); - WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds)); - - *cur_comp = *src_comp; - - memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred, - sizeof(caps_p->caps_key)); - cur_comp->oc_cap_key.cred = caps_p->caps_key; - - memcpy(caps_p->creds, src_comp->oc_cap.cred, - sizeof(caps_p->creds)); - cur_comp->oc_cap.cred = caps_p->creds; -} - int objio_alloc_lseg(struct pnfs_layout_segment **outp, struct pnfs_layout_hdr *pnfslay, struct pnfs_layout_range *range, @@ -317,59 +236,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, struct objio_segment *objio_seg; struct pnfs_osd_xdr_decode_layout_iter iter; struct pnfs_osd_layout layout; - struct pnfs_osd_object_cred *cur_comp, src_comp; - struct caps_buffers *caps_p; + struct pnfs_osd_object_cred src_comp; + unsigned cur_comp; int err; err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); if (unlikely(err)) return err; - err = _verify_data_map(&layout); + err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); if (unlikely(err)) return err; - objio_seg = kzalloc(sizeof(*objio_seg) + - sizeof(objio_seg->ods[0]) * layout.olo_num_comps + - sizeof(*objio_seg->comps) * layout.olo_num_comps + - sizeof(struct caps_buffers) * layout.olo_num_comps, - gfp_flags); - if (!objio_seg) - return -ENOMEM; + objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; + objio_seg->layout.group_width = layout.olo_map.odm_group_width; + objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; + objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; + objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; - objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); - cur_comp = objio_seg->comps; - caps_p = (void *)(cur_comp + layout.olo_num_comps); - while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) - copy_single_comp(cur_comp++, &src_comp, caps_p++); + err = ore_verify_layout(layout.olo_map.odm_num_comps, + &objio_seg->layout); if (unlikely(err)) goto err; - objio_seg->num_comps = layout.olo_num_comps; - objio_seg->comps_index = layout.olo_comps_index; - err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); - if (err) - goto err; - - objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; - objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; - if (layout.olo_map.odm_group_width) { - objio_seg->group_width = layout.olo_map.odm_group_width; - objio_seg->group_depth = layout.olo_map.odm_group_depth; - objio_seg->group_count = layout.olo_map.odm_num_comps / - objio_seg->mirrors_p1 / - objio_seg->group_width; - } else { - objio_seg->group_width = layout.olo_map.odm_num_comps / - objio_seg->mirrors_p1; - objio_seg->group_depth = -1; - objio_seg->group_count = 1; + objio_seg->oc.first_dev = layout.olo_comps_index; + cur_comp = 0; + while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { + copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); + err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, + &src_comp.oc_object_id.oid_device_id, + gfp_flags); + if (err) + goto err; + ++cur_comp; } - - /* Cache this calculation it will hit for every page */ - objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - - objio_seg->stripe_unit) * - objio_seg->group_width; + /* pnfs_osd_xdr_decode_layout_comp returns false on error */ + if (unlikely(err)) + goto err; *outp = &objio_seg->lseg; return 0; @@ -386,43 +289,63 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg) int i; struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - for (i = 0; i < objio_seg->num_comps; i++) { - if (!objio_seg->ods[i]) + for (i = 0; i < objio_seg->oc.numdevs; i++) { + struct ore_dev *od = objio_seg->oc.ods[i]; + struct objio_dev_ent *ode; + + if (!od) break; - nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); + ode = container_of(od, typeof(*ode), od); + nfs4_put_deviceid_node(&ode->id_node); } kfree(objio_seg); } -int objio_alloc_io_state(struct pnfs_layout_segment *lseg, - struct objlayout_io_state **outp, - gfp_t gfp_flags) +static int +objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, + struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, + loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, + struct objio_state **outp) { struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - struct objio_state *ios; - const unsigned first_size = sizeof(*ios) + - objio_seg->num_comps * sizeof(ios->per_dev[0]); - const unsigned sec_size = objio_seg->num_comps * - sizeof(ios->ol_state.ioerrs[0]); - - ios = kzalloc(first_size + sec_size, gfp_flags); - if (unlikely(!ios)) + struct ore_io_state *ios; + int ret; + struct __alloc_objio_state { + struct objio_state objios; + struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; + } *aos; + + aos = kzalloc(sizeof(*aos), gfp_flags); + if (unlikely(!aos)) return -ENOMEM; - ios->layout = objio_seg; - ios->ol_state.ioerrs = ((void *)ios) + first_size; - ios->ol_state.num_comps = objio_seg->num_comps; + objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, + aos->ioerrs, rpcdata, pnfs_layout_type); - *outp = &ios->ol_state; + ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, + offset, count, &ios); + if (unlikely(ret)) { + kfree(aos); + return ret; + } + + ios->pages = pages; + ios->pgbase = pgbase; + ios->private = aos; + BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); + + aos->objios.sync = 0; + aos->objios.ios = ios; + *outp = &aos->objios; return 0; } -void objio_free_io_state(struct objlayout_io_state *ol_state) +void objio_free_result(struct objlayout_io_res *oir) { - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); + struct objio_state *objios = container_of(oir, struct objio_state, oir); - kfree(ios); + ore_put_io_state(objios->ios); + kfree(objios); } enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) @@ -455,539 +378,152 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) } } -static void _clear_bio(struct bio *bio) +static void __on_dev_error(struct ore_io_state *ios, + struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, + u64 dev_offset, u64 dev_len) { - struct bio_vec *bv; - unsigned i; - - __bio_for_each_segment(bv, bio, i, 0) { - unsigned this_count = bv->bv_len; - - if (likely(PAGE_SIZE == this_count)) - clear_highpage(bv->bv_page); - else - zero_user(bv->bv_page, bv->bv_offset, this_count); - } -} - -static int _io_check(struct objio_state *ios, bool is_write) -{ - enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; - int lin_ret = 0; - int i; - - for (i = 0; i < ios->numdevs; i++) { - struct osd_sense_info osi; - struct osd_request *or = ios->per_dev[i].or; - int ret; - - if (!or) - continue; + struct objio_state *objios = ios->private; + struct pnfs_osd_objid pooid; + struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); + /* FIXME: what to do with more-then-one-group layouts. We need to + * translate from ore_io_state index to oc->comps index + */ + unsigned comp = dev_index; - ret = osd_req_decode_sense(or, &osi); - if (likely(!ret)) - continue; + pooid.oid_device_id = ode->id_node.deviceid; + pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; + pooid.oid_object_id = ios->oc->comps[comp].obj.id; - if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { - /* start read offset passed endof file */ - BUG_ON(is_write); - _clear_bio(ios->per_dev[i].bio); - dprintk("%s: start read offset passed end of file " - "offset=0x%llx, length=0x%lx\n", __func__, - _LLU(ios->per_dev[i].offset), - ios->per_dev[i].length); - - continue; /* we recovered */ - } - objlayout_io_set_result(&ios->ol_state, i, - &ios->layout->comps[i].oc_object_id, - osd_pri_2_pnfs_err(osi.osd_err_pri), - ios->per_dev[i].offset, - ios->per_dev[i].length, - is_write); - - if (osi.osd_err_pri >= oep) { - oep = osi.osd_err_pri; - lin_ret = ret; - } - } - - return lin_ret; -} - -/* - * Common IO state helpers. - */ -static void _io_free(struct objio_state *ios) -{ - unsigned i; - - for (i = 0; i < ios->numdevs; i++) { - struct _objio_per_comp *per_dev = &ios->per_dev[i]; - - if (per_dev->or) { - osd_end_request(per_dev->or); - per_dev->or = NULL; - } - - if (per_dev->bio) { - bio_put(per_dev->bio); - per_dev->bio = NULL; - } - } -} - -struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) -{ - unsigned min_dev = ios->layout->comps_index; - unsigned max_dev = min_dev + ios->layout->num_comps; - - BUG_ON(dev < min_dev || max_dev <= dev); - return ios->layout->ods[dev - min_dev]->od; -} - -struct _striping_info { - u64 obj_offset; - u64 group_length; - unsigned dev; - unsigned unit_off; -}; - -static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, - struct _striping_info *si) -{ - u32 stripe_unit = ios->layout->stripe_unit; - u32 group_width = ios->layout->group_width; - u64 group_depth = ios->layout->group_depth; - u32 U = stripe_unit * group_width; - - u64 T = U * group_depth; - u64 S = T * ios->layout->group_count; - u64 M = div64_u64(file_offset, S); - - /* - G = (L - (M * S)) / T - H = (L - (M * S)) % T - */ - u64 LmodU = file_offset - M * S; - u32 G = div64_u64(LmodU, T); - u64 H = LmodU - G * T; - - u32 N = div_u64(H, U); - - div_u64_rem(file_offset, stripe_unit, &si->unit_off); - si->obj_offset = si->unit_off + (N * stripe_unit) + - (M * group_depth * stripe_unit); - - /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= ios->layout->mirrors_p1; - - si->group_length = T - H; -} - -static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, - unsigned pgbase, struct _objio_per_comp *per_dev, int len, - gfp_t gfp_flags) -{ - unsigned pg = *cur_pg; - int cur_len = len; - struct request_queue *q = - osd_request_queue(_io_od(ios, per_dev->dev)); - - if (per_dev->bio == NULL) { - unsigned pages_in_stripe = ios->layout->group_width * - (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / - ios->layout->group_width; - - if (BIO_MAX_PAGES_KMALLOC < bio_size) - bio_size = BIO_MAX_PAGES_KMALLOC; - - per_dev->bio = bio_kmalloc(gfp_flags, bio_size); - if (unlikely(!per_dev->bio)) { - dprintk("Faild to allocate BIO size=%u\n", bio_size); - return -ENOMEM; - } - } - - while (cur_len > 0) { - unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); - unsigned added_len; - - BUG_ON(ios->ol_state.nr_pages <= pg); - cur_len -= pglen; - - added_len = bio_add_pc_page(q, per_dev->bio, - ios->ol_state.pages[pg], pglen, pgbase); - if (unlikely(pglen != added_len)) - return -ENOMEM; - pgbase = 0; - ++pg; - } - BUG_ON(cur_len); - - per_dev->length += len; - *cur_pg = pg; - return 0; -} - -static int _prepare_one_group(struct objio_state *ios, u64 length, - struct _striping_info *si, unsigned *last_pg, - gfp_t gfp_flags) -{ - unsigned stripe_unit = ios->layout->stripe_unit; - unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; - unsigned dev = si->dev; - unsigned first_dev = dev - (dev % devs_in_group); - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; - unsigned cur_pg = *last_pg; - int ret = 0; - - while (length) { - struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev]; - unsigned cur_len, page_off = 0; - - if (!per_dev->length) { - per_dev->dev = dev; - if (dev < si->dev) { - per_dev->offset = si->obj_offset + stripe_unit - - si->unit_off; - cur_len = stripe_unit; - } else if (dev == si->dev) { - per_dev->offset = si->obj_offset; - cur_len = stripe_unit - si->unit_off; - page_off = si->unit_off & ~PAGE_MASK; - BUG_ON(page_off && - (page_off != ios->ol_state.pgbase)); - } else { /* dev > si->dev */ - per_dev->offset = si->obj_offset - si->unit_off; - cur_len = stripe_unit; - } - - if (max_comp < dev - first_dev) - max_comp = dev - first_dev; - } else { - cur_len = stripe_unit; - } - if (cur_len >= length) - cur_len = length; - - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, - cur_len, gfp_flags); - if (unlikely(ret)) - goto out; - - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; - - length -= cur_len; - ios->length += cur_len; - } -out: - ios->numdevs = max_comp + mirrors_p1; - *last_pg = cur_pg; - return ret; -} - -static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) -{ - u64 length = ios->ol_state.count; - u64 offset = ios->ol_state.offset; - struct _striping_info si; - unsigned last_pg = 0; - int ret = 0; - - while (length) { - _calc_stripe_info(ios, offset, &si); - - if (length < si.group_length) - si.group_length = length; - - ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags); - if (unlikely(ret)) - goto out; - - offset += si.group_length; - length -= si.group_length; - } - -out: - if (!ios->length) - return ret; - - return 0; -} - -static ssize_t _sync_done(struct objio_state *ios) -{ - struct completion *waiting = ios->private; - - complete(waiting); - return 0; -} - -static void _last_io(struct kref *kref) -{ - struct objio_state *ios = container_of(kref, struct objio_state, kref); - - ios->done(ios); -} - -static void _done_io(struct osd_request *or, void *p) -{ - struct objio_state *ios = p; - - kref_put(&ios->kref, _last_io); -} - -static ssize_t _io_exec(struct objio_state *ios) -{ - DECLARE_COMPLETION_ONSTACK(wait); - ssize_t status = 0; /* sync status */ - unsigned i; - objio_done_fn saved_done_fn = ios->done; - bool sync = ios->ol_state.sync; - - if (sync) { - ios->done = _sync_done; - ios->private = &wait; - } - - kref_init(&ios->kref); - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - - if (!or) - continue; - - kref_get(&ios->kref); - osd_execute_request_async(or, _done_io, ios); - } - - kref_put(&ios->kref, _last_io); - - if (sync) { - wait_for_completion(&wait); - status = saved_done_fn(ios); - } - - return status; + objlayout_io_set_result(&objios->oir, comp, + &pooid, osd_pri_2_pnfs_err(oep), + dev_offset, dev_len, !ios->reading); } /* * read */ -static ssize_t _read_done(struct objio_state *ios) +static void _read_done(struct ore_io_state *ios, void *private) { + struct objio_state *objios = private; ssize_t status; - int ret = _io_check(ios, false); + int ret = ore_check_io(ios, &__on_dev_error); - _io_free(ios); + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ if (likely(!ret)) status = ios->length; else status = ret; - objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); - return status; + objlayout_read_done(&objios->oir, status, objios->sync); } -static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) +int objio_read_pagelist(struct nfs_read_data *rdata) { - struct osd_request *or = NULL; - struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; - unsigned dev = per_dev->dev; - struct pnfs_osd_object_cred *cred = - &ios->layout->comps[cur_comp]; - struct osd_obj_id obj = { - .partition = cred->oc_object_id.oid_partition_id, - .id = cred->oc_object_id.oid_object_id, - }; + struct objio_state *objios; int ret; - or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); - if (unlikely(!or)) { - ret = -ENOMEM; - goto err; - } - per_dev->or = or; - - osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); - - ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); - if (ret) { - dprintk("%s: Faild to osd_finalize_request() => %d\n", - __func__, ret); - goto err; - } - - dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", - __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), - per_dev->length); - -err: - return ret; -} - -static ssize_t _read_exec(struct objio_state *ios) -{ - unsigned i; - int ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - if (!ios->per_dev[i].length) - continue; - ret = _read_mirrors(ios, i); - if (unlikely(ret)) - goto err; - } - - ios->done = _read_done; - return _io_exec(ios); /* In sync mode exec returns the io status */ - -err: - _io_free(ios); - return ret; -} - -ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) -{ - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); - int ret; - - ret = _io_rw_pagelist(ios, GFP_KERNEL); + ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true, + rdata->lseg, rdata->args.pages, rdata->args.pgbase, + rdata->args.offset, rdata->args.count, rdata, + GFP_KERNEL, &objios); if (unlikely(ret)) return ret; - return _read_exec(ios); + objios->ios->done = _read_done; + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + rdata->args.offset, rdata->args.count); + return ore_read(objios->ios); } /* * write */ -static ssize_t _write_done(struct objio_state *ios) +static void _write_done(struct ore_io_state *ios, void *private) { + struct objio_state *objios = private; ssize_t status; - int ret = _io_check(ios, true); + int ret = ore_check_io(ios, &__on_dev_error); - _io_free(ios); + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ if (likely(!ret)) { /* FIXME: should be based on the OSD's persistence model * See OSD2r05 Section 4.13 Data persistence model */ - ios->ol_state.committed = NFS_FILE_SYNC; + objios->oir.committed = NFS_FILE_SYNC; status = ios->length; } else { status = ret; } - objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); - return status; + objlayout_write_done(&objios->oir, status, objios->sync); } -static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) { - struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; - unsigned dev = ios->per_dev[cur_comp].dev; - unsigned last_comp = cur_comp + ios->layout->mirrors_p1; - int ret; - - for (; cur_comp < last_comp; ++cur_comp, ++dev) { - struct osd_request *or = NULL; - struct pnfs_osd_object_cred *cred = - &ios->layout->comps[cur_comp]; - struct osd_obj_id obj = { - .partition = cred->oc_object_id.oid_partition_id, - .id = cred->oc_object_id.oid_object_id, - }; - struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; - struct bio *bio; - - or = osd_start_request(_io_od(ios, dev), GFP_NOFS); - if (unlikely(!or)) { - ret = -ENOMEM; - goto err; - } - per_dev->or = or; - - if (per_dev != master_dev) { - bio = bio_kmalloc(GFP_NOFS, - master_dev->bio->bi_max_vecs); - if (unlikely(!bio)) { - dprintk("Faild to allocate BIO size=%u\n", - master_dev->bio->bi_max_vecs); - ret = -ENOMEM; - goto err; - } - - __bio_clone(bio, master_dev->bio); - bio->bi_bdev = NULL; - bio->bi_next = NULL; - per_dev->bio = bio; - per_dev->dev = dev; - per_dev->length = master_dev->length; - per_dev->offset = master_dev->offset; - } else { - bio = master_dev->bio; - bio->bi_rw |= REQ_WRITE; - } - - osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); + struct objio_state *objios = priv; + struct nfs_write_data *wdata = objios->oir.rpcdata; + pgoff_t index = offset / PAGE_SIZE; + struct page *page = find_get_page(wdata->inode->i_mapping, index); - ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); - if (ret) { - dprintk("%s: Faild to osd_finalize_request() => %d\n", - __func__, ret); - goto err; + if (!page) { + page = find_or_create_page(wdata->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + dprintk("%s: grab_cache_page Failed index=0x%lx\n", + __func__, index); + return NULL; } - - dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", - __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), - per_dev->length); + unlock_page(page); } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); + return page; +} -err: - return ret; +static void __r4w_put_page(void *priv, struct page *page) +{ + dprintk("%s: index=0x%lx\n", __func__, page->index); + page_cache_release(page); + return; } -static ssize_t _write_exec(struct objio_state *ios) +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + +int objio_write_pagelist(struct nfs_write_data *wdata, int how) { - unsigned i; + struct objio_state *objios; int ret; - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - if (!ios->per_dev[i].length) - continue; - ret = _write_mirrors(ios, i); - if (unlikely(ret)) - goto err; - } - - ios->done = _write_done; - return _io_exec(ios); /* In sync mode exec returns the io->status */ + ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false, + wdata->lseg, wdata->args.pages, wdata->args.pgbase, + wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, + &objios); + if (unlikely(ret)) + return ret; -err: - _io_free(ios); - return ret; -} + objios->sync = 0 != (how & FLUSH_SYNC); + objios->ios->r4w = &_r4w_op; -ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) -{ - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); - int ret; + if (!objios->sync) + objios->ios->done = _write_done; - /* TODO: ios->stable = stable; */ - ret = _io_rw_pagelist(ios, GFP_NOFS); + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + wdata->args.offset, wdata->args.count); + ret = ore_write(objios->ios); if (unlikely(ret)) return ret; - return _write_exec(ios); + if (objios->sync) + _write_done(objios->ios, objios); + + return 0; } static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, @@ -997,7 +533,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, return false; return pgio->pg_count + req->wb_bytes <= - OBJIO_LSEG(pgio->pg_lseg)->max_io_size; + OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; } static const struct nfs_pageio_ops objio_pg_read_ops = { diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 1d06f8e2adea..72074e3a04f9 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -156,77 +156,39 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1 : NFS4_MAX_UINT64; } -static struct objlayout_io_state * -objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, - struct page **pages, - unsigned pgbase, - loff_t offset, - size_t count, - struct pnfs_layout_segment *lseg, - void *rpcdata, - gfp_t gfp_flags) +void _fix_verify_io_params(struct pnfs_layout_segment *lseg, + struct page ***p_pages, unsigned *p_pgbase, + u64 offset, unsigned long count) { - struct objlayout_io_state *state; u64 lseg_end_offset; - dprintk("%s: allocating io_state\n", __func__); - if (objio_alloc_io_state(lseg, &state, gfp_flags)) - return NULL; - BUG_ON(offset < lseg->pls_range.offset); lseg_end_offset = end_offset(lseg->pls_range.offset, lseg->pls_range.length); BUG_ON(offset >= lseg_end_offset); - if (offset + count > lseg_end_offset) { - count = lseg->pls_range.length - - (offset - lseg->pls_range.offset); - dprintk("%s: truncated count %Zd\n", __func__, count); - } + WARN_ON(offset + count > lseg_end_offset); - if (pgbase > PAGE_SIZE) { - pages += pgbase >> PAGE_SHIFT; - pgbase &= ~PAGE_MASK; + if (*p_pgbase > PAGE_SIZE) { + dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); + *p_pages += *p_pgbase >> PAGE_SHIFT; + *p_pgbase &= ~PAGE_MASK; } - - INIT_LIST_HEAD(&state->err_list); - state->lseg = lseg; - state->rpcdata = rpcdata; - state->pages = pages; - state->pgbase = pgbase; - state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; - state->offset = offset; - state->count = count; - state->sync = 0; - - return state; -} - -static void -objlayout_free_io_state(struct objlayout_io_state *state) -{ - dprintk("%s: freeing io_state\n", __func__); - if (unlikely(!state)) - return; - - objio_free_io_state(state); } /* * I/O done common code */ static void -objlayout_iodone(struct objlayout_io_state *state) +objlayout_iodone(struct objlayout_io_res *oir) { - dprintk("%s: state %p status\n", __func__, state); - - if (likely(state->status >= 0)) { - objlayout_free_io_state(state); + if (likely(oir->status >= 0)) { + objio_free_result(oir); } else { - struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); + struct objlayout *objlay = oir->objlay; spin_lock(&objlay->lock); objlay->delta_space_valid = OBJ_DSU_INVALID; - list_add(&objlay->err_list, &state->err_list); + list_add(&objlay->err_list, &oir->err_list); spin_unlock(&objlay->lock); } } @@ -238,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state) * the error for later reporting at layout-return. */ void -objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, +objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, struct pnfs_osd_objid *pooid, int osd_error, u64 offset, u64 length, bool is_write) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; - BUG_ON(index >= state->num_comps); + BUG_ON(index >= oir->num_comps); if (osd_error) { ioerr->oer_component = *pooid; ioerr->oer_comp_offset = offset; @@ -285,21 +247,18 @@ static void _rpc_read_complete(struct work_struct *work) } void -objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) +objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - int eof = state->eof; - struct nfs_read_data *rdata; + struct nfs_read_data *rdata = oir->rpcdata; - state->status = status; - dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); - rdata = state->rpcdata; - rdata->task.tk_status = status; - if (status >= 0) { + oir->status = rdata->task.tk_status = status; + if (status >= 0) rdata->res.count = status; - rdata->res.eof = eof; - } - objlayout_iodone(state); - /* must not use state after this point */ + objlayout_iodone(oir); + /* must not use oir after this point */ + + dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, + status, rdata->res.eof, sync); if (sync) pnfs_ld_read_done(rdata); @@ -317,40 +276,36 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) { loff_t offset = rdata->args.offset; size_t count = rdata->args.count; - struct objlayout_io_state *state; - ssize_t status = 0; + int err; loff_t eof; - dprintk("%s: Begin inode %p offset %llu count %d\n", - __func__, rdata->inode, offset, (int)count); - eof = i_size_read(rdata->inode); if (unlikely(offset + count > eof)) { if (offset >= eof) { - status = 0; + err = 0; rdata->res.count = 0; rdata->res.eof = 1; + /*FIXME: do we need to call pnfs_ld_read_done() */ goto out; } count = eof - offset; } - state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, - rdata->args.pages, rdata->args.pgbase, - offset, count, - rdata->lseg, rdata, - GFP_KERNEL); - if (unlikely(!state)) { - status = -ENOMEM; - goto out; - } + rdata->res.eof = (offset + count) >= eof; + _fix_verify_io_params(rdata->lseg, &rdata->args.pages, + &rdata->args.pgbase, + rdata->args.offset, rdata->args.count); - state->eof = state->offset + state->count >= eof; + dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", + __func__, rdata->inode->i_ino, offset, count, rdata->res.eof); - status = objio_read_pagelist(state); + err = objio_read_pagelist(rdata); out: - dprintk("%s: Return status %Zd\n", __func__, status); - rdata->pnfs_error = status; + if (unlikely(err)) { + rdata->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } return PNFS_ATTEMPTED; } @@ -371,26 +326,20 @@ static void _rpc_write_complete(struct work_struct *work) } void -objlayout_write_done(struct objlayout_io_state *state, ssize_t status, - bool sync) +objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_write_data *wdata; + struct nfs_write_data *wdata = oir->rpcdata; - dprintk("%s: Begin\n", __func__); - wdata = state->rpcdata; - state->status = status; - wdata->task.tk_status = status; + oir->status = wdata->task.tk_status = status; if (status >= 0) { wdata->res.count = status; - wdata->verf.committed = state->committed; - dprintk("%s: Return status %d committed %d\n", - __func__, wdata->task.tk_status, - wdata->verf.committed); - } else - dprintk("%s: Return status %d\n", - __func__, wdata->task.tk_status); - objlayout_iodone(state); - /* must not use state after this point */ + wdata->verf.committed = oir->committed; + } + objlayout_iodone(oir); + /* must not use oir after this point */ + + dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, + status, wdata->verf.committed, sync); if (sync) pnfs_ld_write_done(wdata); @@ -407,30 +356,18 @@ enum pnfs_try_status objlayout_write_pagelist(struct nfs_write_data *wdata, int how) { - struct objlayout_io_state *state; - ssize_t status; - - dprintk("%s: Begin inode %p offset %llu count %u\n", - __func__, wdata->inode, wdata->args.offset, wdata->args.count); - - state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, - wdata->args.pages, - wdata->args.pgbase, - wdata->args.offset, - wdata->args.count, - wdata->lseg, wdata, - GFP_NOFS); - if (unlikely(!state)) { - status = -ENOMEM; - goto out; - } + int err; - state->sync = how & FLUSH_SYNC; + _fix_verify_io_params(wdata->lseg, &wdata->args.pages, + &wdata->args.pgbase, + wdata->args.offset, wdata->args.count); - status = objio_write_pagelist(state, how & FLUSH_STABLE); - out: - dprintk("%s: Return status %Zd\n", __func__, status); - wdata->pnfs_error = status; + err = objio_write_pagelist(wdata, how); + if (unlikely(err)) { + wdata->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } return PNFS_ATTEMPTED; } @@ -537,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err, static void encode_accumulated_error(struct objlayout *objlay, __be32 *p) { - struct objlayout_io_state *state, *tmp; + struct objlayout_io_res *oir, *tmp; struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; - list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { unsigned i; - for (i = 0; i < state->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; if (!ioerr->oer_errno) continue; @@ -563,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p) merge_ioerr(&accumulated_err, ioerr); } - list_del(&state->err_list); - objlayout_free_io_state(state); + list_del(&oir->err_list); + objio_free_result(oir); } pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); @@ -576,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, const struct nfs4_layoutreturn_args *args) { struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct objlayout_io_state *state, *tmp; + struct objlayout_io_res *oir, *tmp; __be32 *start; dprintk("%s: Begin\n", __func__); @@ -585,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, spin_lock(&objlay->lock); - list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { __be32 *last_xdr = NULL, *p; unsigned i; int res = 0; - for (i = 0; i < state->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; if (!ioerr->oer_errno) continue; @@ -615,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, } last_xdr = p; - pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); + pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); } /* TODO: use xdr_write_pages */ @@ -631,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, encode_accumulated_error(objlay, last_xdr); goto loop_done; } - list_del(&state->err_list); - objlayout_free_io_state(state); + list_del(&oir->err_list); + objio_free_result(oir); } loop_done: spin_unlock(&objlay->lock); diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index a8244c8e042d..8ec34727ed21 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -74,19 +74,11 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo) * per-I/O operation state * embedded in objects provider io_state data structure */ -struct objlayout_io_state { - struct pnfs_layout_segment *lseg; - - struct page **pages; - unsigned pgbase; - unsigned nr_pages; - unsigned long count; - loff_t offset; - bool sync; +struct objlayout_io_res { + struct objlayout *objlay; void *rpcdata; int status; /* res */ - int eof; /* res */ int committed; /* res */ /* Error reporting (layout_return) */ @@ -100,6 +92,18 @@ struct objlayout_io_state { struct pnfs_osd_ioerr *ioerrs; }; +static inline +void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, + struct pnfs_osd_ioerr *ioerrs, void *rpcdata, + struct pnfs_layout_hdr *pnfs_layout_type) +{ + oir->objlay = OBJLAYOUT(pnfs_layout_type); + oir->rpcdata = rpcdata; + INIT_LIST_HEAD(&oir->err_list); + oir->num_comps = num_comps; + oir->ioerrs = ioerrs; +} + /* * Raid engine I/O API */ @@ -110,28 +114,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, gfp_t gfp_flags); extern void objio_free_lseg(struct pnfs_layout_segment *lseg); -extern int objio_alloc_io_state( - struct pnfs_layout_segment *lseg, - struct objlayout_io_state **outp, - gfp_t gfp_flags); -extern void objio_free_io_state(struct objlayout_io_state *state); +/* objio_free_result will free these @oir structs recieved from + * objlayout_{read,write}_done + */ +extern void objio_free_result(struct objlayout_io_res *oir); -extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); -extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, - bool stable); +extern int objio_read_pagelist(struct nfs_read_data *rdata); +extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); /* * callback API */ -extern void objlayout_io_set_result(struct objlayout_io_state *state, +extern void objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, struct pnfs_osd_objid *pooid, int osd_error, u64 offset, u64 length, bool is_write); static inline void -objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) +objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) { - struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); - /* If one of the I/Os errored out and the delta_space_used was * invalid we render the complete report as invalid. Protocol mandate * the DSU be accurate or not reported. @@ -144,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) spin_unlock(&objlay->lock); } -extern void objlayout_read_done(struct objlayout_io_state *state, +extern void objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync); -extern void objlayout_write_done(struct objlayout_io_state *state, +extern void objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync); extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b60970cc7f1f..5668f7c54c41 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -18,6 +18,7 @@ #include <linux/nfs_page.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> +#include <linux/export.h> #include "internal.h" #include "pnfs.h" @@ -41,7 +42,7 @@ nfs_page_free(struct nfs_page *p) /** * nfs_create_request - Create an NFS read/write request. - * @file: file descriptor to use + * @ctx: open context to use * @inode: inode to which the request is attached * @page: page to write * @offset: starting offset within the page for the write diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ee73d9a4f700..8e672a2b2d69 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -29,6 +29,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> +#include <linux/module.h> #include "internal.h" #include "pnfs.h" #include "iostat.h" @@ -1259,6 +1260,25 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) } EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); +static void pnfs_ld_handle_read_error(struct nfs_read_data *data) +{ + struct nfs_pageio_descriptor pgio; + + put_lseg(data->lseg); + data->lseg = NULL; + dprintk("pnfs write error = %d\n", data->pnfs_error); + + nfs_pageio_init_read_mds(&pgio, data->inode); + + while (!list_empty(&data->pages)) { + struct nfs_page *req = nfs_list_entry(data->pages.next); + + nfs_list_remove_request(req); + nfs_pageio_add_request(&pgio, req); + } + nfs_pageio_complete(&pgio); +} + /* * Called by non rpc-based layout drivers */ @@ -1267,11 +1287,8 @@ void pnfs_ld_read_done(struct nfs_read_data *data) if (likely(!data->pnfs_error)) { __nfs4_read_done_cb(data); data->mds_ops->rpc_call_done(&data->task, data); - } else { - put_lseg(data->lseg); - data->lseg = NULL; - dprintk("pnfs write error = %d\n", data->pnfs_error); - } + } else + pnfs_ld_handle_read_error(data); data->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_read_done); @@ -1443,17 +1460,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ data = kzalloc(sizeof(*data), GFP_NOFS); if (!data) { - mark_inode_dirty_sync(inode); status = -ENOMEM; goto out; } + if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + goto out_free; + + if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { + if (!sync) { + status = -EAGAIN; + goto out_free; + } + status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (status) + goto out_free; + } + INIT_LIST_HEAD(&data->lseg_list); spin_lock(&inode->i_lock); if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); spin_unlock(&inode->i_lock); - kfree(data); - goto out; + wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); + goto out_free; } pnfs_list_write_lseg(inode, &data->lseg_list); @@ -1475,6 +1506,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = nfs4_proc_layoutcommit(data, sync); out: + if (status) + mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; +out_free: + kfree(data); + goto out; } diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 6fda5228ef56..4f359d2a26eb 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -28,6 +28,7 @@ * such damages. */ +#include <linux/export.h> #include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PNFS diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index ac40b8535d7e..f48125da198a 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -710,6 +710,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs_dir_inode_operations, .file_inode_ops = &nfs_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs_proc_get_root, .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 8b48ec63f722..cfa175c223dc 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -109,7 +109,7 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) } } -static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, +void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode) { nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, @@ -534,23 +534,13 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) static void nfs_readpage_release_full(void *calldata) { struct nfs_read_data *data = calldata; - struct nfs_pageio_descriptor pgio; - if (data->pnfs_error) { - nfs_pageio_init_read_mds(&pgio, data->inode); - pgio.pg_recoalesce = 1; - } while (!list_empty(&data->pages)) { struct nfs_page *req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (!data->pnfs_error) - nfs_readpage_release(req); - else - nfs_pageio_add_request(&pgio, req); + nfs_readpage_release(req); } - if (data->pnfs_error) - nfs_pageio_complete(&pgio); nfs_readdata_release(calldata); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 480b3b6bf71e..134777406ee3 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2787,43 +2787,18 @@ static void nfs_referral_loop_unprotect(void) static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, const char *export_path) { - struct mnt_namespace *ns_private; - struct super_block *s; struct dentry *dentry; - struct path path; - int ret; - - ns_private = create_mnt_ns(root_mnt); - ret = PTR_ERR(ns_private); - if (IS_ERR(ns_private)) - goto out_mntput; - - ret = nfs_referral_loop_protect(); - if (ret != 0) - goto out_put_mnt_ns; + int ret = nfs_referral_loop_protect(); - ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, - export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + if (ret) { + mntput(root_mnt); + return ERR_PTR(ret); + } + dentry = mount_subtree(root_mnt, export_path); nfs_referral_loop_unprotect(); - put_mnt_ns(ns_private); - - if (ret != 0) - goto out_err; - - s = path.mnt->mnt_sb; - atomic_inc(&s->s_active); - dentry = dget(path.dentry); - path_put(&path); - down_write(&s->s_umount); return dentry; -out_put_mnt_ns: - put_mnt_ns(ns_private); -out_mntput: - mntput(root_mnt); -out_err: - return ERR_PTR(ret); } static struct dentry *nfs4_try_mount(int flags, const char *dev_name, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 2219c88d96b2..1dda78db6a73 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -20,6 +20,7 @@ #include <linux/nfs_mount.h> #include <linux/nfs_page.h> #include <linux/backing-dev.h> +#include <linux/export.h> #include <asm/uaccess.h> @@ -1243,7 +1244,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) { struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; - struct nfs_server *server = NFS_SERVER(data->inode); int status; dprintk("NFS: %5u nfs_writeback_done (status %d)\n", @@ -1277,7 +1277,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) if (time_before(complain, jiffies)) { dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", - server->nfs_client->cl_hostname, + NFS_SERVER(data->inode)->nfs_client->cl_hostname, resp->verf->committed, argp->stable); complain = jiffies + 300 * HZ; } diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index ad88f1c0a4c3..9c51aff02ae2 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -36,6 +36,7 @@ #include <linux/slab.h> #include <linux/nfs_fs.h> +#include <linux/export.h> #include "acl.h" diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index db34a585e112..c45a2ea4a090 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -13,6 +13,7 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> #include <linux/sunrpc/gss_krb5_enctypes.h> +#include <linux/module.h> #include "idmap.h" #include "nfsd.h" diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index dc5a1bf476b1..eda7d7e55e05 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -8,6 +8,7 @@ #include <linux/sched.h> #include <linux/freezer.h> +#include <linux/module.h> #include <linux/fs_struct.h> #include <linux/swap.h> @@ -256,6 +257,8 @@ static void nfsd_last_thread(struct svc_serv *serv) nfsd_serv = NULL; nfsd_shutdown(); + svc_rpcb_cleanup(serv); + printk(KERN_WARNING "nfsd: last server has exited, flushing export " "cache\n"); nfsd_export_flush(); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 41d6743d303c..ac258beeda3c 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -625,6 +625,9 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment) goto out_free; + if (argv[n].v_nmembs >= UINT_MAX / argv[n].v_size) + goto out_free; + len = argv[n].v_size * argv[n].v_nmembs; base = (void __user *)(unsigned long)argv[n].v_base; if (len == 0) { @@ -842,6 +845,19 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; + case NILFS_IOCTL_CHANGE_CPMODE: + case NILFS_IOCTL_DELETE_CHECKPOINT: + case NILFS_IOCTL_GET_CPINFO: + case NILFS_IOCTL_GET_CPSTAT: + case NILFS_IOCTL_GET_SUINFO: + case NILFS_IOCTL_GET_SUSTAT: + case NILFS_IOCTL_GET_VINFO: + case NILFS_IOCTL_GET_BDESCS: + case NILFS_IOCTL_CLEAN_SEGMENTS: + case NILFS_IOCTL_SYNC: + case NILFS_IOCTL_RESIZE: + case NILFS_IOCTL_SET_ALLOC_RANGE: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ed553c60de82..3165aebb43c8 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5699,7 +5699,7 @@ int ocfs2_remove_btree_range(struct inode *inode, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out; + goto out_commit; } dquot_free_space_nodirty(inode, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c1efe939c774..78b68af3b0e3 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -290,7 +290,15 @@ static int ocfs2_readpage(struct file *file, struct page *page) } if (down_read_trylock(&oi->ip_alloc_sem) == 0) { + /* + * Unlock the page and cycle ip_alloc_sem so that we don't + * busyloop waiting for ip_alloc_sem to unlock + */ ret = AOP_TRUNCATED_PAGE; + unlock_page(page); + unlock = 0; + down_read(&oi->ip_alloc_sem); + up_read(&oi->ip_alloc_sem); goto out_inode_unlock; } @@ -563,6 +571,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; int level; + wait_queue_head_t *wq = ocfs2_ioend_wq(inode); /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); @@ -570,6 +579,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, if (ocfs2_iocb_is_sem_locked(iocb)) ocfs2_iocb_clear_sem_locked(iocb); + if (ocfs2_iocb_is_unaligned_aio(iocb)) { + ocfs2_iocb_clear_unaligned_aio(iocb); + + if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && + waitqueue_active(wq)) { + wake_up_all(wq); + } + } + ocfs2_iocb_clear_rw_locked(iocb); level = ocfs2_iocb_rw_locked_level(iocb); @@ -863,6 +881,12 @@ struct ocfs2_write_ctxt { struct page *w_target_page; /* + * w_target_locked is used for page_mkwrite path indicating no unlocking + * against w_target_page in ocfs2_write_end_nolock. + */ + unsigned int w_target_locked:1; + + /* * ocfs2_write_end() uses this to know what the real range to * write in the target should be. */ @@ -895,6 +919,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) { + int i; + + /* + * w_target_locked is only set to true in the page_mkwrite() case. + * The intent is to allow us to lock the target page from write_begin() + * to write_end(). The caller must hold a ref on w_target_page. + */ + if (wc->w_target_locked) { + BUG_ON(!wc->w_target_page); + for (i = 0; i < wc->w_num_pages; i++) { + if (wc->w_target_page == wc->w_pages[i]) { + wc->w_pages[i] = NULL; + break; + } + } + mark_page_accessed(wc->w_target_page); + page_cache_release(wc->w_target_page); + } ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); brelse(wc->w_di_bh); @@ -1132,20 +1174,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, */ lock_page(mmap_page); + /* Exit and let the caller retry */ if (mmap_page->mapping != mapping) { + WARN_ON(mmap_page->mapping); unlock_page(mmap_page); - /* - * Sanity check - the locking in - * ocfs2_pagemkwrite() should ensure - * that this code doesn't trigger. - */ - ret = -EINVAL; - mlog_errno(ret); + ret = -EAGAIN; goto out; } page_cache_get(mmap_page); wc->w_pages[i] = mmap_page; + wc->w_target_locked = true; } else { wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); @@ -1160,6 +1199,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, wc->w_target_page = wc->w_pages[i]; } out: + if (ret) + wc->w_target_locked = false; return ret; } @@ -1817,11 +1858,23 @@ try_again: */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, cluster_of_pages, mmap_page); - if (ret) { + if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out_quota; } + /* + * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock + * the target page. In this case, we exit with no error and no target + * page. This will trigger the caller, page_mkwrite(), to re-try + * the operation. + */ + if (ret == -EAGAIN) { + BUG_ON(wc->w_target_page); + ret = 0; + goto out_quota; + } + ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, len); if (ret) { diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 75cf3ad987a6..ffb2da370a99 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits { OCFS2_IOCB_RW_LOCK = 0, OCFS2_IOCB_RW_LOCK_LEVEL, OCFS2_IOCB_SEM, + OCFS2_IOCB_UNALIGNED_IO, OCFS2_IOCB_NUM_LOCKS }; @@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits { clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) #define ocfs2_iocb_is_sem_locked(iocb) \ test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) + +#define ocfs2_iocb_set_unaligned_aio(iocb) \ + set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) +#define ocfs2_iocb_clear_unaligned_aio(iocb) \ + clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) +#define ocfs2_iocb_is_unaligned_aio(iocb) \ + test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) + +#define OCFS2_IOEND_WQ_HASH_SZ 37 +#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\ + OCFS2_IOEND_WQ_HASH_SZ]) +extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; + #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 9a3e6bbff27b..a4e855e3690e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -216,6 +216,7 @@ struct o2hb_region { struct list_head hr_all_item; unsigned hr_unclean_stop:1, + hr_aborted_start:1, hr_item_pinned:1, hr_item_dropped:1; @@ -254,6 +255,10 @@ struct o2hb_region { * a more complete api that doesn't lead to this sort of fragility. */ atomic_t hr_steady_iterations; + /* terminate o2hb thread if it does not reach steady state + * (hr_steady_iterations == 0) within hr_unsteady_iterations */ + atomic_t hr_unsteady_iterations; + char hr_dev_name[BDEVNAME_SIZE]; unsigned int hr_timeout_ms; @@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work) static void o2hb_arm_write_timeout(struct o2hb_region *reg) { + /* Arm writeout only after thread reaches steady state */ + if (atomic_read(®->hr_steady_iterations) != 0) + return; + mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); @@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg, return read == computed; } -/* We want to make sure that nobody is heartbeating on top of us -- - * this will help detect an invalid configuration. */ -static void o2hb_check_last_timestamp(struct o2hb_region *reg) +/* + * Compare the slot data with what we wrote in the last iteration. + * If the match fails, print an appropriate error message. This is to + * detect errors like... another node hearting on the same slot, + * flaky device that is losing writes, etc. + * Returns 1 if check succeeds, 0 otherwise. + */ +static int o2hb_check_own_slot(struct o2hb_region *reg) { struct o2hb_disk_slot *slot; struct o2hb_disk_heartbeat_block *hb_block; @@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg) slot = ®->hr_slots[o2nm_this_node()]; /* Don't check on our 1st timestamp */ if (!slot->ds_last_time) - return; + return 0; hb_block = slot->ds_raw_block; if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && hb_block->hb_node == slot->ds_node_num) - return; + return 1; #define ERRSTR1 "Another node is heartbeating on device" #define ERRSTR2 "Heartbeat generation mismatch on device" @@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg) (unsigned long long)slot->ds_last_time, hb_block->hb_node, (unsigned long long)le64_to_cpu(hb_block->hb_generation), (unsigned long long)le64_to_cpu(hb_block->hb_seq)); + + return 0; } static inline void o2hb_prepare_block(struct o2hb_region *reg, @@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) o2nm_node_put(node); } -static void o2hb_set_quorum_device(struct o2hb_region *reg, - struct o2hb_disk_slot *slot) +static void o2hb_set_quorum_device(struct o2hb_region *reg) { - assert_spin_locked(&o2hb_live_lock); - if (!o2hb_global_heartbeat_active()) return; - if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + /* Prevent race with o2hb_heartbeat_group_drop_item() */ + if (kthread_should_stop()) + return; + + /* Tag region as quorum only after thread reaches steady state */ + if (atomic_read(®->hr_steady_iterations) != 0) return; + spin_lock(&o2hb_live_lock); + + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + goto unlock; + /* * A region can be added to the quorum only when it sees all * live nodes heartbeat on it. In other words, the region has been @@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg, */ if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, sizeof(o2hb_live_node_bitmap))) - return; - - if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD) - return; + goto unlock; - printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", - config_item_name(®->hr_item)); + printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n", + config_item_name(®->hr_item), reg->hr_dev_name); set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); @@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg, if (o2hb_pop_count(&o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) o2hb_region_unpin(NULL); +unlock: + spin_unlock(&o2hb_live_lock); } static int o2hb_check_slot(struct o2hb_region *reg, @@ -925,8 +947,6 @@ fire_callbacks: slot->ds_equal_samples = 0; } out: - o2hb_set_quorum_device(reg, slot); - spin_unlock(&o2hb_live_lock); o2hb_run_event_list(&event); @@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes, static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { - int i, ret, highest_node, change = 0; + int i, ret, highest_node; + int membership_change = 0, own_slot_ok = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct o2hb_bio_wait_ctxt write_wc; @@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) sizeof(configured_nodes)); if (ret) { mlog_errno(ret); - return ret; + goto bail; } /* @@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); if (highest_node >= O2NM_MAX_NODES) { - mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); - return -EINVAL; + mlog(ML_NOTICE, "o2hb: No configured nodes found!\n"); + ret = -EINVAL; + goto bail; } /* No sense in reading the slots of nodes that don't exist @@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_read_slots(reg, highest_node + 1); if (ret < 0) { mlog_errno(ret); - return ret; + goto bail; } /* With an up to date view of the slots, we can check that no * other node has been improperly configured to heartbeat in * our slot. */ - o2hb_check_last_timestamp(reg); + own_slot_ok = o2hb_check_own_slot(reg); /* fill in the proper info for our next heartbeat */ o2hb_prepare_block(reg, reg->hr_generation); - /* And fire off the write. Note that we don't wait on this I/O - * until later. */ ret = o2hb_issue_node_write(reg, &write_wc); if (ret < 0) { mlog_errno(ret); - return ret; + goto bail; } i = -1; while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { - change |= o2hb_check_slot(reg, ®->hr_slots[i]); + membership_change |= o2hb_check_slot(reg, ®->hr_slots[i]); } /* @@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * disk */ mlog(ML_ERROR, "Write error %d on device \"%s\"\n", write_wc.wc_error, reg->hr_dev_name); - return write_wc.wc_error; + ret = write_wc.wc_error; + goto bail; } - o2hb_arm_write_timeout(reg); + /* Skip disarming the timeout if own slot has stale/bad data */ + if (own_slot_ok) { + o2hb_set_quorum_device(reg); + o2hb_arm_write_timeout(reg); + } +bail: /* let the person who launched us know when things are steady */ - if (!change && (atomic_read(®->hr_steady_iterations) != 0)) { - if (atomic_dec_and_test(®->hr_steady_iterations)) + if (atomic_read(®->hr_steady_iterations) != 0) { + if (!ret && own_slot_ok && !membership_change) { + if (atomic_dec_and_test(®->hr_steady_iterations)) + wake_up(&o2hb_steady_queue); + } + } + + if (atomic_read(®->hr_steady_iterations) != 0) { + if (atomic_dec_and_test(®->hr_unsteady_iterations)) { + printk(KERN_NOTICE "o2hb: Unable to stabilize " + "heartbeart on region %s (%s)\n", + config_item_name(®->hr_item), + reg->hr_dev_name); + atomic_set(®->hr_steady_iterations, 0); + reg->hr_aborted_start = 1; wake_up(&o2hb_steady_queue); + ret = -EIO; + } } - return 0; + return ret; } /* Subtract b from a, storing the result in a. a *must* have a larger @@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data) /* Pin node */ o2nm_depend_this_node(); - while (!kthread_should_stop() && !reg->hr_unclean_stop) { + while (!kthread_should_stop() && + !reg->hr_unclean_stop && !reg->hr_aborted_start) { /* We track the time spent inside * o2hb_do_disk_heartbeat so that we avoid more than * hr_timeout_ms between disk writes. On busy systems @@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data) * likely to time itself out. */ do_gettimeofday(&before_hb); - i = 0; - do { - ret = o2hb_do_disk_heartbeat(reg); - } while (ret && ++i < 2); + ret = o2hb_do_disk_heartbeat(reg); do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); @@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data) after_hb.tv_sec, (unsigned long) after_hb.tv_usec, elapsed_msec); - if (elapsed_msec < reg->hr_timeout_ms) { + if (!kthread_should_stop() && + elapsed_msec < reg->hr_timeout_ms) { /* the kthread api has blocked signals for us so no * need to record the return value. */ msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); @@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data) * to timeout on this region when we could just as easily * write a clear generation - thus indicating to them that * this node has left this region. - * - * XXX: Should we skip this on unclean_stop? */ - o2hb_prepare_block(reg, 0); - ret = o2hb_issue_node_write(reg, &write_wc); - if (ret == 0) { - o2hb_wait_on_io(reg, &write_wc); - } else { - mlog_errno(ret); + */ + if (!reg->hr_unclean_stop && !reg->hr_aborted_start) { + o2hb_prepare_block(reg, 0); + ret = o2hb_issue_node_write(reg, &write_wc); + if (ret == 0) + o2hb_wait_on_io(reg, &write_wc); + else + mlog_errno(ret); } /* Unpin node */ o2nm_undepend_this_node(); - mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); + mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n"); return 0; } @@ -1158,6 +1198,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) struct o2hb_debug_buf *db = inode->i_private; struct o2hb_region *reg; unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long lts; char *buf = NULL; int i = -1; int out = 0; @@ -1194,9 +1235,11 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) case O2HB_DB_TYPE_REGION_ELAPSED_TIME: reg = (struct o2hb_region *)db->db_data; - out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", - jiffies_to_msecs(jiffies - - reg->hr_last_timeout_start)); + lts = reg->hr_last_timeout_start; + /* If 0, it has never been set before */ + if (lts) + lts = jiffies_to_msecs(jiffies - lts); + out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); goto done; case O2HB_DB_TYPE_REGION_PINNED: @@ -1426,6 +1469,8 @@ static void o2hb_region_release(struct config_item *item) struct page *page; struct o2hb_region *reg = to_o2hb_region(item); + mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); + if (reg->hr_tmp_block) kfree(reg->hr_tmp_block); @@ -1792,7 +1837,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, live_threshold <<= 1; spin_unlock(&o2hb_live_lock); } - atomic_set(®->hr_steady_iterations, live_threshold + 1); + ++live_threshold; + atomic_set(®->hr_steady_iterations, live_threshold); + /* unsteady_iterations is double the steady_iterations */ + atomic_set(®->hr_unsteady_iterations, (live_threshold << 1)); hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", reg->hr_item.ci_name); @@ -1809,14 +1857,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, ret = wait_event_interruptible(o2hb_steady_queue, atomic_read(®->hr_steady_iterations) == 0); if (ret) { - /* We got interrupted (hello ptrace!). Clean up */ - spin_lock(&o2hb_live_lock); - hb_task = reg->hr_task; - reg->hr_task = NULL; - spin_unlock(&o2hb_live_lock); + atomic_set(®->hr_steady_iterations, 0); + reg->hr_aborted_start = 1; + } - if (hb_task) - kthread_stop(hb_task); + if (reg->hr_aborted_start) { + ret = -EIO; goto out; } @@ -1833,8 +1879,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, ret = -EIO; if (hb_task && o2hb_global_heartbeat_active()) - printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", - config_item_name(®->hr_item)); + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n", + config_item_name(®->hr_item), reg->hr_dev_name); out: if (filp) @@ -2092,13 +2138,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, /* stop the thread when the user removes the region dir */ spin_lock(&o2hb_live_lock); - if (o2hb_global_heartbeat_active()) { - clear_bit(reg->hr_region_num, o2hb_region_bitmap); - clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); - if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) - quorum_region = 1; - clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); - } hb_task = reg->hr_task; reg->hr_task = NULL; reg->hr_item_dropped = 1; @@ -2107,19 +2146,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, if (hb_task) kthread_stop(hb_task); + if (o2hb_global_heartbeat_active()) { + spin_lock(&o2hb_live_lock); + clear_bit(reg->hr_region_num, o2hb_region_bitmap); + clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); + if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) + quorum_region = 1; + clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); + spin_unlock(&o2hb_live_lock); + printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n", + ((atomic_read(®->hr_steady_iterations) == 0) ? + "stopped" : "start aborted"), config_item_name(item), + reg->hr_dev_name); + } + /* * If we're racing a dev_write(), we need to wake them. They will * check reg->hr_task */ if (atomic_read(®->hr_steady_iterations) != 0) { + reg->hr_aborted_start = 1; atomic_set(®->hr_steady_iterations, 0); wake_up(&o2hb_steady_queue); } - if (o2hb_global_heartbeat_active()) - printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", - config_item_name(®->hr_item)); - config_item_put(item); if (!o2hb_global_heartbeat_active() || !quorum_region) diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 3a5835904b3d..dc45deb19e68 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -47,6 +47,7 @@ #define SC_DEBUG_NAME "sock_containers" #define NST_DEBUG_NAME "send_tracking" #define STATS_DEBUG_NAME "stats" +#define NODES_DEBUG_NAME "connected_nodes" #define SHOW_SOCK_CONTAINERS 0 #define SHOW_SOCK_STATS 1 @@ -55,6 +56,7 @@ static struct dentry *o2net_dentry; static struct dentry *sc_dentry; static struct dentry *nst_dentry; static struct dentry *stats_dentry; +static struct dentry *nodes_dentry; static DEFINE_SPINLOCK(o2net_debug_lock); @@ -491,53 +493,87 @@ static const struct file_operations sc_seq_fops = { .release = sc_fop_release, }; -int o2net_debugfs_init(void) +static int o2net_fill_bitmap(char *buf, int len) { - o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); - if (!o2net_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } + unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int i = -1, out = 0; - nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR, - o2net_dentry, NULL, - &nst_seq_fops); - if (!nst_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } + o2net_fill_node_map(map, sizeof(map)); - sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR, - o2net_dentry, NULL, - &sc_seq_fops); - if (!sc_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } + while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) + out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += snprintf(buf + out, PAGE_SIZE - out, "\n"); - stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR, - o2net_dentry, NULL, - &stats_seq_fops); - if (!stats_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } + return out; +} + +static int nodes_fop_open(struct inode *inode, struct file *file) +{ + char *buf; + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE)); + + file->private_data = buf; return 0; -bail: - debugfs_remove(stats_dentry); - debugfs_remove(sc_dentry); - debugfs_remove(nst_dentry); - debugfs_remove(o2net_dentry); - return -ENOMEM; } +static int o2net_debug_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static ssize_t o2net_debug_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, + i_size_read(file->f_mapping->host)); +} + +static const struct file_operations nodes_fops = { + .open = nodes_fop_open, + .release = o2net_debug_release, + .read = o2net_debug_read, + .llseek = generic_file_llseek, +}; + void o2net_debugfs_exit(void) { + debugfs_remove(nodes_dentry); debugfs_remove(stats_dentry); debugfs_remove(sc_dentry); debugfs_remove(nst_dentry); debugfs_remove(o2net_dentry); } +int o2net_debugfs_init(void) +{ + mode_t mode = S_IFREG|S_IRUSR; + + o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); + if (o2net_dentry) + nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode, + o2net_dentry, NULL, &nst_seq_fops); + if (nst_dentry) + sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode, + o2net_dentry, NULL, &sc_seq_fops); + if (sc_dentry) + stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode, + o2net_dentry, NULL, &stats_seq_fops); + if (stats_dentry) + nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode, + o2net_dentry, NULL, &nodes_fops); + if (nodes_dentry) + return 0; + + o2net_debugfs_exit(); + mlog_errno(-ENOMEM); + return -ENOMEM; +} + #endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index db5ee4b4f47a..044e7b58d31c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -59,6 +59,7 @@ #include <linux/idr.h> #include <linux/kref.h> #include <linux/net.h> +#include <linux/export.h> #include <net/tcp.h> #include <asm/uaccess.h> @@ -545,7 +546,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, } if (was_valid && !valid) { - printk(KERN_NOTICE "o2net: no longer connected to " + printk(KERN_NOTICE "o2net: No longer connected to " SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); o2net_complete_nodes_nsw(nn); } @@ -555,7 +556,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, cancel_delayed_work(&nn->nn_connect_expired); printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", o2nm_this_node() > sc->sc_node->nd_num ? - "connected to" : "accepted connection from", + "Connected to" : "Accepted connection from", SC_NODEF_ARGS(sc)); } @@ -643,7 +644,7 @@ static void o2net_state_change(struct sock *sk) o2net_sc_queue_work(sc, &sc->sc_connect_work); break; default: - printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT + printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT " shutdown, state %d\n", SC_NODEF_ARGS(sc), sk->sk_state); o2net_sc_queue_work(sc, &sc->sc_shutdown_work); @@ -1034,6 +1035,25 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, return ret; } +/* Get a map of all nodes to which this node is currently connected to */ +void o2net_fill_node_map(unsigned long *map, unsigned bytes) +{ + struct o2net_sock_container *sc; + int node, ret; + + BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); + + memset(map, 0, bytes); + for (node = 0; node < O2NM_MAX_NODES; ++node) { + o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret); + if (!ret) { + set_bit(node, map); + sc_put(sc); + } + } +} +EXPORT_SYMBOL_GPL(o2net_fill_node_map); + int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, size_t caller_veclen, u8 target_node, int *status) { @@ -1284,11 +1304,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { - mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol " - "version %llu but %llu is required, disconnecting\n", - SC_NODEF_ARGS(sc), - (unsigned long long)be64_to_cpu(hand->protocol_version), - O2NET_PROTOCOL_VERSION); + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net " + "protocol version %llu but %llu is required. " + "Disconnecting.\n", SC_NODEF_ARGS(sc), + (unsigned long long)be64_to_cpu(hand->protocol_version), + O2NET_PROTOCOL_VERSION); /* don't bother reconnecting if its the wrong version. */ o2net_ensure_shutdown(nn, sc, -ENOTCONN); @@ -1302,33 +1322,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) */ if (be32_to_cpu(hand->o2net_idle_timeout_ms) != o2net_idle_timeout()) { - mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " - "%u ms, but we use %u ms locally. disconnecting\n", - SC_NODEF_ARGS(sc), - be32_to_cpu(hand->o2net_idle_timeout_ms), - o2net_idle_timeout()); + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network " + "idle timeout of %u ms, but we use %u ms locally. " + "Disconnecting.\n", SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_idle_timeout_ms), + o2net_idle_timeout()); o2net_ensure_shutdown(nn, sc, -ENOTCONN); return -1; } if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != o2net_keepalive_delay()) { - mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " - "%u ms, but we use %u ms locally. disconnecting\n", - SC_NODEF_ARGS(sc), - be32_to_cpu(hand->o2net_keepalive_delay_ms), - o2net_keepalive_delay()); + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive " + "delay of %u ms, but we use %u ms locally. " + "Disconnecting.\n", SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2net_keepalive_delay_ms), + o2net_keepalive_delay()); o2net_ensure_shutdown(nn, sc, -ENOTCONN); return -1; } if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != O2HB_MAX_WRITE_TIMEOUT_MS) { - mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " - "%u ms, but we use %u ms locally. disconnecting\n", - SC_NODEF_ARGS(sc), - be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), - O2HB_MAX_WRITE_TIMEOUT_MS); + printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat " + "timeout of %u ms, but we use %u ms locally. " + "Disconnecting.\n", SC_NODEF_ARGS(sc), + be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), + O2HB_MAX_WRITE_TIMEOUT_MS); o2net_ensure_shutdown(nn, sc, -ENOTCONN); return -1; } @@ -1539,28 +1559,16 @@ static void o2net_idle_timer(unsigned long data) { struct o2net_sock_container *sc = (struct o2net_sock_container *)data; struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); - #ifdef CONFIG_DEBUG_FS - ktime_t now = ktime_get(); + unsigned long msecs = ktime_to_ms(ktime_get()) - + ktime_to_ms(sc->sc_tv_timer); +#else + unsigned long msecs = o2net_idle_timeout(); #endif - printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " - "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), - o2net_idle_timeout() / 1000, - o2net_idle_timeout() % 1000); - -#ifdef CONFIG_DEBUG_FS - mlog(ML_NOTICE, "Here are some times that might help debug the " - "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, " - "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n", - (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now), - (long long)ktime_to_us(sc->sc_tv_data_ready), - (long long)ktime_to_us(sc->sc_tv_advance_start), - (long long)ktime_to_us(sc->sc_tv_advance_stop), - sc->sc_msg_key, sc->sc_msg_type, - (long long)ktime_to_us(sc->sc_tv_func_start), - (long long)ktime_to_us(sc->sc_tv_func_stop)); -#endif + printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " + "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), + msecs / 1000, msecs % 1000); /* * Initialize the nn_timeout so that the next connection attempt @@ -1693,8 +1701,8 @@ static void o2net_start_connect(struct work_struct *work) out: if (ret) { - mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed " - "with errno %d\n", SC_NODEF_ARGS(sc), ret); + printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT + " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); /* 0 err so that another will be queued and attempted * from set_nn_state */ if (sc) @@ -1717,8 +1725,8 @@ static void o2net_connect_expired(struct work_struct *work) spin_lock(&nn->nn_lock); if (!nn->nn_sc_valid) { - mlog(ML_ERROR, "no connection established with node %u after " - "%u.%u seconds, giving up and returning errors.\n", + printk(KERN_NOTICE "o2net: No connection established with " + "node %u after %u.%u seconds, giving up.\n", o2net_num_from_nn(nn), o2net_idle_timeout() / 1000, o2net_idle_timeout() % 1000); @@ -1861,21 +1869,21 @@ static int o2net_accept_one(struct socket *sock) node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); if (node == NULL) { - mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n", - &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + printk(KERN_NOTICE "o2net: Attempt to connect from unknown " + "node at %pI4:%d\n", &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); ret = -EINVAL; goto out; } if (o2nm_this_node() >= node->nd_num) { local_node = o2nm_get_node_by_num(o2nm_this_node()); - mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' (" - "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n", - local_node->nd_name, local_node->nd_num, - &(local_node->nd_ipv4_address), - ntohs(local_node->nd_ipv4_port), - node->nd_name, node->nd_num, &sin.sin_addr.s_addr, - ntohs(sin.sin_port)); + printk(KERN_NOTICE "o2net: Unexpected connect attempt seen " + "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " + "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, + &(local_node->nd_ipv4_address), + ntohs(local_node->nd_ipv4_port), node->nd_name, + node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); ret = -EINVAL; goto out; } @@ -1900,10 +1908,10 @@ static int o2net_accept_one(struct socket *sock) ret = 0; spin_unlock(&nn->nn_lock); if (ret) { - mlog(ML_NOTICE, "attempt to connect from node '%s' at " - "%pI4:%d but it already has an open connection\n", - node->nd_name, &sin.sin_addr.s_addr, - ntohs(sin.sin_port)); + printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' " + "at %pI4:%d but it already has an open connection\n", + node->nd_name, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); goto out; } @@ -1983,7 +1991,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) { - mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret); + printk(KERN_ERR "o2net: Error %d while creating socket\n", ret); goto out; } @@ -2000,16 +2008,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) sock->sk->sk_reuse = 1; ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); if (ret < 0) { - mlog(ML_ERROR, "unable to bind socket at %pI4:%u, " - "ret=%d\n", &addr, ntohs(port), ret); + printk(KERN_ERR "o2net: Error %d while binding socket at " + "%pI4:%u\n", ret, &addr, ntohs(port)); goto out; } ret = sock->ops->listen(sock, 64); - if (ret < 0) { - mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n", - &addr, ntohs(port), ret); - } + if (ret < 0) + printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n", + ret, &addr, ntohs(port)); out: if (ret) { diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index fd6179eb26d4..5bada2a69b50 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, struct list_head *unreg_list); void o2net_unregister_handler_list(struct list_head *list); +void o2net_fill_node_map(unsigned long *map, unsigned bytes); + struct o2nm_node; int o2net_register_hb_callbacks(void); void o2net_unregister_hb_callbacks(void); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index e2878b5895fb..8fe4e2892ab9 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1184,8 +1184,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, if (pde) le16_add_cpu(&pde->rec_len, le16_to_cpu(de->rec_len)); - else - de->inode = 0; + de->inode = 0; dir->i_version++; ocfs2_journal_dirty(handle, bh); goto bail; diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index d602abb51b61..a5952ceecba5 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -859,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); void dlm_wait_for_recovery(struct dlm_ctxt *dlm); void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); -int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); -int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); +void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); +void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); void dlm_put(struct dlm_ctxt *dlm); struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); @@ -877,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res) kref_get(&res->refs); } void dlm_lockres_put(struct dlm_lock_resource *res); -void __dlm_unhash_lockres(struct dlm_lock_resource *res); -void __dlm_insert_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res); +void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, const char *name, unsigned int len, @@ -902,46 +901,15 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int namelen); -#define dlm_lockres_set_refmap_bit(bit,res) \ - __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) -#define dlm_lockres_clear_refmap_bit(bit,res) \ - __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) +void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, int bit); +void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, int bit); -static inline void __dlm_lockres_set_refmap_bit(int bit, - struct dlm_lock_resource *res, - const char *file, - int line) -{ - //printk("%s:%d:%.*s: setting bit %d\n", file, line, - // res->lockname.len, res->lockname.name, bit); - set_bit(bit, res->refmap); -} - -static inline void __dlm_lockres_clear_refmap_bit(int bit, - struct dlm_lock_resource *res, - const char *file, - int line) -{ - //printk("%s:%d:%.*s: clearing bit %d\n", file, line, - // res->lockname.len, res->lockname.name, bit); - clear_bit(bit, res->refmap); -} - -void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - const char *file, - int line); -void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - int new_lockres, - const char *file, - int line); -#define dlm_lockres_drop_inflight_ref(d,r) \ - __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__) -#define dlm_lockres_grab_inflight_ref(d,r) \ - __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__) -#define dlm_lockres_grab_inflight_ref_new(d,r) \ - __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__) +void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 56f82cb912e3..0e28e242226d 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -30,6 +30,7 @@ #include <linux/sysctl.h> #include <linux/spinlock.h> #include <linux/debugfs.h> +#include <linux/export.h> #include "cluster/heartbeat.h" #include "cluster/nodemanager.h" diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6ed6b95dcf93..92f2ead0fab6 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -157,16 +157,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing, static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); -void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) +void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - if (!hlist_unhashed(&lockres->hash_node)) { - hlist_del_init(&lockres->hash_node); - dlm_lockres_put(lockres); - } + if (hlist_unhashed(&res->hash_node)) + return; + + mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len, + res->lockname.name); + hlist_del_init(&res->hash_node); + dlm_lockres_put(res); } -void __dlm_insert_lockres(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res) +void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { struct hlist_head *bucket; struct qstr *q; @@ -180,6 +182,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, dlm_lockres_get(res); hlist_add_head(&res->hash_node, bucket); + + mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len, + res->lockname.name); } struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, @@ -539,17 +544,17 @@ again: static void __dlm_print_nodes(struct dlm_ctxt *dlm) { - int node = -1; + int node = -1, num = 0; assert_spin_locked(&dlm->spinlock); - printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name); - + printk("( "); while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1)) < O2NM_MAX_NODES) { printk("%d ", node); + ++num; } - printk("\n"); + printk(") %u nodes\n", num); } static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, @@ -566,11 +571,10 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, node = exit_msg->node_idx; - printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name); - spin_lock(&dlm->spinlock); clear_bit(node, dlm->domain_map); clear_bit(node, dlm->exit_domain_map); + printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name); __dlm_print_nodes(dlm); /* notify anything attached to the heartbeat events */ @@ -755,6 +759,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) dlm_mark_domain_leaving(dlm); dlm_leave_domain(dlm); + printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name); dlm_force_free_mles(dlm); dlm_complete_dlm_shutdown(dlm); } @@ -970,7 +975,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, clear_bit(assert->node_idx, dlm->exit_domain_map); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); - printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", + printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ", assert->node_idx, dlm->name); __dlm_print_nodes(dlm); @@ -1701,8 +1706,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) bail: spin_lock(&dlm->spinlock); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); - if (!status) + if (!status) { + printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name); __dlm_print_nodes(dlm); + } spin_unlock(&dlm->spinlock); if (ctxt) { @@ -2131,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain, goto leave; } - if (!o2hb_check_local_node_heartbeating()) { - mlog(ML_ERROR, "the local node has not been configured, or is " - "not heartbeating\n"); - ret = -EPROTO; - goto leave; - } - mlog(0, "register called for domain \"%s\"\n", domain); retry: diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 8d39e0fd66f7..975810b98492 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -183,10 +183,6 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, kick_thread = 1; } } - /* reduce the inflight count, this may result in the lockres - * being purged below during calc_usage */ - if (lock->ml.node == dlm->node_num) - dlm_lockres_drop_inflight_ref(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -231,10 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, lock->ml.type, res->lockname.len, res->lockname.name, flags); + /* + * Wait if resource is getting recovered, remastered, etc. + * If the resource was remastered and new owner is self, then exit. + */ spin_lock(&res->spinlock); - - /* will exit this call with spinlock held */ __dlm_wait_on_lockres(res); + if (res->owner == dlm->node_num) { + spin_unlock(&res->spinlock); + return DLM_RECOVERING; + } res->state |= DLM_LOCK_RES_IN_PROGRESS; /* add lock to local (secondary) queue */ @@ -319,27 +321,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, sizeof(create), res->owner, &status); if (tmpret >= 0) { - // successfully sent and received - ret = status; // this is already a dlm_status + ret = status; if (ret == DLM_REJECTED) { - mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " - "no longer owned by %u. that node is coming back " - "up currently.\n", dlm->name, create.namelen, + mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer " + "owned by node %u. That node is coming back up " + "currently.\n", dlm->name, create.namelen, create.name, res->owner); dlm_print_one_lock_resource(res); BUG(); } } else { - mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " - "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key, - res->owner); - if (dlm_is_host_down(tmpret)) { + mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to " + "node %u\n", dlm->name, create.namelen, create.name, + tmpret, res->owner); + if (dlm_is_host_down(tmpret)) ret = DLM_RECOVERING; - mlog(0, "node %u died so returning DLM_RECOVERING " - "from lock message!\n", res->owner); - } else { + else ret = dlm_err_to_dlm_status(tmpret); - } } return ret; @@ -440,7 +438,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, /* zero memory only if kernel-allocated */ lksb = kzalloc(sizeof(*lksb), GFP_NOFS); if (!lksb) { - kfree(lock); + kmem_cache_free(dlm_lock_cache, lock); return NULL; } kernel_allocated = 1; @@ -718,18 +716,10 @@ retry_lock: if (status == DLM_RECOVERING || status == DLM_MIGRATING || status == DLM_FORWARD) { - mlog(0, "retrying lock with migration/" - "recovery/in progress\n"); msleep(100); - /* no waiting for dlm_reco_thread */ if (recovery) { if (status != DLM_RECOVERING) goto retry_lock; - - mlog(0, "%s: got RECOVERING " - "for $RECOVERY lock, master " - "was %u\n", dlm->name, - res->owner); /* wait to see the node go down, then * drop down and allow the lockres to * get cleaned up. need to remaster. */ @@ -741,6 +731,14 @@ retry_lock: } } + /* Inflight taken in dlm_get_lock_resource() is dropped here */ + spin_lock(&res->spinlock); + dlm_lockres_drop_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); + + dlm_lockres_calc_usage(dlm, res); + dlm_kick_thread(dlm, res); + if (status != DLM_NORMAL) { lock->lksb->flags &= ~DLM_LKSB_GET_LVB; if (status != DLM_NOTQUEUED) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 11eefb8c12e9..005261c333b0 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -631,39 +631,54 @@ error: return NULL; } -void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - int new_lockres, - const char *file, - int line) +void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, int bit) { - if (!new_lockres) - assert_spin_locked(&res->spinlock); + assert_spin_locked(&res->spinlock); + + mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len, + res->lockname.name, bit, __builtin_return_address(0)); + + set_bit(bit, res->refmap); +} + +void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, int bit) +{ + assert_spin_locked(&res->spinlock); + + mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len, + res->lockname.name, bit, __builtin_return_address(0)); + + clear_bit(bit, res->refmap); +} + + +void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); - if (!test_bit(dlm->node_num, res->refmap)) { - BUG_ON(res->inflight_locks != 0); - dlm_lockres_set_refmap_bit(dlm->node_num, res); - } res->inflight_locks++; - mlog(0, "%s:%.*s: inflight++: now %u\n", - dlm->name, res->lockname.len, res->lockname.name, - res->inflight_locks); + + mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, + res->lockname.len, res->lockname.name, res->inflight_locks, + __builtin_return_address(0)); } -void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, - const char *file, - int line) +void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) { assert_spin_locked(&res->spinlock); BUG_ON(res->inflight_locks == 0); + res->inflight_locks--; - mlog(0, "%s:%.*s: inflight--: now %u\n", - dlm->name, res->lockname.len, res->lockname.name, - res->inflight_locks); - if (res->inflight_locks == 0) - dlm_lockres_clear_refmap_bit(dlm->node_num, res); + + mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name, + res->lockname.len, res->lockname.name, res->inflight_locks, + __builtin_return_address(0)); + wake_up(&res->wq); } @@ -697,7 +712,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, unsigned int hash; int tries = 0; int bit, wait_on_recovery = 0; - int drop_inflight_if_nonlocal = 0; BUG_ON(!lockid); @@ -709,36 +723,33 @@ lookup: spin_lock(&dlm->spinlock); tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); if (tmpres) { - int dropping_ref = 0; - spin_unlock(&dlm->spinlock); - spin_lock(&tmpres->spinlock); - /* We wait for the other thread that is mastering the resource */ + /* Wait on the thread that is mastering the resource */ if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { __dlm_wait_on_lockres(tmpres); BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; } - if (tmpres->owner == dlm->node_num) { - BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); - dlm_lockres_grab_inflight_ref(dlm, tmpres); - } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) - dropping_ref = 1; - spin_unlock(&tmpres->spinlock); - - /* wait until done messaging the master, drop our ref to allow - * the lockres to be purged, start over. */ - if (dropping_ref) { - spin_lock(&tmpres->spinlock); - __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); + /* Wait on the resource purge to complete before continuing */ + if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) { + BUG_ON(tmpres->owner == dlm->node_num); + __dlm_wait_on_lockres_flags(tmpres, + DLM_LOCK_RES_DROPPING_REF); spin_unlock(&tmpres->spinlock); dlm_lockres_put(tmpres); tmpres = NULL; goto lookup; } - mlog(0, "found in hash!\n"); + /* Grab inflight ref to pin the resource */ + dlm_lockres_grab_inflight_ref(dlm, tmpres); + + spin_unlock(&tmpres->spinlock); if (res) dlm_lockres_put(res); res = tmpres; @@ -829,8 +840,8 @@ lookup: * but they might own this lockres. wait on them. */ bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); if (bit < O2NM_MAX_NODES) { - mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " - "recover before lock mastery can begin\n", + mlog(0, "%s: res %.*s, At least one node (%d) " + "to recover before lock mastery can begin\n", dlm->name, namelen, (char *)lockid, bit); wait_on_recovery = 1; } @@ -843,12 +854,11 @@ lookup: /* finally add the lockres to its hash bucket */ __dlm_insert_lockres(dlm, res); - /* since this lockres is new it doesn't not require the spinlock */ - dlm_lockres_grab_inflight_ref_new(dlm, res); - /* if this node does not become the master make sure to drop - * this inflight reference below */ - drop_inflight_if_nonlocal = 1; + /* Grab inflight ref to pin the resource */ + spin_lock(&res->spinlock); + dlm_lockres_grab_inflight_ref(dlm, res); + spin_unlock(&res->spinlock); /* get an extra ref on the mle in case this is a BLOCK * if so, the creator of the BLOCK may try to put the last @@ -864,8 +874,8 @@ redo_request: * dlm spinlock would be detectable be a change on the mle, * so we only need to clear out the recovery map once. */ if (dlm_is_recovery_lock(lockid, namelen)) { - mlog(ML_NOTICE, "%s: recovery map is not empty, but " - "must master $RECOVERY lock now\n", dlm->name); + mlog(0, "%s: Recovery map is not empty, but must " + "master $RECOVERY lock now\n", dlm->name); if (!dlm_pre_master_reco_lockres(dlm, res)) wait_on_recovery = 0; else { @@ -883,8 +893,8 @@ redo_request: spin_lock(&dlm->spinlock); bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); if (bit < O2NM_MAX_NODES) { - mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " - "recover before lock mastery can begin\n", + mlog(0, "%s: res %.*s, At least one node (%d) " + "to recover before lock mastery can begin\n", dlm->name, namelen, (char *)lockid, bit); wait_on_recovery = 1; } else @@ -913,8 +923,8 @@ redo_request: * yet, keep going until it does. this is how the * master will know that asserts are needed back to * the lower nodes. */ - mlog(0, "%s:%.*s: requests only up to %u but master " - "is %u, keep going\n", dlm->name, namelen, + mlog(0, "%s: res %.*s, Requests only up to %u but " + "master is %u, keep going\n", dlm->name, namelen, lockid, nodenum, mle->master); } } @@ -924,13 +934,12 @@ wait: ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); if (ret < 0) { wait_on_recovery = 1; - mlog(0, "%s:%.*s: node map changed, redo the " - "master request now, blocked=%d\n", - dlm->name, res->lockname.len, + mlog(0, "%s: res %.*s, Node map changed, redo the master " + "request now, blocked=%d\n", dlm->name, res->lockname.len, res->lockname.name, blocked); if (++tries > 20) { - mlog(ML_ERROR, "%s:%.*s: spinning on " - "dlm_wait_for_lock_mastery, blocked=%d\n", + mlog(ML_ERROR, "%s: res %.*s, Spinning on " + "dlm_wait_for_lock_mastery, blocked = %d\n", dlm->name, res->lockname.len, res->lockname.name, blocked); dlm_print_one_lock_resource(res); @@ -940,7 +949,8 @@ wait: goto redo_request; } - mlog(0, "lockres mastered by %u\n", res->owner); + mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len, + res->lockname.name, res->owner); /* make sure we never continue without this */ BUG_ON(res->owner == O2NM_MAX_NODES); @@ -952,8 +962,6 @@ wait: wake_waiters: spin_lock(&res->spinlock); - if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) - dlm_lockres_drop_inflight_ref(dlm, res); res->state &= ~DLM_LOCK_RES_IN_PROGRESS; spin_unlock(&res->spinlock); wake_up(&res->wq); @@ -1426,9 +1434,7 @@ way_up_top: } if (res->owner == dlm->node_num) { - mlog(0, "%s:%.*s: setting bit %u in refmap\n", - dlm->name, namelen, name, request->node_idx); - dlm_lockres_set_refmap_bit(request->node_idx, res); + dlm_lockres_set_refmap_bit(dlm, res, request->node_idx); spin_unlock(&res->spinlock); response = DLM_MASTER_RESP_YES; if (mle) @@ -1493,10 +1499,8 @@ way_up_top: * go back and clean the mles on any * other nodes */ dispatch_assert = 1; - dlm_lockres_set_refmap_bit(request->node_idx, res); - mlog(0, "%s:%.*s: setting bit %u in refmap\n", - dlm->name, namelen, name, - request->node_idx); + dlm_lockres_set_refmap_bit(dlm, res, + request->node_idx); } else response = DLM_MASTER_RESP_NO; } else { @@ -1702,7 +1706,7 @@ again: "lockres, set the bit in the refmap\n", namelen, lockname, to); spin_lock(&res->spinlock); - dlm_lockres_set_refmap_bit(to, res); + dlm_lockres_set_refmap_bit(dlm, res, to); spin_unlock(&res->spinlock); } } @@ -2187,8 +2191,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) namelen = res->lockname.len; BUG_ON(namelen > O2NM_MAX_NAME_LEN); - mlog(0, "%s:%.*s: sending deref to %d\n", - dlm->name, namelen, lockname, res->owner); memset(&deref, 0, sizeof(deref)); deref.node_idx = dlm->node_num; deref.namelen = namelen; @@ -2197,14 +2199,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, &deref, sizeof(deref), res->owner, &r); if (ret < 0) - mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " - "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key, - res->owner); + mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n", + dlm->name, namelen, lockname, ret, res->owner); else if (r < 0) { /* BAD. other node says I did not have a ref. */ - mlog(ML_ERROR,"while dropping ref on %s:%.*s " - "(master=%u) got %d.\n", dlm->name, namelen, - lockname, res->owner, r); + mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", + dlm->name, namelen, lockname, res->owner, r); dlm_print_one_lock_resource(res); BUG(); } @@ -2260,7 +2260,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, else { BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); if (test_bit(node, res->refmap)) { - dlm_lockres_clear_refmap_bit(node, res); + dlm_lockres_clear_refmap_bit(dlm, res, node); cleared = 1; } } @@ -2320,7 +2320,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); if (test_bit(node, res->refmap)) { __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); - dlm_lockres_clear_refmap_bit(node, res); + dlm_lockres_clear_refmap_bit(dlm, res, node); cleared = 1; } spin_unlock(&res->spinlock); @@ -2802,7 +2802,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, BUG_ON(!list_empty(&lock->bast_list)); BUG_ON(lock->ast_pending); BUG_ON(lock->bast_pending); - dlm_lockres_clear_refmap_bit(lock->ml.node, res); + dlm_lockres_clear_refmap_bit(dlm, res, + lock->ml.node); list_del_init(&lock->list); dlm_lock_put(lock); /* In a normal unlock, we would have added a @@ -2823,7 +2824,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, mlog(0, "%s:%.*s: node %u had a ref to this " "migrating lockres, clearing\n", dlm->name, res->lockname.len, res->lockname.name, bit); - dlm_lockres_clear_refmap_bit(bit, res); + dlm_lockres_clear_refmap_bit(dlm, res, bit); } bit++; } @@ -2916,9 +2917,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, &migrate, sizeof(migrate), nodenum, &status); if (ret < 0) { - mlog(ML_ERROR, "Error %d when sending message %u (key " - "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG, - dlm->key, nodenum); + mlog(ML_ERROR, "%s: res %.*s, Error %d send " + "MIGRATE_REQUEST to node %u\n", dlm->name, + migrate.namelen, migrate.name, ret, nodenum); if (!dlm_is_host_down(ret)) { mlog(ML_ERROR, "unhandled error=%d!\n", ret); BUG(); @@ -2937,7 +2938,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, dlm->name, res->lockname.len, res->lockname.name, nodenum); spin_lock(&res->spinlock); - dlm_lockres_set_refmap_bit(nodenum, res); + dlm_lockres_set_refmap_bit(dlm, res, nodenum); spin_unlock(&res->spinlock); } } @@ -3271,7 +3272,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, * mastery reference here since old_master will briefly have * a reference after the migration completes */ spin_lock(&res->spinlock); - dlm_lockres_set_refmap_bit(old_master, res); + dlm_lockres_set_refmap_bit(dlm, res, old_master); spin_unlock(&res->spinlock); mlog(0, "now time to do a migrate request to other nodes\n"); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 7efab6d28a21..01ebfd0bdad7 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -362,40 +362,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node) } -int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) +void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) { - if (timeout) { - mlog(ML_NOTICE, "%s: waiting %dms for notification of " - "death of node %u\n", dlm->name, timeout, node); + if (dlm_is_node_dead(dlm, node)) + return; + + printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in " + "domain %s\n", node, dlm->name); + + if (timeout) wait_event_timeout(dlm->dlm_reco_thread_wq, - dlm_is_node_dead(dlm, node), - msecs_to_jiffies(timeout)); - } else { - mlog(ML_NOTICE, "%s: waiting indefinitely for notification " - "of death of node %u\n", dlm->name, node); + dlm_is_node_dead(dlm, node), + msecs_to_jiffies(timeout)); + else wait_event(dlm->dlm_reco_thread_wq, dlm_is_node_dead(dlm, node)); - } - /* for now, return 0 */ - return 0; } -int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) +void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) { - if (timeout) { - mlog(0, "%s: waiting %dms for notification of " - "recovery of node %u\n", dlm->name, timeout, node); + if (dlm_is_node_recovered(dlm, node)) + return; + + printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in " + "domain %s\n", node, dlm->name); + + if (timeout) wait_event_timeout(dlm->dlm_reco_thread_wq, - dlm_is_node_recovered(dlm, node), - msecs_to_jiffies(timeout)); - } else { - mlog(0, "%s: waiting indefinitely for notification " - "of recovery of node %u\n", dlm->name, node); + dlm_is_node_recovered(dlm, node), + msecs_to_jiffies(timeout)); + else wait_event(dlm->dlm_reco_thread_wq, dlm_is_node_recovered(dlm, node)); - } - /* for now, return 0 */ - return 0; } /* callers of the top-level api calls (dlmlock/dlmunlock) should @@ -430,6 +428,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm) { spin_lock(&dlm->spinlock); BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); + printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", + dlm->name, dlm->reco.dead_node); dlm->reco.state |= DLM_RECO_STATE_ACTIVE; spin_unlock(&dlm->spinlock); } @@ -440,9 +440,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm) BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; spin_unlock(&dlm->spinlock); + printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name); wake_up(&dlm->reco.event); } +static void dlm_print_recovery_master(struct dlm_ctxt *dlm) +{ + printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the " + "dead node %u in domain %s\n", dlm->reco.new_master, + (dlm->node_num == dlm->reco.new_master ? "me" : "he"), + dlm->reco.dead_node, dlm->name); +} + static int dlm_do_recovery(struct dlm_ctxt *dlm) { int status = 0; @@ -505,9 +514,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) } mlog(0, "another node will master this recovery session.\n"); } - mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", - dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, - dlm->node_num, dlm->reco.dead_node); + + dlm_print_recovery_master(dlm); /* it is safe to start everything back up here * because all of the dead node's lock resources @@ -518,15 +526,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) return 0; master_here: - mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node " - "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task), - dlm->node_num, dlm->reco.dead_node, dlm->name); + dlm_print_recovery_master(dlm); status = dlm_remaster_locks(dlm, dlm->reco.dead_node); if (status < 0) { /* we should never hit this anymore */ - mlog(ML_ERROR, "error %d remastering locks for node %u, " - "retrying.\n", status, dlm->reco.dead_node); + mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, " + "retrying.\n", dlm->name, status, dlm->reco.dead_node); /* yield a bit to allow any final network messages * to get handled on remaining nodes */ msleep(100); @@ -567,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); ndata->state = DLM_RECO_NODE_DATA_REQUESTING; - mlog(0, "requesting lock info from node %u\n", + mlog(0, "%s: Requesting lock info from node %u\n", dlm->name, ndata->node_num); if (ndata->node_num == dlm->node_num) { @@ -640,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) spin_unlock(&dlm_reco_state_lock); } - mlog(0, "done requesting all lock info\n"); + mlog(0, "%s: Done requesting all lock info\n", dlm->name); /* nodes should be sending reco data now * just need to wait */ @@ -802,10 +808,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, /* negative status is handled by caller */ if (ret < 0) - mlog(ML_ERROR, "Error %d when sending message %u (key " - "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG, - dlm->key, request_from); - + mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u " + "to recover dead node %u\n", dlm->name, ret, + request_from, dead_node); // return from here, then // sleep until all received or error return ret; @@ -956,9 +961,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, sizeof(done_msg), send_to, &tmpret); if (ret < 0) { - mlog(ML_ERROR, "Error %d when sending message %u (key " - "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG, - dlm->key, send_to); + mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u " + "to recover dead node %u\n", dlm->name, ret, send_to, + dead_node); if (!dlm_is_host_down(ret)) { BUG(); } @@ -1127,9 +1132,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, if (ret < 0) { /* XXX: negative status is not handled. * this will end up killing this node. */ - mlog(ML_ERROR, "Error %d when sending message %u (key " - "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG, - dlm->key, send_to); + mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to " + "node %u (%s)\n", dlm->name, mres->lockname_len, + mres->lockname, ret, send_to, + (orig_flags & DLM_MRES_MIGRATION ? + "migration" : "recovery")); } else { /* might get an -ENOMEM back here */ ret = status; @@ -1767,7 +1774,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, dlm->name, mres->lockname_len, mres->lockname, from); spin_lock(&res->spinlock); - dlm_lockres_set_refmap_bit(from, res); + dlm_lockres_set_refmap_bit(dlm, res, from); spin_unlock(&res->spinlock); added++; break; @@ -1965,7 +1972,7 @@ skip_lvb: mlog(0, "%s:%.*s: added lock for node %u, " "setting refmap bit\n", dlm->name, res->lockname.len, res->lockname.name, ml->node); - dlm_lockres_set_refmap_bit(ml->node, res); + dlm_lockres_set_refmap_bit(dlm, res, ml->node); added++; } spin_unlock(&res->spinlock); @@ -2084,6 +2091,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { if (res->owner == dead_node) { + mlog(0, "%s: res %.*s, Changing owner from %u to %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->owner, new_master); list_del_init(&res->recovering); spin_lock(&res->spinlock); /* new_master has our reference from @@ -2105,40 +2115,30 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, hash_iter, bucket, hash_node) { - if (res->state & DLM_LOCK_RES_RECOVERING) { - if (res->owner == dead_node) { - mlog(0, "(this=%u) res %.*s owner=%u " - "was not on recovering list, but " - "clearing state anyway\n", - dlm->node_num, res->lockname.len, - res->lockname.name, new_master); - } else if (res->owner == dlm->node_num) { - mlog(0, "(this=%u) res %.*s owner=%u " - "was not on recovering list, " - "owner is THIS node, clearing\n", - dlm->node_num, res->lockname.len, - res->lockname.name, new_master); - } else - continue; + if (!(res->state & DLM_LOCK_RES_RECOVERING)) + continue; - if (!list_empty(&res->recovering)) { - mlog(0, "%s:%.*s: lockres was " - "marked RECOVERING, owner=%u\n", - dlm->name, res->lockname.len, - res->lockname.name, res->owner); - list_del_init(&res->recovering); - dlm_lockres_put(res); - } - spin_lock(&res->spinlock); - /* new_master has our reference from - * the lock state sent during recovery */ - dlm_change_lockres_owner(dlm, res, new_master); - res->state &= ~DLM_LOCK_RES_RECOVERING; - if (__dlm_lockres_has_locks(res)) - __dlm_dirty_lockres(dlm, res); - spin_unlock(&res->spinlock); - wake_up(&res->wq); + if (res->owner != dead_node && + res->owner != dlm->node_num) + continue; + + if (!list_empty(&res->recovering)) { + list_del_init(&res->recovering); + dlm_lockres_put(res); } + + /* new_master has our reference from + * the lock state sent during recovery */ + mlog(0, "%s: res %.*s, Changing owner from %u to %u\n", + dlm->name, res->lockname.len, res->lockname.name, + res->owner, new_master); + spin_lock(&res->spinlock); + dlm_change_lockres_owner(dlm, res, new_master); + res->state &= ~DLM_LOCK_RES_RECOVERING; + if (__dlm_lockres_has_locks(res)) + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); } } } @@ -2252,12 +2252,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, res->lockname.len, res->lockname.name, freed, dead_node); __dlm_print_one_lock_resource(res); } - dlm_lockres_clear_refmap_bit(dead_node, res); + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); } else if (test_bit(dead_node, res->refmap)) { mlog(0, "%s:%.*s: dead node %u had a ref, but had " "no locks and had not purged before dying\n", dlm->name, res->lockname.len, res->lockname.name, dead_node); - dlm_lockres_clear_refmap_bit(dead_node, res); + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); } /* do not kick thread yet */ @@ -2324,9 +2324,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dlm_revalidate_lvb(dlm, res, dead_node); if (res->owner == dead_node) { if (res->state & DLM_LOCK_RES_DROPPING_REF) { - mlog(ML_NOTICE, "Ignore %.*s for " + mlog(ML_NOTICE, "%s: res %.*s, Skip " "recovery as it is being freed\n", - res->lockname.len, + dlm->name, res->lockname.len, res->lockname.name); } else dlm_move_lockres_to_recovery_list(dlm, diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 1d6d1d22c471..e73c833fc2a1 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -94,24 +94,26 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) { int bit; + assert_spin_locked(&res->spinlock); + if (__dlm_lockres_has_locks(res)) return 0; + /* Locks are in the process of being created */ + if (res->inflight_locks) + return 0; + if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) return 0; if (res->state & DLM_LOCK_RES_RECOVERING) return 0; + /* Another node has this resource with this node as the master */ bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); if (bit < O2NM_MAX_NODES) return 0; - /* - * since the bit for dlm->node_num is not set, inflight_locks better - * be zero - */ - BUG_ON(res->inflight_locks != 0); return 1; } @@ -185,8 +187,6 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, /* clear our bit from the master's refmap, ignore errors */ ret = dlm_drop_lockres_ref(dlm, res); if (ret < 0) { - mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name, - res->lockname.len, res->lockname.name, ret); if (!dlm_is_host_down(ret)) BUG(); } @@ -209,7 +209,7 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, BUG(); } - __dlm_unhash_lockres(res); + __dlm_unhash_lockres(dlm, res); /* lockres is not in the hash now. drop the flag and wake up * any processes waiting in dlm_get_lock_resource. */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index e1ed5e502ff2..81a4cd22f80b 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1692,7 +1692,7 @@ int ocfs2_open_lock(struct inode *inode) mlog(0, "inode %llu take PRMODE open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); - if (ocfs2_mount_local(osb)) + if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) goto out; lockres = &OCFS2_I(inode)->ip_open_lockres; @@ -1718,6 +1718,12 @@ int ocfs2_try_open_lock(struct inode *inode, int write) (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); + if (ocfs2_is_hard_readonly(osb)) { + if (write) + status = -EROFS; + goto out; + } + if (ocfs2_mount_local(osb)) goto out; @@ -2298,7 +2304,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, if (ocfs2_is_hard_readonly(osb)) { if (ex) status = -EROFS; - goto bail; + goto getbh; } if (ocfs2_mount_local(osb)) @@ -2356,7 +2362,7 @@ local: mlog_errno(status); goto bail; } - +getbh: if (ret_bh) { status = ocfs2_assign_bh(inode, ret_bh, local_bh); if (status < 0) { @@ -2628,8 +2634,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex) BUG_ON(!dl); - if (ocfs2_is_hard_readonly(osb)) - return -EROFS; + if (ocfs2_is_hard_readonly(osb)) { + if (ex) + return -EROFS; + return 0; + } if (ocfs2_mount_local(osb)) return 0; @@ -2647,7 +2656,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex) struct ocfs2_dentry_lock *dl = dentry->d_fsdata; struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - if (!ocfs2_mount_local(osb)) + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); } diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 23457b491e8c..2f5b92ef0e53 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -832,6 +832,102 @@ out: return ret; } +int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + int ret; + unsigned int is_last = 0, is_data = 0; + u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 cpos, cend, clen, hole_size; + u64 extoff, extlen; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; + + BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE); + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + if (*offset >= inode->i_size) { + ret = -ENXIO; + goto out_unlock; + } + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (origin == SEEK_HOLE) + *offset = inode->i_size; + goto out_unlock; + } + + clen = 0; + cpos = *offset >> cs_bits; + cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size); + + while (cpos < cend && !is_last) { + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size, + &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + extoff = cpos; + extoff <<= cs_bits; + + if (rec.e_blkno == 0ULL) { + clen = hole_size; + is_data = 0; + } else { + clen = le16_to_cpu(rec.e_leaf_clusters) - + (cpos - le32_to_cpu(rec.e_cpos)); + is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1; + } + + if ((!is_data && origin == SEEK_HOLE) || + (is_data && origin == SEEK_DATA)) { + if (extoff > *offset) + *offset = extoff; + goto out_unlock; + } + + if (!is_last) + cpos += clen; + } + + if (origin == SEEK_HOLE) { + extoff = cpos; + extoff <<= cs_bits; + extlen = clen; + extlen <<= cs_bits; + + if ((extoff + extlen) > inode->i_size) + extlen = inode->i_size - extoff; + extoff += extlen; + if (extoff > *offset) + *offset = extoff; + goto out_unlock; + } + + ret = -ENXIO; + +out_unlock: + + brelse(di_bh); + + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_inode_unlock(inode, 0); +out: + if (ret && ret != -ENXIO) + ret = -ENXIO; + return ret; +} + int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, struct buffer_head *bhs[], int flags, int (*validate)(struct super_block *sb, diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index e79d41c2c909..67ea57d2fd59 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len); +int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin); + int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, struct ocfs2_extent_list *el, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index de4ea1af041b..6e396683c3d4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1950,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (ret < 0) mlog_errno(ret); + if (file->f_flags & O_SYNC) + handle->h_sync = 1; + ocfs2_commit_trans(osb, handle); out_inode_unlock: @@ -2052,6 +2055,23 @@ out: return ret; } +static void ocfs2_aiodio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = ocfs2_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0)); +} + +static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) +{ + int blockmask = inode->i_sb->s_blocksize - 1; + loff_t final_size = pos + count; + + if ((pos & blockmask) || (final_size & blockmask)) + return 1; + return 0; +} + static int ocfs2_prepare_inode_for_refcount(struct inode *inode, struct file *file, loff_t pos, size_t count, @@ -2230,6 +2250,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); + int unaligned_dio = 0; trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2297,6 +2318,10 @@ relock: goto out; } + if (direct_io && !is_sync_kiocb(iocb)) + unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, + *ppos); + /* * We can't complete the direct I/O as requested, fall back to * buffered I/O. @@ -2311,6 +2336,18 @@ relock: goto relock; } + if (unaligned_dio) { + /* + * Wait on previous unaligned aio to complete before + * proceeding. + */ + ocfs2_aiodio_wait(inode); + + /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ + atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); + ocfs2_iocb_set_unaligned_aio(iocb); + } + /* * To later detect whether a journal commit for sync writes is * necessary, we sample i_size, and cluster count here. @@ -2382,8 +2419,12 @@ out_dio: if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { rw_level = -1; have_alloc_sem = 0; + unaligned_dio = 0; } + if (unaligned_dio) + atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); + out: if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); @@ -2591,6 +2632,57 @@ bail: return ret; } +/* Refer generic_file_llseek_unlocked() */ +static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) +{ + struct inode *inode = file->f_mapping->host; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + switch (origin) { + case SEEK_SET: + break; + case SEEK_END: + offset += inode->i_size; + break; + case SEEK_CUR: + if (offset == 0) { + offset = file->f_pos; + goto out; + } + offset += file->f_pos; + break; + case SEEK_DATA: + case SEEK_HOLE: + ret = ocfs2_seek_data_hole_offset(file, &offset, origin); + if (ret) + goto out; + break; + default: + ret = -EINVAL; + goto out; + } + + if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + ret = -EINVAL; + if (!ret && offset > inode->i_sb->s_maxbytes) + ret = -EINVAL; + if (ret) + goto out; + + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + +out: + mutex_unlock(&inode->i_mutex); + if (ret) + return ret; + return offset; +} + const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, @@ -2615,7 +2707,7 @@ const struct inode_operations ocfs2_special_file_iops = { * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! */ const struct file_operations ocfs2_fops = { - .llseek = generic_file_llseek, + .llseek = ocfs2_file_llseek, .read = do_sync_read, .write = do_sync_write, .mmap = ocfs2_mmap, @@ -2663,7 +2755,7 @@ const struct file_operations ocfs2_dops = { * the cluster. */ const struct file_operations ocfs2_fops_no_plocks = { - .llseek = generic_file_llseek, + .llseek = ocfs2_file_llseek, .read = do_sync_read, .write = do_sync_write, .mmap = ocfs2_mmap, diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index a22d2c098890..17454a904d7b 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -951,7 +951,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode, trace_ocfs2_cleanup_delete_inode( (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); if (sync_data) - write_inode_now(inode, 1); + filemap_write_and_wait(inode->i_mapping); truncate_inode_pages(&inode->i_data, 0); } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 1c508b149b3a..88924a3133fa 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -43,6 +43,9 @@ struct ocfs2_inode_info /* protects extended attribute changes on this inode */ struct rw_semaphore ip_xattr_sem; + /* Number of outstanding AIO's which are not page aligned */ + atomic_t ip_unaligned_aio; + /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index bc91072b7219..726ff265b296 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -122,7 +122,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) - goto bail_unlock; + goto bail_commit; } ocfs2_inode->ip_attr = flags; @@ -132,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, if (status < 0) mlog_errno(status); +bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode, 1); @@ -381,7 +382,7 @@ int ocfs2_info_handle_freeinode(struct inode *inode, if (!oifi) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_err; } if (o2info_from_user(*oifi, req)) @@ -431,7 +432,7 @@ bail: o2info_set_request_error(&oifi->ifi_req, req); kfree(oifi); - +out_err: return status; } @@ -666,7 +667,7 @@ int ocfs2_info_handle_freefrag(struct inode *inode, if (!oiff) { status = -ENOMEM; mlog_errno(status); - goto bail; + goto out_err; } if (o2info_from_user(*oiff, req)) @@ -716,7 +717,7 @@ bail: o2info_set_request_error(&oiff->iff_req, req); kfree(oiff); - +out_err: return status; } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 295d56454e8b..0a42ae96dca7 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1544,9 +1544,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, /* we need to run complete recovery for offline orphan slots */ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); - mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", - node_num, slot_num, - MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\ + "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), + MINOR(osb->sb->s_dev)); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); @@ -1601,6 +1601,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, jbd2_journal_destroy(journal); + printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\ + "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), + MINOR(osb->sb->s_dev)); done: /* drop the lock on this nodes journal */ if (got_lock) @@ -1808,6 +1811,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void) * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This * is done to catch any orphans that are left over in orphan directories. * + * It scans all slots, even ones that are in use. It does so to handle the + * case described below: + * + * Node 1 has an inode it was using. The dentry went away due to memory + * pressure. Node 1 closes the inode, but it's on the free list. The node + * has the open lock. + * Node 2 unlinks the inode. It grabs the dentry lock to notify others, + * but node 1 has no dentry and doesn't get the message. It trylocks the + * open lock, sees that another node has a PR, and does nothing. + * Later node 2 runs its orphan dir. It igets the inode, trylocks the + * open lock, sees the PR still, and does nothing. + * Basically, we have to trigger an orphan iput on node 1. The only way + * for this to happen is if node 1 runs node 2's orphan dir. + * * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT * seconds. It gets an EX lock on os_lockres and checks sequence number * stored in LVB. If the sequence number has changed, it means some other diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 68cf2f6d3c6a..a3385b63ff5e 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -441,10 +441,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota - * update on dir + index leaf + dx root update for free list */ + * update on dir + index leaf + dx root update for free list + + * previous dirblock update in the free list */ static inline int ocfs2_link_credits(struct super_block *sb) { - return 2*OCFS2_INODE_UPDATE_CREDITS + 3 + + return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + ocfs2_quota_trans_credits(sb); } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 3e9393ca39eb..9cd41083e991 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, struct page *page) { - int ret; + int ret = VM_FAULT_NOPAGE; struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); @@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, void *fsdata; loff_t size = i_size_read(inode); - /* - * Another node might have truncated while we were waiting on - * cluster locks. - * We don't check size == 0 before the shift. This is borrowed - * from do_generic_file_read. - */ last_index = (size - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!size || page->index > last_index)) { - ret = -EINVAL; - goto out; - } /* - * The i_size check above doesn't catch the case where nodes - * truncated and then re-extended the file. We'll re-check the - * page mapping after taking the page lock inside of - * ocfs2_write_begin_nolock(). + * There are cases that lead to the page no longer bebongs to the + * mapping. + * 1) pagecache truncates locally due to memory pressure. + * 2) pagecache truncates when another is taking EX lock against + * inode lock. see ocfs2_data_convert_worker. + * + * The i_size check doesn't catch the case where nodes truncated and + * then re-extended the file. We'll re-check the page mapping after + * taking the page lock inside of ocfs2_write_begin_nolock(). + * + * Let VM retry with these cases. */ - if (!PageUptodate(page) || page->mapping != inode->i_mapping) { - /* - * the page has been umapped in ocfs2_data_downconvert_worker. - * So return 0 here and let VFS retry. - */ - ret = 0; + if ((page->mapping != inode->i_mapping) || + (!PageUptodate(page)) || + (page_offset(page) >= size)) goto out; - } /* * Call ocfs2_write_begin() and ocfs2_write_end() to take @@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, if (ret) { if (ret != -ENOSPC) mlog_errno(ret); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; goto out; } - ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, - fsdata); - if (ret < 0) { - mlog_errno(ret); + if (!locked_page) { + ret = VM_FAULT_NOPAGE; goto out; } + ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, + fsdata); BUG_ON(ret != len); - ret = 0; + ret = VM_FAULT_LOCKED; out: return ret; } @@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out: ocfs2_unblock_signals(&oldset); - if (ret) - ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index d53cb706f14c..184c76b8c293 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -745,7 +745,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, */ ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, new_phys_cpos); - if (!new_phys_cpos) { + if (!*new_phys_cpos) { ret = -ENOSPC; goto out_commit; } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 409285854f64..d355e6e36b36 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -836,18 +836,65 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb, static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) { - __test_and_set_bit_le(bit, bitmap); + __set_bit_le(bit, bitmap); } #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) { - __test_and_clear_bit_le(bit, bitmap); + __clear_bit_le(bit, bitmap); } #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) #define ocfs2_test_bit test_bit_le #define ocfs2_find_next_zero_bit find_next_zero_bit_le #define ocfs2_find_next_bit find_next_bit_le + +static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr) +{ +#if BITS_PER_LONG == 64 + *bit += ((unsigned long) addr & 7UL) << 3; + addr = (void *) ((unsigned long) addr & ~7UL); +#elif BITS_PER_LONG == 32 + *bit += ((unsigned long) addr & 3UL) << 3; + addr = (void *) ((unsigned long) addr & ~3UL); +#else +#error "how many bits you are?!" +#endif + return addr; +} + +static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap) +{ + bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); + ocfs2_set_bit(bit, bitmap); +} + +static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap) +{ + bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); + ocfs2_clear_bit(bit, bitmap); +} + +static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap) +{ + bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); + return ocfs2_test_bit(bit, bitmap); +} + +static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max, + int start) +{ + int fix = 0, ret, tmpmax; + bitmap = correct_addr_and_bit_unaligned(&fix, bitmap); + tmpmax = max + fix; + start += fix; + + ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix; + if (ret > max) + return max; + return ret; +} + #endif /* OCFS2_H */ diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index dc8007fc9247..f100bf70a906 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -404,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( int status = 0; struct ocfs2_quota_recovery *rec; - mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num); + printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for " + "slot %u\n", osb->dev_str, slot_num); + rec = ocfs2_alloc_quota_recovery(); if (!rec) return ERR_PTR(-ENOMEM); @@ -549,8 +551,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, goto out_commit; } lock_buffer(qbh); - WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap)); - ocfs2_clear_bit(bit, dchunk->dqc_bitmap); + WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap)); + ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, 1); unlock_buffer(qbh); ocfs2_journal_dirty(handle, qbh); @@ -596,7 +598,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, struct inode *lqinode; unsigned int flags; - mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num); + printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " + "slot %u\n", osb->dev_str, slot_num); + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); for (type = 0; type < MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) @@ -612,8 +616,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, /* Someone else is holding the lock? Then he must be * doing the recovery. Just skip the file... */ if (status == -EAGAIN) { - mlog(ML_NOTICE, "skipping quota recovery for slot %d " - "because quota file is locked.\n", slot_num); + printk(KERN_NOTICE "ocfs2: Skipping quota recovery on " + "device (%s) for slot %d because quota file is " + "locked.\n", osb->dev_str, slot_num); status = 0; goto out_put; } else if (status < 0) { @@ -944,7 +949,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, * ol_quota_entries_per_block(sb); } - found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0); + found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0); /* We failed? */ if (found == len) { mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" @@ -1208,7 +1213,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private) struct ocfs2_local_disk_chunk *dchunk; dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; - ocfs2_set_bit(*offset, dchunk->dqc_bitmap); + ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, -1); } @@ -1289,7 +1294,7 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) (od->dq_chunk->qc_headerbh->b_data); /* Mark structure as freed */ lock_buffer(od->dq_chunk->qc_headerbh); - ocfs2_clear_bit(offset, dchunk->dqc_bitmap); + ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap); le32_add_cpu(&dchunk->dqc_free, 1); unlock_buffer(od->dq_chunk->qc_headerbh); ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 26fc0014d509..1424c151cccc 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -493,8 +493,8 @@ int ocfs2_find_slot(struct ocfs2_super *osb) goto bail; } } else - mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", - slot); + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " + "allocated to this node!\n", slot, osb->dev_str); ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 19965b00c43c..94368017edb3 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -28,6 +28,7 @@ #include "cluster/masklog.h" #include "cluster/nodemanager.h" #include "cluster/heartbeat.h" +#include "cluster/tcp.h" #include "stackglue.h" @@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb) } /* + * Check if this node is heartbeating and is connected to all other + * heartbeating nodes. + */ +static int o2cb_cluster_check(void) +{ + u8 node_num; + int i; + unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + + node_num = o2nm_this_node(); + if (node_num == O2NM_MAX_NODES) { + printk(KERN_ERR "o2cb: This node has not been configured.\n"); + return -EINVAL; + } + + /* + * o2dlm expects o2net sockets to be created. If not, then + * dlm_join_domain() fails with a stack of errors which are both cryptic + * and incomplete. The idea here is to detect upfront whether we have + * managed to connect to all nodes or not. If not, then list the nodes + * to allow the user to check the configuration (incorrect IP, firewall, + * etc.) Yes, this is racy. But its not the end of the world. + */ +#define O2CB_MAP_STABILIZE_COUNT 60 + for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { + o2hb_fill_node_map(hbmap, sizeof(hbmap)); + if (!test_bit(node_num, hbmap)) { + printk(KERN_ERR "o2cb: %s heartbeat has not been " + "started.\n", (o2hb_global_heartbeat_active() ? + "Global" : "Local")); + return -EINVAL; + } + o2net_fill_node_map(netmap, sizeof(netmap)); + /* Force set the current node to allow easy compare */ + set_bit(node_num, netmap); + if (!memcmp(hbmap, netmap, sizeof(hbmap))) + return 0; + if (i < O2CB_MAP_STABILIZE_COUNT) + msleep(1000); + } + + printk(KERN_ERR "o2cb: This node could not connect to nodes:"); + i = -1; + while ((i = find_next_bit(hbmap, O2NM_MAX_NODES, + i + 1)) < O2NM_MAX_NODES) { + if (!test_bit(i, netmap)) + printk(" %u", i); + } + printk(".\n"); + + return -ENOTCONN; +} + +/* * Called from the dlm when it's about to evict a node. This is how the * classic stack signals node death. */ @@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data) { struct ocfs2_cluster_connection *conn = data; - mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n", - node_num, conn->cc_namelen, conn->cc_name); + printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n", + node_num, conn->cc_namelen, conn->cc_name); conn->cc_recovery_handler(node_num, conn->cc_recovery_data); } @@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) BUG_ON(conn == NULL); BUG_ON(conn->cc_proto == NULL); - /* for now we only have one cluster/node, make sure we see it - * in the heartbeat universe */ - if (!o2hb_check_local_node_heartbeating()) { - if (o2hb_global_heartbeat_active()) - mlog(ML_ERROR, "Global heartbeat not started\n"); - rc = -EINVAL; + /* Ensure cluster stack is up and all nodes are connected */ + rc = o2cb_cluster_check(); + if (rc) { + printk(KERN_ERR "o2cb: Cluster check failed. Fix errors " + "before retrying.\n"); goto out; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 56f61027236b..4994f8b0e604 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -54,6 +54,7 @@ #include "ocfs1_fs_compat.h" #include "alloc.h" +#include "aops.h" #include "blockcheck.h" #include "dlmglue.h" #include "export.h" @@ -1107,9 +1108,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) ocfs2_set_ro_flag(osb, 1); - printk(KERN_NOTICE "Readonly device detected. No cluster " - "services will be utilized for this mount. Recovery " - "will be skipped.\n"); + printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. " + "Cluster services will not be used for this mount. " + "Recovery will be skipped.\n", osb->dev_str); } if (!ocfs2_is_hard_readonly(osb)) { @@ -1616,12 +1617,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) return 0; } +wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; + static int __init ocfs2_init(void) { - int status; + int status, i; ocfs2_print_version(); + for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) + init_waitqueue_head(&ocfs2__ioend_wq[i]); + status = init_ocfs2_uptodate_cache(); if (status < 0) { mlog_errno(status); @@ -1760,7 +1766,7 @@ static void ocfs2_inode_init_once(void *data) ocfs2_extent_map_init(&oi->vfs_inode); INIT_LIST_HEAD(&oi->ip_io_markers); oi->ip_dir_start_lookup = 0; - + atomic_set(&oi->ip_unaligned_aio, 0); init_rwsem(&oi->ip_alloc_sem); init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); @@ -1974,7 +1980,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) * If we failed before we got a uuid_str yet, we can't stop * heartbeat. Otherwise, do it. */ - if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str) + if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str && + !ocfs2_is_hard_readonly(osb)) hangup_needed = 1; if (osb->cconn) @@ -2353,7 +2360,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } - cleancache_init_shared_fs((char *)&uuid_net_key, sb); + cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); bail: return status; @@ -2462,8 +2469,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) goto finally; } } else { - mlog(ML_NOTICE, "File system was not unmounted cleanly, " - "recovering volume.\n"); + printk(KERN_NOTICE "ocfs2: File system on device (%s) was not " + "unmounted cleanly, recovering it.\n", osb->dev_str); } local = ocfs2_mount_local(osb); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 194fb22ef79d..aa9e8777b09a 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2376,16 +2376,18 @@ static int ocfs2_remove_value_outside(struct inode*inode, } ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); - if (ret < 0) { - mlog_errno(ret); - break; - } ocfs2_commit_trans(osb, ctxt.handle); if (ctxt.meta_ac) { ocfs2_free_alloc_context(ctxt.meta_ac); ctxt.meta_ac = NULL; } + + if (ret < 0) { + mlog_errno(ret); + break; + } + } if (ctxt.meta_ac) diff --git a/fs/proc/base.c b/fs/proc/base.c index 2db1bd3173b2..851ba3dcdc29 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1652,46 +1652,12 @@ out: return error; } -static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - int rc; - - if (task == NULL) - return -ESRCH; - - rc = -EACCES; - if (lock_trace(task)) - goto out_task; - - generic_fillattr(inode, stat); - unlock_trace(task); - rc = 0; -out_task: - put_task_struct(task); - return rc; -} - static const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .follow_link = proc_pid_follow_link, .setattr = proc_setattr, }; -static const struct inode_operations proc_fdinfo_link_inode_operations = { - .setattr = proc_setattr, - .getattr = proc_pid_fd_link_getattr, -}; - -static const struct inode_operations proc_fd_link_inode_operations = { - .readlink = proc_pid_readlink, - .follow_link = proc_pid_follow_link, - .setattr = proc_setattr, - .getattr = proc_pid_fd_link_getattr, -}; - /* building an inode */ @@ -1923,61 +1889,49 @@ out: static int proc_fd_info(struct inode *inode, struct path *path, char *info) { - struct task_struct *task; - struct files_struct *files; + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; struct file *file; int fd = proc_fd(inode); - int rc; - - task = get_proc_task(inode); - if (!task) - return -ENOENT; - - rc = -EACCES; - if (lock_trace(task)) - goto out_task; - - rc = -ENOENT; - files = get_files_struct(task); - if (files == NULL) - goto out_unlock; - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (file) { - unsigned int f_flags; - struct fdtable *fdt; - - fdt = files_fdtable(files); - f_flags = file->f_flags & ~O_CLOEXEC; - if (FD_ISSET(fd, fdt->close_on_exec)) - f_flags |= O_CLOEXEC; - - if (path) { - *path = file->f_path; - path_get(&file->f_path); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } + if (files) { + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + unsigned int f_flags; + struct fdtable *fdt; + + fdt = files_fdtable(files); + f_flags = file->f_flags & ~O_CLOEXEC; + if (FD_ISSET(fd, fdt->close_on_exec)) + f_flags |= O_CLOEXEC; + + if (path) { + *path = file->f_path; + path_get(&file->f_path); + } + if (info) + snprintf(info, PROC_FDINFO_MAX, + "pos:\t%lli\n" + "flags:\t0%o\n", + (long long) file->f_pos, + f_flags); + spin_unlock(&files->file_lock); + put_files_struct(files); + return 0; } - if (info) - snprintf(info, PROC_FDINFO_MAX, - "pos:\t%lli\n" - "flags:\t0%o\n", - (long long) file->f_pos, - f_flags); - rc = 0; - } else - rc = -ENOENT; - spin_unlock(&files->file_lock); - put_files_struct(files); - -out_unlock: - unlock_trace(task); -out_task: - put_task_struct(task); - return rc; + spin_unlock(&files->file_lock); + put_files_struct(files); + } + return -ENOENT; } static int proc_fd_link(struct inode *inode, struct path *path) @@ -2072,7 +2026,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, spin_unlock(&files->file_lock); put_files_struct(files); - inode->i_op = &proc_fd_link_inode_operations; + inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; d_set_d_op(dentry, &tid_fd_dentry_operations); @@ -2104,12 +2058,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, if (fd == ~0U) goto out; - result = ERR_PTR(-EACCES); - if (lock_trace(task)) - goto out; - result = instantiate(dir, dentry, task, &fd); - unlock_trace(task); out: put_task_struct(task); out_no_task: @@ -2129,28 +2078,23 @@ static int proc_readfd_common(struct file * filp, void * dirent, retval = -ENOENT; if (!p) goto out_no_task; - - retval = -EACCES; - if (lock_trace(p)) - goto out; - retval = 0; fd = filp->f_pos; switch (fd) { case 0: if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out_unlock; + goto out; filp->f_pos++; case 1: ino = parent_ino(dentry); if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out_unlock; + goto out; filp->f_pos++; default: files = get_files_struct(p); if (!files) - goto out_unlock; + goto out; rcu_read_lock(); for (fd = filp->f_pos-2; fd < files_fdtable(files)->max_fds; @@ -2174,9 +2118,6 @@ static int proc_readfd_common(struct file * filp, void * dirent, rcu_read_unlock(); put_files_struct(files); } - -out_unlock: - unlock_trace(p); out: put_task_struct(p); out_no_task: @@ -2254,7 +2195,6 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir, ei->fd = fd; inode->i_mode = S_IFREG | S_IRUSR; inode->i_fop = &proc_fdinfo_file_operations; - inode->i_op = &proc_fdinfo_link_inode_operations; d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 586174168e2a..80e4645f7990 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -131,12 +131,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES) #ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(global_page_state(NR_ANON_PAGES) + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * - HPAGE_PMD_NR + HPAGE_PMD_NR), +#else + K(global_page_state(NR_ANON_PAGES)), #endif - ), K(global_page_state(NR_FILE_MAPPED)), K(global_page_state(NR_SHMEM)), K(global_page_state(NR_SLAB_RECLAIMABLE) + diff --git a/fs/proc/root.c b/fs/proc/root.c index 9a8a2b77b874..03102d978180 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -91,20 +91,18 @@ static struct file_system_type proc_fs_type = { void __init proc_root_init(void) { - struct vfsmount *mnt; int err; proc_init_inodecache(); err = register_filesystem(&proc_fs_type); if (err) return; - mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); - if (IS_ERR(mnt)) { + err = pid_ns_prepare_proc(&init_pid_ns); + if (err) { unregister_filesystem(&proc_fs_type); return; } - init_pid_ns.proc_mnt = mnt; proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); @@ -209,5 +207,5 @@ int pid_ns_prepare_proc(struct pid_namespace *ns) void pid_ns_release_proc(struct pid_namespace *ns) { - mntput(ns->proc_mnt); + kern_unmount(ns->proc_mnt); } diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 42b274da92c3..0855e6f20391 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu) idle = kstat_cpu(cpu).cpustat.idle; idle = cputime64_add(idle, arch_idle_time(cpu)); } else - idle = usecs_to_cputime(idle_time); + idle = usecs_to_cputime64(idle_time); return idle; } @@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu) /* !NO_HZ so we can rely on cpustat.iowait */ iowait = kstat_cpu(cpu).cpustat.iowait; else - iowait = usecs_to_cputime(iowait_time); + iowait = usecs_to_cputime64(iowait_time); return iowait; } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index cd99bf557650..b0f450a2bb7c 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -12,6 +12,7 @@ #include <linux/user.h> #include <linux/elf.h> #include <linux/elfcore.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/highmem.h> #include <linux/bootmem.h> diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 2bd620f0d796..57bbf9078ac8 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -167,6 +167,7 @@ int pstore_register(struct pstore_info *psi) } psinfo = psi; + mutex_init(&psinfo->read_mutex); spin_unlock(&pstore_lock); if (owner && !try_module_get(owner)) { @@ -195,30 +196,32 @@ EXPORT_SYMBOL_GPL(pstore_register); void pstore_get_records(int quiet) { struct pstore_info *psi = psinfo; + char *buf = NULL; ssize_t size; u64 id; enum pstore_type_id type; struct timespec time; int failed = 0, rc; - unsigned long flags; if (!psi) return; - spin_lock_irqsave(&psinfo->buf_lock, flags); + mutex_lock(&psi->read_mutex); rc = psi->open(psi); if (rc) goto out; - while ((size = psi->read(&id, &type, &time, psi)) > 0) { - rc = pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, + while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) { + rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size, time, psi); + kfree(buf); + buf = NULL; if (rc && (rc != -EEXIST || !quiet)) failed++; } psi->close(psi); out: - spin_unlock_irqrestore(&psinfo->buf_lock, flags); + mutex_unlock(&psi->read_mutex); if (failed) printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", diff --git a/fs/quota/quota.c b/fs/quota/quota.c index aae0edb95c6c..35f4b0ecdeb3 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, /* caller already holds s_umount */ if (sb->s_flags & MS_RDONLY) return -EROFS; - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, WB_REASON_SYNC); return 0; default: return -EINVAL; diff --git a/fs/seq_file.c b/fs/seq_file.c index 05d6b0e78c95..dba43c3ea3af 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -449,8 +449,6 @@ EXPORT_SYMBOL(seq_path); /* * Same as seq_path, but relative to supplied root. - * - * root may be changed, see __d_path(). */ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, char *esc) @@ -463,6 +461,8 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, char *p; p = __d_path(path, root, buf, size); + if (!p) + return SEQ_SKIP; res = PTR_ERR(p); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); @@ -474,7 +474,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, } seq_commit(m, res); - return res < 0 ? res : 0; + return res < 0 && res != -ENAMETOOLONG ? res : 0; } /* diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 048b59d5b2f0..c70111ebefd4 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -78,6 +78,28 @@ config SQUASHFS_XZ If unsure, say N. +config SQUASHFS_4K_DEVBLK_SIZE + bool "Use 4K device block size?" + depends on SQUASHFS + help + By default Squashfs sets the dev block size (sb_min_blocksize) + to 1K or the smallest block size supported by the block device + (if larger). This, because blocks are packed together and + unaligned in Squashfs, should reduce latency. + + This, however, gives poor performance on MTD NAND devices where + the optimal I/O size is 4K (even though the devices can support + smaller block sizes). + + Using a 4K device block size may also improve overall I/O + performance for some file access patterns (e.g. sequential + accesses of files in filesystem order) on all media. + + Setting this option will force Squashfs to use a 4K device block + size by default. + + If unsure, say N. + config SQUASHFS_EMBEDDED bool "Additional option for memory-constrained systems" depends on SQUASHFS diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index b4a4e539a08c..e8e14645de9a 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -36,6 +36,13 @@ #define SQUASHFS_FILE_SIZE 131072 #define SQUASHFS_FILE_LOG 17 +/* default size of block device I/O */ +#ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE +#define SQUASHFS_DEVBLK_SIZE 4096 +#else +#define SQUASHFS_DEVBLK_SIZE 1024 +#endif + #define SQUASHFS_FILE_MAX_SIZE 1048576 #define SQUASHFS_FILE_MAX_LOG 20 diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 7438850c62d0..2da1715452ac 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) } msblk = sb->s_fs_info; - msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); + msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->read_data_mutex); diff --git a/fs/statfs.c b/fs/statfs.c index 8244924dec55..9cf04a118965 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -76,7 +76,7 @@ EXPORT_SYMBOL(vfs_statfs); int user_statfs(const char __user *pathname, struct kstatfs *st) { struct path path; - int error = user_path(pathname, &path); + int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); if (!error) { error = vfs_statfs(&path, st); path_put(&path); diff --git a/fs/sync.c b/fs/sync.c index c98a7477edfd..101b8ef901d7 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (wait) sync_inodes_sb(sb); else - writeback_inodes_sb(sb); + writeback_inodes_sb(sb, WB_REASON_SYNC); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); @@ -98,7 +98,7 @@ static void sync_filesystems(int wait) */ SYSCALL_DEFINE0(sync) { - wakeup_flusher_threads(0); + wakeup_flusher_threads(0, WB_REASON_SYNC); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 315de66e52b2..bc4f94b28706 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -63,7 +63,7 @@ static void shrink_liability(struct ubifs_info *c, int nr_to_write) { down_read(&c->vfs_sb->s_umount); - writeback_inodes_sb(c->vfs_sb); + writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE); up_read(&c->vfs_sb->s_umount); } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index eef109a1a927..b09ba2dd8b62 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c) spin_unlock(&dbg_lock); } +void dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) +{ + struct ubifs_scan_node *snod; + + printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n", + current->pid, sleb->lnum, offs); + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum, + snod->offs, snod->len); + dbg_dump_node(c, snod->node); + } +} + void dbg_dump_leb(const struct ubifs_info *c, int lnum) { struct ubifs_scan_leb *sleb; diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index feb361e252ac..8d9c46810189 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); void dbg_dump_lprops(struct ubifs_info *c); void dbg_dump_lpt_info(struct ubifs_info *c); void dbg_dump_leb(const struct ubifs_info *c, int lnum); +void dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs); void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode); void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); @@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; } static inline void dbg_dump_leb(const struct ubifs_info *c, int lnum) { return; } static inline void +dbg_dump_sleb(const struct ubifs_info *c, + const struct ubifs_scan_leb *sleb, int offs) { return; } +static inline void dbg_dump_znode(const struct ubifs_info *c, const struct ubifs_znode *znode) { return; } static inline void dbg_dump_heap(struct ubifs_info *c, diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index af02790d9328..ee4f43f4bb99 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf) } /** - * clean_an_unclean_leb - read and write a LEB to remove corruption. + * clean_an_unclean_leb - read and write a LEB to remove corruption. * @c: UBIFS file-system description object * @ucleb: unclean LEB information * @sbuf: LEB-sized buffer to use diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 93d938ad3d2a..6094c5a5d7a8 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c) mst->total_dirty = cpu_to_le64(tmp64); /* The indexing LEB does not contribute to dark space */ - tmp64 = (c->main_lebs - 1) * c->dark_wm; + tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm); mst->total_dark = cpu_to_le64(tmp64); mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 20403dc5d437..ae0e76bb6ebf 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2264,19 +2264,12 @@ static int __init ubifs_init(void) return -EINVAL; } - err = register_filesystem(&ubifs_fs_type); - if (err) { - ubifs_err("cannot register file system, error %d", err); - return err; - } - - err = -ENOMEM; ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", sizeof(struct ubifs_inode), 0, SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, &inode_slab_ctor); if (!ubifs_inode_slab) - goto out_reg; + return -ENOMEM; register_shrinker(&ubifs_shrinker_info); @@ -2288,15 +2281,20 @@ static int __init ubifs_init(void) if (err) goto out_compr; + err = register_filesystem(&ubifs_fs_type); + if (err) { + ubifs_err("cannot register file system, error %d", err); + goto out_dbg; + } return 0; +out_dbg: + dbg_debugfs_exit(); out_compr: ubifs_compressors_exit(); out_shrinker: unregister_shrinker(&ubifs_shrinker_info); kmem_cache_destroy(ubifs_inode_slab); -out_reg: - unregister_filesystem(&ubifs_fs_type); return err; } /* late_initcall to let compressors initialize first */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b6c4b3795c4a..76e4266d2e7e 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -42,6 +42,8 @@ xfs_acl_from_disk(struct xfs_acl *aclp) int count, i; count = be32_to_cpu(aclp->acl_cnt); + if (count > XFS_ACL_MAX_ENTRIES) + return ERR_PTR(-EFSCORRUPTED); acl = posix_acl_alloc(count, GFP_KERNEL); if (!acl) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 33b13310ee0c..574d4ee9b625 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -189,7 +189,7 @@ xfs_end_io( int error = 0; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - error = -EIO; + ioend->io_error = -EIO; goto done; } if (ioend->io_error) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index d4906e7c9787..c1b55e596551 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -110,6 +110,7 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags) /* * Query whether the requested number of additional bytes of extended * attribute space will be able to fit inline. + * * Returns zero if not, else the di_forkoff fork offset to be used in the * literal area for attribute data once the new bytes have been added. * @@ -122,7 +123,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) int offset; int minforkoff; /* lower limit on valid forkoff locations */ int maxforkoff; /* upper limit on valid forkoff locations */ - int dsize; + int dsize; xfs_mount_t *mp = dp->i_mount; offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ @@ -136,47 +137,60 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) return (offset >= minforkoff) ? minforkoff : 0; } - if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { - if (bytes <= XFS_IFORK_ASIZE(dp)) - return dp->i_d.di_forkoff; + /* + * If the requested numbers of bytes is smaller or equal to the + * current attribute fork size we can always proceed. + * + * Note that if_bytes in the data fork might actually be larger than + * the current data fork size is due to delalloc extents. In that + * case either the extent count will go down when they are converted + * to real extents, or the delalloc conversion will take care of the + * literal area rebalancing. + */ + if (bytes <= XFS_IFORK_ASIZE(dp)) + return dp->i_d.di_forkoff; + + /* + * For attr2 we can try to move the forkoff if there is space in the + * literal area, but for the old format we are done if there is no + * space in the fixed attribute fork. + */ + if (!(mp->m_flags & XFS_MOUNT_ATTR2)) return 0; - } dsize = dp->i_df.if_bytes; - + switch (dp->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* + /* * If there is no attr fork and the data fork is extents, - * determine if creating the default attr fork will result - * in the extents form migrating to btree. If so, the - * minimum offset only needs to be the space required for + * determine if creating the default attr fork will result + * in the extents form migrating to btree. If so, the + * minimum offset only needs to be the space required for * the btree root. - */ + */ if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > xfs_default_attroffset(dp)) dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); break; - case XFS_DINODE_FMT_BTREE: /* - * If have data btree then keep forkoff if we have one, - * otherwise we are adding a new attr, so then we set - * minforkoff to where the btree root can finish so we have + * If we have a data btree then keep forkoff if we have one, + * otherwise we are adding a new attr, so then we set + * minforkoff to where the btree root can finish so we have * plenty of room for attrs */ if (dp->i_d.di_forkoff) { - if (offset < dp->i_d.di_forkoff) + if (offset < dp->i_d.di_forkoff) return 0; - else - return dp->i_d.di_forkoff; - } else - dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); + return dp->i_d.di_forkoff; + } + dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); break; } - - /* - * A data fork btree root must have space for at least + + /* + * A data fork btree root must have space for at least * MINDBTPTRS key/ptr pairs if the data fork is small or empty. */ minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); @@ -186,10 +200,10 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ - if (offset >= minforkoff && offset < maxforkoff) - return offset; if (offset >= maxforkoff) return maxforkoff; + if (offset >= minforkoff) + return offset; return 0; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c68baeb0974a..d0ab78837057 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2383,6 +2383,8 @@ xfs_bmap_btalloc( int tryagain; int error; + ASSERT(ap->length); + mp = ap->ip->i_mount; align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { @@ -4629,6 +4631,8 @@ xfs_bmapi_allocate( int error; int rt; + ASSERT(bma->length > 0); + rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); /* @@ -4849,6 +4853,7 @@ xfs_bmapi_write( ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(!(flags & XFS_BMAPI_IGSTATE)); ASSERT(tp != NULL); + ASSERT(len > 0); whichfork = (flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; @@ -4918,9 +4923,22 @@ xfs_bmapi_write( bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; - bma.length = len; bma.offset = bno; + /* + * There's a 32/64 bit type mismatch between the + * allocation length request (which can be 64 bits in + * length) and the bma length request, which is + * xfs_extlen_t and therefore 32 bits. Hence we have to + * check for 32-bit overflows and handle them here. + */ + if (len > (xfs_filblks_t)MAXEXTLEN) + bma.length = MAXEXTLEN; + else + bma.length = len; + + ASSERT(len > 0); + ASSERT(bma.length > 0); error = xfs_bmapi_allocate(&bma, flags); if (error) goto error0; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1a3513881bce..eac97ef81e2a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -656,7 +656,7 @@ xfs_buf_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_buf_item_ops = { +static const struct xfs_item_ops xfs_buf_item_ops = { .iop_size = xfs_buf_item_size, .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index bb3f71d236d2..0dee0b71029d 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -295,7 +295,7 @@ xfs_qm_dquot_logitem_committing( /* * This is the ops vector for dquots */ -static struct xfs_item_ops xfs_dquot_item_ops = { +static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = xfs_qm_dquot_logitem_size, .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, @@ -483,7 +483,7 @@ xfs_qm_qoff_logitem_committing( { } -static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { +static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, @@ -498,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { /* * This is the ops vector shared by all quotaoff-start log items. */ -static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { +static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index da108977b21f..558910f5e3c0 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -98,22 +98,22 @@ xfs_fs_encode_fh( switch (fileid_type) { case FILEID_INO32_GEN_PARENT: spin_lock(&dentry->d_lock); - fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino; + fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; spin_unlock(&dentry->d_lock); /*FALLTHRU*/ case FILEID_INO32_GEN: - fid->i32.ino = inode->i_ino; + fid->i32.ino = XFS_I(inode)->i_ino; fid->i32.gen = inode->i_generation; break; case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: spin_lock(&dentry->d_lock); - fid64->parent_ino = dentry->d_parent->d_inode->i_ino; + fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; fid64->parent_gen = dentry->d_parent->d_inode->i_generation; spin_unlock(&dentry->d_lock); /*FALLTHRU*/ case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: - fid64->ino = inode->i_ino; + fid64->ino = XFS_I(inode)->i_ino; fid64->gen = inode->i_generation; break; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d22e62623437..35c2aff38b20 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -217,7 +217,7 @@ xfs_efi_item_committing( /* * This is the ops vector shared by all efi log items. */ -static struct xfs_item_ops xfs_efi_item_ops = { +static const struct xfs_item_ops xfs_efi_item_ops = { .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_pin = xfs_efi_item_pin, @@ -477,7 +477,7 @@ xfs_efd_item_committing( /* * This is the ops vector shared by all efd log items. */ -static struct xfs_item_ops xfs_efd_item_ops = { +static const struct xfs_item_ops xfs_efd_item_ops = { .iop_size = xfs_efd_item_size, .iop_format = xfs_efd_item_format, .iop_pin = xfs_efd_item_pin, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c0237c602f11..755ee8164880 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2835,6 +2835,27 @@ corrupt_out: return XFS_ERROR(EFSCORRUPTED); } +void +xfs_promote_inode( + struct xfs_inode *ip) +{ + struct xfs_buf *bp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + + bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno, + ip->i_imap.im_len, XBF_TRYLOCK); + if (!bp) + return; + + if (XFS_BUF_ISDELAYWRITE(bp)) { + xfs_buf_delwri_promote(bp); + wake_up_process(ip->i_mount->m_ddev_targp->bt_task); + } + + xfs_buf_relse(bp); +} + /* * Return a pointer to the extent record at file index idx. */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 760140d1dd66..b4cd4739f98e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -498,6 +498,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_iunpin_wait(xfs_inode_t *); int xfs_iflush(xfs_inode_t *, uint); +void xfs_promote_inode(struct xfs_inode *); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index b7cf21ba240f..abaafdbb3e65 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -795,7 +795,7 @@ xfs_inode_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_inode_item_ops = { +static const struct xfs_item_ops xfs_inode_item_ops = { .iop_size = xfs_inode_item_size, .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2758a6277c52..34817adf4b9e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -150,6 +150,117 @@ xlog_grant_add_space( } while (head_val != old); } +STATIC bool +xlog_reserveq_wake( + struct log *log, + int *free_bytes) +{ + struct xlog_ticket *tic; + int need_bytes; + + list_for_each_entry(tic, &log->l_reserveq, t_queue) { + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + need_bytes = tic->t_unit_res * tic->t_cnt; + else + need_bytes = tic->t_unit_res; + + if (*free_bytes < need_bytes) + return false; + *free_bytes -= need_bytes; + + trace_xfs_log_grant_wake_up(log, tic); + wake_up(&tic->t_wait); + } + + return true; +} + +STATIC bool +xlog_writeq_wake( + struct log *log, + int *free_bytes) +{ + struct xlog_ticket *tic; + int need_bytes; + + list_for_each_entry(tic, &log->l_writeq, t_queue) { + ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); + + need_bytes = tic->t_unit_res; + + if (*free_bytes < need_bytes) + return false; + *free_bytes -= need_bytes; + + trace_xfs_log_regrant_write_wake_up(log, tic); + wake_up(&tic->t_wait); + } + + return true; +} + +STATIC int +xlog_reserveq_wait( + struct log *log, + struct xlog_ticket *tic, + int need_bytes) +{ + list_add_tail(&tic->t_queue, &log->l_reserveq); + + do { + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + xlog_grant_push_ail(log, need_bytes); + + XFS_STATS_INC(xs_sleep_logspace); + trace_xfs_log_grant_sleep(log, tic); + + xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); + trace_xfs_log_grant_wake(log, tic); + + spin_lock(&log->l_grant_reserve_lock); + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes); + + list_del_init(&tic->t_queue); + return 0; +shutdown: + list_del_init(&tic->t_queue); + return XFS_ERROR(EIO); +} + +STATIC int +xlog_writeq_wait( + struct log *log, + struct xlog_ticket *tic, + int need_bytes) +{ + list_add_tail(&tic->t_queue, &log->l_writeq); + + do { + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + xlog_grant_push_ail(log, need_bytes); + + XFS_STATS_INC(xs_sleep_logspace); + trace_xfs_log_regrant_write_sleep(log, tic); + + xlog_wait(&tic->t_wait, &log->l_grant_write_lock); + trace_xfs_log_regrant_write_wake(log, tic); + + spin_lock(&log->l_grant_write_lock); + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes); + + list_del_init(&tic->t_queue); + return 0; +shutdown: + list_del_init(&tic->t_queue); + return XFS_ERROR(EIO); +} + static void xlog_tic_reset_res(xlog_ticket_t *tic) { @@ -350,8 +461,19 @@ xfs_log_reserve( retval = xlog_grant_log_space(log, internal_ticket); } + if (unlikely(retval)) { + /* + * If we are failing, make sure the ticket doesn't have any + * current reservations. We don't want to add this back + * when the ticket/ transaction gets cancelled. + */ + internal_ticket->t_curr_res = 0; + /* ungrant will give back unit_res * t_cnt. */ + internal_ticket->t_cnt = 0; + } + return retval; -} /* xfs_log_reserve */ +} /* @@ -626,7 +748,7 @@ xfs_log_item_init( struct xfs_mount *mp, struct xfs_log_item *item, int type, - struct xfs_item_ops *ops) + const struct xfs_item_ops *ops) { item->li_mountp = mp; item->li_ailp = mp->m_ail; @@ -2481,8 +2603,8 @@ restart: /* * Atomically get the log space required for a log ticket. * - * Once a ticket gets put onto the reserveq, it will only return after - * the needed reservation is satisfied. + * Once a ticket gets put onto the reserveq, it will only return after the + * needed reservation is satisfied. * * This function is structured so that it has a lock free fast path. This is * necessary because every new transaction reservation will come through this @@ -2490,113 +2612,53 @@ restart: * every pass. * * As tickets are only ever moved on and off the reserveq under the - * l_grant_reserve_lock, we only need to take that lock if we are going - * to add the ticket to the queue and sleep. We can avoid taking the lock if the - * ticket was never added to the reserveq because the t_queue list head will be - * empty and we hold the only reference to it so it can safely be checked - * unlocked. + * l_grant_reserve_lock, we only need to take that lock if we are going to add + * the ticket to the queue and sleep. We can avoid taking the lock if the ticket + * was never added to the reserveq because the t_queue list head will be empty + * and we hold the only reference to it so it can safely be checked unlocked. */ STATIC int -xlog_grant_log_space(xlog_t *log, - xlog_ticket_t *tic) +xlog_grant_log_space( + struct log *log, + struct xlog_ticket *tic) { - int free_bytes; - int need_bytes; + int free_bytes, need_bytes; + int error = 0; -#ifdef DEBUG - if (log->l_flags & XLOG_ACTIVE_RECOVERY) - panic("grant Recovery problem"); -#endif + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); trace_xfs_log_grant_enter(log, tic); + /* + * If there are other waiters on the queue then give them a chance at + * logspace before us. Wake up the first waiters, if we do not wake + * up all the waiters then go to sleep waiting for more free space, + * otherwise try to get some space for this transaction. + */ need_bytes = tic->t_unit_res; if (tic->t_flags & XFS_LOG_PERM_RESERV) need_bytes *= tic->t_ocnt; - - /* something is already sleeping; insert new transaction at end */ - if (!list_empty_careful(&log->l_reserveq)) { - spin_lock(&log->l_grant_reserve_lock); - /* recheck the queue now we are locked */ - if (list_empty(&log->l_reserveq)) { - spin_unlock(&log->l_grant_reserve_lock); - goto redo; - } - list_add_tail(&tic->t_queue, &log->l_reserveq); - - trace_xfs_log_grant_sleep1(log, tic); - - /* - * Gotta check this before going to sleep, while we're - * holding the grant lock. - */ - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; - - XFS_STATS_INC(xs_sleep_logspace); - xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); - - /* - * If we got an error, and the filesystem is shutting down, - * we'll catch it down below. So just continue... - */ - trace_xfs_log_grant_wake1(log, tic); - } - -redo: - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return_unlocked; - free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); - if (free_bytes < need_bytes) { + if (!list_empty_careful(&log->l_reserveq)) { spin_lock(&log->l_grant_reserve_lock); - if (list_empty(&tic->t_queue)) - list_add_tail(&tic->t_queue, &log->l_reserveq); - - trace_xfs_log_grant_sleep2(log, tic); - - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; - - xlog_grant_push_ail(log, need_bytes); - - XFS_STATS_INC(xs_sleep_logspace); - xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); - - trace_xfs_log_grant_wake2(log, tic); - goto redo; - } - - if (!list_empty(&tic->t_queue)) { + if (!xlog_reserveq_wake(log, &free_bytes) || + free_bytes < need_bytes) + error = xlog_reserveq_wait(log, tic, need_bytes); + spin_unlock(&log->l_grant_reserve_lock); + } else if (free_bytes < need_bytes) { spin_lock(&log->l_grant_reserve_lock); - list_del_init(&tic->t_queue); + error = xlog_reserveq_wait(log, tic, need_bytes); spin_unlock(&log->l_grant_reserve_lock); } + if (error) + return error; - /* we've got enough space */ xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); trace_xfs_log_grant_exit(log, tic); xlog_verify_grant_tail(log); return 0; - -error_return_unlocked: - spin_lock(&log->l_grant_reserve_lock); -error_return: - list_del_init(&tic->t_queue); - spin_unlock(&log->l_grant_reserve_lock); - trace_xfs_log_grant_error(log, tic); - - /* - * If we are failing, make sure the ticket doesn't have any - * current reservations. We don't want to add this back when - * the ticket/transaction gets cancelled. - */ - tic->t_curr_res = 0; - tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ - return XFS_ERROR(EIO); -} /* xlog_grant_log_space */ - +} /* * Replenish the byte reservation required by moving the grant write head. @@ -2605,10 +2667,12 @@ error_return: * free fast path. */ STATIC int -xlog_regrant_write_log_space(xlog_t *log, - xlog_ticket_t *tic) +xlog_regrant_write_log_space( + struct log *log, + struct xlog_ticket *tic) { - int free_bytes, need_bytes; + int free_bytes, need_bytes; + int error = 0; tic->t_curr_res = tic->t_unit_res; xlog_tic_reset_res(tic); @@ -2616,104 +2680,38 @@ xlog_regrant_write_log_space(xlog_t *log, if (tic->t_cnt > 0) return 0; -#ifdef DEBUG - if (log->l_flags & XLOG_ACTIVE_RECOVERY) - panic("regrant Recovery problem"); -#endif + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); trace_xfs_log_regrant_write_enter(log, tic); - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return_unlocked; - /* If there are other waiters on the queue then give them a - * chance at logspace before us. Wake up the first waiters, - * if we do not wake up all the waiters then go to sleep waiting - * for more free space, otherwise try to get some space for - * this transaction. + /* + * If there are other waiters on the queue then give them a chance at + * logspace before us. Wake up the first waiters, if we do not wake + * up all the waiters then go to sleep waiting for more free space, + * otherwise try to get some space for this transaction. */ need_bytes = tic->t_unit_res; - if (!list_empty_careful(&log->l_writeq)) { - struct xlog_ticket *ntic; - - spin_lock(&log->l_grant_write_lock); - free_bytes = xlog_space_left(log, &log->l_grant_write_head); - list_for_each_entry(ntic, &log->l_writeq, t_queue) { - ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); - - if (free_bytes < ntic->t_unit_res) - break; - free_bytes -= ntic->t_unit_res; - wake_up(&ntic->t_wait); - } - - if (ntic != list_first_entry(&log->l_writeq, - struct xlog_ticket, t_queue)) { - if (list_empty(&tic->t_queue)) - list_add_tail(&tic->t_queue, &log->l_writeq); - trace_xfs_log_regrant_write_sleep1(log, tic); - - xlog_grant_push_ail(log, need_bytes); - - XFS_STATS_INC(xs_sleep_logspace); - xlog_wait(&tic->t_wait, &log->l_grant_write_lock); - trace_xfs_log_regrant_write_wake1(log, tic); - } else - spin_unlock(&log->l_grant_write_lock); - } - -redo: - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return_unlocked; - free_bytes = xlog_space_left(log, &log->l_grant_write_head); - if (free_bytes < need_bytes) { + if (!list_empty_careful(&log->l_writeq)) { spin_lock(&log->l_grant_write_lock); - if (list_empty(&tic->t_queue)) - list_add_tail(&tic->t_queue, &log->l_writeq); - - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; - - xlog_grant_push_ail(log, need_bytes); - - XFS_STATS_INC(xs_sleep_logspace); - trace_xfs_log_regrant_write_sleep2(log, tic); - xlog_wait(&tic->t_wait, &log->l_grant_write_lock); - - trace_xfs_log_regrant_write_wake2(log, tic); - goto redo; - } - - if (!list_empty(&tic->t_queue)) { + if (!xlog_writeq_wake(log, &free_bytes) || + free_bytes < need_bytes) + error = xlog_writeq_wait(log, tic, need_bytes); + spin_unlock(&log->l_grant_write_lock); + } else if (free_bytes < need_bytes) { spin_lock(&log->l_grant_write_lock); - list_del_init(&tic->t_queue); + error = xlog_writeq_wait(log, tic, need_bytes); spin_unlock(&log->l_grant_write_lock); } - /* we've got enough space */ + if (error) + return error; + xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); trace_xfs_log_regrant_write_exit(log, tic); xlog_verify_grant_tail(log); return 0; - - - error_return_unlocked: - spin_lock(&log->l_grant_write_lock); - error_return: - list_del_init(&tic->t_queue); - spin_unlock(&log->l_grant_write_lock); - trace_xfs_log_regrant_write_error(log, tic); - - /* - * If we are failing, make sure the ticket doesn't have any - * current reservations. We don't want to add this back when - * the ticket/transaction gets cancelled. - */ - tic->t_curr_res = 0; - tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ - return XFS_ERROR(EIO); -} /* xlog_regrant_write_log_space */ - +} /* The first cnt-1 times through here we don't need to * move the grant write head because the permanent diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 78c9039994af..3f7bf451c034 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -137,7 +137,7 @@ struct xfs_trans; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, int type, - struct xfs_item_ops *ops); + const struct xfs_item_ops *ops); xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5cff443f6cdb..0bbb1a41998b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -674,7 +674,8 @@ xfs_qm_dqattach_one( * disk and we didn't ask it to allocate; * ESRCH if quotas got turned off suddenly. */ - error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp); + error = xfs_qm_dqget(ip->i_mount, ip, id, type, + doalloc | XFS_QMOPT_DOWARN, &dqp); if (error) return error; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3eca58f51ae9..8a899496fd5f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -869,27 +869,6 @@ xfs_fs_dirty_inode( } STATIC int -xfs_log_inode( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); -} - -STATIC int xfs_fs_write_inode( struct inode *inode, struct writeback_control *wbc) @@ -902,10 +881,8 @@ xfs_fs_write_inode( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - if (!ip->i_update_core) - return 0; - if (wbc->sync_mode == WB_SYNC_ALL) { + if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { /* * Make sure the inode has made it it into the log. Instead * of forcing it all the way to stable storage using a @@ -913,11 +890,14 @@ xfs_fs_write_inode( * ->sync_fs call do that for thus, which reduces the number * of synchronous log forces dramatically. */ - error = xfs_log_inode(ip); + error = xfs_log_dirty_inode(ip, NULL, 0); if (error) goto out; return 0; } else { + if (!ip->i_update_core) + return 0; + /* * We make this non-blocking if the inode is contended, return * EAGAIN to indicate to the caller that they did not succeed. diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index aa3dc1a4d53d..f0994aedcd15 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -336,6 +336,32 @@ xfs_sync_fsdata( return error; } +int +xfs_log_dirty_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + if (!ip->i_update_core) + return 0; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp, 0); +} + /* * When remounting a filesystem read-only or freezing the filesystem, we have * two phases to execute. This first phase is syncing the data before we @@ -359,6 +385,16 @@ xfs_quiesce_data( { int error, error2 = 0; + /* + * Log all pending size and timestamp updates. The vfs writeback + * code is supposed to do this, but due to its overagressive + * livelock detection it will skip inodes where appending writes + * were written out in the first non-blocking sync phase if their + * completion took long enough that it happened after taking the + * timestamp for the cut-off in the blocking phase. + */ + xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); + xfs_qm_sync(mp, SYNC_TRYLOCK); xfs_qm_sync(mp, SYNC_WAIT); @@ -770,6 +806,17 @@ restart: if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) goto out; + + /* + * If we only have a single dirty inode in a cluster there is + * a fair chance that the AIL push may have pushed it into + * the buffer, but xfsbufd won't touch it until 30 seconds + * from now, and thus we will lock up here. + * + * Promote the inode buffer to the front of the delwri list + * and wake up xfsbufd now. + */ + xfs_promote_inode(ip); xfs_iflock(ip); } diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 941202e7ac6e..fa965479d788 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -34,6 +34,8 @@ void xfs_quiesce_attr(struct xfs_mount *mp); void xfs_flush_inodes(struct xfs_inode *ip); +int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags); + int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f1d2802b2f07..494035798873 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -834,18 +834,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); -DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 603f3eb52041..3ae713c0abd9 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -326,7 +326,7 @@ typedef struct xfs_log_item { struct xfs_log_item *); /* buffer item iodone */ /* callback func */ - struct xfs_item_ops *li_ops; /* function list */ + const struct xfs_item_ops *li_ops; /* function list */ /* delayed logging */ struct list_head li_cil; /* CIL pointers */ @@ -341,7 +341,7 @@ typedef struct xfs_log_item { { XFS_LI_IN_AIL, "IN_AIL" }, \ { XFS_LI_ABORTED, "ABORTED" } -typedef struct xfs_item_ops { +struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); @@ -352,7 +352,7 @@ typedef struct xfs_item_ops { void (*iop_push)(xfs_log_item_t *); bool (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); -} xfs_item_ops_t; +}; #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 4ecf2a549060..ce9268a2f56b 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -112,7 +112,7 @@ xfs_readlink( char *link) { xfs_mount_t *mp = ip->i_mount; - int pathlen; + xfs_fsize_t pathlen; int error = 0; trace_xfs_readlink(ip); @@ -122,13 +122,19 @@ xfs_readlink( xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT(S_ISLNK(ip->i_d.di_mode)); - ASSERT(ip->i_d.di_size <= MAXPATHLEN); - pathlen = ip->i_d.di_size; if (!pathlen) goto out; + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { memcpy(link, ip->i_df.if_u1.if_data, pathlen); link[pathlen] = '\0'; |