diff options
Diffstat (limited to 'fs/btrfs')
35 files changed, 3334 insertions, 928 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 7bb3c020e570..ecb9fd3be143 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -4,6 +4,8 @@ config BTRFS_FS select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE + select LZO_COMPRESS + select LZO_DECOMPRESS help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index a35eb36b32fd..31610ea73aec 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - export.o tree-log.o acl.o free-space-cache.o zlib.o \ + export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 6ae2c8cac9d5..9c949348510b 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) char *value = NULL; struct posix_acl *acl; + if (!IS_POSIXACL(inode)) + return NULL; + acl = get_cached_acl(inode, type); if (acl != ACL_NOT_CACHED) return acl; @@ -60,8 +63,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, value, size); if (size > 0) { acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) + if (IS_ERR(acl)) { + kfree(value); return acl; + } set_cached_acl(inode, type, acl); } kfree(value); @@ -82,6 +87,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, struct posix_acl *acl; int ret = 0; + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + acl = btrfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 6ad63f17eca0..ccc991c542df 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -157,7 +157,7 @@ struct btrfs_inode { /* * always compress this one file */ - unsigned force_compress:1; + unsigned force_compress:4; struct inode vfs_inode; }; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b50bc4bd5c56..4d2110eafe29 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -62,6 +62,9 @@ struct compressed_bio { /* number of bytes on disk */ unsigned long compressed_len; + /* the compression algorithm for this bio */ + int compress_type; + /* number of compressed pages in the array */ unsigned long nr_pages; @@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err) /* ok, we're the last bio for this extent, lets start * the decompression. */ - ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, - cb->start, - cb->orig_bio->bi_io_vec, - cb->orig_bio->bi_vcnt, - cb->compressed_len); + ret = btrfs_decompress_biovec(cb->compress_type, + cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); csum_failed: if (ret) cb->errors = 1; @@ -558,7 +562,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - int ret; + int ret = -ENOMEM; u32 *sums; tree = &BTRFS_I(inode)->io_tree; @@ -573,6 +577,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + goto out; + atomic_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -588,17 +595,23 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->len = uncompressed_len; cb->compressed_len = compressed_len; + cb->compress_type = extent_compress_type(bio_flags); cb->orig_bio = bio; nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE; - cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, + cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + if (!cb->compressed_pages) + goto fail1; + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; for (page_index = 0; page_index < nr_pages; page_index++) { cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!cb->compressed_pages[page_index]) + goto fail2; } cb->nr_pages = nr_pages; @@ -609,6 +622,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->len = uncompressed_len; comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + if (!comp_bio) + goto fail2; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; atomic_inc(&cb->pending_bios); @@ -676,4 +691,329 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_put(comp_bio); return 0; + +fail2: + for (page_index = 0; page_index < nr_pages; page_index++) + free_page((unsigned long)cb->compressed_pages[page_index]); + + kfree(cb->compressed_pages); +fail1: + kfree(cb); +out: + free_extent_map(em); + return ret; +} + +static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; +static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; +static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; +static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; +static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; + +struct btrfs_compress_op *btrfs_compress_op[] = { + &btrfs_zlib_compress, + &btrfs_lzo_compress, +}; + +int __init btrfs_init_compress(void) +{ + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + INIT_LIST_HEAD(&comp_idle_workspace[i]); + spin_lock_init(&comp_workspace_lock[i]); + atomic_set(&comp_alloc_workspace[i], 0); + init_waitqueue_head(&comp_workspace_wait[i]); + } + return 0; +} + +/* + * this finds an available workspace or allocates a new one + * ERR_PTR is returned if things go bad. + */ +static struct list_head *find_workspace(int type) +{ + struct list_head *workspace; + int cpus = num_online_cpus(); + int idx = type - 1; + + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; +again: + spin_lock(workspace_lock); + if (!list_empty(idle_workspace)) { + workspace = idle_workspace->next; + list_del(workspace); + (*num_workspace)--; + spin_unlock(workspace_lock); + return workspace; + + } + if (atomic_read(alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + + spin_unlock(workspace_lock); + prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(alloc_workspace) > cpus && !*num_workspace) + schedule(); + finish_wait(workspace_wait, &wait); + goto again; + } + atomic_inc(alloc_workspace); + spin_unlock(workspace_lock); + + workspace = btrfs_compress_op[idx]->alloc_workspace(); + if (IS_ERR(workspace)) { + atomic_dec(alloc_workspace); + wake_up(workspace_wait); + } + return workspace; +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static void free_workspace(int type, struct list_head *workspace) +{ + int idx = type - 1; + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; + + spin_lock(workspace_lock); + if (*num_workspace < num_online_cpus()) { + list_add_tail(workspace, idle_workspace); + (*num_workspace)++; + spin_unlock(workspace_lock); + goto wake; + } + spin_unlock(workspace_lock); + + btrfs_compress_op[idx]->free_workspace(workspace); + atomic_dec(alloc_workspace); +wake: + if (waitqueue_active(workspace_wait)) + wake_up(workspace_wait); +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct list_head *workspace; + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + while (!list_empty(&comp_idle_workspace[i])) { + workspace = comp_idle_workspace[i].next; + list_del(workspace); + btrfs_compress_op[i]->free_workspace(workspace); + atomic_dec(&comp_alloc_workspace[i]); + } + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -1; + + ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, + start, len, pages, + nr_dest_pages, out_pages, + total_in, total_out, + max_out); + free_workspace(type, workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, + struct bio_vec *bvec, int vcnt, size_t srclen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -ENOMEM; + + ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, + disk_start, + bvec, vcnt, srclen); + free_workspace(type, workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -ENOMEM; + + ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, + dest_page, start_byte, + srclen, destlen); + + free_workspace(type, workspace); + return ret; +} + +void btrfs_exit_compress(void) +{ + free_workspaces(); +} + +/* + * Copy uncompressed data from working buffer to pages. + * + * buf_start is the byte offset we're of the start of our workspace buffer. + * + * total_out is the last byte of the buffer + */ +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *page_index, + unsigned long *pg_offset) +{ + unsigned long buf_offset; + unsigned long current_buf_start; + unsigned long start_byte; + unsigned long working_bytes = total_out - buf_start; + unsigned long bytes; + char *kaddr; + struct page *page_out = bvec[*page_index].bv_page; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) + return 1; + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while (working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - *pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out, KM_USER0); + memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page_out); + + *pg_offset += bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (*pg_offset == PAGE_CACHE_SIZE) { + (*page_index)++; + if (*page_index >= vcnt) + return 0; + + page_out = bvec[*page_index].bv_page; + *pg_offset = 0; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + return 1; + + /* + * the next page in the biovec might not be adjacent + * to the last page, but it might still be found + * inside this working buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + buf_offset; + } + } + } + + return 1; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 421f5b4aa715..51000174b9d7 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -19,24 +19,27 @@ #ifndef __BTRFS_COMPRESSION_ #define __BTRFS_COMPRESSION_ -int btrfs_zlib_decompress(unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen); -int btrfs_zlib_compress_pages(struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out); -int btrfs_zlib_decompress_biovec(struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen); -void btrfs_zlib_exit(void); +int btrfs_init_compress(void); +void btrfs_exit_compress(void); + +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, + struct bio_vec *bvec, int vcnt, size_t srclen); +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen); +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *page_index, + unsigned long *pg_offset); + int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, @@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long nr_pages); int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); + +struct btrfs_compress_op { + struct list_head *(*alloc_workspace)(void); + + void (*free_workspace)(struct list_head *workspace); + + int (*compress_pages)(struct list_head *workspace, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); + + int (*decompress_biovec)(struct list_head *workspace, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); + + int (*decompress)(struct list_head *workspace, + unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +}; + +extern struct btrfs_compress_op btrfs_zlib_compress; +extern struct btrfs_compress_op btrfs_lzo_compress; + #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 9ac171599258..b5baff0dccfe 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { + if (!p) + return; btrfs_release_path(NULL, p); kmem_cache_free(btrfs_path_cachep, p); } @@ -2514,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); + if (right == NULL) + return 1; + btrfs_tree_lock(right); btrfs_set_lock_blocking(right); @@ -2764,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); + if (left == NULL) + return 1; + btrfs_tree_lock(left); btrfs_set_lock_blocking(left); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a142d204b526..6f820fa23df4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -27,6 +27,7 @@ #include <linux/backing-dev.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/kobject.h> #include <asm/kmap_types.h> #include "extent_io.h" #include "extent_map.h" @@ -294,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FSID_SIZE 16 #define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) #define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) + +/* + * File system states + */ + +/* Errors detected */ +#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) + #define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) #define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) @@ -398,13 +407,15 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) /* * A leaf is full of items. offset and size tell us where to find @@ -551,9 +562,11 @@ struct btrfs_timespec { } __attribute__ ((__packed__)); enum btrfs_compression_type { - BTRFS_COMPRESS_NONE = 0, - BTRFS_COMPRESS_ZLIB = 1, - BTRFS_COMPRESS_LAST = 2, + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_TYPES = 2, + BTRFS_COMPRESS_LAST = 3, }; struct btrfs_inode_item { @@ -597,6 +610,8 @@ struct btrfs_dir_item { u8 type; } __attribute__ ((__packed__)); +#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) + struct btrfs_root_item { struct btrfs_inode_item inode; __le64 generation; @@ -895,7 +910,8 @@ struct btrfs_fs_info { */ u64 last_trans_log_full_commit; u64 open_ioctl_trans; - unsigned long mount_opt; + unsigned long mount_opt:20; + unsigned long compress_type:4; u64 max_inline; u64 alloc_start; struct btrfs_transaction *running_transaction; @@ -1050,6 +1066,9 @@ struct btrfs_fs_info { unsigned metadata_ratio; void *bdev_holder; + + /* filesystem state */ + u64 fs_state; }; /* @@ -1235,6 +1254,7 @@ struct btrfs_root { #define BTRFS_MOUNT_SPACE_CACHE (1 << 12) #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) +#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1893,6 +1913,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, last_snapshot, 64); +static inline bool btrfs_root_readonly(struct btrfs_root *root) +{ + return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; +} + /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); @@ -2145,6 +2170,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); int btrfs_check_data_free_space(struct inode *inode, u64 bytes); @@ -2188,6 +2214,14 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, int btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); +int btrfs_error_unpin_extent_range(struct btrfs_root *root, + u64 start, u64 end); +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type); + /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2541,6 +2575,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno); + +#define btrfs_std_error(fs_info, errno) \ +do { \ + if ((errno)) \ + __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\ +} while (0) /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 51d2e4de34eb..e1aa8d607bc7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -44,6 +44,20 @@ static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); +static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only); +static int btrfs_destroy_ordered_operations(struct btrfs_root *root); +static int btrfs_destroy_ordered_extents(struct btrfs_root *root); +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root); +static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root); +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark); +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents); +static int btrfs_cleanup_transaction(struct btrfs_root *root); /* * end_io_wq structs are used to do processing in task context when an IO is @@ -345,14 +359,22 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) + if (page->private == EXTENT_PAGE_PRIVATE) { + WARN_ON(1); goto out; - if (!page->private) + } + if (!page->private) { + WARN_ON(1); goto out; + } len = page->private >> 2; WARN_ON(len == 0); eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + if (eb == NULL) { + WARN_ON(1); + goto out; + } ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, btrfs_header_generation(eb)); BUG_ON(ret); @@ -427,6 +449,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, WARN_ON(len == 0); eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + if (eb == NULL) { + ret = -EIO; + goto out; + } found_start = btrfs_header_bytenr(eb); if (found_start != start) { @@ -1145,6 +1171,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, } btrfs_free_path(path); if (ret) { + kfree(root); if (ret > 0) ret = -ENOENT; return ERR_PTR(ret); @@ -1527,6 +1554,7 @@ static int transaction_kthread(void *arg) spin_unlock(&root->fs_info->new_trans_lock); trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); if (transid == trans->transid) { ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); @@ -1713,8 +1741,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info, BTRFS_ROOT_TREE_OBJECTID); bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (!bh) + if (!bh) { + err = -EINVAL; goto fail_iput; + } memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); memcpy(&fs_info->super_for_commit, &fs_info->super_copy, @@ -1727,6 +1757,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!btrfs_super_root(disk_super)) goto fail_iput; + /* check FS state, whether FS is broken. */ + fs_info->fs_state |= btrfs_super_flags(disk_super); + + btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + ret = btrfs_parse_options(tree_root, options); if (ret) { err = ret; @@ -1744,10 +1779,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, } features = btrfs_super_incompat_flags(disk_super); - if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { - features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - btrfs_set_super_incompat_flags(disk_super, features); - } + features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; @@ -1957,7 +1992,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_set_opt(fs_info->mount_opt, SSD); } - if (btrfs_super_log_root(disk_super) != 0) { + /* do not make disk changes in broken FS */ + if (btrfs_super_log_root(disk_super) != 0 && + !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { @@ -2421,10 +2458,14 @@ int btrfs_commit_super(struct btrfs_root *root) up_write(&root->fs_info->cleanup_work_sem); trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); /* run commit again to drop the original snapshot */ trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_commit_transaction(trans, root); ret = btrfs_write_and_wait_transaction(NULL, root); BUG_ON(ret); @@ -2442,8 +2483,28 @@ int close_ctree(struct btrfs_root *root) smp_mb(); btrfs_put_block_group_cache(fs_info); + + /* + * Here come 2 situations when btrfs is broken to flip readonly: + * + * 1. when btrfs flips readonly somewhere else before + * btrfs_commit_super, sb->s_flags has MS_RDONLY flag, + * and btrfs will skip to write sb directly to keep + * ERROR state on disk. + * + * 2. when btrfs flips readonly just in btrfs_commit_super, + * and in such case, btrfs cannnot write sb via btrfs_commit_super, + * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, + * btrfs will cleanup all FS resources first and write sb then. + */ if (!(fs_info->sb->s_flags & MS_RDONLY)) { - ret = btrfs_commit_super(root); + ret = btrfs_commit_super(root); + if (ret) + printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + } + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + ret = btrfs_error_commit_super(root); if (ret) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } @@ -2502,6 +2563,8 @@ int close_ctree(struct btrfs_root *root) kfree(fs_info->chunk_root); kfree(fs_info->dev_root); kfree(fs_info->csum_root); + kfree(fs_info); + return 0; } @@ -2619,6 +2682,352 @@ out: return 0; } +static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only) +{ + if (read_only) + return; + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + printk(KERN_WARNING "warning: mount fs with errors, " + "running btrfsck is recommended\n"); +} + +int btrfs_error_commit_super(struct btrfs_root *root) +{ + int ret; + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + + /* cleanup FS via transaction */ + btrfs_cleanup_transaction(root); + + ret = write_ctree_super(NULL, root, 0); + + return ret; +} + +static int btrfs_destroy_ordered_operations(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->fs_info->ordered_operations_mutex); + spin_lock(&root->fs_info->ordered_extent_lock); + + list_splice_init(&root->fs_info->ordered_operations, &splice); + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + ordered_operations); + + list_del_init(&btrfs_inode->ordered_operations); + + btrfs_invalidate_inodes(btrfs_inode->root); + } + + spin_unlock(&root->fs_info->ordered_extent_lock); + mutex_unlock(&root->fs_info->ordered_operations_mutex); + + return 0; +} + +static int btrfs_destroy_ordered_extents(struct btrfs_root *root) +{ + struct list_head splice; + struct btrfs_ordered_extent *ordered; + struct inode *inode; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->fs_info->ordered_extent_lock); + + list_splice_init(&root->fs_info->ordered_extents, &splice); + while (!list_empty(&splice)) { + ordered = list_entry(splice.next, struct btrfs_ordered_extent, + root_extent_list); + + list_del_init(&ordered->root_extent_list); + atomic_inc(&ordered->refs); + + /* the inode may be getting freed (in sys_unlink path). */ + inode = igrab(ordered->inode); + + spin_unlock(&root->fs_info->ordered_extent_lock); + if (inode) + iput(inode); + + atomic_set(&ordered->refs, 1); + btrfs_put_ordered_extent(ordered); + + spin_lock(&root->fs_info->ordered_extent_lock); + } + + spin_unlock(&root->fs_info->ordered_extent_lock); + + return 0; +} + +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + int ret = 0; + + delayed_refs = &trans->delayed_refs; + + spin_lock(&delayed_refs->lock); + if (delayed_refs->num_entries == 0) { + printk(KERN_INFO "delayed_refs has NO entry\n"); + return ret; + } + + node = rb_first(&delayed_refs->root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); + + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + + atomic_set(&ref->refs, 1); + if (btrfs_delayed_ref_is_head(ref)) { + struct btrfs_delayed_ref_head *head; + + head = btrfs_delayed_node_to_head(ref); + mutex_lock(&head->mutex); + kfree(head->extent_op); + delayed_refs->num_heads--; + if (list_empty(&head->cluster)) + delayed_refs->num_heads_ready--; + list_del_init(&head->cluster); + mutex_unlock(&head->mutex); + } + + spin_unlock(&delayed_refs->lock); + btrfs_put_delayed_ref(ref); + + cond_resched(); + spin_lock(&delayed_refs->lock); + } + + spin_unlock(&delayed_refs->lock); + + return ret; +} + +static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +{ + struct btrfs_pending_snapshot *snapshot; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + list_splice_init(&t->pending_snapshots, &splice); + + while (!list_empty(&splice)) { + snapshot = list_entry(splice.next, + struct btrfs_pending_snapshot, + list); + + list_del_init(&snapshot->list); + + kfree(snapshot); + } + + return 0; +} + +static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + list_splice_init(&root->fs_info->delalloc_inodes, &splice); + + spin_lock(&root->fs_info->delalloc_lock); + + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + delalloc_inodes); + + list_del_init(&btrfs_inode->delalloc_inodes); + + btrfs_invalidate_inodes(btrfs_inode->root); + } + + spin_unlock(&root->fs_info->delalloc_lock); + + return 0; +} + +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark) +{ + int ret; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + u64 start = 0; + u64 end; + u64 offset; + unsigned long index; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark); + if (ret) + break; + + clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + offset = page_offset(page); + + spin_lock(&dirty_pages->buffer_lock); + eb = radix_tree_lookup( + &(&BTRFS_I(page->mapping->host)->io_tree)->buffer, + offset >> PAGE_CACHE_SHIFT); + spin_unlock(&dirty_pages->buffer_lock); + if (eb) { + ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY, + &eb->bflags); + atomic_set(&eb->refs, 1); + } + if (PageWriteback(page)) + end_page_writeback(page); + + lock_page(page); + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + spin_lock_irq(&page->mapping->tree_lock); + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&page->mapping->tree_lock); + } + + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + } + } + + return ret; +} + +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents) +{ + struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; + + unpin = pinned_extents; + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + /* opt_discard */ + ret = btrfs_error_discard_extent(root, start, end + 1 - start); + + clear_extent_dirty(unpin, start, end, GFP_NOFS); + btrfs_error_unpin_extent_range(root, start, end); + cond_resched(); + } + + return 0; +} + +static int btrfs_cleanup_transaction(struct btrfs_root *root) +{ + struct btrfs_transaction *t; + LIST_HEAD(list); + + WARN_ON(1); + + mutex_lock(&root->fs_info->trans_mutex); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + list_splice_init(&root->fs_info->trans_list, &list); + while (!list_empty(&list)) { + t = list_entry(list.next, struct btrfs_transaction, list); + if (!t) + break; + + btrfs_destroy_ordered_operations(root); + + btrfs_destroy_ordered_extents(root); + + btrfs_destroy_delayed_refs(t, root); + + btrfs_block_rsv_release(root, + &root->fs_info->trans_block_rsv, + t->dirty_pages.dirty_bytes); + + /* FIXME: cleanup wait for commit */ + t->in_commit = 1; + t->blocked = 1; + if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) + wake_up(&root->fs_info->transaction_blocked_wait); + + t->blocked = 0; + if (waitqueue_active(&root->fs_info->transaction_wait)) + wake_up(&root->fs_info->transaction_wait); + mutex_unlock(&root->fs_info->trans_mutex); + + mutex_lock(&root->fs_info->trans_mutex); + t->commit_done = 1; + if (waitqueue_active(&t->commit_wait)) + wake_up(&t->commit_wait); + mutex_unlock(&root->fs_info->trans_mutex); + + mutex_lock(&root->fs_info->trans_mutex); + + btrfs_destroy_pending_snapshots(t); + + btrfs_destroy_delalloc_inodes(root); + + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->new_trans_lock); + + btrfs_destroy_marked_extents(root, &t->dirty_pages, + EXTENT_DIRTY); + + btrfs_destroy_pinned_extent(root, + root->fs_info->pinned_extents); + + t->use_count = 0; + list_del_init(&t->list); + memset(t, 0, sizeof(*t)); + kmem_cache_free(btrfs_transaction_cachep, t); + } + + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + mutex_unlock(&root->fs_info->trans_mutex); + + return 0; +} + static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 88e825a0bf21..07b20dc2fd95 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); +int btrfs_error_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 0ccf9a8afcdf..ff27d7a477b2 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -65,7 +65,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, { struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; struct btrfs_root *root; - struct dentry *dentry; struct inode *inode; struct btrfs_key key; int index; @@ -108,10 +107,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, return ERR_PTR(-ESTALE); } - dentry = d_obtain_alias(inode); - if (!IS_ERR(dentry)) - d_set_d_op(dentry, &btrfs_dentry_operations); - return dentry; + return d_obtain_alias(inode); fail: srcu_read_unlock(&fs_info->subvol_srcu, index); return ERR_PTR(err); @@ -166,7 +162,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, static struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = child->d_inode; - struct dentry *dentry; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -176,6 +171,8 @@ static struct dentry *btrfs_get_parent(struct dentry *child) int ret; path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = root->root_key.objectid; @@ -223,10 +220,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); - if (!IS_ERR(dentry)) - d_set_d_op(dentry, &btrfs_dentry_operations); - return dentry; + return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); fail: btrfs_free_path(path); return ERR_PTR(ret); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 227e5815d838..588ff9849873 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -320,11 +320,6 @@ static int caching_kthread(void *data) if (!path) return -ENOMEM; - exclude_super_stripes(extent_root, block_group); - spin_lock(&block_group->space_info->lock); - block_group->space_info->bytes_readonly += block_group->bytes_super; - spin_unlock(&block_group->space_info->lock); - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); /* @@ -467,8 +462,10 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_NO; } spin_unlock(&cache->lock); - if (ret == 1) + if (ret == 1) { + free_excluded_extents(fs_info->extent_root, cache); return 0; + } } if (load_cache_only) @@ -3089,7 +3086,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) return btrfs_reduce_alloc_profile(root, flags); } -static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { u64 flags; @@ -3161,8 +3158,12 @@ alloc: bytes + 2 * 1024 * 1024, alloc_target, 0); btrfs_end_transaction(trans, root); - if (ret < 0) - return ret; + if (ret < 0) { + if (ret != -ENOSPC) + return ret; + else + goto commit_trans; + } if (!data_sinfo) { btrfs_set_inode_space_info(root, inode); @@ -3173,6 +3174,7 @@ alloc: spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ +commit_trans: if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); @@ -3339,8 +3341,10 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, u64 reserved; u64 max_reclaim; u64 reclaimed = 0; + long time_left; int pause = 1; int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; + int loops = 0; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; @@ -3353,7 +3357,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, max_reclaim = min(reserved, to_reclaim); - while (1) { + while (loops < 1024) { /* have the flusher threads jump in and do some IO */ smp_mb(); nr_pages = min_t(unsigned long, nr_pages, @@ -3361,8 +3365,12 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_reserved) + if (reserved > space_info->bytes_reserved) { + loops = 0; reclaimed += reserved - space_info->bytes_reserved; + } else { + loops++; + } reserved = space_info->bytes_reserved; spin_unlock(&space_info->lock); @@ -3373,7 +3381,12 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, return -EAGAIN; __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(pause); + time_left = schedule_timeout(pause); + + /* We were interrupted, exit */ + if (time_left) + break; + pause <<= 1; if (pause > HZ / 10) pause = HZ / 10; @@ -3583,8 +3596,20 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, if (num_bytes > 0) { if (dest) { - block_rsv_add_bytes(dest, num_bytes, 0); - } else { + spin_lock(&dest->lock); + if (!dest->full) { + u64 bytes_to_add; + + bytes_to_add = dest->size - dest->reserved; + bytes_to_add = min(num_bytes, bytes_to_add); + dest->reserved += bytes_to_add; + if (dest->reserved >= dest->size) + dest->full = 1; + num_bytes -= bytes_to_add; + } + spin_unlock(&dest->lock); + } + if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_reserved -= num_bytes; spin_unlock(&space_info->lock); @@ -3721,11 +3746,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, return 0; } - WARN_ON(1); - printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", - block_rsv->size, block_rsv->reserved, - block_rsv->freed[0], block_rsv->freed[1]); - return -ENOSPC; } @@ -4012,6 +4032,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); atomic_dec(&BTRFS_I(inode)->outstanding_extents); + WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); spin_lock(&BTRFS_I(inode)->accounting_lock); nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); @@ -5355,7 +5376,7 @@ again: num_bytes, data, 1); goto again; } - if (ret == -ENOSPC) { + if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, data); @@ -5633,6 +5654,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize) { struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; int ret; block_rsv = get_block_rsv(trans, root); @@ -5640,14 +5662,39 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (block_rsv->size == 0) { ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 0); - if (ret) + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve. + */ + if (ret && block_rsv != global_rsv) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + return ERR_PTR(ret); + } else if (ret) { return ERR_PTR(ret); + } return block_rsv; } ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; + if (ret) { + WARN_ON(1); + ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, + 0); + if (!ret) { + spin_lock(&block_rsv->lock); + block_rsv->size += blocksize; + spin_unlock(&block_rsv->lock); + return block_rsv; + } else if (ret && block_rsv != global_rsv) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + } return ERR_PTR(-ENOSPC); } @@ -6221,6 +6268,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, BUG_ON(!wc); trans = btrfs_start_transaction(tree_root, 0); + BUG_ON(IS_ERR(trans)); + if (block_rsv) trans->block_rsv = block_rsv; @@ -6318,6 +6367,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, btrfs_end_transaction_throttle(trans, tree_root); trans = btrfs_start_transaction(tree_root, 0); + BUG_ON(IS_ERR(trans)); if (block_rsv) trans->block_rsv = block_rsv; } @@ -6446,6 +6496,8 @@ static noinline int relocate_inode_pages(struct inode *inode, u64 start, int ret = 0; ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; mutex_lock(&inode->i_mutex); first_index = start >> PAGE_CACHE_SHIFT; @@ -6531,7 +6583,7 @@ static noinline int relocate_data_extent(struct inode *reloc_inode, u64 end = start + extent_key->offset - 1; em = alloc_extent_map(GFP_NOFS); - BUG_ON(!em || IS_ERR(em)); + BUG_ON(!em); em->start = start; em->len = extent_key->offset; @@ -7477,7 +7529,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) BUG_ON(reloc_root->commit_root != NULL); while (1) { trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); mutex_lock(&root->fs_info->drop_mutex); ret = btrfs_drop_snapshot(trans, reloc_root); @@ -7535,7 +7587,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root) if (found) { trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); } @@ -7779,7 +7831,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, trans = btrfs_start_transaction(extent_root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); if (extent_key->objectid == 0) { ret = del_extent_zero(trans, extent_root, path, extent_key); @@ -7970,13 +8022,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache) if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + sinfo->bytes_may_use + sinfo->bytes_readonly + - cache->reserved_pinned + num_bytes < sinfo->total_bytes) { + cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; sinfo->bytes_reserved += cache->reserved_pinned; cache->reserved_pinned = 0; cache->ro = 1; ret = 0; } + spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); return ret; @@ -8012,6 +8065,69 @@ out: return ret; } +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + u64 alloc_flags = get_alloc_profile(root, type); + return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); +} + +/* + * helper to account the unused space of all the readonly block group in the + * list. takes mirrors into account. + */ +static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) +{ + struct btrfs_block_group_cache *block_group; + u64 free_bytes = 0; + int factor; + + list_for_each_entry(block_group, groups_list, list) { + spin_lock(&block_group->lock); + + if (!block_group->ro) { + spin_unlock(&block_group->lock); + continue; + } + + if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) + factor = 2; + else + factor = 1; + + free_bytes += (block_group->key.offset - + btrfs_block_group_used(&block_group->item)) * + factor; + + spin_unlock(&block_group->lock); + } + + return free_bytes; +} + +/* + * helper to account the unused space of all the readonly block group in the + * space_info. takes mirrors into account. + */ +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) +{ + int i; + u64 free_bytes = 0; + + spin_lock(&sinfo->lock); + + for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) + if (!list_empty(&sinfo->block_groups[i])) + free_bytes += __btrfs_get_ro_block_group_free_space( + &sinfo->block_groups[i]); + + spin_unlock(&sinfo->lock); + + return free_bytes; +} + int btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { @@ -8092,7 +8208,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 min_free = btrfs_block_group_used(&block_group->item); - u64 dev_offset, max_avail; + u64 dev_offset; /* * check to make sure we can actually find a chunk with enough @@ -8100,7 +8216,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ if (device->total_bytes > device->bytes_used + min_free) { ret = find_free_dev_extent(NULL, device, min_free, - &dev_offset, &max_avail); + &dev_offset, NULL); if (!ret) break; ret = -1; @@ -8213,6 +8329,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); + /* + * We haven't cached this block group, which means we could + * possibly have excluded extents on this block group. + */ + if (block_group->cached == BTRFS_CACHE_NO) + free_excluded_extents(info->extent_root, block_group); + btrfs_remove_free_space_cache(block_group); btrfs_put_block_group(block_group); @@ -8328,6 +8451,13 @@ int btrfs_read_block_groups(struct btrfs_root *root) cache->sectorsize = root->sectorsize; /* + * We need to exclude the super stripes now so that the space + * info has super bytes accounted for, otherwise we'll think + * we have more space than we actually do. + */ + exclude_super_stripes(root, cache); + + /* * check for two cases, either we are full, and therefore * don't need to bother with the caching work since we won't * find any space, or we are empty, and we can just add all @@ -8335,12 +8465,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { - exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; free_excluded_extents(root, cache); } else if (btrfs_block_group_used(&cache->item) == 0) { - exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; add_new_free_space(cache, root->fs_info, @@ -8584,3 +8712,14 @@ out: btrfs_free_path(path); return ret; } + +int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +{ + return unpin_extent_range(root, start, end); +} + +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ + return btrfs_discard_extent(root, bytenr, num_bytes); +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3e86b9f36507..fd3f172e94e6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits) + unsigned long bits, int contig) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; u64 total_bytes = 0; + u64 last = 0; int found = 0; if (search_end <= cur_start) { @@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree, state = rb_entry(node, struct extent_state, rb_node); if (state->start > search_end) break; - if (state->end >= cur_start && (state->state & bits)) { + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { total_bytes += min(search_end, state->end) + 1 - max(cur_start, state->start); if (total_bytes >= max_bytes) @@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree, *start = state->start; found = 1; } + last = state->end; + } else if (contig && found) { + break; } node = rb_next(node); if (!node) @@ -1865,7 +1871,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, bio_get(bio); if (tree->ops && tree->ops->submit_bio_hook) - tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, mirror_num, bio_flags, start); else submit_bio(rw, bio); @@ -1920,6 +1926,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, nr = bio_get_nr_vecs(bdev); bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + if (!bio) + return -ENOMEM; bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; @@ -1944,6 +1952,7 @@ void set_page_extent_mapped(struct page *page) static void set_page_extent_head(struct page *page, unsigned long len) { + WARN_ON(!PagePrivate(page)); set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); } @@ -2028,8 +2037,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { this_bio_flag = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&this_bio_flag, + em->compress_type); + } iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); @@ -2123,7 +2135,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, &bio_flags); if (bio) - submit_one_bio(READ, bio, 0, bio_flags); + ret = submit_one_bio(READ, bio, 0, bio_flags); return ret; } @@ -2816,9 +2828,17 @@ int try_release_extent_state(struct extent_map_tree *map, * at this point we can safely clear everything except the * locked bit and the nodatasum bit */ - clear_extent_bit(tree, start, end, + ret = clear_extent_bit(tree, start, end, ~(EXTENT_LOCKED | EXTENT_NODATASUM), 0, 0, NULL, mask); + + /* if clear_extent_bit failed for enomem reasons, + * we can't allow the release to continue. + */ + if (ret < 0) + ret = 0; + else + ret = 1; } return ret; } @@ -2898,6 +2918,46 @@ out: return sector; } +/* + * helper function for fiemap, which doesn't want to see any holes. + * This maps until we find something past 'last' + */ +static struct extent_map *get_extent_skip_holes(struct inode *inode, + u64 offset, + u64 last, + get_extent_t *get_extent) +{ + u64 sectorsize = BTRFS_I(inode)->root->sectorsize; + struct extent_map *em; + u64 len; + + if (offset >= last) + return NULL; + + while(1) { + len = last - offset; + if (len == 0) + break; + len = (len + sectorsize - 1) & ~(sectorsize - 1); + em = get_extent(inode, NULL, 0, offset, len, 0); + if (!em || IS_ERR(em)) + return em; + + /* if this isn't a hole return it */ + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && + em->block_start != EXTENT_MAP_HOLE) { + return em; + } + + /* this is a hole, advance to the next extent */ + offset = extent_map_end(em); + free_extent_map(em); + if (offset >= last) + break; + } + return NULL; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -2907,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u32 flags = 0; u32 found_type; u64 last; + u64 last_for_get_extent = 0; u64 disko = 0; + u64 isize = i_size_read(inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_file_extent_item *item; int end = 0; - u64 em_start = 0, em_len = 0; + u64 em_start = 0; + u64 em_len = 0; + u64 em_end = 0; unsigned long emflags; - int hole = 0; if (len == 0) return -EINVAL; @@ -2926,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + /* + * lookup the last file extent. We're not using i_size here + * because there might be preallocation past i_size + */ ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path, inode->i_ino, -1, 0); if (ret < 0) { @@ -2939,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); - /* No extents, just return */ + /* No extents, but there might be delalloc bits */ if (found_key.objectid != inode->i_ino || found_type != BTRFS_EXTENT_DATA_KEY) { - btrfs_free_path(path); - return 0; + /* have to trust i_size as the end */ + last = (u64)-1; + last_for_get_extent = isize; + } else { + /* + * remember the start of the last extent. There are a + * bunch of different factors that go into the length of the + * extent, so its much less complex to remember where it started + */ + last = found_key.offset; + last_for_get_extent = last + 1; } - last = found_key.offset; btrfs_free_path(path); + /* + * we might have some extents allocated but more delalloc past those + * extents. so, we trust isize unless the start of the last extent is + * beyond isize + */ + if (last < isize) { + last = (u64)-1; + last_for_get_extent = isize; + } + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent(inode, NULL, 0, off, max - off, 0); + + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -2959,19 +3046,14 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - hole = 0; - off = em->start + em->len; + off = extent_map_end(em); if (off >= max) end = 1; - if (em->block_start == EXTENT_MAP_HOLE) { - hole = 1; - goto next; - } - em_start = em->start; em_len = em->len; - + em_end = extent_map_end(em); + emflags = em->flags; disko = 0; flags = 0; @@ -2990,37 +3072,29 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; -next: - emflags = em->flags; free_extent_map(em); em = NULL; - if (!end) { - em = get_extent(inode, NULL, 0, off, max - off, 0); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - emflags = em->flags; - } - - if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { + if ((em_start >= last) || em_len == (u64)-1 || + (last == (u64)-1 && isize <= em_end)) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - if (em_start == last) { + /* now scan forward to see if this is really the last extent. */ + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + if (!em) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - - if (!hole) { - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); - if (ret) - goto out_free; - } + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; } out_free: free_extent_map(em); @@ -3072,6 +3146,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, #endif eb = kmem_cache_zalloc(extent_buffer_cache, mask); + if (eb == NULL) + return NULL; eb->start = start; eb->len = len; spin_lock_init(&eb->lock); @@ -3187,7 +3263,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, } if (!PageUptodate(p)) uptodate = 0; - unlock_page(p); + + /* + * see below about how we avoid a nasty race with release page + * and why we unlock later + */ + if (i != 0) + unlock_page(p); } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); @@ -3211,9 +3293,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, atomic_inc(&eb->refs); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); + + /* + * there is a race where release page may have + * tried to find this extent buffer in the radix + * but failed. It will tell the VM it is safe to + * reclaim the, and it will clear the page private bit. + * We must make sure to set the page private bit properly + * after the extent buffer is in the radix tree so + * it doesn't get lost + */ + set_page_extent_mapped(eb->first_page); + set_page_extent_head(eb->first_page, eb->len); + if (!page0) + unlock_page(eb->first_page); return eb; free_eb: + if (eb->first_page && !page0) + unlock_page(eb->first_page); + if (!atomic_dec_and_test(&eb->refs)) return exists; btrfs_release_extent_buffer(eb); @@ -3264,10 +3363,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, continue; lock_page(page); + WARN_ON(!PagePrivate(page)); + + set_page_extent_mapped(page); if (i == 0) set_page_extent_head(page, eb->len); - else - set_page_private(page, EXTENT_PAGE_PRIVATE); clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); @@ -3457,6 +3557,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); + + WARN_ON(!PagePrivate(page)); + + set_page_extent_mapped(page); + if (i == 0) + set_page_extent_head(page, eb->len); + if (inc_all_pages) page_cache_get(page); if (!PageUptodate(page)) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4183c8178f01..9318dfefd59c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -20,8 +20,12 @@ #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) -/* flags for bio submission */ +/* + * flags for bio submission. The high bits indicate the compression + * type for this bio + */ #define EXTENT_BIO_COMPRESSED 1 +#define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ #define EXTENT_BUFFER_UPTODATE 0 @@ -135,6 +139,17 @@ struct extent_buffer { wait_queue_head_t lock_wq; }; +static inline void extent_set_compress_type(unsigned long *bio_flags, + int compress_type) +{ + *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; +} + +static inline int extent_compress_type(unsigned long bio_flags) +{ + return bio_flags >> EXTENT_BIO_FLAG_SHIFT; +} + struct extent_map_tree; static inline struct extent_state *extent_state_next(struct extent_state *state) @@ -176,7 +191,7 @@ void extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits); + u64 max_bytes, unsigned long bits, int contig); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 23cb8da3ff66..2b6c12e983b3 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -3,6 +3,7 @@ #include <linux/module.h> #include <linux/spinlock.h> #include <linux/hardirq.h> +#include "ctree.h" #include "extent_map.h" @@ -50,10 +51,11 @@ struct extent_map *alloc_extent_map(gfp_t mask) { struct extent_map *em; em = kmem_cache_alloc(extent_map_cache, mask); - if (!em || IS_ERR(em)) - return em; + if (!em) + return NULL; em->in_tree = 0; em->flags = 0; + em->compress_type = BTRFS_COMPRESS_NONE; atomic_set(&em->refs, 1); return em; } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ab6d74b6e647..28b44dbd1e35 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -26,7 +26,8 @@ struct extent_map { unsigned long flags; struct block_device *bdev; atomic_t refs; - int in_tree; + unsigned int in_tree:1; + unsigned int compress_type:4; }; struct extent_map_tree { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a562a250ae77..4f19a3e1bf32 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -536,6 +536,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, root = root->fs_info->csum_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; while (1) { key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -548,7 +550,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, if (path->slots[0] == 0) goto out; path->slots[0]--; + } else if (ret < 0) { + goto out; } + leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66836d85763b..7084140d5940 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,6 +24,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> +#include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/statfs.h> @@ -185,6 +186,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split = alloc_extent_map(GFP_NOFS); if (!split2) split2 = alloc_extent_map(GFP_NOFS); + BUG_ON(!split || !split2); write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -224,6 +226,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; + split->compress_type = em->compress_type; ret = add_extent_mapping(em_tree, split); BUG_ON(ret); free_extent_map(split); @@ -238,6 +241,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->len = em->start + em->len - (start + len); split->bdev = em->bdev; split->flags = flags; + split->compress_type = em->compress_type; if (compressed) { split->block_len = em->block_len; @@ -790,8 +794,12 @@ again: for (i = 0; i < num_pages; i++) { pages[i] = grab_cache_page(inode->i_mapping, index + i); if (!pages[i]) { - err = -ENOMEM; - BUG_ON(1); + int c; + for (c = i - 1; c >= 0; c--) { + unlock_page(pages[c]); + page_cache_release(pages[c]); + } + return -ENOMEM; } wait_on_page_writeback(pages[i]); } @@ -890,6 +898,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (err) goto out; + /* + * If BTRFS flips readonly due to some impossible error + * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), + * although we have opened a file as writable, we have + * to stop this write operation to ensure FS consistency. + */ + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + err = -EROFS; + goto out; + } + file_update_time(file); BTRFS_I(inode)->sequence++; @@ -932,6 +951,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / (sizeof(struct page *))); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto out; + } /* generic_write_checks can change our pos */ start_pos = pos; @@ -970,8 +993,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, size_t write_bytes = min(iov_iter_count(&i), nrptrs * (size_t)PAGE_CACHE_SIZE - offset); - size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + size_t num_pages = (write_bytes + offset + + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); @@ -1001,8 +1024,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, &i); - dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; if (num_pages > dirty_pages) { if (copied > 0) @@ -1237,6 +1260,117 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } +static long btrfs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct extent_state *cached_state = NULL; + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 locked_end; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + struct extent_map *em; + int ret; + + alloc_start = offset & ~mask; + alloc_end = (offset + len + mask) & ~mask; + + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + + mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, alloc_end); + if (ret) + goto out; + + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(inode, alloc_start); + if (ret) + goto out; + } + + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); + if (ret) + goto out; + + locked_end = alloc_end - 1; + while (1) { + struct btrfs_ordered_extent *ordered; + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, + locked_end, 0, &cached_state, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + alloc_end - 1); + if (ordered && + ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset < alloc_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + alloc_start, locked_end, + &cached_state, GFP_NOFS); + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ + btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + } else { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + } + + cur_offset = alloc_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + alloc_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), alloc_end); + last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, + last_byte - cur_offset, + 1 << inode->i_blkbits, + offset + len, + &alloc_hint); + if (ret < 0) { + free_extent_map(em); + break; + } + } + free_extent_map(em); + + cur_offset = last_byte; + if (cur_offset >= alloc_end) { + ret = 0; + break; + } + } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state, GFP_NOFS); + + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + const struct file_operations btrfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -1248,6 +1382,7 @@ const struct file_operations btrfs_file_operations = { .open = generic_file_open, .release = btrfs_release_file, .fsync = btrfs_sync_file, + .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 60d684266959..a0390657451b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -987,11 +987,18 @@ tree_search_offset(struct btrfs_block_group_cache *block_group, return entry; } -static void unlink_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_free_space *info) +static inline void +__unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) { rb_erase(&info->offset_index, &block_group->free_space_offset); block_group->free_extents--; +} + +static void unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + __unlink_free_space(block_group, info); block_group->free_space -= info->bytes; } @@ -1016,14 +1023,18 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; + u64 size = block_group->key.offset; /* * The goal is to keep the total amount of memory used per 1gb of space * at or below 32k, so we need to adjust how much memory we allow to be * used by extent based free space tracking */ - max_bytes = MAX_CACHE_BYTES_PER_GIG * - (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + if (size < 1024 * 1024 * 1024) + max_bytes = MAX_CACHE_BYTES_PER_GIG; + else + max_bytes = MAX_CACHE_BYTES_PER_GIG * + div64_u64(size, 1024 * 1024 * 1024); /* * we want to account for 1 more bitmap than what we have so we can make @@ -1171,6 +1182,16 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group, recalculate_thresholds(block_group); } +static void free_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info) +{ + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); +} + static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes) @@ -1195,6 +1216,7 @@ again: */ search_start = *offset; search_bytes = *bytes; + search_bytes = min(search_bytes, end - search_start + 1); ret = search_bitmap(block_group, bitmap_info, &search_start, &search_bytes); BUG_ON(ret < 0 || search_start != *offset); @@ -1211,13 +1233,8 @@ again: if (*bytes) { struct rb_node *next = rb_next(&bitmap_info->offset_index); - if (!bitmap_info->bytes) { - unlink_free_space(block_group, bitmap_info); - kfree(bitmap_info->bitmap); - kfree(bitmap_info); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + if (!bitmap_info->bytes) + free_bitmap(block_group, bitmap_info); /* * no entry after this bitmap, but we still have bytes to @@ -1250,13 +1267,8 @@ again: return -EAGAIN; goto again; - } else if (!bitmap_info->bytes) { - unlink_free_space(block_group, bitmap_info); - kfree(bitmap_info->bitmap); - kfree(bitmap_info); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + } else if (!bitmap_info->bytes) + free_bitmap(block_group, bitmap_info); return 0; } @@ -1359,22 +1371,14 @@ out: return ret; } -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +bool try_merge_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, bool update_stat) { - struct btrfs_free_space *right_info = NULL; - struct btrfs_free_space *left_info = NULL; - struct btrfs_free_space *info = NULL; - int ret = 0; - - info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); - if (!info) - return -ENOMEM; - - info->offset = offset; - info->bytes = bytes; - - spin_lock(&block_group->tree_lock); + struct btrfs_free_space *left_info; + struct btrfs_free_space *right_info; + bool merged = false; + u64 offset = info->offset; + u64 bytes = info->bytes; /* * first we want to see if there is free space adjacent to the range we @@ -1388,37 +1392,62 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, else left_info = tree_search_offset(block_group, offset - 1, 0, 0); - /* - * If there was no extent directly to the left or right of this new - * extent then we know we're going to have to allocate a new extent, so - * before we do that see if we need to drop this into a bitmap - */ - if ((!left_info || left_info->bitmap) && - (!right_info || right_info->bitmap)) { - ret = insert_into_bitmap(block_group, info); - - if (ret < 0) { - goto out; - } else if (ret) { - ret = 0; - goto out; - } - } - if (right_info && !right_info->bitmap) { - unlink_free_space(block_group, right_info); + if (update_stat) + unlink_free_space(block_group, right_info); + else + __unlink_free_space(block_group, right_info); info->bytes += right_info->bytes; kfree(right_info); + merged = true; } if (left_info && !left_info->bitmap && left_info->offset + left_info->bytes == offset) { - unlink_free_space(block_group, left_info); + if (update_stat) + unlink_free_space(block_group, left_info); + else + __unlink_free_space(block_group, left_info); info->offset = left_info->offset; info->bytes += left_info->bytes; kfree(left_info); + merged = true; } + return merged; +} + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *info; + int ret = 0; + + info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!info) + return -ENOMEM; + + info->offset = offset; + info->bytes = bytes; + + spin_lock(&block_group->tree_lock); + + if (try_merge_free_space(block_group, info, true)) + goto link; + + /* + * There was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + ret = insert_into_bitmap(block_group, info); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } +link: ret = link_free_space(block_group, info); if (ret) kfree(info); @@ -1621,6 +1650,7 @@ __btrfs_return_cluster_to_free_space( node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); BUG_ON(entry->bitmap); + try_merge_free_space(block_group, entry, false); tree_insert_offset(&block_group->free_space_offset, entry->offset, &entry->offset_index, 0); } @@ -1685,13 +1715,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, ret = offset; if (entry->bitmap) { bitmap_clear_bits(block_group, entry, offset, bytes); - if (!entry->bytes) { - unlink_free_space(block_group, entry); - kfree(entry->bitmap); - kfree(entry); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + if (!entry->bytes) + free_bitmap(block_group, entry); } else { unlink_free_space(block_group, entry); entry->offset += bytes; @@ -1789,6 +1814,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, ret = search_start; bitmap_clear_bits(block_group, entry, ret, bytes); + if (entry->bytes == 0) + free_bitmap(block_group, entry); out: spin_unlock(&cluster->lock); spin_unlock(&block_group->tree_lock); @@ -1842,15 +1869,26 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, entry->offset += bytes; entry->bytes -= bytes; - if (entry->bytes == 0) { + if (entry->bytes == 0) rb_erase(&entry->offset_index, &cluster->root); - kfree(entry); - } break; } out: spin_unlock(&cluster->lock); + if (!ret) + return 0; + + spin_lock(&block_group->tree_lock); + + block_group->free_space -= bytes; + if (entry->bytes == 0) { + block_group->free_extents--; + kfree(entry); + } + + spin_unlock(&block_group->tree_lock); + return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0ff46a47895..0efdb65953c5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -122,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; size_t datasize; unsigned long offset; - int use_compress = 0; + int compress_type = BTRFS_COMPRESS_NONE; if (compressed_size && compressed_pages) { - use_compress = 1; + compress_type = root->fs_info->compress_type; cur_size = compressed_size; } @@ -159,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, ei, size); ptr = btrfs_file_extent_inline_start(ei); - if (use_compress) { + if (compress_type != BTRFS_COMPRESS_NONE) { struct page *cpage; int i = 0; while (compressed_size > 0) { @@ -176,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, compressed_size -= cur_size; } btrfs_set_file_extent_compression(leaf, ei, - BTRFS_COMPRESS_ZLIB); + compress_type); } else { page = find_get_page(inode->i_mapping, start >> PAGE_CACHE_SHIFT); @@ -263,6 +263,7 @@ struct async_extent { u64 compressed_size; struct page **pages; unsigned long nr_pages; + int compress_type; struct list_head list; }; @@ -280,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow, u64 start, u64 ram_size, u64 compressed_size, struct page **pages, - unsigned long nr_pages) + unsigned long nr_pages, + int compress_type) { struct async_extent *async_extent; @@ -290,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow, async_extent->compressed_size = compressed_size; async_extent->pages = pages; async_extent->nr_pages = nr_pages; + async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; } @@ -332,6 +335,7 @@ static noinline int compress_file_range(struct inode *inode, unsigned long max_uncompressed = 128 * 1024; int i; int will_compress; + int compress_type = root->fs_info->compress_type; actual_end = min_t(u64, isize, end + 1); again: @@ -381,12 +385,16 @@ again: WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); - ret = btrfs_zlib_compress_pages(inode->i_mapping, start, - total_compressed, pages, - nr_pages, &nr_pages_ret, - &total_in, - &total_compressed, - max_compressed); + if (BTRFS_I(inode)->force_compress) + compress_type = BTRFS_I(inode)->force_compress; + + ret = btrfs_compress_pages(compress_type, + inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); if (!ret) { unsigned long offset = total_compressed & @@ -408,7 +416,7 @@ again: } if (start == 0) { trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -493,7 +501,8 @@ again: * and will submit them to the elevator. */ add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret); + total_compressed, pages, nr_pages_ret, + compress_type); if (start + num_bytes < end) { start += num_bytes; @@ -515,7 +524,8 @@ cleanup_and_bail_uncompressed: __set_page_dirty_nobuffers(locked_page); /* unlocked later on in the async handlers */ } - add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); + add_async_extent(async_cow, start, end - start + 1, + 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; } @@ -602,6 +612,7 @@ retry: GFP_NOFS); trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, @@ -633,6 +644,7 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em); em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; @@ -640,6 +652,7 @@ retry: em->block_start = ins.objectid; em->block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -656,11 +669,13 @@ retry: async_extent->ram_size - 1, 0); } - ret = btrfs_add_ordered_extent(inode, async_extent->start, - ins.objectid, - async_extent->ram_size, - ins.offset, - BTRFS_ORDERED_COMPRESSED); + ret = btrfs_add_ordered_extent_compress(inode, + async_extent->start, + ins.objectid, + async_extent->ram_size, + ins.offset, + BTRFS_ORDERED_COMPRESSED, + async_extent->compress_type); BUG_ON(ret); /* @@ -758,7 +773,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(root == root->fs_info->tree_root); trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -806,6 +821,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(ret); em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em); em->start = start; em->orig_start = em->start; ram_size = ins.offset; @@ -1036,7 +1052,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, } else { trans = btrfs_join_transaction(root, 1); } - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); cow_start = (u64)-1; cur_offset = start; @@ -1155,6 +1171,7 @@ out_check: struct extent_map_tree *em_tree; em_tree = &BTRFS_I(inode)->extent_tree; em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em); em->start = cur_offset; em->orig_start = em->start; em->len = num_bytes; @@ -1544,6 +1561,7 @@ out: out_page: unlock_page(page); page_cache_release(page); + kfree(fixup); } /* @@ -1670,7 +1688,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - int compressed = 0; + int compress_type = 0; int ret; bool nolock = false; @@ -1690,7 +1708,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction_nolock(root, 1); else trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); @@ -1707,13 +1725,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction_nolock(root, 1); else trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) - compressed = 1; + compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - BUG_ON(compressed); + BUG_ON(compress_type); ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + @@ -1727,7 +1746,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, - compressed, 0, 0, + compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, @@ -1829,6 +1848,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { logical = em->block_start; failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); } failrec->logical = logical; free_extent_map(em); @@ -1892,7 +1913,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start) private = 0; if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY)) { + (u64)-1, 1, EXTENT_DIRTY, 0)) { ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, &private_failure); if (ret == 0) { @@ -2339,6 +2360,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) */ if (is_bad_inode(inode)) { trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_orphan_del(trans, inode); btrfs_end_transaction(trans, root); iput(inode); @@ -2366,6 +2388,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) if (root->orphan_block_rsv || root->orphan_item_inserted) { trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); btrfs_end_transaction(trans, root); } @@ -2626,7 +2649,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto err; + goto out; } path->leave_spinning = 1; @@ -2699,9 +2722,10 @@ static int check_path_shared(struct btrfs_root *root, struct extent_buffer *eb; int level; u64 refs = 1; - int uninitialized_var(ret); for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + int ret; + if (!path->nodes[level]) break; eb = path->nodes[level]; @@ -2712,7 +2736,7 @@ static int check_path_shared(struct btrfs_root *root, if (refs > 1) return 1; } - return ret; /* XXX callers? */ + return 0; } /* @@ -3671,8 +3695,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; int err; + if (btrfs_root_readonly(root)) + return -EROFS; + err = inode_change_ok(inode, attr); if (err) return err; @@ -4084,8 +4112,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) int index; int ret; - d_set_d_op(dentry, &btrfs_dentry_operations); - if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -4117,7 +4143,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } srcu_read_unlock(&root->fs_info->subvol_srcu, index); - if (root != sub_root) { + if (!IS_ERR(inode) && root != sub_root) { down_read(&root->fs_info->cleanup_work_sem); if (!(inode->i_sb->s_flags & MS_RDONLY)) btrfs_orphan_cleanup(sub_root); @@ -4330,6 +4356,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) trans = btrfs_join_transaction_nolock(root, 1); else trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, inode); if (nolock) ret = btrfs_end_transaction_nolock(trans, root); @@ -4355,6 +4383,7 @@ void btrfs_dirty_inode(struct inode *inode) return; trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); ret = btrfs_update_inode(trans, root, inode); @@ -4930,8 +4959,10 @@ static noinline int uncompress_inline(struct btrfs_path *path, size_t max_size; unsigned long inline_size; unsigned long ptr; + int compress_type; WARN_ON(pg_offset != 0); + compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, btrfs_item_nr(leaf, path->slots[0])); @@ -4941,8 +4972,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); - ret = btrfs_zlib_decompress(tmp, page, extent_offset, - inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, page, + extent_offset, inline_size, max_size); if (ret) { char *kaddr = kmap_atomic(page, KM_USER0); unsigned long copy_size = min_t(u64, @@ -4984,7 +5015,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; - int compressed; + int compress_type; again: read_lock(&em_tree->lock); @@ -5043,7 +5074,7 @@ again: found_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; - compressed = btrfs_file_extent_compression(leaf, item); + compress_type = btrfs_file_extent_compression(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + @@ -5089,8 +5120,9 @@ again: em->block_start = EXTENT_MAP_HOLE; goto insert; } - if (compressed) { + if (compress_type != BTRFS_COMPRESS_NONE) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; em->block_start = bytenr; em->block_len = btrfs_file_extent_disk_num_bytes(leaf, item); @@ -5124,12 +5156,14 @@ again: em->len = (copy_size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); em->orig_start = EXTENT_MAP_INLINE; - if (compressed) + if (compress_type) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (create == 0 && !PageUptodate(page)) { - if (btrfs_file_extent_compression(leaf, item) == - BTRFS_COMPRESS_ZLIB) { + if (btrfs_file_extent_compression(leaf, item) != + BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, inode, page, pg_offset, extent_offset, item); @@ -5154,6 +5188,8 @@ again: em = NULL; btrfs_release_path(root, path); trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return ERR_CAST(trans); goto again; } map = kmap(page); @@ -5244,6 +5280,128 @@ out: return em; } +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + struct extent_map *em; + struct extent_map *hole_em = NULL; + u64 range_start = start; + u64 end; + u64 found; + u64 found_end; + int err = 0; + + em = btrfs_get_extent(inode, page, pg_offset, start, len, create); + if (IS_ERR(em)) + return em; + if (em) { + /* + * if our em maps to a hole, there might + * actually be delalloc bytes behind it + */ + if (em->block_start != EXTENT_MAP_HOLE) + return em; + else + hole_em = em; + } + + /* check to see if we've wrapped (len == -1 or similar) */ + end = start + len; + if (end < start) + end = (u64)-1; + else + end -= 1; + + em = NULL; + + /* ok, we didn't find anything, lets look for delalloc */ + found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, + end, len, EXTENT_DELALLOC, 1); + found_end = range_start + found; + if (found_end < range_start) + found_end = (u64)-1; + + /* + * we didn't find anything useful, return + * the original results from get_extent() + */ + if (range_start > end || found_end <= start) { + em = hole_em; + hole_em = NULL; + goto out; + } + + /* adjust the range_start to make sure it doesn't + * go backwards from the start they passed in + */ + range_start = max(start,range_start); + found = found_end - range_start; + + if (found > 0) { + u64 hole_start = start; + u64 hole_len = len; + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + err = -ENOMEM; + goto out; + } + /* + * when btrfs_get_extent can't find anything it + * returns one huge hole + * + * make sure what it found really fits our range, and + * adjust to make sure it is based on the start from + * the caller + */ + if (hole_em) { + u64 calc_end = extent_map_end(hole_em); + + if (calc_end <= start || (hole_em->start > end)) { + free_extent_map(hole_em); + hole_em = NULL; + } else { + hole_start = max(hole_em->start, start); + hole_len = calc_end - hole_start; + } + } + em->bdev = NULL; + if (hole_em && range_start > hole_start) { + /* our hole starts before our delalloc, so we + * have to return just the parts of the hole + * that go until the delalloc starts + */ + em->len = min(hole_len, + range_start - hole_start); + em->start = hole_start; + em->orig_start = hole_start; + /* + * don't adjust block start at all, + * it is fixed at EXTENT_MAP_HOLE + */ + em->block_start = hole_em->block_start; + em->block_len = hole_len; + } else { + em->start = range_start; + em->len = found; + em->orig_start = range_start; + em->block_start = EXTENT_MAP_DELALLOC; + em->block_len = found; + } + } else if (hole_em) { + return hole_em; + } +out: + + free_extent_map(hole_em); + if (err) { + free_extent_map(em); + return ERR_PTR(err); + } + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -5258,8 +5416,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, btrfs_drop_extent_cache(inode, start, start + len - 1, 0); trans = btrfs_join_transaction(root, 0); - if (!trans) - return ERR_PTR(-ENOMEM); + if (IS_ERR(trans)) + return ERR_CAST(trans); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -5483,7 +5641,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * while we look for nocow cross refs */ trans = btrfs_join_transaction(root, 0); - if (!trans) + if (IS_ERR(trans)) goto must_cow; if (can_nocow_odirect(trans, inode, start, len) == 1) { @@ -5618,7 +5776,7 @@ again: BUG_ON(!ordered); trans = btrfs_join_transaction(root, 1); - if (!trans) { + if (IS_ERR(trans)) { err = -ENOMEM; goto out; } @@ -6066,7 +6224,7 @@ out: static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { - return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); } int btrfs_readpage(struct file *file, struct page *page) @@ -6479,7 +6637,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->ordered_data_close = 0; ei->orphan_meta_reserved = 0; ei->dummy_inode = 0; - ei->force_compress = 0; + ei->force_compress = BTRFS_COMPRESS_NONE; inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree, GFP_NOFS); @@ -7100,112 +7258,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } -static long btrfs_fallocate(struct inode *inode, int mode, - loff_t offset, loff_t len) -{ - struct extent_state *cached_state = NULL; - u64 cur_offset; - u64 last_byte; - u64 alloc_start; - u64 alloc_end; - u64 alloc_hint = 0; - u64 locked_end; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; - struct extent_map *em; - int ret; - - alloc_start = offset & ~mask; - alloc_end = (offset + len + mask) & ~mask; - - /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. - */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); - - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, alloc_end); - if (ret) - goto out; - - if (alloc_start > inode->i_size) { - ret = btrfs_cont_expand(inode, alloc_start); - if (ret) - goto out; - } - - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); - if (ret) - goto out; - - locked_end = alloc_end - 1; - while (1) { - struct btrfs_ordered_extent *ordered; - - /* the extent lock is ordered inside the running - * transaction - */ - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, - alloc_end - 1); - if (ordered && - ordered->file_offset + ordered->len > alloc_start && - ordered->file_offset < alloc_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - alloc_start, locked_end, - &cached_state, GFP_NOFS); - /* - * we can't wait on the range with the transaction - * running or with the extent lock held - */ - btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); - } else { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - } - - cur_offset = alloc_start; - while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - alloc_end - cur_offset, 0); - BUG_ON(IS_ERR(em) || !em); - last_byte = min(extent_map_end(em), alloc_end); - last_byte = (last_byte + mask) & ~mask; - if (em->block_start == EXTENT_MAP_HOLE || - (cur_offset >= inode->i_size && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, - last_byte - cur_offset, - 1 << inode->i_blkbits, - offset + len, - &alloc_hint); - if (ret < 0) { - free_extent_map(em); - break; - } - } - free_extent_map(em); - - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; - break; - } - } - unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_NOFS); - - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); -out: - mutex_unlock(&inode->i_mutex); - return ret; -} - static int btrfs_set_page_dirty(struct page *page) { return __set_page_dirty_nobuffers(page); @@ -7213,6 +7265,10 @@ static int btrfs_set_page_dirty(struct page *page) static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) { + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (btrfs_root_readonly(root) && (mask & MAY_WRITE)) + return -EROFS; if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) return -EACCES; return generic_permission(inode, mask, flags, btrfs_check_acl); @@ -7308,7 +7364,6 @@ static const struct inode_operations btrfs_file_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, - .fallocate = btrfs_fallocate, .fiemap = btrfs_fiemap, }; static const struct inode_operations btrfs_special_inode_operations = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f87552a1d7ea..5fdb2abc4fa7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -147,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) unsigned int flags, oldflags; int ret; + if (btrfs_root_readonly(root)) + return -EROFS; + if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; @@ -200,7 +203,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); @@ -360,7 +363,8 @@ fail: } static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen, u64 *async_transid) + char *name, int namelen, u64 *async_transid, + bool readonly) { struct inode *inode; struct dentry *parent; @@ -378,6 +382,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, btrfs_init_block_rsv(&pending_snapshot->block_rsv); pending_snapshot->dentry = dentry; pending_snapshot->root = root; + pending_snapshot->readonly = readonly; trans = btrfs_start_transaction(root->fs_info->extent_root, 5); if (IS_ERR(trans)) { @@ -509,7 +514,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, - u64 *async_transid) + u64 *async_transid, bool readonly) { struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; @@ -541,7 +546,7 @@ static noinline int btrfs_mksubvol(struct path *parent, if (snap_src) { error = create_snapshot(snap_src, dentry, - name, namelen, async_transid); + name, namelen, async_transid, readonly); } else { error = create_subvol(BTRFS_I(dir)->root, dentry, name, namelen, async_transid); @@ -638,9 +643,11 @@ static int btrfs_defrag_file(struct file *file, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct page *page; + struct btrfs_super_block *disk_super; unsigned long last_index; unsigned long ra_pages = root->fs_info->bdi.ra_pages; unsigned long total_read = 0; + u64 features; u64 page_start; u64 page_end; u64 last_len = 0; @@ -648,6 +655,14 @@ static int btrfs_defrag_file(struct file *file, u64 defrag_end = 0; unsigned long i; int ret; + int compress_type = BTRFS_COMPRESS_ZLIB; + + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + if (range->compress_type > BTRFS_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } if (inode->i_size == 0) return 0; @@ -683,7 +698,7 @@ static int btrfs_defrag_file(struct file *file, total_read++; mutex_lock(&inode->i_mutex); if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) - BTRFS_I(inode)->force_compress = 1; + BTRFS_I(inode)->force_compress = compress_type; ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) @@ -781,10 +796,17 @@ loop_unlock: atomic_dec(&root->fs_info->async_submit_draining); mutex_lock(&inode->i_mutex); - BTRFS_I(inode)->force_compress = 0; + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; mutex_unlock(&inode->i_mutex); } + disk_super = &root->fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (range->compress_type == BTRFS_COMPRESS_LZO) { + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + btrfs_set_super_incompat_flags(disk_super, features); + } + return 0; err_reservations: @@ -885,6 +907,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); } else { @@ -901,7 +927,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, char *name, unsigned long fd, int subvol, - u64 *transid) + u64 *transid, + bool readonly) { struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct file *src_file; @@ -919,7 +946,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, - NULL, transid); + NULL, transid, readonly); } else { struct inode *src_inode; src_file = fget(fd); @@ -938,7 +965,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, } ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, - transid); + transid, readonly); fput(src_file); } out: @@ -946,58 +973,142 @@ out: } static noinline int btrfs_ioctl_snap_create(struct file *file, - void __user *arg, int subvol, - int v2) + void __user *arg, int subvol) { - struct btrfs_ioctl_vol_args *vol_args = NULL; - struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL; - char *name; - u64 fd; + struct btrfs_ioctl_vol_args *vol_args; int ret; - if (v2) { - u64 transid = 0; - u64 *ptr = NULL; + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2)); - if (IS_ERR(vol_args_v2)) - return PTR_ERR(vol_args_v2); + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, + NULL, false); - if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) { - ret = -EINVAL; - goto out; - } - - name = vol_args_v2->name; - fd = vol_args_v2->fd; - vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + kfree(vol_args); + return ret; +} - if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC) - ptr = &transid; +static noinline int btrfs_ioctl_snap_create_v2(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_ioctl_vol_args_v2 *vol_args; + int ret; + u64 transid = 0; + u64 *ptr = NULL; + bool readonly = false; - ret = btrfs_ioctl_snap_create_transid(file, name, fd, - subvol, ptr); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - if (ret == 0 && ptr && - copy_to_user(arg + - offsetof(struct btrfs_ioctl_vol_args_v2, - transid), ptr, sizeof(*ptr))) - ret = -EFAULT; - } else { - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); - name = vol_args->name; - fd = vol_args->fd; - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - - ret = btrfs_ioctl_snap_create_transid(file, name, fd, - subvol, NULL); + if (vol_args->flags & + ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) { + ret = -EOPNOTSUPP; + goto out; } + + if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) + ptr = &transid; + if (vol_args->flags & BTRFS_SUBVOL_RDONLY) + readonly = true; + + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, + ptr, readonly); + + if (ret == 0 && ptr && + copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), ptr, sizeof(*ptr))) + ret = -EFAULT; out: kfree(vol_args); - kfree(vol_args_v2); + return ret; +} + +static noinline int btrfs_ioctl_subvol_getflags(struct file *file, + void __user *arg) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + u64 flags = 0; + + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + down_read(&root->fs_info->subvol_sem); + if (btrfs_root_readonly(root)) + flags |= BTRFS_SUBVOL_RDONLY; + up_read(&root->fs_info->subvol_sem); + + if (copy_to_user(arg, &flags, sizeof(flags))) + ret = -EFAULT; + + return ret; +} + +static noinline int btrfs_ioctl_subvol_setflags(struct file *file, + void __user *arg) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 root_flags; + u64 flags; + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & BTRFS_SUBVOL_CREATE_ASYNC) + return -EINVAL; + + if (flags & ~BTRFS_SUBVOL_RDONLY) + return -EOPNOTSUPP; + + if (!is_owner_or_cap(inode)) + return -EACCES; + + down_write(&root->fs_info->subvol_sem); + + /* nothing to do */ + if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) + goto out; + root_flags = btrfs_root_flags(&root->root_item); + if (flags & BTRFS_SUBVOL_RDONLY) + btrfs_set_root_flags(&root->root_item, + root_flags | BTRFS_ROOT_SUBVOL_RDONLY); + else + btrfs_set_root_flags(&root->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_reset; + } + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + + btrfs_commit_transaction(trans, root); +out_reset: + if (ret) + btrfs_set_root_flags(&root->root_item, root_flags); +out: + up_write(&root->fs_info->subvol_sem); return ret; } @@ -1509,6 +1620,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) struct btrfs_ioctl_defrag_range_args *range; int ret; + if (btrfs_root_readonly(root)) + return -EROFS; + ret = mnt_want_write(file->f_path.mnt); if (ret) return ret; @@ -1637,6 +1751,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) return -EINVAL; + if (btrfs_root_readonly(root)) + return -EROFS; + ret = mnt_want_write(file->f_path.mnt); if (ret) return ret; @@ -1788,7 +1905,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = inode->i_ino; - new_key.offset = key.offset + destoff - off; + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { @@ -1958,6 +2078,10 @@ static long btrfs_ioctl_trans_start(struct file *file) if (file->private_data) goto out; + ret = -EROFS; + if (btrfs_root_readonly(root)) + goto out; + ret = mnt_want_write(file->f_path.mnt); if (ret) goto out; @@ -1968,7 +2092,7 @@ static long btrfs_ioctl_trans_start(struct file *file) ret = -ENOMEM; trans = btrfs_start_ioctl_transaction(root, 0); - if (!trans) + if (IS_ERR(trans)) goto out_drop; file->private_data = trans; @@ -2024,9 +2148,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); - if (!trans) { + if (IS_ERR(trans)) { btrfs_free_path(path); - return -ENOMEM; + return PTR_ERR(trans); } dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); @@ -2087,7 +2211,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) int num_types = 4; int alloc_size; int ret = 0; - int slot_count = 0; + u64 slot_count = 0; int i, c; if (copy_from_user(&space_args, @@ -2126,7 +2250,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) goto out; } - slot_count = min_t(int, space_args.space_slots, slot_count); + slot_count = min_t(u64, space_args.space_slots, slot_count); alloc_size = sizeof(*dest) * slot_count; @@ -2146,6 +2270,9 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) for (i = 0; i < num_types; i++) { struct btrfs_space_info *tmp; + if (!slot_count) + break; + info = NULL; rcu_read_lock(); list_for_each_entry_rcu(tmp, &root->fs_info->space_info, @@ -2167,7 +2294,10 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) memcpy(dest, &space, sizeof(space)); dest++; space_args.total_spaces++; + slot_count--; } + if (!slot_count) + break; } up_read(&info->groups_sem); } @@ -2220,6 +2350,8 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp u64 transid; trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); transid = trans->transid; btrfs_commit_transaction_async(trans, root, 0); @@ -2257,13 +2389,17 @@ long btrfs_ioctl(struct file *file, unsigned int case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, argp, 0, 0); + return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SNAP_CREATE_V2: - return btrfs_ioctl_snap_create(file, argp, 0, 1); + return btrfs_ioctl_snap_create_v2(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: - return btrfs_ioctl_snap_create(file, argp, 1, 0); + return btrfs_ioctl_snap_create(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp); + case BTRFS_IOC_SUBVOL_GETFLAGS: + return btrfs_ioctl_subvol_getflags(file, argp); + case BTRFS_IOC_SUBVOL_SETFLAGS: + return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: return btrfs_ioctl_default_subvol(file, argp); case BTRFS_IOC_DEFRAG: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index c344d12c646b..8fb382167b13 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args { }; #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) +#define BTRFS_SUBVOL_RDONLY (1ULL << 1) #define BTRFS_SUBVOL_NAME_MAX 4039 struct btrfs_ioctl_vol_args_v2 { @@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args { */ __u32 extent_thresh; + /* + * which compression method to use if turning on compression + * for this defrag operation. If unspecified, zlib will + * be used + */ + __u32 compress_type; + /* spare for later */ - __u32 unused[5]; + __u32 unused[4]; }; struct btrfs_ioctl_space_info { @@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args { #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64) +#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) #endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c new file mode 100644 index 000000000000..a178f5ebea78 --- /dev/null +++ b/fs/btrfs/lzo.c @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include <linux/lzo.h> +#include "compression.h" + +#define LZO_LEN 4 + +struct workspace { + void *mem; + void *buf; /* where compressed data goes */ + void *cbuf; /* where decompressed data goes */ + struct list_head list; +}; + +static void lzo_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + vfree(workspace->buf); + vfree(workspace->cbuf); + vfree(workspace->mem); + kfree(workspace); +} + +static struct list_head *lzo_alloc_workspace(void) +{ + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); + workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + if (!workspace->mem || !workspace->buf || !workspace->cbuf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + lzo_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static inline void write_compress_length(char *buf, size_t len) +{ + __le32 dlen; + + dlen = cpu_to_le32(len); + memcpy(buf, &dlen, LZO_LEN); +} + +static inline size_t read_compress_length(char *buf) +{ + __le32 dlen; + + memcpy(&dlen, buf, LZO_LEN); + return le32_to_cpu(dlen); +} + +static int lzo_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + unsigned long bytes_left; + + size_t in_len; + size_t out_len; + char *buf; + unsigned long tot_in = 0; + unsigned long tot_out = 0; + unsigned long pg_bytes_left; + unsigned long out_offset; + unsigned long bytes; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + /* + * store the size of all chunks of compressed data in + * the first 4 bytes + */ + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + out_offset = LZO_LEN; + tot_out = LZO_LEN; + pages[0] = out_page; + nr_pages = 1; + pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + /* compress at most one page of data each time */ + in_len = min(len, PAGE_CACHE_SIZE); + while (tot_in < len) { + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, + &out_len, workspace->mem); + if (ret != LZO_E_OK) { + printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + ret); + ret = -1; + goto out; + } + + /* store the size of this chunk of compressed data */ + write_compress_length(cpage_out + out_offset, out_len); + tot_out += LZO_LEN; + out_offset += LZO_LEN; + pg_bytes_left -= LZO_LEN; + + tot_in += in_len; + tot_out += out_len; + + /* copy bytes from the working buffer into the pages */ + buf = workspace->cbuf; + while (out_len) { + bytes = min_t(unsigned long, pg_bytes_left, out_len); + + memcpy(cpage_out + out_offset, buf, bytes); + + out_len -= bytes; + pg_bytes_left -= bytes; + buf += bytes; + out_offset += bytes; + + /* + * we need another page for writing out. + * + * Note if there's less than 4 bytes left, we just + * skip to a new page. + */ + if ((out_len == 0 && pg_bytes_left < LZO_LEN) || + pg_bytes_left == 0) { + if (pg_bytes_left) { + memset(cpage_out + out_offset, 0, + pg_bytes_left); + tot_out += pg_bytes_left; + } + + /* we're done, don't allocate new page */ + if (out_len == 0 && tot_in >= len) + break; + + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -1; + goto out; + } + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages++] = out_page; + + pg_bytes_left = PAGE_CACHE_SIZE; + out_offset = 0; + } + } + + /* we're making it bigger, give up */ + if (tot_in > 8192 && tot_in < tot_out) + goto out; + + /* we're all done */ + if (tot_in >= len) + break; + + if (tot_out > max_out) + break; + + bytes_left = len - tot_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + in_len = min(bytes_left, PAGE_CACHE_SIZE); + } + + if (tot_out > tot_in) + goto out; + + /* store the size of all chunks of compressed data */ + cpage_out = kmap(pages[0]); + write_compress_length(cpage_out, tot_out); + + kunmap(pages[0]); + + ret = 0; + *total_out = tot_out; + *total_in = tot_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + + return ret; +} + +static int lzo_decompress_biovec(struct list_head *ws, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; + char *data_in; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long buf_offset = 0; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + + size_t in_len; + size_t out_len; + unsigned long in_offset; + unsigned long in_page_bytes_left; + unsigned long tot_in; + unsigned long tot_out; + unsigned long tot_len; + char *buf; + bool may_late_unmap, need_unmap; + + data_in = kmap(pages_in[0]); + tot_len = read_compress_length(data_in); + + tot_in = LZO_LEN; + in_offset = LZO_LEN; + tot_len = min_t(size_t, srclen, tot_len); + in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + tot_out = 0; + pg_offset = 0; + + while (tot_in < tot_len) { + in_len = read_compress_length(data_in + in_offset); + in_page_bytes_left -= LZO_LEN; + in_offset += LZO_LEN; + tot_in += LZO_LEN; + + tot_in += in_len; + working_bytes = in_len; + may_late_unmap = need_unmap = false; + + /* fast path: avoid using the working buffer */ + if (in_page_bytes_left >= in_len) { + buf = data_in + in_offset; + bytes = in_len; + may_late_unmap = true; + goto cont; + } + + /* copy bytes from the pages into the working buffer */ + buf = workspace->cbuf; + buf_offset = 0; + while (working_bytes) { + bytes = min(working_bytes, in_page_bytes_left); + + memcpy(buf + buf_offset, data_in + in_offset, bytes); + buf_offset += bytes; +cont: + working_bytes -= bytes; + in_page_bytes_left -= bytes; + in_offset += bytes; + + /* check if we need to pick another page */ + if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN) + || in_page_bytes_left == 0) { + tot_in += in_page_bytes_left; + + if (working_bytes == 0 && tot_in >= tot_len) + break; + + if (page_in_index + 1 >= total_pages_in) { + ret = -1; + goto done; + } + + if (may_late_unmap) + need_unmap = true; + else + kunmap(pages_in[page_in_index]); + + data_in = kmap(pages_in[++page_in_index]); + + in_page_bytes_left = PAGE_CACHE_SIZE; + in_offset = 0; + } + } + + out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); + ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, + &out_len); + if (need_unmap) + kunmap(pages_in[page_in_index - 1]); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "btrfs decompress failed\n"); + ret = -1; + break; + } + + buf_start = tot_out; + tot_out += out_len; + + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + tot_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) + break; + } +done: + kunmap(pages_in[page_in_index]); + return ret; +} + +static int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + size_t in_len; + size_t out_len; + size_t tot_len; + int ret = 0; + char *kaddr; + unsigned long bytes; + + BUG_ON(srclen < LZO_LEN); + + tot_len = read_compress_length(data_in); + data_in += LZO_LEN; + + in_len = read_compress_length(data_in); + data_in += LZO_LEN; + + out_len = PAGE_CACHE_SIZE; + ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "btrfs decompress failed!\n"); + ret = -1; + goto out; + } + + if (out_len < start_byte) { + ret = -1; + goto out; + } + + bytes = min_t(unsigned long, destlen, out_len - start_byte); + + kaddr = kmap_atomic(dest_page, KM_USER0); + memcpy(kaddr, workspace->buf + start_byte, bytes); + kunmap_atomic(kaddr, KM_USER0); +out: + return ret; +} + +struct btrfs_compress_op btrfs_lzo_compress = { + .alloc_workspace = lzo_alloc_workspace, + .free_workspace = lzo_free_workspace, + .compress_pages = lzo_compress_pages, + .decompress_biovec = lzo_decompress_biovec, + .decompress = lzo_decompress, +}; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ae7737e352c9..083a55477375 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, u64 file_offset) { struct rb_root *root = &tree->tree; - struct rb_node *prev; + struct rb_node *prev = NULL; struct rb_node *ret; struct btrfs_ordered_extent *entry; @@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, */ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, - int type, int dio) + int type, int dio, int compress_type) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = inode; + entry->compress_type = compress_type; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -220,14 +221,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type) { return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 0); + disk_len, type, 0, + BTRFS_COMPRESS_NONE); } int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type) { return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 1); + disk_len, type, 1, + BTRFS_COMPRESS_NONE); +} + +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + compress_type); } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 61dca83119dd..ff1f69aa1883 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -68,7 +68,7 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ -#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ @@ -93,6 +93,9 @@ struct btrfs_ordered_extent { /* flags (described above) */ unsigned long flags; + /* compression algorithm */ + int compress_type; + /* reference count */ atomic_t refs; @@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type); int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0d126be22b63..fb2605d998e9 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) #else BUG(); #endif + break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 045c9c2b2d7e..31ade5802ae8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1157,6 +1157,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, new_node->bytenr = dest->node->start; new_node->level = node->level; new_node->lowest = node->lowest; + new_node->checked = 1; new_node->root = dest; if (!node->lowest) { @@ -2028,6 +2029,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, while (1) { trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, @@ -2147,6 +2149,12 @@ again: } trans = btrfs_join_transaction(rc->extent_root, 1); + if (IS_ERR(trans)) { + if (!err) + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + return PTR_ERR(trans); + } if (!err) { if (num_bytes != rc->merging_rsv_size) { @@ -3222,6 +3230,7 @@ truncate: trans = btrfs_join_transaction(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); + ret = PTR_ERR(trans); goto out; } @@ -3628,6 +3637,7 @@ int prepare_to_relocate(struct reloc_control *rc) set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root, 1); + BUG_ON(IS_ERR(trans)); btrfs_commit_transaction(trans, rc->extent_root); return 0; } @@ -3644,6 +3654,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) u32 item_size; int ret; int err = 0; + int progress = 0; path = btrfs_alloc_path(); if (!path) @@ -3656,8 +3667,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } while (1) { + progress++; trans = btrfs_start_transaction(rc->extent_root, 0); - + BUG_ON(IS_ERR(trans)); +restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans, rc->extent_root); continue; @@ -3770,6 +3783,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } } } + if (trans && progress && err == -ENOSPC) { + ret = btrfs_force_chunk_alloc(trans, rc->extent_root, + rc->block_group->flags); + if (ret == 0) { + err = 0; + progress = 0; + goto restart; + } + } btrfs_release_path(rc->extent_root, path); clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, @@ -3804,7 +3826,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + btrfs_commit_transaction(trans, rc->extent_root); out_free: btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); btrfs_free_path(path); @@ -4022,6 +4047,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) int ret; trans = btrfs_start_transaction(root->fs_info->tree_root, 0); + BUG_ON(IS_ERR(trans)); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); @@ -4125,6 +4151,11 @@ int btrfs_recover_relocation(struct btrfs_root *root) set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root, 1); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + err = PTR_ERR(trans); + goto out_free; + } rc->merge_reloc_tree = 1; @@ -4154,9 +4185,13 @@ int btrfs_recover_relocation(struct btrfs_root *root) unset_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); -out: + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + btrfs_commit_transaction(trans, rc->extent_root); +out_free: kfree(rc); +out: while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 883c6fa1367e..d39a9895d932 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -54,6 +54,90 @@ static const struct super_operations btrfs_super_ops; +static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + errstr = "Readonly filesystem"; + break; + default: + if (nbuf) { + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +static void __save_error_info(struct btrfs_fs_info *fs_info) +{ + /* + * today we only save the error info into ram. Long term we'll + * also send it down to the disk + */ + fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; +} + +/* NOTE: + * We move write_super stuff at umount in order to avoid deadlock + * for umount hold all lock. + */ +static void save_error_info(struct btrfs_fs_info *fs_info) +{ + __save_error_info(fs_info); +} + +/* btrfs handle error by forcing the filesystem readonly */ +static void btrfs_handle_error(struct btrfs_fs_info *fs_info) +{ + struct super_block *sb = fs_info->sb; + + if (sb->s_flags & MS_RDONLY) + return; + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + sb->s_flags |= MS_RDONLY; + printk(KERN_INFO "btrfs is forced readonly\n"); + } +} + +/* + * __btrfs_std_error decodes expected errors from the caller and + * invokes the approciate error response. + */ +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno) +{ + struct super_block *sb = fs_info->sb; + char nbuf[16]; + const char *errstr; + + /* + * Special case: if the error is EROFS, and we're already + * under MS_RDONLY, then it is safe here. + */ + if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + return; + + errstr = btrfs_decode_error(fs_info, errno, nbuf); + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + save_error_info(fs_info); + + btrfs_handle_error(fs_info); +} + static void btrfs_put_super(struct super_block *sb) { struct btrfs_root *root = btrfs_sb(sb); @@ -69,9 +153,10 @@ enum { Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, - Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, - Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, - Opt_user_subvol_rm_allowed, + Opt_compress_type, Opt_compress_force, Opt_compress_force_type, + Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, + Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, + Opt_enospc_debug, Opt_err, }; static match_table_t tokens = { @@ -86,7 +171,9 @@ static match_table_t tokens = { {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, + {Opt_compress_type, "compress=%s"}, {Opt_compress_force, "compress-force"}, + {Opt_compress_force_type, "compress-force=%s"}, {Opt_ssd, "ssd"}, {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd, "nossd"}, @@ -98,6 +185,7 @@ static match_table_t tokens = { {Opt_space_cache, "space_cache"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, + {Opt_enospc_debug, "enospc_debug"}, {Opt_err, NULL}, }; @@ -112,6 +200,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) char *p, *num, *orig; int intarg; int ret = 0; + char *compress_type; + bool compress_force = false; if (!options) return 0; @@ -154,14 +244,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, NODATACOW); btrfs_set_opt(info->mount_opt, NODATASUM); break; - case Opt_compress: - printk(KERN_INFO "btrfs: use compression\n"); - btrfs_set_opt(info->mount_opt, COMPRESS); - break; case Opt_compress_force: - printk(KERN_INFO "btrfs: forcing compression\n"); - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + case Opt_compress_force_type: + compress_force = true; + case Opt_compress: + case Opt_compress_type: + if (token == Opt_compress || + token == Opt_compress_force || + strcmp(args[0].from, "zlib") == 0) { + compress_type = "zlib"; + info->compress_type = BTRFS_COMPRESS_ZLIB; + } else if (strcmp(args[0].from, "lzo") == 0) { + compress_type = "lzo"; + info->compress_type = BTRFS_COMPRESS_LZO; + } else { + ret = -EINVAL; + goto out; + } + btrfs_set_opt(info->mount_opt, COMPRESS); + if (compress_force) { + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + pr_info("btrfs: force %s compression\n", + compress_type); + } else + pr_info("btrfs: use %s compression\n", + compress_type); break; case Opt_ssd: printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); @@ -252,6 +360,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_user_subvol_rm_allowed: btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); break; + case Opt_enospc_debug: + btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -277,7 +388,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; - char *opts, *p; + char *opts, *orig, *p; int error = 0; int intarg; @@ -291,6 +402,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, opts = kstrdup(options, GFP_KERNEL); if (!opts) return -ENOMEM; + orig = opts; while ((p = strsep(&opts, ",")) != NULL) { int token; @@ -326,7 +438,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, } out_free_opts: - kfree(opts); + kfree(orig); out: /* * If no subvolume name is specified we use the default one. Allocate @@ -460,6 +572,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_magic = BTRFS_SUPER_MAGIC; sb->s_op = &btrfs_super_ops; + sb->s_d_op = &btrfs_dentry_operations; sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; @@ -516,6 +629,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_wait_ordered_extents(root, 0, 0); trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); return ret; } @@ -654,6 +769,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } btrfs_close_devices(fs_devices); + kfree(fs_info); + kfree(tree_root); } else { char b[BDEVNAME_SIZE]; @@ -752,6 +869,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) return 0; } +/* + * The helper to calc the free space on the devices that can be used to store + * file data. + */ +static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_device_info *devices_info; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 skip_space; + u64 type; + u64 avail_space; + u64 used_space; + u64 min_stripe_size; + int min_stripes = 1; + int i = 0, nr_devices; + int ret; + + nr_devices = fs_info->fs_devices->rw_devices; + BUG_ON(!nr_devices); + + devices_info = kmalloc(sizeof(*devices_info) * nr_devices, + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + /* calc min stripe number for data space alloction */ + type = btrfs_get_alloc_profile(root, 1); + if (type & BTRFS_BLOCK_GROUP_RAID0) + min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID1) + min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID10) + min_stripes = 4; + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_stripe_size = 2 * BTRFS_STRIPE_LEN; + else + min_stripe_size = BTRFS_STRIPE_LEN; + + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + if (!device->in_fs_metadata) + continue; + + avail_space = device->total_bytes - device->bytes_used; + + /* align with stripe_len */ + do_div(avail_space, BTRFS_STRIPE_LEN); + avail_space *= BTRFS_STRIPE_LEN; + + /* + * In order to avoid overwritting the superblock on the drive, + * btrfs starts at an offset of at least 1MB when doing chunk + * allocation. + */ + skip_space = 1024 * 1024; + + /* user can set the offset in fs_info->alloc_start. */ + if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= + device->total_bytes) + skip_space = max(fs_info->alloc_start, skip_space); + + /* + * btrfs can not use the free space in [0, skip_space - 1], + * we must subtract it from the total. In order to implement + * it, we account the used space in this range first. + */ + ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, + &used_space); + if (ret) { + kfree(devices_info); + return ret; + } + + /* calc the free space in [0, skip_space - 1] */ + skip_space -= used_space; + + /* + * we can use the free space in [0, skip_space - 1], subtract + * it from the total. + */ + if (avail_space && avail_space >= skip_space) + avail_space -= skip_space; + else + avail_space = 0; + + if (avail_space < min_stripe_size) + continue; + + devices_info[i].dev = device; + devices_info[i].max_avail = avail_space; + + i++; + } + + nr_devices = i; + + btrfs_descending_sort_devices(devices_info, nr_devices); + + i = nr_devices - 1; + avail_space = 0; + while (nr_devices >= min_stripes) { + if (devices_info[i].max_avail >= min_stripe_size) { + int j; + u64 alloc_size; + + avail_space += devices_info[i].max_avail * min_stripes; + alloc_size = devices_info[i].max_avail; + for (j = i + 1 - min_stripes; j <= i; j++) + devices_info[j].max_avail -= alloc_size; + } + i--; + nr_devices--; + } + + kfree(devices_info); + *free_bytes = avail_space; + return 0; +} + static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_root *root = btrfs_sb(dentry->d_sb); @@ -759,17 +997,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; - u64 total_used_data = 0; + u64 total_free_data = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)root->fs_info->fsid; + int ret; + /* holding chunk_muext to avoid allocating new chunks */ + mutex_lock(&root->fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { - if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | - BTRFS_BLOCK_GROUP_SYSTEM)) - total_used_data += found->disk_total; - else - total_used_data += found->disk_used; + if (found->flags & BTRFS_BLOCK_GROUP_DATA) { + total_free_data += found->disk_total - found->disk_used; + total_free_data -= + btrfs_account_ro_block_groups_free_space(found); + } + total_used += found->disk_used; } rcu_read_unlock(); @@ -777,9 +1019,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bavail = buf->f_blocks - (total_used_data >> bits); buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_bavail = total_free_data; + ret = btrfs_calc_avail_data_space(root, &total_free_data); + if (ret) { + mutex_unlock(&root->fs_info->chunk_mutex); + return ret; + } + buf->f_bavail += total_free_data; + buf->f_bavail = buf->f_bavail >> bits; + mutex_unlock(&root->fs_info->chunk_mutex); /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted @@ -896,10 +1146,14 @@ static int __init init_btrfs_fs(void) if (err) return err; - err = btrfs_init_cachep(); + err = btrfs_init_compress(); if (err) goto free_sysfs; + err = btrfs_init_cachep(); + if (err) + goto free_compress; + err = extent_io_init(); if (err) goto free_cachep; @@ -927,6 +1181,8 @@ free_extent_io: extent_io_exit(); free_cachep: btrfs_destroy_cachep(); +free_compress: + btrfs_exit_compress(); free_sysfs: btrfs_exit_sysfs(); return err; @@ -941,7 +1197,7 @@ static void __exit exit_btrfs_fs(void) unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); - btrfs_zlib_exit(); + btrfs_exit_compress(); } module_init(init_btrfs_fs) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f50e931fc217..3d73c8d93bbb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -181,6 +181,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; int ret; + + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + return ERR_PTR(-EROFS); again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) @@ -910,6 +913,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 to_reserve = 0; u64 index = 0; u64 objectid; + u64 root_flags; new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { @@ -967,6 +971,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + root_flags = btrfs_root_flags(new_root_item); + if (pending->readonly) + root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; + else + root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; + btrfs_set_root_flags(new_root_item, root_flags); + old = btrfs_lock_root_node(root); btrfs_cow_block(trans, root, old, NULL, 0, &old); btrfs_set_lock_blocking(old); @@ -1150,6 +1161,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, INIT_DELAYED_WORK(&ac->work, do_async_commit); ac->root = root; ac->newtrans = btrfs_join_transaction(root, 0); + if (IS_ERR(ac->newtrans)) { + int err = PTR_ERR(ac->newtrans); + kfree(ac); + return err; + } /* take transaction reference */ mutex_lock(&root->fs_info->trans_mutex); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index f104b57ad4ef..229a594cacd5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -62,6 +62,7 @@ struct btrfs_pending_snapshot { struct btrfs_block_rsv block_rsv; /* extra metadata reseration for relocation */ int error; + bool readonly; struct list_head list; }; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 054744ac5719..a4bbb854dfd2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -338,6 +338,12 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, } dst_copy = kmalloc(item_size, GFP_NOFS); src_copy = kmalloc(item_size, GFP_NOFS); + if (!dst_copy || !src_copy) { + btrfs_release_path(root, path); + kfree(dst_copy); + kfree(src_copy); + return -ENOMEM; + } read_extent_buffer(eb, src_copy, src_ptr, item_size); @@ -665,6 +671,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(leaf, di, &location); name_len = btrfs_dir_name_len(leaf, di); name = kmalloc(name_len, GFP_NOFS); + if (!name) + return -ENOMEM; + read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); btrfs_release_path(root, path); @@ -744,6 +753,9 @@ static noinline int backref_in_log(struct btrfs_root *log, int match = 0; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); if (ret != 0) goto out; @@ -967,6 +979,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, key.offset = (u64)-1; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -1178,6 +1192,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); + if (!name) + return -ENOMEM; + log_type = btrfs_dir_type(eb, di); read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); @@ -1692,6 +1709,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(parent); next = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!next) + return -ENOMEM; if (*level == 1) { wc->process_func(root, next, wc, ptr_gen); @@ -2032,6 +2051,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); + ret = 0; goto out; } atomic_set(&log_root_tree->log_commit[index2], 1); @@ -2096,7 +2116,7 @@ out: smp_mb(); if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); - return 0; + return ret; } static void free_log_tree(struct btrfs_trans_handle *trans, @@ -2194,6 +2214,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, log = root->log_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, name, name_len, -1); if (IS_ERR(di)) { @@ -2594,6 +2617,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); @@ -2725,7 +2751,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, log = root->log_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; dst_path = btrfs_alloc_path(); + if (!dst_path) { + btrfs_free_path(path); + return -ENOMEM; + } min_key.objectid = inode->i_ino; min_key.type = BTRFS_INODE_ITEM_KEY; @@ -3080,6 +3112,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) BUG_ON(!path); trans = btrfs_start_transaction(fs_info->tree_root, 0); + BUG_ON(IS_ERR(trans)); wc.trans = trans; wc.pin = 1; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6b9884507837..dd13eb81ee40 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/random.h> #include <linux/iocontext.h> +#include <linux/capability.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -493,7 +494,7 @@ again: continue; if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); device->bdev = NULL; fs_devices->open_devices--; } @@ -527,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); fs_devices->open_devices--; } if (device->writeable) { @@ -584,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int seeding = 1; int ret = 0; + flags |= FMODE_EXCL; + list_for_each_entry(device, head, dev_list) { if (device->bdev) continue; if (!device->name) continue; - bdev = open_bdev_exclusive(device->name, flags, holder); + bdev = blkdev_get_by_path(device->name, flags, holder); if (IS_ERR(bdev)) { printk(KERN_INFO "open %s failed\n", device->name); goto error; @@ -598,8 +601,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); - if (!bh) + if (!bh) { + ret = -EINVAL; goto error_close; + } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -642,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, error_brelse: brelse(bh); error_close: - close_bdev_exclusive(bdev, FMODE_READ); + blkdev_put(bdev, flags); error: continue; } @@ -688,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, mutex_lock(&uuid_mutex); - bdev = open_bdev_exclusive(path, flags, holder); + flags |= FMODE_EXCL; + bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); @@ -700,7 +706,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error_close; bh = btrfs_read_dev_super(bdev); if (!bh) { - ret = -EIO; + ret = -EINVAL; goto error_close; } disk_super = (struct btrfs_super_block *)bh->b_data; @@ -720,65 +726,173 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, brelse(bh); error_close: - close_bdev_exclusive(bdev, flags); + blkdev_put(bdev, flags); error: mutex_unlock(&uuid_mutex); return ret; } +/* helper to account the used device space in the range */ +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent; + struct btrfs_path *path; + u64 extent_end; + int ret; + int slot; + struct extent_buffer *l; + + *length = 0; + + if (start >= device->total_bytes) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto out; + } + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + break; + + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (key.offset <= start && extent_end > end) { + *length = end - start + 1; + break; + } else if (key.offset <= start && extent_end > start) + *length += extent_end - start; + else if (key.offset > start && extent_end <= end) + *length += extent_end - key.offset; + else if (key.offset > start && key.offset <= end) { + *length += end - key.offset + 1; + break; + } else if (key.offset > end) + break; + +next: + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* + * find_free_dev_extent - find free space in the specified device + * @trans: transaction handler + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size of the max + * free space if we don't find suitable free space + * * this uses a pretty simple search, the expectation is that it is * called very infrequently and that a given device has a small number * of extents + * + * @start is used to store the start of the free space if we find. But if we + * don't find suitable free space, it will be used to store the start position + * of the max free space. + * + * @len is used to store the size of the free space that we find. + * But if we don't find suitable free space, it is used to store the size of + * the max free space. */ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *max_avail) + u64 *start, u64 *len) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; - struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; - u64 hole_size = 0; - u64 last_byte = 0; - u64 search_start = 0; + u64 hole_size; + u64 max_hole_start; + u64 max_hole_size; + u64 extent_end; + u64 search_start; u64 search_end = device->total_bytes; int ret; - int slot = 0; - int start_found; + int slot; struct extent_buffer *l; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 2; - start_found = 0; - /* FIXME use last free of some kind */ /* we don't want to overwrite the superblock on the drive, * so we make sure to start at an offset of at least 1MB */ - search_start = max((u64)1024 * 1024, search_start); + search_start = 1024 * 1024; - if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) + if (root->fs_info->alloc_start + num_bytes <= search_end) search_start = max(root->fs_info->alloc_start, search_start); + max_hole_start = search_start; + max_hole_size = 0; + + if (search_start >= search_end) { + ret = -ENOSPC; + goto error; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto error; + } + path->reada = 2; + key.objectid = device->devid; key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); if (ret < 0) - goto error; + goto out; if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, key.type); if (ret < 0) - goto error; - if (ret > 0) - start_found = 1; + goto out; } - l = path->nodes[0]; - btrfs_item_key_to_cpu(l, &key, path->slots[0]); + while (1) { l = path->nodes[0]; slot = path->slots[0]; @@ -787,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, if (ret == 0) continue; if (ret < 0) - goto error; -no_more_items: - if (!start_found) { - if (search_start >= search_end) { - ret = -ENOSPC; - goto error; - } - *start = search_start; - start_found = 1; - goto check_pending; - } - *start = last_byte > search_start ? - last_byte : search_start; - if (search_end <= *start) { - ret = -ENOSPC; - goto error; - } - goto check_pending; + goto out; + + break; } btrfs_item_key_to_cpu(l, &key, slot); @@ -812,48 +911,62 @@ no_more_items: goto next; if (key.objectid > device->devid) - goto no_more_items; + break; - if (key.offset >= search_start && key.offset > last_byte && - start_found) { - if (last_byte < search_start) - last_byte = search_start; - hole_size = key.offset - last_byte; + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; - if (hole_size > *max_avail) - *max_avail = hole_size; + if (key.offset > search_start) { + hole_size = key.offset - search_start; - if (key.offset > last_byte && - hole_size >= num_bytes) { - *start = last_byte; - goto check_pending; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } + + /* + * If this free space is greater than which we need, + * it must be the max free space that we have found + * until now, so max_hole_start must point to the start + * of this free space and the length of this free space + * is stored in max_hole_size. Thus, we return + * max_hole_start and max_hole_size and go back to the + * caller. + */ + if (hole_size >= num_bytes) { + ret = 0; + goto out; } } - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) - goto next; - start_found = 1; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (extent_end > search_start) + search_start = extent_end; next: path->slots[0]++; cond_resched(); } -check_pending: - /* we have to make sure we didn't find an extent that has already - * been allocated by the map tree or the original allocation - */ - BUG_ON(*start < search_start); - if (*start + num_bytes > search_end) { - ret = -ENOSPC; - goto error; + hole_size = search_end- search_start; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; } - /* check for pending inserts here */ - ret = 0; -error: + /* See above. */ + if (hole_size < num_bytes) + ret = -ENOSPC; + else + ret = 0; + +out: btrfs_free_path(path); +error: + *start = max_hole_start; + if (len) + *len = max_hole_size; return ret; } @@ -1100,6 +1213,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, return -ENOMEM; trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1183,8 +1300,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } } else { - bdev = open_bdev_exclusive(device_path, FMODE_READ, - root->fs_info->bdev_holder); + bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto out; @@ -1193,7 +1310,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); if (!bh) { - ret = -EIO; + ret = -EINVAL; goto error_close; } disk_super = (struct btrfs_super_block *)bh->b_data; @@ -1221,11 +1338,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = btrfs_shrink_device(device, 0); if (ret) - goto error_brelse; + goto error_undo; ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); if (ret) - goto error_brelse; + goto error_undo; device->in_fs_metadata = 0; @@ -1251,7 +1368,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->latest_bdev = next_device->bdev; if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode); device->bdev = NULL; device->fs_devices->open_devices--; } @@ -1294,11 +1411,18 @@ error_brelse: brelse(bh); error_close: if (bdev) - close_bdev_exclusive(bdev, FMODE_READ); + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; +error_undo: + if (device->writeable) { + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->rw_devices++; + } + goto error_brelse; } /* @@ -1446,7 +1570,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) return -EINVAL; - bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); + bdev = blkdev_get_by_path(device_path, FMODE_EXCL, + root->fs_info->bdev_holder); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -1487,11 +1612,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = find_next_devid(root, &device->devid); if (ret) { + kfree(device->name); kfree(device); goto error; } trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + kfree(device->name); + kfree(device); + ret = PTR_ERR(trans); + goto error; + } + lock_chunks(root); device->writeable = 1; @@ -1507,7 +1640,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; - device->mode = 0; + device->mode = FMODE_EXCL; set_blocksize(device->bdev, 4096); if (seeding_dev) { @@ -1572,7 +1705,7 @@ out: mutex_unlock(&root->fs_info->volume_mutex); return ret; error: - close_bdev_exclusive(bdev, 0); + blkdev_put(bdev, FMODE_EXCL); if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -1759,7 +1892,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, return ret; trans = btrfs_start_transaction(root, 0); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); lock_chunks(root); @@ -1912,6 +2045,9 @@ int btrfs_balance(struct btrfs_root *dev_root) if (dev_root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + mutex_lock(&dev_root->fs_info->volume_mutex); dev_root = dev_root->fs_info->dev_root; @@ -1930,7 +2066,7 @@ int btrfs_balance(struct btrfs_root *dev_root) BUG_ON(ret); trans = btrfs_start_transaction(dev_root, 0); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); ret = btrfs_grow_device(trans, device, old_size); BUG_ON(ret); @@ -2096,6 +2232,11 @@ again: /* Shrinking succeeded, else we would be at "done". */ trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto done; + } + lock_chunks(root); device->disk_total_bytes = new_size; @@ -2150,66 +2291,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, return calc_size * num_stripes; } -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct map_lookup **map_ret, - u64 *num_bytes, u64 *stripe_size, - u64 start, u64 type) +/* Used to sort the devices by max_avail(descending sort) */ +int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2) { - struct btrfs_fs_info *info = extent_root->fs_info; - struct btrfs_device *device = NULL; - struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct list_head *cur; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; - struct extent_map *em; - struct list_head private_devs; - int min_stripe_size = 1 * 1024 * 1024; - u64 calc_size = 1024 * 1024 * 1024; - u64 max_chunk_size = calc_size; - u64 min_free; - u64 avail; - u64 max_avail = 0; - u64 dev_offset; - int num_stripes = 1; - int min_stripes = 1; - int sub_stripes = 0; - int looped = 0; - int ret; - int index; - int stripe_len = 64 * 1024; + if (((struct btrfs_device_info *)dev_info1)->max_avail > + ((struct btrfs_device_info *)dev_info2)->max_avail) + return -1; + else if (((struct btrfs_device_info *)dev_info1)->max_avail < + ((struct btrfs_device_info *)dev_info2)->max_avail) + return 1; + else + return 0; +} - if ((type & BTRFS_BLOCK_GROUP_RAID1) && - (type & BTRFS_BLOCK_GROUP_DUP)) { - WARN_ON(1); - type &= ~BTRFS_BLOCK_GROUP_DUP; - } - if (list_empty(&fs_devices->alloc_list)) - return -ENOSPC; +static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type, + int *num_stripes, int *min_stripes, + int *sub_stripes) +{ + *num_stripes = 1; + *min_stripes = 1; + *sub_stripes = 0; if (type & (BTRFS_BLOCK_GROUP_RAID0)) { - num_stripes = fs_devices->rw_devices; - min_stripes = 2; + *num_stripes = fs_devices->rw_devices; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_DUP)) { - num_stripes = 2; - min_stripes = 2; + *num_stripes = 2; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID1)) { if (fs_devices->rw_devices < 2) return -ENOSPC; - num_stripes = 2; - min_stripes = 2; + *num_stripes = 2; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - num_stripes = fs_devices->rw_devices; - if (num_stripes < 4) + *num_stripes = fs_devices->rw_devices; + if (*num_stripes < 4) return -ENOSPC; - num_stripes &= ~(u32)1; - sub_stripes = 2; - min_stripes = 4; + *num_stripes &= ~(u32)1; + *sub_stripes = 2; + *min_stripes = 4; } + return 0; +} + +static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices, + u64 proposed_size, u64 type, + int num_stripes, int small_stripe) +{ + int min_stripe_size = 1 * 1024 * 1024; + u64 calc_size = proposed_size; + u64 max_chunk_size = calc_size; + int ncopies = 1; + + if (type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID10)) + ncopies = 2; + if (type & BTRFS_BLOCK_GROUP_DATA) { max_chunk_size = 10 * calc_size; min_stripe_size = 64 * 1024 * 1024; @@ -2226,51 +2368,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); -again: - max_avail = 0; - if (!map || map->num_stripes != num_stripes) { - kfree(map); - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) - return -ENOMEM; - map->num_stripes = num_stripes; - } - - if (calc_size * num_stripes > max_chunk_size) { - calc_size = max_chunk_size; + if (calc_size * num_stripes > max_chunk_size * ncopies) { + calc_size = max_chunk_size * ncopies; do_div(calc_size, num_stripes); - do_div(calc_size, stripe_len); - calc_size *= stripe_len; + do_div(calc_size, BTRFS_STRIPE_LEN); + calc_size *= BTRFS_STRIPE_LEN; } /* we don't want tiny stripes */ - if (!looped) + if (!small_stripe) calc_size = max_t(u64, min_stripe_size, calc_size); /* - * we're about to do_div by the stripe_len so lets make sure + * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure * we end up with something bigger than a stripe */ - calc_size = max_t(u64, calc_size, stripe_len * 4); + calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN); + + do_div(calc_size, BTRFS_STRIPE_LEN); + calc_size *= BTRFS_STRIPE_LEN; + + return calc_size; +} + +static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map, + int num_stripes) +{ + struct map_lookup *new; + size_t len = map_lookup_size(num_stripes); + + BUG_ON(map->num_stripes < num_stripes); + + if (map->num_stripes == num_stripes) + return map; + + new = kmalloc(len, GFP_NOFS); + if (!new) { + /* just change map->num_stripes */ + map->num_stripes = num_stripes; + return map; + } + + memcpy(new, map, len); + new->num_stripes = num_stripes; + kfree(map); + return new; +} + +/* + * helper to allocate device space from btrfs_device_info, in which we stored + * max free space information of every device. It is used when we can not + * allocate chunks by default size. + * + * By this helper, we can allocate a new chunk as larger as possible. + */ +static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_devices *fs_devices, + struct btrfs_device_info *devices, + int nr_device, u64 type, + struct map_lookup **map_lookup, + int min_stripes, u64 *stripe_size) +{ + int i, index, sort_again = 0; + int min_devices = min_stripes; + u64 max_avail, min_free; + struct map_lookup *map = *map_lookup; + int ret; + + if (nr_device < min_stripes) + return -ENOSPC; + + btrfs_descending_sort_devices(devices, nr_device); + + max_avail = devices[0].max_avail; + if (!max_avail) + return -ENOSPC; + + for (i = 0; i < nr_device; i++) { + /* + * if dev_offset = 0, it means the free space of this device + * is less than what we need, and we didn't search max avail + * extent on this device, so do it now. + */ + if (!devices[i].dev_offset) { + ret = find_free_dev_extent(trans, devices[i].dev, + max_avail, + &devices[i].dev_offset, + &devices[i].max_avail); + if (ret != 0 && ret != -ENOSPC) + return ret; + sort_again = 1; + } + } + + /* we update the max avail free extent of each devices, sort again */ + if (sort_again) + btrfs_descending_sort_devices(devices, nr_device); + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_devices = 1; + + if (!devices[min_devices - 1].max_avail) + return -ENOSPC; + + max_avail = devices[min_devices - 1].max_avail; + if (type & BTRFS_BLOCK_GROUP_DUP) + do_div(max_avail, 2); + + max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type, + min_stripes, 1); + if (type & BTRFS_BLOCK_GROUP_DUP) + min_free = max_avail * 2; + else + min_free = max_avail; + + if (min_free > devices[min_devices - 1].max_avail) + return -ENOSPC; + + map = __shrink_map_lookup_stripes(map, min_stripes); + *stripe_size = max_avail; + + index = 0; + for (i = 0; i < min_stripes; i++) { + map->stripes[i].dev = devices[index].dev; + map->stripes[i].physical = devices[index].dev_offset; + if (type & BTRFS_BLOCK_GROUP_DUP) { + i++; + map->stripes[i].dev = devices[index].dev; + map->stripes[i].physical = devices[index].dev_offset + + max_avail; + } + index++; + } + *map_lookup = map; + + return 0; +} - do_div(calc_size, stripe_len); - calc_size *= stripe_len; +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup **map_ret, + u64 *num_bytes, u64 *stripe_size, + u64 start, u64 type) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_device *device = NULL; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct list_head *cur; + struct map_lookup *map; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_device_info *devices_info; + struct list_head private_devs; + u64 calc_size = 1024 * 1024 * 1024; + u64 min_free; + u64 avail; + u64 dev_offset; + int num_stripes; + int min_stripes; + int sub_stripes; + int min_devices; /* the min number of devices we need */ + int i; + int ret; + int index; + + if ((type & BTRFS_BLOCK_GROUP_RAID1) && + (type & BTRFS_BLOCK_GROUP_DUP)) { + WARN_ON(1); + type &= ~BTRFS_BLOCK_GROUP_DUP; + } + if (list_empty(&fs_devices->alloc_list)) + return -ENOSPC; + + ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes, + &min_stripes, &sub_stripes); + if (ret) + return ret; + + devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + ret = -ENOMEM; + goto error; + } + map->num_stripes = num_stripes; cur = fs_devices->alloc_list.next; index = 0; + i = 0; - if (type & BTRFS_BLOCK_GROUP_DUP) + calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type, + num_stripes, 0); + + if (type & BTRFS_BLOCK_GROUP_DUP) { min_free = calc_size * 2; - else + min_devices = 1; + } else { min_free = calc_size; - - /* - * we add 1MB because we never use the first 1MB of the device, unless - * we've looped, then we are likely allocating the maximum amount of - * space left already - */ - if (!looped) - min_free += 1024 * 1024; + min_devices = min_stripes; + } INIT_LIST_HEAD(&private_devs); while (index < num_stripes) { @@ -2283,27 +2583,39 @@ again: cur = cur->next; if (device->in_fs_metadata && avail >= min_free) { - ret = find_free_dev_extent(trans, device, - min_free, &dev_offset, - &max_avail); + ret = find_free_dev_extent(trans, device, min_free, + &devices_info[i].dev_offset, + &devices_info[i].max_avail); if (ret == 0) { list_move_tail(&device->dev_alloc_list, &private_devs); map->stripes[index].dev = device; - map->stripes[index].physical = dev_offset; + map->stripes[index].physical = + devices_info[i].dev_offset; index++; if (type & BTRFS_BLOCK_GROUP_DUP) { map->stripes[index].dev = device; map->stripes[index].physical = - dev_offset + calc_size; + devices_info[i].dev_offset + + calc_size; index++; } - } - } else if (device->in_fs_metadata && avail > max_avail) - max_avail = avail; + } else if (ret != -ENOSPC) + goto error; + + devices_info[i].dev = device; + i++; + } else if (device->in_fs_metadata && + avail >= BTRFS_STRIPE_LEN) { + devices_info[i].dev = device; + devices_info[i].max_avail = avail; + i++; + } + if (cur == &fs_devices->alloc_list) break; } + list_splice(&private_devs, &fs_devices->alloc_list); if (index < num_stripes) { if (index >= min_stripes) { @@ -2312,34 +2624,36 @@ again: num_stripes /= sub_stripes; num_stripes *= sub_stripes; } - looped = 1; - goto again; - } - if (!looped && max_avail > 0) { - looped = 1; - calc_size = max_avail; - goto again; + + map = __shrink_map_lookup_stripes(map, num_stripes); + } else if (i >= min_devices) { + ret = __btrfs_alloc_tiny_space(trans, fs_devices, + devices_info, i, type, + &map, min_stripes, + &calc_size); + if (ret) + goto error; + } else { + ret = -ENOSPC; + goto error; } - kfree(map); - return -ENOSPC; } map->sector_size = extent_root->sectorsize; - map->stripe_len = stripe_len; - map->io_align = stripe_len; - map->io_width = stripe_len; + map->stripe_len = BTRFS_STRIPE_LEN; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; map->type = type; - map->num_stripes = num_stripes; map->sub_stripes = sub_stripes; *map_ret = map; *stripe_size = calc_size; *num_bytes = chunk_bytes_by_type(type, calc_size, - num_stripes, sub_stripes); + map->num_stripes, sub_stripes); em = alloc_extent_map(GFP_NOFS); if (!em) { - kfree(map); - return -ENOMEM; + ret = -ENOMEM; + goto error; } em->bdev = (struct block_device *)map; em->start = start; @@ -2372,7 +2686,13 @@ again: index++; } + kfree(devices_info); return 0; + +error: + kfree(map); + kfree(devices_info); + return ret; } static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2740db49eb04..7fb59d45fe8c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -20,8 +20,11 @@ #define __BTRFS_VOLUMES_ #include <linux/bio.h> +#include <linux/sort.h> #include "async-thread.h" +#define BTRFS_STRIPE_LEN (64 * 1024) + struct buffer_head; struct btrfs_pending_bios { struct bio *head; @@ -50,7 +53,7 @@ struct btrfs_device { struct block_device *bdev; - /* the mode sent to open_bdev_exclusive */ + /* the mode sent to blkdev_get */ fmode_t mode; char *name; @@ -136,6 +139,30 @@ struct btrfs_multi_bio { struct btrfs_bio_stripe stripes[]; }; +struct btrfs_device_info { + struct btrfs_device *dev; + u64 dev_offset; + u64 max_avail; +}; + +/* Used to sort the devices by max_avail(descending sort) */ +int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2); + +/* + * sort the devices by max_avail, in which max free extent size of each device + * is stored.(Descending Sort) + */ +static inline void btrfs_descending_sort_devices( + struct btrfs_device_info *devices, + size_t nr_devices) +{ + sort(devices, nr_devices, sizeof(struct btrfs_device_info), + btrfs_cmp_device_free_bytes, NULL); +} + +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length); + #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 698fdd2c739c..a5776531dc2b 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -316,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -336,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, int btrfs_removexattr(struct dentry *dentry, const char *name) { + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b9cd5445f71c..f5ec2d44150d 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -32,15 +32,6 @@ #include <linux/bio.h> #include "compression.h" -/* Plan: call deflate() with avail_in == *sourcelen, - avail_out = *dstlen - 12 and flush == Z_FINISH. - If it doesn't manage to finish, call it again with - avail_in == 0 and avail_out set to the remaining 12 - bytes for it to clean up. - Q: Is 12 bytes sufficient? -*/ -#define STREAM_END_SPACE 12 - struct workspace { z_stream inf_strm; z_stream def_strm; @@ -48,152 +39,51 @@ struct workspace { struct list_head list; }; -static LIST_HEAD(idle_workspace); -static DEFINE_SPINLOCK(workspace_lock); -static unsigned long num_workspace; -static atomic_t alloc_workspace = ATOMIC_INIT(0); -static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); +static void zlib_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); -/* - * this finds an available zlib workspace or allocates a new one - * NULL or an ERR_PTR is returned if things go bad. - */ -static struct workspace *find_zlib_workspace(void) + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); +} + +static struct list_head *zlib_alloc_workspace(void) { struct workspace *workspace; - int ret; - int cpus = num_online_cpus(); - -again: - spin_lock(&workspace_lock); - if (!list_empty(&idle_workspace)) { - workspace = list_entry(idle_workspace.next, struct workspace, - list); - list_del(&workspace->list); - num_workspace--; - spin_unlock(&workspace_lock); - return workspace; - } - spin_unlock(&workspace_lock); - if (atomic_read(&alloc_workspace) > cpus) { - DEFINE_WAIT(wait); - prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(&alloc_workspace) > cpus) - schedule(); - finish_wait(&workspace_wait, &wait); - goto again; - } - atomic_inc(&alloc_workspace); workspace = kzalloc(sizeof(*workspace), GFP_NOFS); - if (!workspace) { - ret = -ENOMEM; - goto fail; - } + if (!workspace) + return ERR_PTR(-ENOMEM); workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); - if (!workspace->def_strm.workspace) { - ret = -ENOMEM; - goto fail; - } workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); - if (!workspace->inf_strm.workspace) { - ret = -ENOMEM; - goto fail_inflate; - } workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!workspace->buf) { - ret = -ENOMEM; - goto fail_kmalloc; - } - return workspace; - -fail_kmalloc: - vfree(workspace->inf_strm.workspace); -fail_inflate: - vfree(workspace->def_strm.workspace); -fail: - kfree(workspace); - atomic_dec(&alloc_workspace); - wake_up(&workspace_wait); - return ERR_PTR(ret); -} - -/* - * put a workspace struct back on the list or free it if we have enough - * idle ones sitting around - */ -static int free_workspace(struct workspace *workspace) -{ - spin_lock(&workspace_lock); - if (num_workspace < num_online_cpus()) { - list_add_tail(&workspace->list, &idle_workspace); - num_workspace++; - spin_unlock(&workspace_lock); - if (waitqueue_active(&workspace_wait)) - wake_up(&workspace_wait); - return 0; - } - spin_unlock(&workspace_lock); - vfree(workspace->def_strm.workspace); - vfree(workspace->inf_strm.workspace); - kfree(workspace->buf); - kfree(workspace); + if (!workspace->def_strm.workspace || + !workspace->inf_strm.workspace || !workspace->buf) + goto fail; - atomic_dec(&alloc_workspace); - if (waitqueue_active(&workspace_wait)) - wake_up(&workspace_wait); - return 0; -} + INIT_LIST_HEAD(&workspace->list); -/* - * cleanup function for module exit - */ -static void free_workspaces(void) -{ - struct workspace *workspace; - while (!list_empty(&idle_workspace)) { - workspace = list_entry(idle_workspace.next, struct workspace, - list); - list_del(&workspace->list); - vfree(workspace->def_strm.workspace); - vfree(workspace->inf_strm.workspace); - kfree(workspace->buf); - kfree(workspace); - atomic_dec(&alloc_workspace); - } + return &workspace->list; +fail: + zlib_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); } -/* - * given an address space and start/len, compress the bytes. - * - * pages are allocated to hold the compressed result and stored - * in 'pages' - * - * out_pages is used to return the number of pages allocated. There - * may be pages allocated even if we return an error - * - * total_in is used to return the number of bytes actually read. It - * may be smaller then len if we had to exit early because we - * ran out of room in the pages array or because we cross the - * max_out threshold. - * - * total_out is used to return the total number of compressed bytes - * - * max_out tells us the max number of bytes that we're allowed to - * stuff into pages - */ -int btrfs_zlib_compress_pages(struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) +static int zlib_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) { + struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; - struct workspace *workspace; char *data_in; char *cpage_out; int nr_pages = 0; @@ -205,10 +95,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, *total_out = 0; *total_in = 0; - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -1; - if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { printk(KERN_WARNING "deflateInit failed\n"); ret = -1; @@ -222,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, data_in = kmap(in_page); out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -1; + goto out; + } cpage_out = kmap(out_page); pages[0] = out_page; nr_pages = 1; @@ -260,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, goto out; } out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -1; + goto out; + } cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; @@ -314,55 +208,26 @@ out: kunmap(in_page); page_cache_release(in_page); } - free_workspace(workspace); return ret; } -/* - * pages_in is an array of pages with compressed data. - * - * disk_start is the starting logical offset of this array in the file - * - * bvec is a bio_vec of pages from the file that we want to decompress into - * - * vcnt is the count of pages in the biovec - * - * srclen is the number of bytes in pages_in - * - * The basic idea is that we have a bio that was created by readpages. - * The pages in the bio are for the uncompressed data, and they may not - * be contiguous. They all correspond to the range of bytes covered by - * the compressed extent. - */ -int btrfs_zlib_decompress_biovec(struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen) +static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) { - int ret = 0; + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; int wbits = MAX_WBITS; - struct workspace *workspace; char *data_in; size_t total_out = 0; - unsigned long page_bytes_left; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - struct page *page_out; unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE; unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - unsigned long working_bytes; unsigned long pg_offset; - unsigned long start_byte; - unsigned long current_buf_start; - char *kaddr; - - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -ENOMEM; data_in = kmap(pages_in[page_in_index]); workspace->inf_strm.next_in = data_in; @@ -372,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, workspace->inf_strm.total_out = 0; workspace->inf_strm.next_out = workspace->buf; workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; - page_out = bvec[page_out_index].bv_page; - page_bytes_left = PAGE_CACHE_SIZE; pg_offset = 0; /* If it's deflate, and it's got no preset dictionary, then @@ -389,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "inflateInit failed\n"); - ret = -1; - goto out; + return -1; } while (workspace->inf_strm.total_in < srclen) { ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); if (ret != Z_OK && ret != Z_STREAM_END) break; - /* - * buf start is the byte offset we're of the start of - * our workspace buffer - */ - buf_start = total_out; - /* total_out is the last byte of the workspace buffer */ + buf_start = total_out; total_out = workspace->inf_strm.total_out; - working_bytes = total_out - buf_start; - - /* - * start byte is the first byte of the page we're currently - * copying into relative to the start of the compressed data. - */ - start_byte = page_offset(page_out) - disk_start; - - if (working_bytes == 0) { - /* we didn't make progress in this inflate - * call, we're done - */ - if (ret != Z_STREAM_END) - ret = -1; + /* we didn't make progress in this inflate call, we're done */ + if (buf_start == total_out) break; - } - /* we haven't yet hit data corresponding to this page */ - if (total_out <= start_byte) - goto next; - - /* - * the start of the data we care about is offset into - * the middle of our working buffer - */ - if (total_out > start_byte && buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes -= buf_offset; - } else { - buf_offset = 0; - } - current_buf_start = buf_start; - - /* copy bytes from the working buffer into the pages */ - while (working_bytes > 0) { - bytes = min(PAGE_CACHE_SIZE - pg_offset, - PAGE_CACHE_SIZE - buf_offset); - bytes = min(bytes, working_bytes); - kaddr = kmap_atomic(page_out, KM_USER0); - memcpy(kaddr + pg_offset, workspace->buf + buf_offset, - bytes); - kunmap_atomic(kaddr, KM_USER0); - flush_dcache_page(page_out); - - pg_offset += bytes; - page_bytes_left -= bytes; - buf_offset += bytes; - working_bytes -= bytes; - current_buf_start += bytes; - - /* check if we need to pick another page */ - if (page_bytes_left == 0) { - page_out_index++; - if (page_out_index >= vcnt) { - ret = 0; - goto done; - } - - page_out = bvec[page_out_index].bv_page; - pg_offset = 0; - page_bytes_left = PAGE_CACHE_SIZE; - start_byte = page_offset(page_out) - disk_start; - - /* - * make sure our new page is covered by this - * working buffer - */ - if (total_out <= start_byte) - goto next; - - /* the next page in the biovec might not - * be adjacent to the last page, but it - * might still be found inside this working - * buffer. bump our offset pointer - */ - if (total_out > start_byte && - current_buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes = total_out - start_byte; - current_buf_start = buf_start + - buf_offset; - } - } + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + total_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) { + ret = 0; + goto done; } -next: + workspace->inf_strm.next_out = workspace->buf; workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; @@ -516,35 +301,21 @@ done: zlib_inflateEnd(&workspace->inf_strm); if (data_in) kunmap(pages_in[page_in_index]); -out: - free_workspace(workspace); return ret; } -/* - * a less complex decompression routine. Our compressed data fits in a - * single page, and we want to read a single page out of it. - * start_byte tells us the offset into the compressed data we're interested in - */ -int btrfs_zlib_decompress(unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) +static int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) { + struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - struct workspace *workspace; unsigned long bytes_left = destlen; unsigned long total_out = 0; char *kaddr; - if (destlen > PAGE_CACHE_SIZE) - return -ENOMEM; - - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -ENOMEM; - workspace->inf_strm.next_in = data_in; workspace->inf_strm.avail_in = srclen; workspace->inf_strm.total_in = 0; @@ -565,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "inflateInit failed\n"); - ret = -1; - goto out; + return -1; } while (bytes_left > 0) { @@ -616,12 +386,13 @@ next: ret = 0; zlib_inflateEnd(&workspace->inf_strm); -out: - free_workspace(workspace); return ret; } -void btrfs_zlib_exit(void) -{ - free_workspaces(); -} +struct btrfs_compress_op btrfs_zlib_compress = { + .alloc_workspace = zlib_alloc_workspace, + .free_workspace = zlib_free_workspace, + .compress_pages = zlib_compress_pages, + .decompress_biovec = zlib_decompress_biovec, + .decompress = zlib_decompress, +}; |