diff options
Diffstat (limited to 'fs')
99 files changed, 1839 insertions, 988 deletions
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index ff911e779651..be1e34adc3c6 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -52,10 +52,9 @@ */ struct p9_rdir { - struct mutex mutex; int head; int tail; - uint8_t *buf; + uint8_t buf[]; }; /** @@ -93,33 +92,12 @@ static void p9stat_init(struct p9_wstat *stbuf) * */ -static int v9fs_alloc_rdir_buf(struct file *filp, int buflen) +static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen) { - struct p9_rdir *rdir; - struct p9_fid *fid; - int err = 0; - - fid = filp->private_data; - if (!fid->rdir) { - rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); - - if (rdir == NULL) { - err = -ENOMEM; - goto exit; - } - spin_lock(&filp->f_dentry->d_lock); - if (!fid->rdir) { - rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir); - mutex_init(&rdir->mutex); - rdir->head = rdir->tail = 0; - fid->rdir = (void *) rdir; - rdir = NULL; - } - spin_unlock(&filp->f_dentry->d_lock); - kfree(rdir); - } -exit: - return err; + struct p9_fid *fid = filp->private_data; + if (!fid->rdir) + fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); + return fid->rdir; } /** @@ -145,20 +123,16 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) buflen = fid->clnt->msize - P9_IOHDRSZ; - err = v9fs_alloc_rdir_buf(filp, buflen); - if (err) - goto exit; - rdir = (struct p9_rdir *) fid->rdir; + rdir = v9fs_alloc_rdir_buf(filp, buflen); + if (!rdir) + return -ENOMEM; - err = mutex_lock_interruptible(&rdir->mutex); - if (err) - return err; - while (err == 0) { + while (1) { if (rdir->tail == rdir->head) { err = v9fs_file_readn(filp, rdir->buf, NULL, buflen, filp->f_pos); if (err <= 0) - goto unlock_and_exit; + return err; rdir->head = 0; rdir->tail = err; @@ -169,9 +143,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) rdir->tail - rdir->head, &st); if (err) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); - err = -EIO; p9stat_free(&st); - goto unlock_and_exit; + return -EIO; } reclen = st.size+2; @@ -180,19 +153,13 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) p9stat_free(&st); - if (over) { - err = 0; - goto unlock_and_exit; - } + if (over) + return 0; + rdir->head += reclen; filp->f_pos += reclen; } } - -unlock_and_exit: - mutex_unlock(&rdir->mutex); -exit: - return err; } /** @@ -218,21 +185,16 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, buflen = fid->clnt->msize - P9_READDIRHDRSZ; - err = v9fs_alloc_rdir_buf(filp, buflen); - if (err) - goto exit; - rdir = (struct p9_rdir *) fid->rdir; + rdir = v9fs_alloc_rdir_buf(filp, buflen); + if (!rdir) + return -ENOMEM; - err = mutex_lock_interruptible(&rdir->mutex); - if (err) - return err; - - while (err == 0) { + while (1) { if (rdir->tail == rdir->head) { err = p9_client_readdir(fid, rdir->buf, buflen, filp->f_pos); if (err <= 0) - goto unlock_and_exit; + return err; rdir->head = 0; rdir->tail = err; @@ -245,8 +207,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, &curdirent); if (err < 0) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); - err = -EIO; - goto unlock_and_exit; + return -EIO; } /* d_off in dirent structure tracks the offset into @@ -261,20 +222,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent, curdirent.d_type); oldoffset = curdirent.d_off; - if (over) { - err = 0; - goto unlock_and_exit; - } + if (over) + return 0; filp->f_pos = curdirent.d_off; rdir->head += err; } } - -unlock_and_exit: - mutex_unlock(&rdir->mutex); -exit: - return err; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index c2483e97beee..7a3399767570 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -80,10 +80,6 @@ int v9fs_file_open(struct inode *inode, struct file *file) p9_client_clunk(fid); return err; } - if (file->f_flags & O_TRUNC) { - i_size_write(inode, 0); - inode->i_blocks = 0; - } if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))) generic_file_llseek(file, 0, SEEK_END); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 890bed538f9b..57d017ac68e4 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -192,9 +192,6 @@ int v9fs_uflags2omode(int uflags, int extended) break; } - if (uflags & O_TRUNC) - ret |= P9_OTRUNC; - if (extended) { if (uflags & O_EXCL) ret |= P9_OEXCL; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 40895546e103..8d24ad66dfb8 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -186,7 +186,6 @@ static int v9fs_mapped_dotl_flags(int flags) { O_CREAT, P9_DOTL_CREATE }, { O_EXCL, P9_DOTL_EXCL }, { O_NOCTTY, P9_DOTL_NOCTTY }, - { O_TRUNC, P9_DOTL_TRUNC }, { O_APPEND, P9_DOTL_APPEND }, { O_NONBLOCK, P9_DOTL_NONBLOCK }, { O_DSYNC, P9_DOTL_DSYNC }, @@ -268,8 +267,14 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, } /* Only creates */ - if (!(flags & O_CREAT) || dentry->d_inode) - return finish_no_open(file, res); + if (!(flags & O_CREAT)) + return finish_no_open(file, res); + else if (dentry->d_inode) { + if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) + return -EEXIST; + else + return finish_no_open(file, res); + } v9ses = v9fs_inode2v9ses(dir); diff --git a/fs/Kconfig b/fs/Kconfig index cfe512fd1caf..780725a463b1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -68,16 +68,6 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" -config CUSE - tristate "Character device in Userspace support" - depends on FUSE_FS - help - This FUSE extension allows character devices to be - implemented in userspace. - - If you want to develop or use userspace character device - based on CUSE, answer Y or M. - config GENERIC_ACL bool select FS_POSIX_ACL diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0c42cdbabecf..49d0b43458b7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -33,6 +33,7 @@ #include <linux/elf.h> #include <linux/utsname.h> #include <linux/coredump.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include <asm/param.h> #include <asm/page.h> @@ -1320,8 +1321,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus, cputime_to_timeval(cputime.utime, &prstatus->pr_utime); cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { - cputime_to_timeval(p->utime, &prstatus->pr_utime); - cputime_to_timeval(p->stime, &prstatus->pr_stime); + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + cputime_to_timeval(utime, &prstatus->pr_utime); + cputime_to_timeval(stime, &prstatus->pr_stime); } cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index dc84732e554f..cb240dd3b402 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1375,8 +1375,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus, cputime_to_timeval(cputime.utime, &prstatus->pr_utime); cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { - cputime_to_timeval(p->utime, &prstatus->pr_utime); - cputime_to_timeval(p->stime, &prstatus->pr_stime); + cputime_t utime, stime; + + task_cputime(p, &utime, &stime); + cputime_to_timeval(utime, &prstatus->pr_utime); + cputime_to_timeval(stime, &prstatus->pr_stime); } cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 521e9d4424f6..5a3327b8f90d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3997,7 +3997,7 @@ again: * We make the other tasks wait for the flush only when we can flush * all things. */ - if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; } @@ -4534,7 +4534,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) unsigned nr_extents = 0; int extra_reserve = 0; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; - int ret; + int ret = 0; bool delalloc_lock = true; /* If we are a free space inode we need to not flush since we will be in @@ -4579,20 +4579,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); - if (root->fs_info->quota_enabled) { + if (root->fs_info->quota_enabled) ret = btrfs_qgroup_reserve(root, num_bytes + nr_extents * root->leafsize); - if (ret) { - spin_lock(&BTRFS_I(inode)->lock); - calc_csum_metadata_size(inode, num_bytes, 0); - spin_unlock(&BTRFS_I(inode)->lock); - if (delalloc_lock) - mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); - return ret; - } - } - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + /* + * ret != 0 here means the qgroup reservation failed, we go straight to + * the shared error handling then. + */ + if (ret == 0) + ret = reserve_metadata_bytes(root, block_rsv, + to_reserve, flush); + if (ret) { u64 to_free = 0; unsigned dropped; @@ -5560,7 +5558,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int empty_cluster = 2 * 1024 * 1024; struct btrfs_space_info *space_info; int loop = 0; - int index = 0; + int index = __get_raid_index(data); int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; @@ -6788,11 +6786,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, &wc->flags[level]); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return ret; } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return 1; } } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f169d6b11d7f..fdb7a8db3b57 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) return 0; + if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || + test_bit(EXTENT_FLAG_LOGGING, &next->flags)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -255,7 +259,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, if (!em) goto out; - list_move(&em->list, &tree->modified_extents); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); em->mod_start = em->start; @@ -280,6 +285,13 @@ out: } +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +{ + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + if (em->in_tree) + try_merge_map(tree, em); +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 922943ce29e8..c6598c89cff8 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index bd38cef42358..94aa53b38721 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, if (!contig) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - if (!contig && (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset)) { + if (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset) { unsigned long bytes_left; sums->len = this_sum_bytes; this_sum_bytes = 0; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 77061bf43edb..aeb84469d2c4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_ioctl_defrag_range_args range; int num_defrag; + int index; + int ret; /* get the inode */ key.objectid = defrag->root; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); key.offset = (u64)-1; + + index = srcu_read_lock(&fs_info->subvol_srcu); + inode_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(inode_root)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode_root); + ret = PTR_ERR(inode_root); + goto cleanup; + } + if (btrfs_root_refs(&inode_root->root_item) == 0) { + ret = -ENOENT; + goto cleanup; } key.objectid = defrag->ino; @@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return PTR_ERR(inode); + ret = PTR_ERR(inode); + goto cleanup; } + srcu_read_unlock(&fs_info->subvol_srcu, index); /* do a chunk of defrag */ clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); @@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, iput(inode); return 0; +cleanup: + srcu_read_unlock(&fs_info->subvol_srcu, index); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; } /* @@ -1594,9 +1608,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (err < 0 && num_written > 0) num_written = err; } -out: + if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); +out: sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; @@ -2241,6 +2256,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) if (lockend <= lockstart) lockend = lockstart + root->sectorsize; + lockend--; len = lockend - lockstart + 1; len = max_t(u64, len, root->sectorsize); @@ -2307,9 +2323,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) } } - *offset = start; - free_extent_map(em); - break; + if (!test_bit(EXTENT_FLAG_PREALLOC, + &em->flags)) { + *offset = start; + free_extent_map(em); + break; + } } } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 59ea2e4349c9..0be7a8742a43 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1862,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; - int ret = 0; + int ret; + bool re_search = false; spin_lock(&ctl->tree_lock); again: + ret = 0; if (!bytes) goto out_lock; @@ -1879,17 +1881,17 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - /* the tree logging code might be calling us before we - * have fully loaded the free space rbtree for this - * block group. So it is possible the entry won't - * be in the rbtree yet at all. The caching code - * will make sure not to put it in the rbtree if - * the logging code has pinned it. + /* + * If we found a partial bit of our free space in a + * bitmap but then couldn't find the other part this may + * be a problem, so WARN about it. */ + WARN_ON(re_search); goto out_lock; } } + re_search = false; if (!info->bitmap) { unlink_free_space(ctl, info); if (offset == info->offset) { @@ -1935,8 +1937,10 @@ again: } ret = remove_from_bitmap(ctl, info, &offset, &bytes); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + re_search = true; goto again; + } BUG_ON(ret); /* logic error */ out_lock: spin_unlock(&ctl->tree_lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 16d9e8e191e6..cc93b23ca352 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, }; -static int btrfs_setsize(struct inode *inode, loff_t newsize); +static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, @@ -2478,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; + + /* 1 for the orphan item deletion. */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + goto out; + ret = btrfs_truncate(inode); } else { nr_unlink++; @@ -3665,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) block_end - cur_offset, 0); if (IS_ERR(em)) { err = PTR_ERR(em); + em = NULL; break; } last_byte = min(extent_map_end(em), block_end); @@ -3748,16 +3761,27 @@ next: return err; } -static int btrfs_setsize(struct inode *inode, loff_t newsize) +static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); + loff_t newsize = attr->ia_size; + int mask = attr->ia_valid; int ret; if (newsize == oldsize) return 0; + /* + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) + inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize > oldsize) { truncate_pagecache(inode, oldsize, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); @@ -3783,9 +3807,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags); + /* + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion. + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * We need to do this in case we fail at _any_ point during the + * actual truncate. Once we do the truncate_setsize we could + * invalidate pages which forces any outstanding ordered io to + * be instantly completed which will give us extents that need + * to be truncated. If we fail to get an orphan inode down we + * could have left over extents that were never meant to live, + * so we need to garuntee from this point on that everything + * will be consistent. + */ + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + return ret; + /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); ret = btrfs_truncate(inode); + if (ret && inode->i_nlink) + btrfs_orphan_del(NULL, inode); } return ret; @@ -3805,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setsize(inode, attr->ia_size); + err = btrfs_setsize(inode, attr); if (err) return err; } @@ -5572,10 +5621,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag return em; if (em) { /* - * if our em maps to a hole, there might - * actually be delalloc bytes behind it + * if our em maps to + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. */ - if (em->block_start != EXTENT_MAP_HOLE) + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) return em; else hole_em = em; @@ -5657,6 +5709,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag */ em->block_start = hole_em->block_start; em->block_len = hole_len; + if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); } else { em->start = range_start; em->len = found; @@ -6915,11 +6969,9 @@ static int btrfs_truncate(struct inode *inode) /* * 1 for the truncate slack space - * 1 for the orphan item we're going to add - * 1 for the orphan item deletion * 1 for updating the inode. */ - trans = btrfs_start_transaction(root, 4); + trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; @@ -6930,12 +6982,6 @@ static int btrfs_truncate(struct inode *inode) min_size); BUG_ON(ret); - ret = btrfs_orphan_add(trans, inode); - if (ret) { - btrfs_end_transaction(trans, root); - goto out; - } - /* * setattr is responsible for setting the ordered_data_close flag, * but that is only tested during the last file release. That @@ -7004,12 +7050,6 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(trans, inode); if (ret) err = ret; - } else if (ret && inode->i_nlink > 0) { - /* - * Failed to do the truncate, remove us from the in memory - * orphan list. - */ - ret = btrfs_orphan_del(NULL, inode); } if (trans) { @@ -7531,41 +7571,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) */ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { - struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; struct list_head works; + struct list_head splice; int ret = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; INIT_LIST_HEAD(&works); - + INIT_LIST_HEAD(&splice); +again: spin_lock(&root->fs_info->delalloc_lock); - while (!list_empty(head)) { - binode = list_entry(head->next, struct btrfs_inode, + list_splice_init(&root->fs_info->delalloc_inodes, &splice); + while (!list_empty(&splice)) { + binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); + + list_del_init(&binode->delalloc_inodes); + inode = igrab(&binode->vfs_inode); if (!inode) - list_del_init(&binode->delalloc_inodes); + continue; + + list_add_tail(&binode->delalloc_inodes, + &root->fs_info->delalloc_inodes); spin_unlock(&root->fs_info->delalloc_lock); - if (inode) { - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); - if (!work) { - ret = -ENOMEM; - goto out; - } - list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); + + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (unlikely(!work)) { + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); + cond_resched(); spin_lock(&root->fs_info->delalloc_lock); } spin_unlock(&root->fs_info->delalloc_lock); + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + + spin_lock(&root->fs_info->delalloc_lock); + if (!list_empty(&root->fs_info->delalloc_inodes)) { + spin_unlock(&root->fs_info->delalloc_lock); + goto again; + } + spin_unlock(&root->fs_info->delalloc_lock); + /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return @@ -7578,11 +7638,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); + return 0; out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } + + if (!list_empty_careful(&splice)) { + spin_lock(&root->fs_info->delalloc_lock); + list_splice_tail(&splice, &root->fs_info->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); + } return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4b4516770f05..338f2597bf7f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -515,7 +515,6 @@ static noinline int create_subvol(struct btrfs_root *root, BUG_ON(ret); - d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: if (async_transid) { *async_transid = trans->transid; @@ -525,6 +524,10 @@ fail: } if (err && !ret) ret = err; + + if (!ret) + d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); + return ret; } @@ -1339,7 +1342,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + mnt_drop_write_file(file); + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -1362,6 +1366,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", @@ -1369,9 +1374,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_free; } - if (device->fs_devices && device->fs_devices->seeding) { + + if (!device->writeable) { printk(KERN_INFO "btrfs: resizer unable to apply on " - "seeding device %llu\n", + "readonly device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_free; @@ -1443,8 +1449,8 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } @@ -2095,13 +2101,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = inode_permission(inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; - - /* check if subvolume may be deleted by a non-root user */ - err = btrfs_may_delete(dir, dentry, 1); - if (err) - goto out_dput; } + /* check if subvolume may be deleted by a user */ + err = btrfs_may_delete(dir, dentry, 1); + if (err) + goto out_dput; + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out_dput; @@ -2183,19 +2189,20 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) struct btrfs_ioctl_defrag_range_args *range; int ret; - if (btrfs_root_readonly(root)) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + mnt_drop_write_file(file); + return -EINVAL; } - ret = mnt_want_write_file(file); - if (ret) { - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); - return ret; + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; } switch (inode->i_mode & S_IFMT) { @@ -2247,8 +2254,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } @@ -2263,7 +2270,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2300,7 +2307,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2316,8 +2323,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } @@ -3437,8 +3444,8 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; + bool need_unlock; /* for mut. excl. ops lock */ int ret; - int need_to_clear_lock = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3447,14 +3454,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&fs_info->volume_mutex); +again: + if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + need_unlock = true; + goto locked; + } + + /* + * mut. excl. ops lock is locked. Three possibilites: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* this is either (2) or (3) */ + if (!atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + if (!mutex_trylock(&fs_info->volume_mutex)) + goto again; + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !atomic_read(&fs_info->balance_running)) { + /* this is (3) */ + need_unlock = false; + goto locked; + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + goto again; + } else { + /* this is (2) */ + mutex_unlock(&fs_info->balance_mutex); + ret = -EINPROGRESS; + goto out; + } + } else { + /* this is (1) */ + mutex_unlock(&fs_info->balance_mutex); + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + ret = -EINVAL; + goto out; + } + +locked: + BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); if (IS_ERR(bargs)) { ret = PTR_ERR(bargs); - goto out; + goto out_unlock; } if (bargs->flags & BTRFS_BALANCE_RESUME) { @@ -3474,13 +3528,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bargs = NULL; } - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + if (fs_info->balance_ctl) { ret = -EINPROGRESS; goto out_bargs; } - need_to_clear_lock = 1; bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { @@ -3501,11 +3552,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) } do_balance: - ret = btrfs_balance(bctl, bargs); /* - * bctl is freed in __cancel_balance or in free_fs_info if - * restriper was paused all the way until unmount + * Ownership of bctl and mutually_exclusive_operation_running + * goes to to btrfs_balance. bctl is freed in __cancel_balance, + * or, if restriper was paused all the way until unmount, in + * free_fs_info. mutually_exclusive_operation_running is + * cleared in __cancel_balance. */ + need_unlock = false; + + ret = btrfs_balance(bctl, bargs); + if (arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; @@ -3513,12 +3570,12 @@ do_balance: out_bargs: kfree(bargs); -out: - if (need_to_clear_lock) - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); +out_unlock: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); + if (need_unlock) + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +out: mnt_drop_write_file(file); return ret; } @@ -3698,6 +3755,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto drop_write; } + if (!sa->qgroupid) { + ret = -EINVAL; + goto out; + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f10731297040..e5ed56729607 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -836,9 +836,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, * if the disk i_size is already at the inode->i_size, or * this ordered extent is inside the disk i_size, we're done */ - if (disk_i_size == i_size || offset <= disk_i_size) { + if (disk_i_size == i_size) + goto out; + + /* + * We still need to update disk_i_size if outstanding_isize is greater + * than disk_i_size. + */ + if (offset <= disk_i_size && + (!ordered || ordered->outstanding_isize <= disk_i_size)) goto out; - } /* * walk backward from this ordered extent to disk_i_size. @@ -870,7 +877,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, break; if (test->file_offset >= i_size) break; - if (test->file_offset >= disk_i_size) { + if (entry_end(test) > disk_i_size) { /* * we don't update disk_i_size now, so record this * undealt i_size. Or we will not know the real diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fe9d02c45f8e..a5c856234323 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -379,6 +379,13 @@ next1: ret = add_relation_rb(fs_info, found_key.objectid, found_key.offset); + if (ret == -ENOENT) { + printk(KERN_WARNING + "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", + (unsigned long long)found_key.objectid, + (unsigned long long)found_key.offset); + ret = 0; /* ignore the error */ + } if (ret) goto out; next2: @@ -956,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; int ret = 0; quota_root = fs_info->quota_root; if (!quota_root) return -EINVAL; + /* check if there are no relations to this qgroup */ + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (qgroup) { + if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) { + spin_unlock(&fs_info->qgroup_lock); + return -EBUSY; + } + } + spin_unlock(&fs_info->qgroup_lock); + ret = del_qgroup_item(trans, quota_root, qgroupid); spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); - spin_unlock(&fs_info->qgroup_lock); return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bdbb94f245c9..67783e03d121 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -580,20 +580,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) int corrected = 0; struct btrfs_key key; struct inode *inode = NULL; + struct btrfs_fs_info *fs_info; u64 end = offset + PAGE_SIZE - 1; struct btrfs_root *local_root; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); - if (IS_ERR(local_root)) + + fs_info = fixup->root->fs_info; + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + + local_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; - inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); + inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -606,7 +615,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) } if (PageUptodate(page)) { - struct btrfs_fs_info *fs_info; if (PageDirty(page)) { /* * we need to write the data to the defect sector. the @@ -3180,18 +3188,25 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx) u64 physical_for_dev_replace; u64 len; struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + int srcu_index; key.objectid = root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + srcu_index = srcu_read_lock(&fs_info->subvol_srcu); + local_root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(local_root)) + if (IS_ERR(local_root)) { + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); return PTR_ERR(local_root); + } key.type = BTRFS_INODE_ITEM_KEY; key.objectid = inum; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); + srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); if (IS_ERR(inode)) return PTR_ERR(inode); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 54454542ad40..321b7fb4e441 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx, (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); - if (!nce_head) + if (!nce_head) { + kfree(nce); return -ENOMEM; + } INIT_LIST_HEAD(nce_head); ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 99545df1b86c..d8982e9601d3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -267,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, function, line, errstr); return; } - trans->transaction->aborted = errno; + ACCESS_ONCE(trans->transaction->aborted) = errno; __btrfs_std_error(root->fs_info, function, line, errno, NULL); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 87fac9a21ea5..fc03aa60b684 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -333,12 +333,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, &root->fs_info->trans_block_rsv, num_bytes, flush); if (ret) - return ERR_PTR(ret); + goto reserve_fail; } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); - if (!h) - return ERR_PTR(-ENOMEM); + if (!h) { + ret = -ENOMEM; + goto alloc_fail; + } /* * If we are JOIN_NOLOCK we're already committing a transaction and @@ -365,11 +367,7 @@ again: if (ret < 0) { /* We must get the transaction if we are JOIN_NOLOCK. */ BUG_ON(type == TRANS_JOIN_NOLOCK); - - if (type < TRANS_JOIN_NOLOCK) - sb_end_intwrite(root->fs_info->sb); - kmem_cache_free(btrfs_trans_handle_cachep, h); - return ERR_PTR(ret); + goto join_fail; } cur_trans = root->fs_info->running_transaction; @@ -410,6 +408,19 @@ got_it: if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; return h; + +join_fail: + if (type < TRANS_JOIN_NOLOCK) + sb_end_intwrite(root->fs_info->sb); + kmem_cache_free(btrfs_trans_handle_cachep, h); +alloc_fail: + if (num_bytes) + btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, + num_bytes); +reserve_fail: + if (qgroup_reserved) + btrfs_qgroup_free(root, qgroup_reserved); + return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, @@ -1468,7 +1479,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } - if (cur_trans->aborted) { + /* Stop the commit early if ->aborted is set */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; goto cleanup_transaction; } @@ -1574,6 +1586,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); + /* ->aborted might be set after the previous check, so check it */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + goto cleanup_transaction; + } /* * the reloc mutex makes sure that we stop * the balancing code from coming in and moving @@ -1657,6 +1674,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } + /* + * The tasks which save the space cache and inode cache may also + * update ->aborted, check it. + */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } + btrfs_prepare_extent_commit(trans, root); cur_trans = root->fs_info->running_transaction; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 83186c7e45d4..9027bb1e7466 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3357,6 +3357,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (skip_csum) return 0; + if (em->compress_type) { + csum_offset = 0; + csum_len = block_len; + } + /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, em->block_start + csum_offset, @@ -3410,13 +3415,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em = list_entry(extents.next, struct extent_map, list); list_del_init(&em->list); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); /* * If we had an error we just need to delete everybody from our * private list. */ if (ret) { + clear_em_logging(tree, em); free_extent_map(em); continue; } @@ -3424,8 +3429,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); ret = log_one_extent(trans, inode, root, em, path); - free_extent_map(em); write_lock(&tree->lock); + clear_em_logging(tree, em); + free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cce6aa74012..5cbb7f4b1672 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1431,7 +1431,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } } else { ret = btrfs_get_bdev_and_sb(device_path, - FMODE_READ | FMODE_EXCL, + FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder, 0, &bdev, &bh); if (ret) @@ -1556,7 +1556,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = 0; /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + if (bdev) + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); error_brelse: brelse(bh); @@ -2614,7 +2615,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = btrfs_block_group_used(&cache->item); - user_thresh = div_factor_fine(cache->key.offset, bargs->usage); + if (bargs->usage == 0) + user_thresh = 0; + else if (bargs->usage > 100) + user_thresh = cache->key.offset; + else + user_thresh = div_factor_fine(cache->key.offset, + bargs->usage); + if (chunk_used < user_thresh) ret = 0; @@ -2959,6 +2967,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); BUG_ON(ret); + + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, @@ -3138,8 +3148,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, out: if (bctl->flags & BTRFS_BALANCE_RESUME) __cancel_balance(fs_info); - else + else { kfree(bctl); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + } return ret; } @@ -3156,7 +3168,6 @@ static int balance_kthread(void *data) ret = btrfs_balance(fs_info->balance_ctl, NULL); } - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); @@ -3179,7 +3190,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } - WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); if (IS_ERR(tsk)) return PTR_ERR(tsk); @@ -3233,6 +3243,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); @@ -3496,7 +3508,7 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { { 1, 1, 2, 2, 2, 2 /* raid1 */ }, { 1, 2, 1, 1, 1, 2 /* dup */ }, { 1, 1, 0, 2, 1, 1 /* raid0 */ }, - { 1, 1, 0, 1, 1, 1 /* single */ }, + { 1, 1, 1, 1, 1, 1 /* single */ }, }; static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, diff --git a/fs/buffer.c b/fs/buffer.c index c017a2dfb909..7a75c3e0fd58 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2935,6 +2935,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) void *kaddr = kmap_atomic(bh->b_page); memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); kunmap_atomic(kaddr); + flush_dcache_page(bh->b_page); } } diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ce5cbd717bfc..210fce2df308 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -226,6 +226,8 @@ compose_mount_options_out: compose_mount_options_err: kfree(mountdata); mountdata = ERR_PTR(rc); + kfree(*devname); + *devname = NULL; goto compose_mount_options_out; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f653835d067b..de7f9168a118 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -228,7 +228,6 @@ cifs_alloc_inode(struct super_block *sb) cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; - cifs_inode->leave_pages_clean = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->server_eof = 0; cifs_inode->uniqueid = 0; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index aea1eec64911..e6899cea1c35 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -386,6 +386,7 @@ struct smb_version_values { unsigned int cap_unix; unsigned int cap_nt_find; unsigned int cap_large_files; + unsigned int oplock_read; }; #define HEADER_SIZE(server) (server->vals->header_size) @@ -1030,7 +1031,6 @@ struct cifsInodeInfo { bool clientCanCacheAll; /* read and writebehind oplock */ bool delete_pending; /* DELETE_ON_CLOSE is set */ bool invalid_mapping; /* pagecache is invalid */ - bool leave_pages_clean; /* protected by i_mutex, not set pages dirty */ unsigned long time; /* jiffies of last update of inode */ u64 server_eof; /* current file size on server -- protected by i_lock */ u64 uniqueid; /* server inode number */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 17c3643e5950..12b3da39733b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1917,7 +1917,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs) } case AF_INET6: { struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; - struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs; + struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs; return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr); } default: diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0a6677ba212b..8ea6ca50a665 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -238,6 +238,23 @@ out: return rc; } +static bool +cifs_has_mand_locks(struct cifsInodeInfo *cinode) +{ + struct cifs_fid_locks *cur; + bool has_locks = false; + + down_read(&cinode->lock_sem); + list_for_each_entry(cur, &cinode->llist, llist) { + if (!list_empty(&cur->locks)) { + has_locks = true; + break; + } + } + up_read(&cinode->lock_sem); + return has_locks; +} + struct cifsFileInfo * cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) @@ -248,6 +265,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct cifsFileInfo *cfile; struct cifs_fid_locks *fdlocks; struct cifs_tcon *tcon = tlink_tcon(tlink); + struct TCP_Server_Info *server = tcon->ses->server; cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); if (cfile == NULL) @@ -276,12 +294,22 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + /* + * If the server returned a read oplock and we have mandatory brlocks, + * set oplock level to None. + */ + if (oplock == server->vals->oplock_read && + cifs_has_mand_locks(cinode)) { + cFYI(1, "Reset oplock val from read to None due to mand locks"); + oplock = 0; + } + spin_lock(&cifs_file_list_lock); - if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE) + if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock) oplock = fid->pending_open->oplock; list_del(&fid->pending_open->olist); - tlink_tcon(tlink)->ses->server->ops->set_fid(cfile, fid, oplock); + server->ops->set_fid(cfile, fid, oplock); list_add(&cfile->tlist, &tcon->openFileList); /* if readable file instance put first in list*/ @@ -1422,6 +1450,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; + struct inode *inode = cfile->dentry->d_inode; if (posix_lck) { int posix_lock_type; @@ -1459,6 +1488,21 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, if (!rc) goto out; + /* + * Windows 7 server can delay breaking lease from read to None + * if we set a byte-range lock on a file - break it explicitly + * before sending the lock to the server to be sure the next + * read won't conflict with non-overlapted locks due to + * pagereading. + */ + if (!CIFS_I(inode)->clientCanCacheAll && + CIFS_I(inode)->clientCanCacheRead) { + cifs_invalidate_mapping(inode); + cFYI(1, "Set no oplock for inode=%p due to mand locks", + inode); + CIFS_I(inode)->clientCanCacheRead = false; + } + rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length, type, 1, 0, wait_flag); if (rc) { @@ -2103,15 +2147,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, } else { rc = copied; pos += copied; - /* - * When we use strict cache mode and cifs_strict_writev was run - * with level II oplock (indicated by leave_pages_clean field of - * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev - * sent the data to the server itself. - */ - if (!CIFS_I(inode)->leave_pages_clean || - !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)) - set_page_dirty(page); + set_page_dirty(page); } if (rc > 0) { @@ -2462,8 +2498,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, } static ssize_t -cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, bool cache_ex) +cifs_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; @@ -2485,12 +2521,8 @@ cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov, server->vals->exclusive_lock_type, NULL, CIFS_WRITE_OP)) { mutex_lock(&inode->i_mutex); - if (!cache_ex) - cinode->leave_pages_clean = true; rc = __generic_file_aio_write(iocb, iov, nr_segs, - &iocb->ki_pos); - if (!cache_ex) - cinode->leave_pages_clean = false; + &iocb->ki_pos); mutex_unlock(&inode->i_mutex); } @@ -2517,60 +2549,32 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, struct cifsFileInfo *cfile = (struct cifsFileInfo *) iocb->ki_filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - ssize_t written, written2; - /* - * We need to store clientCanCacheAll here to prevent race - * conditions - this value can be changed during an execution - * of generic_file_aio_write. For CIFS it can be changed from - * true to false only, but for SMB2 it can be changed both from - * true to false and vice versa. So, we can end up with a data - * stored in the cache, not marked dirty and not sent to the - * server if this value changes its state from false to true - * after cifs_write_end. - */ - bool cache_ex = cinode->clientCanCacheAll; - bool cache_read = cinode->clientCanCacheRead; - int rc; - loff_t saved_pos; + ssize_t written; - if (cache_ex) { + if (cinode->clientCanCacheAll) { if (cap_unix(tcon->ses) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu( - tcon->fsUnixInfo.Capability))) + (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) + && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) return generic_file_aio_write(iocb, iov, nr_segs, pos); - return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex); + return cifs_writev(iocb, iov, nr_segs, pos); } - /* - * For files without exclusive oplock in strict cache mode we need to - * write the data to the server exactly from the pos to pos+len-1 rather - * than flush all affected pages because it may cause a error with - * mandatory locks on these pages but not on the region from pos to - * ppos+len-1. + * For non-oplocked files in strict cache mode we need to write the data + * to the server exactly from the pos to pos+len-1 rather than flush all + * affected pages because it may cause a error with mandatory locks on + * these pages but not on the region from pos to ppos+len-1. */ written = cifs_user_writev(iocb, iov, nr_segs, pos); - if (!cache_read || written <= 0) - return written; - - saved_pos = iocb->ki_pos; - iocb->ki_pos = pos; - /* we have a read oplock - need to store a data in the page cache */ - if (cap_unix(tcon->ses) && - ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) && - (CIFS_UNIX_FCNTL_CAP & le64_to_cpu( - tcon->fsUnixInfo.Capability))) - written2 = generic_file_aio_write(iocb, iov, nr_segs, pos); - else - written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos, - cache_ex); - /* errors occured during writing - invalidate the page cache */ - if (written2 < 0) { - rc = cifs_invalidate_mapping(inode); - if (rc) - written = (ssize_t)rc; - else - iocb->ki_pos = saved_pos; + if (written > 0 && cinode->clientCanCacheRead) { + /* + * Windows 7 server can delay breaking level2 oplock if a write + * request comes - break it on the client to prevent reading + * an old data. + */ + cifs_invalidate_mapping(inode); + cFYI(1, "Set no oplock for inode=%p after a write operation", + inode); + cinode->clientCanCacheRead = false; } return written; } @@ -3577,6 +3581,13 @@ void cifs_oplock_break(struct work_struct *work) struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; + if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead && + cifs_has_mand_locks(cinode)) { + cFYI(1, "Reset oplock to None for inode=%p due to mand locks", + inode); + cinode->clientCanCacheRead = false; + } + if (inode && S_ISREG(inode->i_mode)) { if (cinode->clientCanCacheRead) break_lease(inode, O_RDONLY); diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index a5d234c8d5d9..47bc5a87f94e 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -53,6 +53,13 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf, mutex_unlock(&server->srv_mutex); return rc; } + + /* + * The response to this call was already factored into the sequence + * number when the call went out, so we must adjust it back downward + * after signing here. + */ + --server->sequence_number; rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); mutex_unlock(&server->srv_mutex); @@ -952,4 +959,5 @@ struct smb_version_values smb1_values = { .cap_unix = CAP_UNIX, .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, .cap_large_files = CAP_LARGE_FILES, + .oplock_read = OPLOCK_READ, }; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d79de7bc4435..c9c7aa7ed966 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -708,6 +708,7 @@ struct smb_version_values smb20_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, + .oplock_read = SMB2_OPLOCK_LEVEL_II, }; struct smb_version_values smb21_values = { @@ -725,6 +726,7 @@ struct smb_version_values smb21_values = { .cap_unix = 0, .cap_nt_find = SMB2_NT_FIND, .cap_large_files = SMB2_LARGE_FILES, + .oplock_read = SMB2_OPLOCK_LEVEL_II, }; struct smb_version_values smb30_values = { diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 76d974c952fe..1a528680ec5a 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -144,9 +144,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec, *sent = 0; - if (ssocket == NULL) - return -ENOTSOCK; /* BB eventually add reconnect code here */ - smb_msg.msg_name = (struct sockaddr *) &server->dstaddr; smb_msg.msg_namelen = sizeof(struct sockaddr); smb_msg.msg_control = NULL; @@ -291,6 +288,9 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) struct socket *ssocket = server->ssocket; int val = 1; + if (ssocket == NULL) + return -ENOTSOCK; + cFYI(1, "Sending smb: smb_len=%u", smb_buf_length); dump_smb(iov[0].iov_base, iov[0].iov_len); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 153bb1e42e63..a5f12b7e228d 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -176,7 +176,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) opts->uid = uid; break; case Opt_gid: - if (match_octal(&args[0], &option)) + if (match_int(&args[0], &option)) return -EINVAL; gid = make_kgid(current_user_ns(), option); if (!gid_valid(gid)) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 7ff49852b0cb..911649a47dd5 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -503,11 +503,11 @@ static ssize_t device_write(struct file *file, const char __user *buf, #endif return -EINVAL; -#ifdef CONFIG_COMPAT - if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN) -#else + /* + * can't compare against COMPAT/dlm_write_request32 because + * we don't yet know if is64bit is zero + */ if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) -#endif return -EINVAL; kbuf = kzalloc(count + 1, GFP_NOFS); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index ea9931281557..a7b0c2dfb3db 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1935,7 +1935,7 @@ static const unsigned char filename_rev_map[256] = { * @src: Source location for the filename to encode * @src_size: Size of the source in bytes */ -void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size, +static void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size, unsigned char *src, size_t src_size) { size_t num_blocks; diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index 809e67d05ca3..f1ea610362c6 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -102,12 +102,12 @@ int __init ecryptfs_init_kthread(void) void ecryptfs_destroy_kthread(void) { - struct ecryptfs_open_req *req; + struct ecryptfs_open_req *req, *tmp; mutex_lock(&ecryptfs_kthread_ctl.mux); ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE; - list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list, - kthread_ctl_list) { + list_for_each_entry_safe(req, tmp, &ecryptfs_kthread_ctl.req_list, + kthread_ctl_list) { list_del(&req->kthread_ctl_list); *req->lower_file = ERR_PTR(-EIO); complete(&req->done); diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index bd1d57f98f74..564a1fa34b99 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -338,7 +338,8 @@ static int ecryptfs_write_begin(struct file *file, if (prev_page_end_size >= i_size_read(page->mapping->host)) { zero_user(page, 0, PAGE_CACHE_SIZE); - } else { + SetPageUptodate(page); + } else if (len < PAGE_CACHE_SIZE) { rc = ecryptfs_decrypt_page(page); if (rc) { printk(KERN_ERR "%s: Error decrypting " @@ -348,8 +349,8 @@ static int ecryptfs_write_begin(struct file *file, ClearPageUptodate(page); goto out; } + SetPageUptodate(page); } - SetPageUptodate(page); } } /* If creating a page or more of holes, zero them out via truncate. @@ -499,6 +500,13 @@ static int ecryptfs_write_end(struct file *file, } goto out; } + if (!PageUptodate(page)) { + if (copied < PAGE_CACHE_SIZE) { + rc = 0; + goto out; + } + SetPageUptodate(page); + } /* Fills in zeros if 'to' goes beyond inode size */ rc = fill_zeros_to_end_of_page(page, to); if (rc) { diff --git a/fs/eventpoll.c b/fs/eventpoll.c index be56b21435f8..9fec1836057a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1313,7 +1313,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ - epi->event.events = event->events; + epi->event.events = event->events; /* need barrier below */ pt._key = event->events; epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { @@ -1324,6 +1324,26 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even } /* + * The following barrier has two effects: + * + * 1) Flush epi changes above to other CPUs. This ensures + * we do not miss events from ep_poll_callback if an + * event occurs immediately after we call f_op->poll(). + * We need this because we did not take ep->lock while + * changing epi above (but ep_poll_callback does take + * ep->lock). + * + * 2) We also need to ensure we do not miss _past_ events + * when calling f_op->poll(). This barrier also + * pairs with the barrier in wq_has_sleeper (see + * comments for wq_has_sleeper). + * + * This barrier will now guarantee ep_poll_callback or f_op->poll + * (or both) will notice the readiness of an item. + */ + smp_mb(); + + /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ diff --git a/fs/exec.c b/fs/exec.c index 18c45cac368f..20df02c1cc70 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -434,8 +434,9 @@ static int count(struct user_arg_ptr argv, int max) if (IS_ERR(p)) return -EFAULT; - if (i++ >= max) + if (i >= max) return -E2BIG; + ++i; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 0a475c881852..987358740cb9 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -41,6 +41,7 @@ config EXT4_USE_FOR_EXT23 config EXT4_FS_POSIX_ACL bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support permissions for users and @@ -53,6 +54,7 @@ config EXT4_FS_POSIX_ACL config EXT4_FS_SECURITY bool "Ext4 Security Labels" + depends on EXT4_FS help Security labels support alternative access control models implemented by security modules like SELinux. This option diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 26af22832a84..5ae1674ec12f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2226,13 +2226,14 @@ errout: * removes index from the index block. */ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path) + struct ext4_ext_path *path, int depth) { int err; ext4_fsblk_t leaf; /* free index block */ - path--; + depth--; + path = path + depth; leaf = ext4_idx_pblock(path->p_idx); if (unlikely(path->p_hdr->eh_entries == 0)) { EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); @@ -2257,6 +2258,19 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); + + while (--depth >= 0) { + if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr)) + break; + path--; + err = ext4_ext_get_access(handle, inode, path); + if (err) + break; + path->p_idx->ei_block = (path+1)->p_idx->ei_block; + err = ext4_ext_dirty(handle, inode, path); + if (err) + break; + } return err; } @@ -2599,7 +2613,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, /* if this leaf is free, then we should * remove it from index block above */ if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL) - err = ext4_ext_rm_idx(handle, inode, path + depth); + err = ext4_ext_rm_idx(handle, inode, path, depth); out: return err; @@ -2802,7 +2816,7 @@ again: /* index is empty, remove it; * handle must be already prepared by the * truncatei_leaf() */ - err = ext4_ext_rm_idx(handle, inode, path + i); + err = ext4_ext_rm_idx(handle, inode, path, i); } /* root level has p_bh == NULL, brelse() eats this */ brelse(path[i].p_bh); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d07c27ca594a..405565a62277 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -108,14 +108,6 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, /* Unaligned direct AIO must be serialized; see comment above */ if (unaligned_aio) { - static unsigned long unaligned_warn_time; - - /* Warn about this once per day */ - if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ)) - ext4_msg(inode->i_sb, KERN_WARNING, - "Unaligned AIO/DIO on inode %ld by %s; " - "performance will be poor.", - inode->i_ino, current->comm); mutex_lock(ext4_aio_mutex(inode)); ext4_unwritten_wait(inode); } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index dfbc1fe96674..3278e64e57b6 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -109,8 +109,6 @@ static int __sync_inode(struct inode *inode, int datasync) * * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. - * - * i_mutex lock is held when entering and exiting this function */ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cb1c1ab2720b..cbfe13bf5b2a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2880,8 +2880,6 @@ static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offs static void ext4_invalidatepage(struct page *page, unsigned long offset) { - journal_t *journal = EXT4_JOURNAL(page->mapping->host); - trace_ext4_invalidatepage(page, offset); /* @@ -2889,16 +2887,34 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset) */ if (ext4_should_dioread_nolock(page->mapping->host)) ext4_invalidatepage_free_endio(page, offset); + + /* No journalling happens on data buffers when this function is used */ + WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); + + block_invalidatepage(page, offset); +} + +static int __ext4_journalled_invalidatepage(struct page *page, + unsigned long offset) +{ + journal_t *journal = EXT4_JOURNAL(page->mapping->host); + + trace_ext4_journalled_invalidatepage(page, offset); + /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0) ClearPageChecked(page); - if (journal) - jbd2_journal_invalidatepage(journal, page, offset); - else - block_invalidatepage(page, offset); + return jbd2_journal_invalidatepage(journal, page, offset); +} + +/* Wrapper for aops... */ +static void ext4_journalled_invalidatepage(struct page *page, + unsigned long offset) +{ + WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); } static int ext4_releasepage(struct page *page, gfp_t wait) @@ -3264,7 +3280,7 @@ static const struct address_space_operations ext4_journalled_aops = { .write_end = ext4_journalled_write_end, .set_page_dirty = ext4_journalled_set_page_dirty, .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, + .invalidatepage = ext4_journalled_invalidatepage, .releasepage = ext4_releasepage, .direct_IO = ext4_direct_IO, .is_partially_uptodate = block_is_partially_uptodate, @@ -4305,6 +4321,47 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) } /* + * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate + * buffers that are attached to a page stradding i_size and are undergoing + * commit. In that case we have to wait for commit to finish and try again. + */ +static void ext4_wait_for_tail_page_commit(struct inode *inode) +{ + struct page *page; + unsigned offset; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + tid_t commit_tid = 0; + int ret; + + offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + /* + * All buffers in the last page remain valid? Then there's nothing to + * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE == + * blocksize case + */ + if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits)) + return; + while (1) { + page = find_lock_page(inode->i_mapping, + inode->i_size >> PAGE_CACHE_SHIFT); + if (!page) + return; + ret = __ext4_journalled_invalidatepage(page, offset); + unlock_page(page); + page_cache_release(page); + if (ret != -EBUSY) + return; + commit_tid = 0; + read_lock(&journal->j_state_lock); + if (journal->j_committing_transaction) + commit_tid = journal->j_committing_transaction->t_tid; + read_unlock(&journal->j_state_lock); + if (commit_tid) + jbd2_log_wait_commit(journal, commit_tid); + } +} + +/* * ext4_setattr() * * Called from notify_change. @@ -4417,16 +4474,28 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) { - truncate_setsize(inode, attr->ia_size); - /* Inode size will be reduced, wait for dio in flight. - * Temporarily disable dioread_nolock to prevent - * livelock. */ + if (attr->ia_size != inode->i_size) { + loff_t oldsize = inode->i_size; + + i_size_write(inode, attr->ia_size); + /* + * Blocks are going to be removed from the inode. Wait + * for dio in flight. Temporarily disable + * dioread_nolock to prevent livelock. + */ if (orphan) { - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ext4_inode_resume_unlocked_dio(inode); + if (!ext4_should_journal_data(inode)) { + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ext4_inode_resume_unlocked_dio(inode); + } else + ext4_wait_for_tail_page_commit(inode); } + /* + * Truncate pagecache after we've waited for commit + * in data=journal mode to make pages freeable. + */ + truncate_pagecache(inode, oldsize, inode->i_size); } ext4_truncate(inode); } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index cac448282331..f9ed946a448e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -722,7 +722,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, ext4_warning(dir->i_sb, "Node failed checksum"); brelse(bh); *err = ERR_BAD_DX_DIR; - goto fail; + goto fail2; } set_buffer_verified(bh); @@ -2368,7 +2368,6 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, } inode->i_size = EXT4_I(inode)->i_disksize = blocksize; - dir_block = ext4_bread(handle, inode, 0, 1, &err); if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { if (!err) { err = -EIO; @@ -2648,7 +2647,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0; - if (!EXT4_SB(inode->i_sb)->s_journal) + if ((!EXT4_SB(inode->i_sb)->s_journal) && + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) return 0; mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3cdb0a2fc648..3d4fb81bacd5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1645,9 +1645,7 @@ static int parse_options(char *options, struct super_block *sb, unsigned int *journal_ioprio, int is_remount) { -#ifdef CONFIG_QUOTA struct ext4_sb_info *sbi = EXT4_SB(sb); -#endif char *p; substring_t args[MAX_OPT_ARGS]; int token; @@ -1696,6 +1694,16 @@ static int parse_options(char *options, struct super_block *sb, } } #endif + if (test_opt(sb, DIOREAD_NOLOCK)) { + int blocksize = + BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); + + if (blocksize < PAGE_CACHE_SIZE) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "dioread_nolock if block size != PAGE_SIZE"); + return 0; + } + } return 1; } @@ -2212,7 +2220,9 @@ static void ext4_orphan_cleanup(struct super_block *sb, __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); + mutex_lock(&inode->i_mutex); ext4_truncate(inode); + mutex_unlock(&inode->i_mutex); nr_truncates++; } else { ext4_msg(sb, KERN_DEBUG, @@ -3223,6 +3233,10 @@ int ext4_calculate_overhead(struct super_block *sb) memset(buf, 0, PAGE_SIZE); cond_resched(); } + /* Add the journal blocks as well */ + if (sbi->s_journal) + overhead += EXT4_B2C(sbi, sbi->s_journal->j_maxlen); + sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); @@ -3436,15 +3450,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) clear_opt(sb, DELALLOC); } - blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - if (test_opt(sb, DIOREAD_NOLOCK)) { - if (blocksize < PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "dioread_nolock if block size != PAGE_SIZE"); - goto failed_mount; - } - } - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); @@ -3486,6 +3491,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) goto failed_mount; + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { ext4_msg(sb, KERN_ERR, @@ -4725,7 +4731,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } ext4_setup_system_zone(sb); - if (sbi->s_journal == NULL) + if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY)) ext4_commit_super(sb, 1); #ifdef CONFIG_QUOTA diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index fed74d193ffb..137af4255da6 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -82,7 +82,6 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: - acl->a_entries[i].e_id = ACL_UNDEFINED_ID; entry = (struct f2fs_acl_entry *)((char *)entry + sizeof(struct f2fs_acl_entry_short)); break; @@ -192,15 +191,14 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) retval = f2fs_getxattr(inode, name_index, "", value, retval); } - if (retval < 0) { - if (retval == -ENODATA) - acl = NULL; - else - acl = ERR_PTR(retval); - } else { + if (retval > 0) acl = f2fs_acl_from_disk(value, retval); - } + else if (retval == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(retval); kfree(value); + if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6ef36c37e2be..ff3c8439af87 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -214,7 +214,6 @@ retry: goto retry; } new->ino = ino; - INIT_LIST_HEAD(&new->list); /* add new_oentry into list which is sorted by inode number */ if (orphan) { @@ -772,7 +771,7 @@ void init_orphan_info(struct f2fs_sb_info *sbi) sbi->n_orphans = 0; } -int create_checkpoint_caches(void) +int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", sizeof(struct orphan_inode_entry), NULL); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 655aeabc1dd4..7bd22a201125 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -16,6 +16,7 @@ #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/bio.h> +#include <linux/prefetch.h> #include "f2fs.h" #include "node.h" @@ -546,6 +547,15 @@ redirty_out: #define MAX_DESIRED_PAGES_WP 4096 +static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -562,7 +572,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!S_ISDIR(inode->i_mode)) mutex_lock(&sbi->writepages); - ret = generic_writepages(mapping, wbc); + ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); if (!S_ISDIR(inode->i_mode)) mutex_unlock(&sbi->writepages); f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); @@ -688,6 +698,11 @@ static int f2fs_set_data_page_dirty(struct page *page) return 0; } +static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, get_data_block_ro); +} + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, @@ -699,4 +714,5 @@ const struct address_space_operations f2fs_dblock_aops = { .invalidatepage = f2fs_invalidate_data_page, .releasepage = f2fs_release_data_page, .direct_IO = f2fs_direct_IO, + .bmap = f2fs_bmap, }; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0e0380a588ad..c8c37307b326 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -26,6 +26,7 @@ static LIST_HEAD(f2fs_stat_list); static struct dentry *debugfs_root; +static DEFINE_MUTEX(f2fs_stat_mutex); static void update_general_status(struct f2fs_sb_info *sbi) { @@ -180,18 +181,14 @@ static int stat_show(struct seq_file *s, void *v) int i = 0; int j; + mutex_lock(&f2fs_stat_mutex); list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { - mutex_lock(&si->stat_lock); - if (!si->sbi) { - mutex_unlock(&si->stat_lock); - continue; - } update_general_status(si->sbi); seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++); - seq_printf(s, "[SB: 1] [CP: 2] [NAT: %d] [SIT: %d] ", - si->nat_area_segs, si->sit_area_segs); + seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", + si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", @@ -286,8 +283,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n", (si->base_mem + si->cache_mem) >> 10, si->base_mem >> 10, si->cache_mem >> 10); - mutex_unlock(&si->stat_lock); } + mutex_unlock(&f2fs_stat_mutex); return 0; } @@ -303,7 +300,7 @@ static const struct file_operations stat_fops = { .release = single_release, }; -static int init_stats(struct f2fs_sb_info *sbi) +int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; @@ -313,9 +310,6 @@ static int init_stats(struct f2fs_sb_info *sbi) return -ENOMEM; si = sbi->stat_info; - mutex_init(&si->stat_lock); - list_add_tail(&si->stat_list, &f2fs_stat_list); - si->all_area_segs = le32_to_cpu(raw_super->segment_count); si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); @@ -325,21 +319,11 @@ static int init_stats(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); si->sbi = sbi; - return 0; -} -int f2fs_build_stats(struct f2fs_sb_info *sbi) -{ - int retval; - - retval = init_stats(sbi); - if (retval) - return retval; - - if (!debugfs_root) - debugfs_root = debugfs_create_dir("f2fs", NULL); + mutex_lock(&f2fs_stat_mutex); + list_add_tail(&si->stat_list, &f2fs_stat_list); + mutex_unlock(&f2fs_stat_mutex); - debugfs_create_file("status", S_IRUGO, debugfs_root, NULL, &stat_fops); return 0; } @@ -347,14 +331,22 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = sbi->stat_info; + mutex_lock(&f2fs_stat_mutex); list_del(&si->stat_list); - mutex_lock(&si->stat_lock); - si->sbi = NULL; - mutex_unlock(&si->stat_lock); + mutex_unlock(&f2fs_stat_mutex); + kfree(sbi->stat_info); } -void destroy_root_stats(void) +void __init f2fs_create_root_stats(void) +{ + debugfs_root = debugfs_create_dir("f2fs", NULL); + if (debugfs_root) + debugfs_create_file("status", S_IRUGO, debugfs_root, + NULL, &stat_fops); +} + +void f2fs_destroy_root_stats(void) { debugfs_remove_recursive(debugfs_root); debugfs_root = NULL; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index b4e24f32b54e..989980e16d0b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include "f2fs.h" +#include "node.h" #include "acl.h" static unsigned long dir_blocks(struct inode *inode) @@ -74,7 +75,7 @@ static unsigned long dir_block_index(unsigned int level, unsigned int idx) return bidx; } -static bool early_match_name(const char *name, int namelen, +static bool early_match_name(const char *name, size_t namelen, f2fs_hash_t namehash, struct f2fs_dir_entry *de) { if (le16_to_cpu(de->name_len) != namelen) @@ -87,7 +88,7 @@ static bool early_match_name(const char *name, int namelen, } static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - const char *name, int namelen, int *max_slots, + const char *name, size_t namelen, int *max_slots, f2fs_hash_t namehash, struct page **res_page) { struct f2fs_dir_entry *de; @@ -126,7 +127,7 @@ found: } static struct f2fs_dir_entry *find_in_level(struct inode *dir, - unsigned int level, const char *name, int namelen, + unsigned int level, const char *name, size_t namelen, f2fs_hash_t namehash, struct page **res_page) { int s = GET_DENTRY_SLOTS(namelen); @@ -181,7 +182,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct qstr *child, struct page **res_page) { const char *name = child->name; - int namelen = child->len; + size_t namelen = child->len; unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; f2fs_hash_t name_hash; @@ -308,6 +309,7 @@ static int init_inode_metadata(struct inode *inode, struct dentry *dentry) ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); + set_cold_node(inode, ipage); init_dent_inode(dentry, ipage); f2fs_put_page(ipage, 1); } @@ -381,7 +383,7 @@ int f2fs_add_link(struct dentry *dentry, struct inode *inode) struct inode *dir = dentry->d_parent->d_inode; struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + size_t namelen = dentry->d_name.len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; int slots = GET_DENTRY_SLOTS(namelen); @@ -501,7 +503,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, } if (inode) { - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_ctime = CURRENT_TIME; drop_nlink(inode); if (S_ISDIR(inode->i_mode)) { drop_nlink(inode); @@ -540,13 +542,13 @@ int f2fs_make_empty(struct inode *inode, struct inode *parent) de = &dentry_blk->dentry[0]; de->name_len = cpu_to_le16(1); - de->hash_code = 0; + de->hash_code = f2fs_dentry_hash(".", 1); de->ino = cpu_to_le32(inode->i_ino); memcpy(dentry_blk->filename[0], ".", 1); set_de_type(de, inode); de = &dentry_blk->dentry[1]; - de->hash_code = 0; + de->hash_code = f2fs_dentry_hash("..", 2); de->name_len = cpu_to_le16(2); de->ino = cpu_to_le32(parent->i_ino); memcpy(dentry_blk->filename[1], "..", 2); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a18d63db2fb6..c8e2d751ef9c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -211,11 +211,11 @@ struct dnode_of_data { static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, struct page *ipage, struct page *npage, nid_t nid) { + memset(dn, 0, sizeof(*dn)); dn->inode = inode; dn->inode_page = ipage; dn->node_page = npage; dn->nid = nid; - dn->inode_page_locked = 0; } /* @@ -877,11 +877,13 @@ bool f2fs_empty_dir(struct inode *); * super.c */ int f2fs_sync_fs(struct super_block *, int); +extern __printf(3, 4) +void f2fs_msg(struct super_block *, const char *, const char *, ...); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const char *, int); +f2fs_hash_t f2fs_dentry_hash(const char *, size_t); /* * node.c @@ -912,7 +914,7 @@ int restore_node_summary(struct f2fs_sb_info *, unsigned int, void flush_nat_entries(struct f2fs_sb_info *); int build_node_manager(struct f2fs_sb_info *); void destroy_node_manager(struct f2fs_sb_info *); -int create_node_manager_caches(void); +int __init create_node_manager_caches(void); void destroy_node_manager_caches(void); /* @@ -964,7 +966,7 @@ void sync_dirty_dir_inodes(struct f2fs_sb_info *); void block_operations(struct f2fs_sb_info *); void write_checkpoint(struct f2fs_sb_info *, bool, bool); void init_orphan_info(struct f2fs_sb_info *); -int create_checkpoint_caches(void); +int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); /* @@ -984,9 +986,9 @@ int do_write_data_page(struct page *); int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int); -int f2fs_gc(struct f2fs_sb_info *, int); +int f2fs_gc(struct f2fs_sb_info *); void build_gc_manager(struct f2fs_sb_info *); -int create_gc_caches(void); +int __init create_gc_caches(void); void destroy_gc_caches(void); /* @@ -1058,7 +1060,8 @@ struct f2fs_stat_info { int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void destroy_root_stats(void); +void __init f2fs_create_root_stats(void); +void f2fs_destroy_root_stats(void); #else #define stat_inc_call_count(si) #define stat_inc_seg_count(si, type) @@ -1068,7 +1071,8 @@ void destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void destroy_root_stats(void) { } +static inline void __init f2fs_create_root_stats(void) { } +static inline void f2fs_destroy_root_stats(void) { } #endif extern const struct file_operations f2fs_dir_operations; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f9e085dfb1f0..3191b52aafb0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -96,8 +96,9 @@ out: } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = f2fs_vm_page_mkwrite, + .fault = filemap_fault, + .page_mkwrite = f2fs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode) @@ -137,6 +138,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; + /* guarantee free sections for fsync */ + f2fs_balance_fs(sbi); + mutex_lock(&inode->i_mutex); if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) @@ -160,15 +164,17 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (need_to_sync_dir(sbi, inode)) need_cp = true; - f2fs_write_inode(inode, NULL); - if (need_cp) { /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); clear_inode_flag(F2FS_I(inode), FI_NEED_CP); } else { - while (sync_node_pages(sbi, inode->i_ino, &wbc) == 0) - f2fs_write_inode(inode, NULL); + /* if there is no written node page, write its inode page */ + while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { + ret = f2fs_write_inode(inode, NULL); + if (ret) + goto out; + } filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX); } @@ -405,6 +411,8 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + f2fs_balance_fs(sbi); + mutex_lock_op(sbi, DATA_TRUNC); set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, RDONLY_NODE); @@ -532,7 +540,6 @@ static long f2fs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file->f_path.dentry->d_inode; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); long ret; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -543,7 +550,10 @@ static long f2fs_fallocate(struct file *file, int mode, else ret = expand_inode_data(inode, offset, len, mode); - f2fs_balance_fs(sbi); + if (!ret) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + } return ret; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 644aa3808273..c386910dacc5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -78,7 +78,7 @@ static int gc_thread_func(void *data) sbi->bg_gc++; - if (f2fs_gc(sbi, 1) == GC_NONE) + if (f2fs_gc(sbi) == GC_NONE) wait_ms = GC_THREAD_NOGC_SLEEP_TIME; else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) wait_ms = GC_THREAD_MAX_SLEEP_TIME; @@ -390,9 +390,7 @@ next_step: } err = check_valid_map(sbi, segno, off); - if (err == GC_ERROR) - return err; - else if (err == GC_NEXT) + if (err == GC_NEXT) continue; if (initial) { @@ -426,32 +424,30 @@ next_step: } /* - * Calculate start block index that this node page contains + * Calculate start block index indicating the given node offset. + * Be careful, caller should give this node offset only indicating direct node + * blocks. If any node offsets, which point the other types of node blocks such + * as indirect or double indirect node blocks, are given, it must be a caller's + * bug. */ block_t start_bidx_of_node(unsigned int node_ofs) { - block_t start_bidx; - unsigned int bidx, indirect_blks; - int dec; + unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; + unsigned int bidx; - indirect_blks = 2 * NIDS_PER_BLOCK + 4; + if (node_ofs == 0) + return 0; - start_bidx = 1; - if (node_ofs == 0) { - start_bidx = 0; - } else if (node_ofs <= 2) { + if (node_ofs <= 2) { bidx = node_ofs - 1; } else if (node_ofs <= indirect_blks) { - dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 2 - dec; } else { - dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 5 - dec; } - - if (start_bidx) - start_bidx = bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; - return start_bidx; + return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE; } static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -556,9 +552,7 @@ next_step: } err = check_valid_map(sbi, segno, off); - if (err == GC_ERROR) - goto stop; - else if (err == GC_NEXT) + if (err == GC_NEXT) continue; if (phase == 0) { @@ -568,9 +562,7 @@ next_step: /* Get an inode by ino with checking validity */ err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs); - if (err == GC_ERROR) - goto stop; - else if (err == GC_NEXT) + if (err == GC_NEXT) continue; if (phase == 1) { @@ -663,62 +655,44 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, return ret; } -int f2fs_gc(struct f2fs_sb_info *sbi, int nGC) +int f2fs_gc(struct f2fs_sb_info *sbi) { - unsigned int segno; - int old_free_secs, cur_free_secs; - int gc_status, nfree; struct list_head ilist; + unsigned int segno, i; int gc_type = BG_GC; + int gc_status = GC_NONE; INIT_LIST_HEAD(&ilist); gc_more: - nfree = 0; - gc_status = GC_NONE; + if (!(sbi->sb->s_flags & MS_ACTIVE)) + goto stop; if (has_not_enough_free_secs(sbi)) - old_free_secs = reserved_sections(sbi); - else - old_free_secs = free_sections(sbi); - - while (sbi->sb->s_flags & MS_ACTIVE) { - int i; - if (has_not_enough_free_secs(sbi)) - gc_type = FG_GC; + gc_type = FG_GC; - cur_free_secs = free_sections(sbi) + nfree; + if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) + goto stop; - /* We got free space successfully. */ - if (nGC < cur_free_secs - old_free_secs) - break; - - if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) + for (i = 0; i < sbi->segs_per_sec; i++) { + /* + * do_garbage_collect will give us three gc_status: + * GC_ERROR, GC_DONE, and GC_BLOCKED. + * If GC is finished uncleanly, we have to return + * the victim to dirty segment list. + */ + gc_status = do_garbage_collect(sbi, segno + i, &ilist, gc_type); + if (gc_status != GC_DONE) break; - - for (i = 0; i < sbi->segs_per_sec; i++) { - /* - * do_garbage_collect will give us three gc_status: - * GC_ERROR, GC_DONE, and GC_BLOCKED. - * If GC is finished uncleanly, we have to return - * the victim to dirty segment list. - */ - gc_status = do_garbage_collect(sbi, segno + i, - &ilist, gc_type); - if (gc_status != GC_DONE) - goto stop; - nfree++; - } } -stop: - if (has_not_enough_free_secs(sbi) || gc_status == GC_BLOCKED) { + if (has_not_enough_free_secs(sbi)) { write_checkpoint(sbi, (gc_status == GC_BLOCKED), false); - if (nfree) + if (has_not_enough_free_secs(sbi)) goto gc_more; } +stop: mutex_unlock(&sbi->gc_mutex); put_gc_inode(&ilist); - BUG_ON(!list_empty(&ilist)); return gc_status; } @@ -727,7 +701,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) DIRTY_I(sbi)->v_ops = &default_v_ops; } -int create_gc_caches(void) +int __init create_gc_caches(void) { winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", sizeof(struct inode_entry), NULL); diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index a60f04200f8b..6eb8d269b53b 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -42,7 +42,7 @@ static void TEA_transform(unsigned int buf[4], unsigned int const in[]) buf[1] += b1; } -static void str2hashbuf(const char *msg, int len, unsigned int *buf, int num) +static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) { unsigned pad, val; int i; @@ -69,13 +69,17 @@ static void str2hashbuf(const char *msg, int len, unsigned int *buf, int num) *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const char *name, int len) +f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len) { - __u32 hash, minor_hash; + __u32 hash; f2fs_hash_t f2fs_hash; const char *p; __u32 in[8], buf[4]; + if ((len <= 2) && (name[0] == '.') && + (name[1] == '.' || name[1] == '\0')) + return 0; + /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; buf[1] = 0xefcdab89; @@ -83,15 +87,15 @@ f2fs_hash_t f2fs_dentry_hash(const char *name, int len) buf[3] = 0x10325476; p = name; - while (len > 0) { + while (1) { str2hashbuf(p, len, in, 4); TEA_transform(buf, in); - len -= 16; p += 16; + if (len <= 16) + break; + len -= 16; } hash = buf[0]; - minor_hash = buf[1]; - f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT); return f2fs_hash; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index df5fb381ebf1..794241777322 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -203,6 +203,7 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags); ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino); ri->i_generation = cpu_to_le32(inode->i_generation); + set_cold_node(inode, node_page); set_page_dirty(node_page); } @@ -216,6 +217,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; + if (wbc) + f2fs_balance_fs(sbi); + node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 89b7675dc377..1a49b881bac0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -77,8 +77,8 @@ fail: static int is_multimedia_file(const unsigned char *s, const char *sub) { - int slen = strlen(s); - int sublen = strlen(sub); + size_t slen = strlen(s); + size_t sublen = strlen(sub); int ret; if (sublen > slen) @@ -123,6 +123,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + f2fs_balance_fs(sbi); + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -144,8 +146,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (!sbi->por_doing) d_instantiate(dentry, inode); unlock_new_inode(inode); - - f2fs_balance_fs(sbi); return 0; out: clear_nlink(inode); @@ -163,6 +163,8 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct f2fs_sb_info *sbi = F2FS_SB(sb); int err; + f2fs_balance_fs(sbi); + inode->i_ctime = CURRENT_TIME; atomic_inc(&inode->i_count); @@ -172,8 +174,6 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, goto out; d_instantiate(dentry, inode); - - f2fs_balance_fs(sbi); return 0; out: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); @@ -223,6 +223,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct page *page; int err = -ENOENT; + f2fs_balance_fs(sbi); + de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) goto fail; @@ -238,7 +240,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) /* In order to evict this inode, we set it dirty */ mark_inode_dirty(inode); - f2fs_balance_fs(sbi); fail: return err; } @@ -249,9 +250,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct super_block *sb = dir->i_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); struct inode *inode; - unsigned symlen = strlen(symname) + 1; + size_t symlen = strlen(symname) + 1; int err; + f2fs_balance_fs(sbi); + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -268,9 +271,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); unlock_new_inode(inode); - - f2fs_balance_fs(sbi); - return err; out: clear_nlink(inode); @@ -286,6 +286,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + f2fs_balance_fs(sbi); + inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -305,7 +307,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) d_instantiate(dentry, inode); unlock_new_inode(inode); - f2fs_balance_fs(sbi); return 0; out_fail: @@ -336,6 +337,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + f2fs_balance_fs(sbi); + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -350,9 +353,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, alloc_nid_done(sbi, inode->i_ino); d_instantiate(dentry, inode); unlock_new_inode(inode); - - f2fs_balance_fs(sbi); - return 0; out: clear_nlink(inode); @@ -376,6 +376,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *new_entry; int err = -ENOENT; + f2fs_balance_fs(sbi); + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) goto out; @@ -441,8 +443,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } mutex_unlock_op(sbi, RENAME); - - f2fs_balance_fs(sbi); return 0; out_dir: diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 19870361497e..9bda63c9c166 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -484,12 +484,14 @@ static void truncate_node(struct dnode_of_data *dn) struct node_info ni; get_node_info(sbi, dn->nid, &ni); + if (dn->inode->i_blocks == 0) { + BUG_ON(ni.blk_addr != NULL_ADDR); + goto invalidate; + } BUG_ON(ni.blk_addr == NULL_ADDR); - if (ni.blk_addr != NULL_ADDR) - invalidate_blocks(sbi, ni.blk_addr); - /* Deallocate node address */ + invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, dn->inode, 1); set_node_addr(sbi, &ni, NULL_ADDR); @@ -499,7 +501,7 @@ static void truncate_node(struct dnode_of_data *dn) } else { sync_inode_page(dn); } - +invalidate: clear_node_page_dirty(dn->node_page); F2FS_SET_SB_DIRT(sbi); @@ -768,20 +770,12 @@ int remove_inode_page(struct inode *inode) dn.inode_page_locked = 1; truncate_node(&dn); } - if (inode->i_blocks == 1) { - /* inernally call f2fs_put_page() */ - set_new_dnode(&dn, inode, page, page, ino); - truncate_node(&dn); - } else if (inode->i_blocks == 0) { - struct node_info ni; - get_node_info(sbi, inode->i_ino, &ni); - /* called after f2fs_new_inode() is failed */ - BUG_ON(ni.blk_addr != NULL_ADDR); - f2fs_put_page(page, 1); - } else { - BUG(); - } + /* 0 is possible, after f2fs_new_inode() is failed */ + BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1); + set_new_dnode(&dn, inode, page, page, ino); + truncate_node(&dn); + mutex_unlock_op(sbi, NODE_TRUNC); return 0; } @@ -834,17 +828,18 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; } set_node_addr(sbi, &new_ni, NEW_ADDR); + set_cold_node(dn->inode, page); dn->node_page = page; sync_inode_page(dn); set_page_dirty(page); - set_cold_node(dn->inode, page); if (ofs == 0) inc_valid_inode_count(sbi); return page; fail: + clear_node_page_dirty(page); f2fs_put_page(page, 1); return ERR_PTR(err); } @@ -1093,7 +1088,6 @@ static int f2fs_write_node_page(struct page *page, { struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); nid_t nid; - unsigned int nofs; block_t new_addr; struct node_info ni; @@ -1110,7 +1104,6 @@ static int f2fs_write_node_page(struct page *page, /* get old block addr of this node page */ nid = nid_of_node(page); - nofs = ofs_of_node(page); BUG_ON(page->index != nid); get_node_info(sbi, nid, &ni); @@ -1131,6 +1124,12 @@ static int f2fs_write_node_page(struct page *page, return 0; } +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * Be default, 512 pages (2MB), a segment size, is quite reasonable. + */ +#define COLLECT_DIRTY_NODES 512 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1138,17 +1137,16 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct block_device *bdev = sbi->sb->s_bdev; long nr_to_write = wbc->nr_to_write; - if (wbc->for_kupdate) - return 0; - - if (get_pages(sbi, F2FS_DIRTY_NODES) == 0) - return 0; - + /* First check balancing cached NAT entries */ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { write_checkpoint(sbi, false, false); return 0; } + /* collect a number of dirty node pages and write together */ + if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) + return 0; + /* if mounting is failed, skip writing node pages */ wbc->nr_to_write = bio_get_nr_vecs(bdev); sync_node_pages(sbi, 0, wbc); @@ -1571,7 +1569,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) nid_t nid; struct f2fs_nat_entry raw_ne; int offset = -1; - block_t old_blkaddr, new_blkaddr; + block_t new_blkaddr; ne = list_entry(cur, struct nat_entry, list); nid = nat_get_nid(ne); @@ -1585,7 +1583,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1); if (offset >= 0) { raw_ne = nat_in_journal(sum, offset); - old_blkaddr = le32_to_cpu(raw_ne.block_addr); goto flush_now; } to_nat_page: @@ -1607,7 +1604,6 @@ to_nat_page: BUG_ON(!nat_blk); raw_ne = nat_blk->entries[nid - start_nid]; - old_blkaddr = le32_to_cpu(raw_ne.block_addr); flush_now: new_blkaddr = nat_get_blkaddr(ne); @@ -1741,7 +1737,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kfree(nm_i); } -int create_node_manager_caches(void) +int __init create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", sizeof(struct nat_entry), NULL); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b07e9b6ef376..f42e4060b399 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -67,7 +67,7 @@ static int recover_dentry(struct page *ipage, struct inode *inode) kunmap(page); f2fs_put_page(page, 0); } else { - f2fs_add_link(&dent, inode); + err = f2fs_add_link(&dent, inode); } iput(dir); out: @@ -144,14 +144,14 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto out; } - INIT_LIST_HEAD(&entry->list); - list_add_tail(&entry->list, head); - entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); if (IS_ERR(entry->inode)) { err = PTR_ERR(entry->inode); + kmem_cache_free(fsync_entry_slab, entry); goto out; } + + list_add_tail(&entry->list, head); entry->blkaddr = blkaddr; } if (IS_INODE(page)) { @@ -173,10 +173,9 @@ out: static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - struct list_head *this; - struct fsync_inode_entry *entry; - list_for_each(this, head) { - entry = list_entry(this, struct fsync_inode_entry, list); + struct fsync_inode_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, head, list) { iput(entry->inode); list_del(&entry->list); kmem_cache_free(fsync_entry_slab, entry); @@ -228,6 +227,9 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi, /* Deallocate previous index in the node page */ inode = f2fs_iget_nowait(sbi->sb, ino); + if (IS_ERR(inode)) + return; + truncate_hole(inode, bidx, bidx + 1); iput(inode); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1b26e4ea1016..4b0099066582 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -12,57 +12,26 @@ #include <linux/f2fs_fs.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/prefetch.h> #include <linux/vmalloc.h> #include "f2fs.h" #include "segment.h" #include "node.h" -static int need_to_flush(struct f2fs_sb_info *sbi) -{ - unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) * - sbi->segs_per_sec; - int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; - int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; - - if (sbi->por_doing) - return 0; - - if (free_sections(sbi) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi))) - return 1; - return 0; -} - /* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ void f2fs_balance_fs(struct f2fs_sb_info *sbi) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .for_reclaim = 0, - }; - - if (sbi->por_doing) - return; - /* - * We should do checkpoint when there are so many dirty node pages - * with enough free segments. After then, we should do GC. + * We should do GC or end up with checkpoint, if there are so many dirty + * dir/node pages without enough free segments. */ - if (need_to_flush(sbi)) { - sync_dirty_dir_inodes(sbi); - sync_node_pages(sbi, 0, &wbc); - } - if (has_not_enough_free_secs(sbi)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, 1); + f2fs_gc(sbi); } } @@ -631,7 +600,6 @@ static void f2fs_end_io_write(struct bio *bio, int err) if (page->mapping) set_bit(AS_EIO, &page->mapping->flags); set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG); - set_page_dirty(page); } end_page_writeback(page); dec_page_count(p->sbi, F2FS_WRITEBACK); @@ -791,11 +759,10 @@ static int __get_segment_type(struct page *page, enum page_type p_type) return __get_segment_type_2(page, p_type); case 4: return __get_segment_type_4(page, p_type); - case 6: - return __get_segment_type_6(page, p_type); - default: - BUG(); } + /* NR_CURSEG_TYPE(6) logs by default */ + BUG_ON(sbi->active_logs != NR_CURSEG_TYPE); + return __get_segment_type_6(page, p_type); } static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, @@ -1608,7 +1575,6 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) for (i = 0; i < NR_DIRTY_TYPE; i++) { dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); - dirty_i->nr_dirty[i] = 0; if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 0948405af6f5..66a288a52fd3 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -459,7 +459,20 @@ static inline int get_ssr_segment(struct f2fs_sb_info *sbi, int type) static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi) { - return free_sections(sbi) <= reserved_sections(sbi); + unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) * + sbi->segs_per_sec; + int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1) + >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1) + >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + + if (sbi->por_doing) + return false; + + if (free_sections(sbi) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi))) + return true; + return false; } static inline int utilization(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 13867322cf5a..37fad04c8669 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -53,6 +53,18 @@ static match_table_t f2fs_tokens = { {Opt_err, NULL}, }; +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + static void init_once(void *foo) { struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; @@ -119,15 +131,16 @@ static void f2fs_put_super(struct super_block *sb) int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - int ret = 0; if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES)) return 0; if (sync) write_checkpoint(sbi, false, false); + else + f2fs_balance_fs(sbi); - return ret; + return 0; } static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -148,8 +161,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi); - buf->f_files = valid_inode_count(sbi); - buf->f_ffree = sbi->total_node_count - valid_node_count(sbi); + buf->f_files = sbi->total_node_count; + buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi); buf->f_namelen = F2FS_MAX_NAME_LEN; buf->f_fsid.val[0] = (u32)id; @@ -248,7 +261,8 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static int parse_options(struct f2fs_sb_info *sbi, char *options) +static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi, + char *options) { substring_t args[MAX_OPT_ARGS]; char *p; @@ -287,7 +301,8 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) break; #else case Opt_nouser_xattr: - pr_info("nouser_xattr options not supported\n"); + f2fs_msg(sb, KERN_INFO, + "nouser_xattr options not supported"); break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL @@ -296,13 +311,13 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) break; #else case Opt_noacl: - pr_info("noacl options not supported\n"); + f2fs_msg(sb, KERN_INFO, "noacl options not supported"); break; #endif case Opt_active_logs: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg != 2 && arg != 4 && arg != 6) + if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE) return -EINVAL; sbi->active_logs = arg; break; @@ -310,8 +325,9 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) set_opt(sbi, DISABLE_EXT_IDENTIFY); break; default: - pr_err("Unrecognized mount option \"%s\" or missing value\n", - p); + f2fs_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" or missing value", + p); return -EINVAL; } } @@ -338,23 +354,36 @@ static loff_t max_file_size(unsigned bits) return result; } -static int sanity_check_raw_super(struct f2fs_super_block *raw_super) +static int sanity_check_raw_super(struct super_block *sb, + struct f2fs_super_block *raw_super) { unsigned int blocksize; - if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) + if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { + f2fs_msg(sb, KERN_INFO, + "Magic Mismatch, valid(0x%x) - read(0x%x)", + F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic)); return 1; + } /* Currently, support only 4KB block size */ blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); - if (blocksize != PAGE_CACHE_SIZE) + if (blocksize != PAGE_CACHE_SIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid blocksize (%u), supports only 4KB\n", + blocksize); return 1; + } if (le32_to_cpu(raw_super->log_sectorsize) != - F2FS_LOG_SECTOR_SIZE) + F2FS_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); return 1; + } if (le32_to_cpu(raw_super->log_sectors_per_block) != - F2FS_LOG_SECTORS_PER_BLOCK) + F2FS_LOG_SECTORS_PER_BLOCK) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block"); return 1; + } return 0; } @@ -414,14 +443,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; - /* set a temporary block size */ - if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) + /* set a block size */ + if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { + f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; + } /* read f2fs raw super block */ raw_super_buf = sb_bread(sb, 0); if (!raw_super_buf) { err = -EIO; + f2fs_msg(sb, KERN_ERR, "unable to read superblock"); goto free_sbi; } raw_super = (struct f2fs_super_block *) @@ -439,12 +471,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi, POSIX_ACL); #endif /* parse mount options */ - if (parse_options(sbi, (char *)data)) + if (parse_options(sb, sbi, (char *)data)) goto free_sb_buf; /* sanity checking of raw super */ - if (sanity_check_raw_super(raw_super)) + if (sanity_check_raw_super(sb, raw_super)) { + f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem"); goto free_sb_buf; + } sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); sb->s_max_links = F2FS_LINK_MAX; @@ -478,18 +512,23 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); goto free_sb_buf; } err = get_valid_checkpoint(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint"); goto free_meta_inode; + } /* sanity checking of checkpoint */ err = -EINVAL; - if (sanity_check_ckpt(raw_super, sbi->ckpt)) + if (sanity_check_ckpt(raw_super, sbi->ckpt)) { + f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); goto free_cp; + } sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); @@ -503,38 +542,41 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->dir_inode_list); spin_lock_init(&sbi->dir_inode_lock); - /* init super block */ - if (!sb_set_blocksize(sb, sbi->blocksize)) - goto free_cp; - init_orphan_info(sbi); /* setup f2fs internal modules */ err = build_segment_manager(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS segment manager"); goto free_sm; + } err = build_node_manager(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS node manager"); goto free_nm; + } build_gc_manager(sbi); /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); if (IS_ERR(sbi->node_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); err = PTR_ERR(sbi->node_inode); goto free_nm; } /* if there are nt orphan nodes free them */ err = -EINVAL; - if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) && - recover_orphan_inodes(sbi)) + if (recover_orphan_inodes(sbi)) goto free_node_inode; /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { + f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); err = PTR_ERR(root); goto free_node_inode; } @@ -548,8 +590,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) } /* recover fsynced data */ - if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) && - !test_opt(sbi, DISABLE_ROLL_FORWARD)) + if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) recover_fsync_data(sbi); /* After POR, we can run background GC thread */ @@ -599,7 +640,7 @@ static struct file_system_type f2fs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -static int init_inodecache(void) +static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), NULL); @@ -634,14 +675,17 @@ static int __init init_f2fs_fs(void) err = create_checkpoint_caches(); if (err) goto fail; - return register_filesystem(&f2fs_fs_type); + err = register_filesystem(&f2fs_fs_type); + if (err) + goto fail; + f2fs_create_root_stats(); fail: return err; } static void __exit exit_f2fs_fs(void) { - destroy_root_stats(); + f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); destroy_checkpoint_caches(); destroy_gc_caches(); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 7d52e8dc0c59..8038c0496504 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -208,7 +208,7 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name, struct page *page; void *base_addr; int error = 0, found = 0; - int value_len, name_len; + size_t value_len, name_len; if (name == NULL) return -EINVAL; @@ -304,7 +304,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, struct f2fs_xattr_entry *here, *last; struct page *page; void *base_addr; - int error, found, free, name_len, newsize; + int error, found, free, newsize; + size_t name_len; char *pval; if (name == NULL) @@ -317,6 +318,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, if (name_len > 255 || value_len > MAX_VALUE_LEN) return -ERANGE; + f2fs_balance_fs(sbi); + mutex_lock_op(sbi, NODE_NEW); if (!fi->i_xattr_nid) { /* Allocate new attribute block */ diff --git a/fs/file.c b/fs/file.c index 15cb8618e95d..2b3570b7caeb 100644 --- a/fs/file.c +++ b/fs/file.c @@ -490,7 +490,7 @@ void exit_files(struct task_struct *tsk) } } -static void __devinit fdtable_defer_list_init(int cpu) +static void fdtable_defer_list_init(int cpu) { struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); spin_lock_init(&fddef->lock); diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 0cf160a94eda..1b2f6c2c3aaf 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -4,12 +4,24 @@ config FUSE_FS With FUSE it is possible to implement a fully functional filesystem in a userspace program. - There's also companion library: libfuse. This library along with - utilities is available from the FUSE homepage: + There's also a companion library: libfuse2. This library is available + from the FUSE homepage: <http://fuse.sourceforge.net/> + although chances are your distribution already has that library + installed if you've installed the "fuse" package itself. See <file:Documentation/filesystems/fuse.txt> for more information. See <file:Documentation/Changes> for needed library/utility version. If you want to develop a userspace FS, or if you want to use a filesystem based on FUSE, answer Y or M. + +config CUSE + tristate "Character device in Userspace support" + depends on FUSE_FS + help + This FUSE extension allows character devices to be + implemented in userspace. + + If you want to develop or use a userspace character device + based on CUSE, answer Y or M. diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index ee8d55042298..6f96a8def147 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -45,7 +45,6 @@ #include <linux/miscdevice.h> #include <linux/mutex.h> #include <linux/slab.h> -#include <linux/spinlock.h> #include <linux/stat.h> #include <linux/module.h> @@ -63,7 +62,7 @@ struct cuse_conn { bool unrestricted_ioctl; }; -static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */ +static DEFINE_MUTEX(cuse_lock); /* protects registration */ static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN]; static struct class *cuse_class; @@ -92,19 +91,22 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { loff_t pos = 0; + struct iovec iov = { .iov_base = buf, .iov_len = count }; - return fuse_direct_io(file, buf, count, &pos, 0); + return fuse_direct_io(file, &iov, 1, count, &pos, 0); } static ssize_t cuse_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { loff_t pos = 0; + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + /* * No locking or generic_write_checks(), the server is * responsible for locking and sanity checks. */ - return fuse_direct_io(file, buf, count, &pos, 1); + return fuse_direct_io(file, &iov, 1, count, &pos, 1); } static int cuse_open(struct inode *inode, struct file *file) @@ -114,14 +116,14 @@ static int cuse_open(struct inode *inode, struct file *file) int rc; /* look up and get the connection */ - spin_lock(&cuse_lock); + mutex_lock(&cuse_lock); list_for_each_entry(pos, cuse_conntbl_head(devt), list) if (pos->dev->devt == devt) { fuse_conn_get(&pos->fc); cc = pos; break; } - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* dead? */ if (!cc) @@ -267,7 +269,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) { char *end = p + len; - char *key, *val; + char *uninitialized_var(key), *uninitialized_var(val); int rc; while (true) { @@ -305,14 +307,14 @@ static void cuse_gendev_release(struct device *dev) */ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) { - struct cuse_conn *cc = fc_to_cc(fc); + struct cuse_conn *cc = fc_to_cc(fc), *pos; struct cuse_init_out *arg = req->out.args[0].value; struct page *page = req->pages[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; dev_t devt; - int rc; + int rc, i; if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { @@ -356,15 +358,24 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) dev_set_drvdata(dev, cc); dev_set_name(dev, "%s", devinfo.name); + mutex_lock(&cuse_lock); + + /* make sure the device-name is unique */ + for (i = 0; i < CUSE_CONNTBL_LEN; ++i) { + list_for_each_entry(pos, &cuse_conntbl[i], list) + if (!strcmp(dev_name(pos->dev), dev_name(dev))) + goto err_unlock; + } + rc = device_add(dev); if (rc) - goto err_device; + goto err_unlock; /* register cdev */ rc = -ENOMEM; cdev = cdev_alloc(); if (!cdev) - goto err_device; + goto err_unlock; cdev->owner = THIS_MODULE; cdev->ops = &cuse_frontend_fops; @@ -377,9 +388,8 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) cc->cdev = cdev; /* make the device available */ - spin_lock(&cuse_lock); list_add(&cc->list, cuse_conntbl_head(devt)); - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* announce device availability */ dev_set_uevent_suppress(dev, 0); @@ -391,7 +401,8 @@ out: err_cdev: cdev_del(cdev); -err_device: +err_unlock: + mutex_unlock(&cuse_lock); put_device(dev); err_region: unregister_chrdev_region(devt, 1); @@ -411,7 +422,7 @@ static int cuse_send_init(struct cuse_conn *cc) BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); - req = fuse_get_req(fc); + req = fuse_get_req(fc, 1); if (IS_ERR(req)) { rc = PTR_ERR(req); goto err; @@ -441,6 +452,7 @@ static int cuse_send_init(struct cuse_conn *cc) req->out.argvar = 1; req->out.argpages = 1; req->pages[0] = page; + req->page_descs[0].length = req->out.args[1].size; req->num_pages = 1; req->end = cuse_process_init_reply; fuse_request_send_background(fc, req); @@ -520,9 +532,9 @@ static int cuse_channel_release(struct inode *inode, struct file *file) int rc; /* remove from the conntbl, no more access from this point on */ - spin_lock(&cuse_lock); + mutex_lock(&cuse_lock); list_del_init(&cc->list); - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* remove device */ if (cc->dev) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c16335315e5d..e9bdec0b16d9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -34,34 +34,67 @@ static struct fuse_conn *fuse_get_conn(struct file *file) return file->private_data; } -static void fuse_request_init(struct fuse_req *req) +static void fuse_request_init(struct fuse_req *req, struct page **pages, + struct fuse_page_desc *page_descs, + unsigned npages) { memset(req, 0, sizeof(*req)); + memset(pages, 0, sizeof(*pages) * npages); + memset(page_descs, 0, sizeof(*page_descs) * npages); INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); atomic_set(&req->count, 1); + req->pages = pages; + req->page_descs = page_descs; + req->max_pages = npages; } -struct fuse_req *fuse_request_alloc(void) +static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL); - if (req) - fuse_request_init(req); + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags); + if (req) { + struct page **pages; + struct fuse_page_desc *page_descs; + + if (npages <= FUSE_REQ_INLINE_PAGES) { + pages = req->inline_pages; + page_descs = req->inline_page_descs; + } else { + pages = kmalloc(sizeof(struct page *) * npages, flags); + page_descs = kmalloc(sizeof(struct fuse_page_desc) * + npages, flags); + } + + if (!pages || !page_descs) { + kfree(pages); + kfree(page_descs); + kmem_cache_free(fuse_req_cachep, req); + return NULL; + } + + fuse_request_init(req, pages, page_descs, npages); + } return req; } + +struct fuse_req *fuse_request_alloc(unsigned npages) +{ + return __fuse_request_alloc(npages, GFP_KERNEL); +} EXPORT_SYMBOL_GPL(fuse_request_alloc); -struct fuse_req *fuse_request_alloc_nofs(void) +struct fuse_req *fuse_request_alloc_nofs(unsigned npages) { - struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS); - if (req) - fuse_request_init(req); - return req; + return __fuse_request_alloc(npages, GFP_NOFS); } void fuse_request_free(struct fuse_req *req) { + if (req->pages != req->inline_pages) { + kfree(req->pages); + kfree(req->page_descs); + } kmem_cache_free(fuse_req_cachep, req); } @@ -97,7 +130,7 @@ static void fuse_req_init_context(struct fuse_req *req) req->in.h.pid = current->pid; } -struct fuse_req *fuse_get_req(struct fuse_conn *fc) +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) { struct fuse_req *req; sigset_t oldset; @@ -116,7 +149,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc) if (!fc->connected) goto out; - req = fuse_request_alloc(); + req = fuse_request_alloc(npages); err = -ENOMEM; if (!req) goto out; @@ -165,7 +198,7 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) struct fuse_file *ff = file->private_data; spin_lock(&fc->lock); - fuse_request_init(req); + fuse_request_init(req, req->pages, req->page_descs, req->max_pages); BUG_ON(ff->reserved_req); ff->reserved_req = req; wake_up_all(&fc->reserved_req_waitq); @@ -186,13 +219,14 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) * filesystem should not have it's own file open. If deadlock is * intentional, it can still be broken by "aborting" the filesystem. */ -struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file) +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, + struct file *file) { struct fuse_req *req; atomic_inc(&fc->num_waiting); wait_event(fc->blocked_waitq, !fc->blocked); - req = fuse_request_alloc(); + req = fuse_request_alloc(0); if (!req) req = get_reserved_req(fc, file); @@ -406,9 +440,8 @@ __acquires(fc->lock) } } -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { - req->isreply = 1; spin_lock(&fc->lock); if (!fc->connected) req->out.h.error = -ENOTCONN; @@ -425,6 +458,12 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) } spin_unlock(&fc->lock); } + +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +{ + req->isreply = 1; + __fuse_request_send(fc, req); +} EXPORT_SYMBOL_GPL(fuse_request_send); static void fuse_request_send_nowait_locked(struct fuse_conn *fc, @@ -491,6 +530,27 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, fuse_request_send_nowait_locked(fc, req); } +void fuse_force_forget(struct file *file, u64 nodeid) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_req *req; + struct fuse_forget_in inarg; + + memset(&inarg, 0, sizeof(inarg)); + inarg.nlookup = 1; + req = fuse_get_req_nofail_nopages(fc, file); + req->in.h.opcode = FUSE_FORGET; + req->in.h.nodeid = nodeid; + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->isreply = 0; + __fuse_request_send(fc, req); + /* ignore errors */ + fuse_put_request(fc, req); +} + /* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already @@ -692,8 +752,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *oldpage = *pagep; struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; - struct address_space *mapping; - pgoff_t index; unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); @@ -724,9 +782,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (fuse_check_page(newpage) != 0) goto out_fallback_unlock; - mapping = oldpage->mapping; - index = oldpage->index; - /* * This is a new and locked page, it shouldn't be mapped or * have any special flags on it @@ -855,11 +910,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, { unsigned i; struct fuse_req *req = cs->req; - unsigned offset = req->page_offset; - unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { int err; + unsigned offset = req->page_descs[i].offset; + unsigned count = min(nbytes, req->page_descs[i].length); err = fuse_copy_page(cs, &req->pages[i], offset, count, zeroing); @@ -867,8 +922,6 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, return err; nbytes -= count; - count = min(nbytes, (unsigned) PAGE_SIZE); - offset = 0; } return 0; } @@ -1541,29 +1594,34 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; + int num_pages; + + offset = outarg->offset & ~PAGE_CACHE_MASK; + file_size = i_size_read(inode); + + num = outarg->size; + if (outarg->offset > file_size) + num = 0; + else if (outarg->offset + num > file_size) + num = file_size - outarg->offset; - req = fuse_get_req(fc); + num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ); + + req = fuse_get_req(fc, num_pages); if (IS_ERR(req)) return PTR_ERR(req); - offset = outarg->offset & ~PAGE_CACHE_MASK; - req->in.h.opcode = FUSE_NOTIFY_REPLY; req->in.h.nodeid = outarg->nodeid; req->in.numargs = 2; req->in.argpages = 1; - req->page_offset = offset; + req->page_descs[0].offset = offset; req->end = fuse_retrieve_end; index = outarg->offset >> PAGE_CACHE_SHIFT; - file_size = i_size_read(inode); - num = outarg->size; - if (outarg->offset > file_size) - num = 0; - else if (outarg->offset + num > file_size) - num = file_size - outarg->offset; - while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) { + while (num && req->num_pages < num_pages) { struct page *page; unsigned int this_num; @@ -1573,6 +1631,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = this_num; req->num_pages++; offset = 0; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b7c09f9eb40c..85065221a58a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -14,6 +14,29 @@ #include <linux/namei.h> #include <linux/slab.h> +static bool fuse_use_readdirplus(struct inode *dir, struct file *filp) +{ + struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_inode *fi = get_fuse_inode(dir); + + if (!fc->do_readdirplus) + return false; + if (!fc->readdirplus_auto) + return true; + if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state)) + return true; + if (filp->f_pos == 0) + return true; + return false; +} + +static void fuse_advise_use_readdirplus(struct inode *dir) +{ + struct fuse_inode *fi = get_fuse_inode(dir); + + set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); +} + #if BITS_PER_LONG >= 64 static inline void fuse_dentry_settime(struct dentry *entry, u64 time) { @@ -178,7 +201,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) return -ECHILD; fc = get_fuse_conn(inode); - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return 0; @@ -219,6 +242,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) attr_version); fuse_change_entry_timeout(entry, &outarg); } + fuse_advise_use_readdirplus(inode); return 1; } @@ -271,7 +295,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, if (name->len > FUSE_NAME_MAX) goto out; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); err = PTR_ERR(req); if (IS_ERR(req)) goto out; @@ -355,6 +379,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, else fuse_invalidate_entry_cache(entry); + fuse_advise_use_readdirplus(dir); return newent; out_iput: @@ -391,7 +416,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (!forget) goto out_err; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); err = PTR_ERR(req); if (IS_ERR(req)) goto out_put_forget_req; @@ -592,7 +617,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, { struct fuse_mknod_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -623,7 +648,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -647,7 +672,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -664,7 +689,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -682,7 +707,14 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) spin_lock(&fc->lock); fi->attr_version = ++fc->attr_version; - drop_nlink(inode); + /* + * If i_nlink == 0 then unlink doesn't make sense, yet this can + * happen if userspace filesystem is careless. It would be + * difficult to enforce correct nlink usage so just ignore this + * condition here + */ + if (inode->i_nlink > 0) + drop_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); @@ -696,7 +728,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -723,7 +755,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, int err; struct fuse_rename_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -776,7 +808,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_link_in inarg; struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -848,7 +880,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, struct fuse_req *req; u64 attr_version; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -985,7 +1017,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, /* * Calling into a user-controlled filesystem gives the filesystem - * daemon ptrace-like capabilities over the requester process. This + * daemon ptrace-like capabilities over the current process. This * means, that the filesystem daemon is able to record the exact * filesystem operations performed, and can also control the behavior * of the requester process in otherwise impossible ways. For example @@ -996,27 +1028,23 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, * for which the owner of the mount has ptrace privilege. This * excludes processes started by other users, suid or sgid processes. */ -int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) +int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; - int ret; if (fc->flags & FUSE_ALLOW_OTHER) return 1; - rcu_read_lock(); - ret = 0; - cred = __task_cred(task); + cred = current_cred(); if (uid_eq(cred->euid, fc->user_id) && uid_eq(cred->suid, fc->user_id) && uid_eq(cred->uid, fc->user_id) && gid_eq(cred->egid, fc->group_id) && gid_eq(cred->sgid, fc->group_id) && gid_eq(cred->gid, fc->group_id)) - ret = 1; - rcu_read_unlock(); + return 1; - return ret; + return 0; } static int fuse_access(struct inode *inode, int mask) @@ -1029,7 +1057,7 @@ static int fuse_access(struct inode *inode, int mask) if (fc->no_access) return 0; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1077,7 +1105,7 @@ static int fuse_permission(struct inode *inode, int mask) bool refreshed = false; int err = 0; - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; /* @@ -1155,19 +1183,157 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file, return 0; } -static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +static int fuse_direntplus_link(struct file *file, + struct fuse_direntplus *direntplus, + u64 attr_version) { int err; + struct fuse_entry_out *o = &direntplus->entry_out; + struct fuse_dirent *dirent = &direntplus->dirent; + struct dentry *parent = file->f_path.dentry; + struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); + struct dentry *dentry; + struct dentry *alias; + struct inode *dir = parent->d_inode; + struct fuse_conn *fc; + struct inode *inode; + + if (!o->nodeid) { + /* + * Unlike in the case of fuse_lookup, zero nodeid does not mean + * ENOENT. Instead, it only means the userspace filesystem did + * not want to return attributes/handle for this entry. + * + * So do nothing. + */ + return 0; + } + + if (name.name[0] == '.') { + /* + * We could potentially refresh the attributes of the directory + * and its parent? + */ + if (name.len == 1) + return 0; + if (name.name[1] == '.' && name.len == 2) + return 0; + } + fc = get_fuse_conn(dir); + + name.hash = full_name_hash(name.name, name.len); + dentry = d_lookup(parent, &name); + if (dentry && dentry->d_inode) { + inode = dentry->d_inode; + if (get_node_id(inode) == o->nodeid) { + struct fuse_inode *fi; + fi = get_fuse_inode(inode); + spin_lock(&fc->lock); + fi->nlookup++; + spin_unlock(&fc->lock); + + /* + * The other branch to 'found' comes via fuse_iget() + * which bumps nlookup inside + */ + goto found; + } + err = d_invalidate(dentry); + if (err) + goto out; + dput(dentry); + dentry = NULL; + } + + dentry = d_alloc(parent, &name); + err = -ENOMEM; + if (!dentry) + goto out; + + inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, + &o->attr, entry_attr_timeout(o), attr_version); + if (!inode) + goto out; + + alias = d_materialise_unique(dentry, inode); + err = PTR_ERR(alias); + if (IS_ERR(alias)) + goto out; + if (alias) { + dput(dentry); + dentry = alias; + } + +found: + fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o), + attr_version); + + fuse_change_entry_timeout(dentry, o); + + err = 0; +out: + if (dentry) + dput(dentry); + return err; +} + +static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, + void *dstbuf, filldir_t filldir, u64 attr_version) +{ + struct fuse_direntplus *direntplus; + struct fuse_dirent *dirent; + size_t reclen; + int over = 0; + int ret; + + while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) { + direntplus = (struct fuse_direntplus *) buf; + dirent = &direntplus->dirent; + reclen = FUSE_DIRENTPLUS_SIZE(direntplus); + + if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX) + return -EIO; + if (reclen > nbytes) + break; + + if (!over) { + /* We fill entries into dstbuf only as much as + it can hold. But we still continue iterating + over remaining entries to link them. If not, + we need to send a FORGET for each of those + which we did not link. + */ + over = filldir(dstbuf, dirent->name, dirent->namelen, + file->f_pos, dirent->ino, + dirent->type); + file->f_pos = dirent->off; + } + + buf += reclen; + nbytes -= reclen; + + ret = fuse_direntplus_link(file, direntplus, attr_version); + if (ret) + fuse_force_forget(file, direntplus->entry_out.nodeid); + } + + return 0; +} + +static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) +{ + int plus, err; size_t nbytes; struct page *page; struct inode *inode = file->f_path.dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; + u64 attr_version = 0; if (is_bad_inode(inode)) return -EIO; - req = fuse_get_req(fc); + req = fuse_get_req(fc, 1); if (IS_ERR(req)) return PTR_ERR(req); @@ -1176,17 +1342,34 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) fuse_put_request(fc, req); return -ENOMEM; } + + plus = fuse_use_readdirplus(inode, file); req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; - fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR); + req->page_descs[0].length = PAGE_SIZE; + if (plus) { + attr_version = fuse_get_attr_version(fc); + fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + FUSE_READDIRPLUS); + } else { + fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, + FUSE_READDIR); + } fuse_request_send(fc, req); nbytes = req->out.args[0].size; err = req->out.h.error; fuse_put_request(fc, req); - if (!err) - err = parse_dirfile(page_address(page), nbytes, file, dstbuf, - filldir); + if (!err) { + if (plus) { + err = parse_dirplusfile(page_address(page), nbytes, + file, dstbuf, filldir, + attr_version); + } else { + err = parse_dirfile(page_address(page), nbytes, file, + dstbuf, filldir); + } + } __free_page(page); fuse_invalidate_attr(inode); /* atime changed */ @@ -1197,7 +1380,7 @@ static char *read_link(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_req(fc); + struct fuse_req *req = fuse_get_req_nopages(fc); char *link; if (IS_ERR(req)) @@ -1391,7 +1574,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, loff_t oldsize; int err; - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) @@ -1410,7 +1593,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, if (attr->ia_valid & ATTR_SIZE) is_truncate = true; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1500,7 +1683,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; return fuse_update_attributes(inode, stat, NULL, NULL); @@ -1518,7 +1701,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name, if (fc->no_setxattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1557,7 +1740,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, if (fc->no_getxattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1603,13 +1786,13 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) struct fuse_getxattr_out outarg; ssize_t ret; - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; if (fc->no_listxattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1654,7 +1837,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e21d4d8f87e3..c8071768b950 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -25,7 +25,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, struct fuse_req *req; int err; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -57,7 +57,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) return NULL; ff->fc = fc; - ff->reserved_req = fuse_request_alloc(); + ff->reserved_req = fuse_request_alloc(0); if (unlikely(!ff->reserved_req)) { kfree(ff); return NULL; @@ -368,7 +368,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; - req = fuse_get_req_nofail(fc, file); + req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); @@ -436,7 +436,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, fuse_sync_writes(inode); - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -544,7 +544,7 @@ static int fuse_readpage(struct file *file, struct page *page) */ fuse_wait_on_page_writeback(inode, page->index); - req = fuse_get_req(fc); + req = fuse_get_req(fc, 1); err = PTR_ERR(req); if (IS_ERR(req)) goto out; @@ -555,6 +555,7 @@ static int fuse_readpage(struct file *file, struct page *page) req->out.argpages = 1; req->num_pages = 1; req->pages[0] = page; + req->page_descs[0].length = count; num_read = fuse_send_read(req, file, pos, count, NULL); err = req->out.h.error; fuse_put_request(fc, req); @@ -641,6 +642,7 @@ struct fuse_fill_data { struct fuse_req *req; struct file *file; struct inode *inode; + unsigned nr_pages; }; static int fuse_readpages_fill(void *_data, struct page *page) @@ -656,16 +658,26 @@ static int fuse_readpages_fill(void *_data, struct page *page) (req->num_pages == FUSE_MAX_PAGES_PER_REQ || (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { + int nr_alloc = min_t(unsigned, data->nr_pages, + FUSE_MAX_PAGES_PER_REQ); fuse_send_readpages(req, data->file); - data->req = req = fuse_get_req(fc); + data->req = req = fuse_get_req(fc, nr_alloc); if (IS_ERR(req)) { unlock_page(page); return PTR_ERR(req); } } + + if (WARN_ON(req->num_pages >= req->max_pages)) { + fuse_put_request(fc, req); + return -EIO; + } + page_cache_get(page); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = PAGE_SIZE; req->num_pages++; + data->nr_pages--; return 0; } @@ -676,6 +688,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_data data; int err; + int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ); err = -EIO; if (is_bad_inode(inode)) @@ -683,7 +696,8 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - data.req = fuse_get_req(fc); + data.req = fuse_get_req(fc, nr_alloc); + data.nr_pages = nr_pages; err = PTR_ERR(data.req); if (IS_ERR(data.req)) goto out; @@ -786,7 +800,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, res = fuse_send_write(req, file, pos, count, NULL); - offset = req->page_offset; + offset = req->page_descs[0].offset; count = res; for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; @@ -817,7 +831,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, int err; req->in.argpages = 1; - req->page_offset = offset; + req->page_descs[0].offset = offset; do { size_t tmp; @@ -857,6 +871,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, err = 0; req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].length = tmp; req->num_pages++; iov_iter_advance(ii, tmp); @@ -869,11 +884,19 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0); + req->num_pages < req->max_pages && offset == 0); return count > 0 ? count : err; } +static inline unsigned fuse_wr_pages(loff_t pos, size_t len) +{ + return min_t(unsigned, + ((pos + len - 1) >> PAGE_CACHE_SHIFT) - + (pos >> PAGE_CACHE_SHIFT) + 1, + FUSE_MAX_PAGES_PER_REQ); +} + static ssize_t fuse_perform_write(struct file *file, struct address_space *mapping, struct iov_iter *ii, loff_t pos) @@ -889,8 +912,9 @@ static ssize_t fuse_perform_write(struct file *file, do { struct fuse_req *req; ssize_t count; + unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii)); - req = fuse_get_req(fc); + req = fuse_get_req(fc, nr_pages); if (IS_ERR(req)) { err = PTR_ERR(req); break; @@ -1023,47 +1047,110 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) } } -static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, +static inline void fuse_page_descs_length_init(struct fuse_req *req, + unsigned index, unsigned nr_pages) +{ + int i; + + for (i = index; i < index + nr_pages; i++) + req->page_descs[i].length = PAGE_SIZE - + req->page_descs[i].offset; +} + +static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) +{ + return (unsigned long)ii->iov->iov_base + ii->iov_offset; +} + +static inline size_t fuse_get_frag_size(const struct iov_iter *ii, + size_t max_size) +{ + return min(iov_iter_single_seg_count(ii), max_size); +} + +static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, size_t *nbytesp, int write) { - size_t nbytes = *nbytesp; - unsigned long user_addr = (unsigned long) buf; - unsigned offset = user_addr & ~PAGE_MASK; - int npages; + size_t nbytes = 0; /* # bytes already packed in req */ /* Special case for kernel I/O: can copy directly into the buffer */ if (segment_eq(get_fs(), KERNEL_DS)) { + unsigned long user_addr = fuse_get_user_addr(ii); + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + if (write) req->in.args[1].value = (void *) user_addr; else req->out.args[0].value = (void *) user_addr; + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; return 0; } - nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); - npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); - npages = get_user_pages_fast(user_addr, npages, !write, req->pages); - if (npages < 0) - return npages; + while (nbytes < *nbytesp && req->num_pages < req->max_pages) { + unsigned npages; + unsigned long user_addr = fuse_get_user_addr(ii); + unsigned offset = user_addr & ~PAGE_MASK; + size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes); + int ret; + + unsigned n = req->max_pages - req->num_pages; + frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT); + + npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + npages = clamp(npages, 1U, n); + + ret = get_user_pages_fast(user_addr, npages, !write, + &req->pages[req->num_pages]); + if (ret < 0) + return ret; - req->num_pages = npages; - req->page_offset = offset; + npages = ret; + frag_size = min_t(size_t, frag_size, + (npages << PAGE_SHIFT) - offset); + iov_iter_advance(ii, frag_size); + + req->page_descs[req->num_pages].offset = offset; + fuse_page_descs_length_init(req, req->num_pages, npages); + + req->num_pages += npages; + req->page_descs[req->num_pages - 1].length -= + (npages << PAGE_SHIFT) - offset - frag_size; + + nbytes += frag_size; + } if (write) req->in.argpages = 1; else req->out.argpages = 1; - nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; - *nbytesp = min(*nbytesp, nbytes); + *nbytesp = nbytes; return 0; } -ssize_t fuse_direct_io(struct file *file, const char __user *buf, - size_t count, loff_t *ppos, int write) +static inline int fuse_iter_npages(const struct iov_iter *ii_p) +{ + struct iov_iter ii = *ii_p; + int npages = 0; + + while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) { + unsigned long user_addr = fuse_get_user_addr(&ii); + unsigned offset = user_addr & ~PAGE_MASK; + size_t frag_size = iov_iter_single_seg_count(&ii); + + npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + iov_iter_advance(&ii, frag_size); + } + + return min(npages, FUSE_MAX_PAGES_PER_REQ); +} + +ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, + unsigned long nr_segs, size_t count, loff_t *ppos, + int write) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -1071,8 +1158,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, loff_t pos = *ppos; ssize_t res = 0; struct fuse_req *req; + struct iov_iter ii; + + iov_iter_init(&ii, iov, nr_segs, count, 0); - req = fuse_get_req(fc); + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1080,7 +1170,7 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, size_t nres; fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); - int err = fuse_get_user_pages(req, buf, &nbytes, write); + int err = fuse_get_user_pages(req, &ii, &nbytes, write); if (err) { res = err; break; @@ -1103,12 +1193,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, count -= nres; res += nres; pos += nres; - buf += nres; if (nres != nbytes) break; if (count) { fuse_put_request(fc, req); - req = fuse_get_req(fc); + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) break; } @@ -1122,8 +1211,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf, } EXPORT_SYMBOL_GPL(fuse_direct_io); -static ssize_t fuse_direct_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { ssize_t res; struct inode *inode = file->f_path.dentry->d_inode; @@ -1131,22 +1220,31 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf, if (is_bad_inode(inode)) return -EIO; - res = fuse_direct_io(file, buf, count, ppos, 0); + res = fuse_direct_io(file, iov, nr_segs, iov_length(iov, nr_segs), + ppos, 0); fuse_invalidate_attr(inode); return res; } -static ssize_t __fuse_direct_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t fuse_direct_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct iovec iov = { .iov_base = buf, .iov_len = count }; + return __fuse_direct_read(file, &iov, 1, ppos); +} + +static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { struct inode *inode = file->f_path.dentry->d_inode; + size_t count = iov_length(iov, nr_segs); ssize_t res; res = generic_write_checks(file, ppos, &count, 0); if (!res) { - res = fuse_direct_io(file, buf, count, ppos, 1); + res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1); if (res > 0) fuse_write_update_size(inode, *ppos); } @@ -1159,6 +1257,7 @@ static ssize_t __fuse_direct_write(struct file *file, const char __user *buf, static ssize_t fuse_direct_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { + struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; struct inode *inode = file->f_path.dentry->d_inode; ssize_t res; @@ -1167,7 +1266,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, /* Don't allow parallel writes to the same file */ mutex_lock(&inode->i_mutex); - res = __fuse_direct_write(file, buf, count, ppos); + res = __fuse_direct_write(file, &iov, 1, ppos); mutex_unlock(&inode->i_mutex); return res; @@ -1272,7 +1371,7 @@ static int fuse_writepage_locked(struct page *page) set_page_writeback(page); - req = fuse_request_alloc_nofs(); + req = fuse_request_alloc_nofs(1); if (!req) goto err; @@ -1293,7 +1392,8 @@ static int fuse_writepage_locked(struct page *page) req->in.argpages = 1; req->num_pages = 1; req->pages[0] = tmp_page; - req->page_offset = 0; + req->page_descs[0].offset = 0; + req->page_descs[0].length = PAGE_SIZE; req->end = fuse_writepage_end; req->inode = inode; @@ -1471,7 +1571,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) struct fuse_lk_out outarg; int err; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1506,7 +1606,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) if (fl->fl_flags & FL_CLOSE) return 0; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -1575,7 +1675,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) if (!inode->i_sb->s_bdev || fc->no_bmap) return 0; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return 0; @@ -1873,7 +1973,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, num_pages++; } - req = fuse_get_req(fc); + req = fuse_get_req(fc, num_pages); if (IS_ERR(req)) { err = PTR_ERR(req); req = NULL; @@ -1881,6 +1981,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, } memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); req->num_pages = num_pages; + fuse_page_descs_length_init(req, 0, req->num_pages); /* okay, let's send it to the client */ req->in.h.opcode = FUSE_IOCTL; @@ -1981,7 +2082,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, struct inode *inode = file->f_dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - if (!fuse_allow_task(fc, current)) + if (!fuse_allow_current_process(fc)) return -EACCES; if (is_bad_inode(inode)) @@ -2066,6 +2167,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) return DEFAULT_POLLMASK; poll_wait(file, &ff->poll_wait, wait); + inarg.events = (__u32)poll_requested_events(wait); /* * Ask for notification iff there's someone waiting for it. @@ -2076,7 +2178,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) fuse_register_polled_file(fc, ff); } - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return POLLERR; @@ -2126,41 +2228,6 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc, return 0; } -static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos, int rw) -{ - const struct iovec *vector = iov; - ssize_t ret = 0; - - while (nr_segs > 0) { - void __user *base; - size_t len; - ssize_t nr; - - base = vector->iov_base; - len = vector->iov_len; - vector++; - nr_segs--; - - if (rw == WRITE) - nr = __fuse_direct_write(filp, base, len, ppos); - else - nr = fuse_direct_read(filp, base, len, ppos); - - if (nr < 0) { - if (!ret) - ret = nr; - break; - } - ret += nr; - if (nr != len) - break; - } - - return ret; -} - - static ssize_t fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -2172,13 +2239,16 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, file = iocb->ki_filp; pos = offset; - ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw); + if (rw == WRITE) + ret = __fuse_direct_write(file, iov, nr_segs, &pos); + else + ret = __fuse_direct_read(file, iov, nr_segs, &pos); return ret; } -long fuse_file_fallocate(struct file *file, int mode, loff_t offset, - loff_t length) +static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, + loff_t length) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -2194,7 +2264,7 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (fc->no_fallocate) return -EOPNOTSUPP; - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -2213,7 +2283,6 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset, return err; } -EXPORT_SYMBOL_GPL(fuse_file_fallocate); static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e105a53fc72d..6aeba864f070 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -44,6 +44,9 @@ doing the mount will be allowed to access the filesystem */ #define FUSE_ALLOW_OTHER (1 << 1) +/** Number of page pointers embedded in fuse_req */ +#define FUSE_REQ_INLINE_PAGES 1 + /** List of active connections */ extern struct list_head fuse_conn_list; @@ -103,6 +106,15 @@ struct fuse_inode { /** List of writepage requestst (pending or sent) */ struct list_head writepages; + + /** Miscellaneous bits describing inode state */ + unsigned long state; +}; + +/** FUSE inode state bits */ +enum { + /** Advise readdirplus */ + FUSE_I_ADVISE_RDPLUS, }; struct fuse_conn; @@ -200,6 +212,12 @@ struct fuse_out { struct fuse_arg args[3]; }; +/** FUSE page descriptor */ +struct fuse_page_desc { + unsigned int length; + unsigned int offset; +}; + /** The request state */ enum fuse_req_state { FUSE_REQ_INIT = 0, @@ -291,14 +309,23 @@ struct fuse_req { } misc; /** page vector */ - struct page *pages[FUSE_MAX_PAGES_PER_REQ]; + struct page **pages; + + /** page-descriptor vector */ + struct fuse_page_desc *page_descs; + + /** size of the 'pages' array */ + unsigned max_pages; + + /** inline page vector */ + struct page *inline_pages[FUSE_REQ_INLINE_PAGES]; + + /** inline page-descriptor vector */ + struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES]; /** number of pages in vector */ unsigned num_pages; - /** offset of data on first page */ - unsigned page_offset; - /** File used in the request (or NULL) */ struct fuse_file *ff; @@ -487,6 +514,12 @@ struct fuse_conn { /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; + /** Does the filesystem support readdirplus? */ + unsigned do_readdirplus:1; + + /** Does the filesystem want adaptive readdirplus? */ + unsigned readdirplus_auto:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -578,6 +611,9 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); +/* Used by READDIRPLUS */ +void fuse_force_forget(struct file *file, u64 nodeid); + /** * Initialize READ or READDIR request */ @@ -658,9 +694,9 @@ void fuse_ctl_cleanup(void); /** * Allocate a request */ -struct fuse_req *fuse_request_alloc(void); +struct fuse_req *fuse_request_alloc(unsigned npages); -struct fuse_req *fuse_request_alloc_nofs(void); +struct fuse_req *fuse_request_alloc_nofs(unsigned npages); /** * Free a request @@ -668,14 +704,25 @@ struct fuse_req *fuse_request_alloc_nofs(void); void fuse_request_free(struct fuse_req *req); /** - * Get a request, may fail with -ENOMEM + * Get a request, may fail with -ENOMEM, + * caller should specify # elements in req->pages[] explicitly */ -struct fuse_req *fuse_get_req(struct fuse_conn *fc); +struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); + +/** + * Get a request, may fail with -ENOMEM, + * useful for callers who doesn't use req->pages[] + */ +static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc) +{ + return fuse_get_req(fc, 0); +} /** * Gets a requests for a file operation, always succeeds */ -struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file); +struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, + struct file *file); /** * Decrement reference count of a request. If count goes to zero free @@ -739,9 +786,9 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc); int fuse_valid_type(int m); /** - * Is task allowed to perform filesystem operation? + * Is current process allowed to perform filesystem operation? */ -int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task); +int fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); @@ -776,8 +823,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir); -ssize_t fuse_direct_io(struct file *file, const char __user *buf, - size_t count, loff_t *ppos, int write); +ssize_t fuse_direct_io(struct file *file, const struct iovec *iov, + unsigned long nr_segs, size_t count, loff_t *ppos, + int write); long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags); long fuse_ioctl_common(struct file *file, unsigned int cmd, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 73ca6b72beaf..01353ed75750 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -92,6 +92,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->attr_version = 0; fi->writectr = 0; fi->orig_ino = 0; + fi->state = 0; INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); INIT_LIST_HEAD(&fi->writepages); @@ -408,12 +409,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) struct fuse_statfs_out outarg; int err; - if (!fuse_allow_task(fc, current)) { + if (!fuse_allow_current_process(fc)) { buf->f_type = FUSE_SUPER_MAGIC; return 0; } - req = fuse_get_req(fc); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -863,6 +864,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->dont_mask = 1; if (arg->flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; + if (arg->flags & FUSE_DO_READDIRPLUS) + fc->do_readdirplus = 1; + if (arg->flags & FUSE_READDIRPLUS_AUTO) + fc->readdirplus_auto = 1; } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -889,7 +894,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | - FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA; + FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | + FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1034,12 +1040,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) /* only now - we want root dentry with NULL ->d_op */ sb->s_d_op = &fuse_dentry_operations; - init_req = fuse_request_alloc(); + init_req = fuse_request_alloc(0); if (!init_req) goto err_put_root; if (is_bdev) { - fc->destroy_req = fuse_request_alloc(); + fc->destroy_req = fuse_request_alloc(0); if (!fc->destroy_req) goto err_free_init_req; } diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 8dad6b093716..9802de0f85e6 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -241,6 +241,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, static void gfs2_reverse_hex(char *c, u64 value) { + *c = '0'; while (value) { *c-- = hex_asc[value & 0x0f]; value >>= 4; @@ -280,6 +281,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int lvb_needs_unlock = 0; int error; if (gl->gl_lksb.sb_lkid == 0) { @@ -293,8 +295,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_update_request_times(gl); /* don't want to skip dlm_unlock writing the lvb when lock is ex */ + + if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE)) + lvb_needs_unlock = 1; + if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && - gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) { + !lvb_needs_unlock) { gfs2_glock_free(gl); return; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 37ee061d899e..b7eff078fe90 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -350,10 +350,14 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) BUG_ON(len < chunk_size); len -= chunk_size; block = gfs2_rbm_to_block(&rbm); - gfs2_rbm_from_block(&rbm, block + chunk_size); - n_unaligned = 3; - if (ptr) + if (gfs2_rbm_from_block(&rbm, block + chunk_size)) { + n_unaligned = 0; break; + } + if (ptr) { + n_unaligned = 3; + break; + } n_unaligned = len & 3; } @@ -557,22 +561,20 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd) */ int gfs2_rs_alloc(struct gfs2_inode *ip) { - struct gfs2_blkreserv *res; + int error = 0; + down_write(&ip->i_rw_mutex); if (ip->i_res) - return 0; - - res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); - if (!res) - return -ENOMEM; + goto out; - RB_CLEAR_NODE(&res->rs_node); + ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); + if (!ip->i_res) { + error = -ENOMEM; + goto out; + } - down_write(&ip->i_rw_mutex); - if (ip->i_res) - kmem_cache_free(gfs2_rsrv_cachep, res); - else - ip->i_res = res; + RB_CLEAR_NODE(&ip->i_res->rs_node); +out: up_write(&ip->i_rw_mutex); return 0; } @@ -1424,6 +1426,9 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, rs->rs_free = extlen; rs->rs_inum = ip->i_no_addr; rs_insert(ip); + } else { + if (goal == rgd->rd_last_alloc + rgd->rd_data0) + rgd->rd_last_alloc = 0; } } diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index a2862339323b..81cc7eaff863 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -446,7 +446,8 @@ int __log_start_commit(journal_t *journal, tid_t target) * currently running transaction (if it exists). Otherwise, * the target tid must be an old one. */ - if (journal->j_running_transaction && + if (journal->j_commit_request != target && + journal->j_running_transaction && journal->j_running_transaction->t_tid == target) { /* * We want a new commit: OK, mark the request and wakeup the diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 42f6615af0ac..df9f29760efa 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -209,7 +209,8 @@ repeat: if (!new_transaction) goto alloc_transaction; write_lock(&journal->j_state_lock); - if (!journal->j_running_transaction) { + if (!journal->j_running_transaction && + !journal->j_barrier_count) { jbd2_get_transaction(journal, new_transaction); new_transaction = NULL; } @@ -1839,7 +1840,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, BUFFER_TRACE(bh, "entry"); -retry: /* * It is safe to proceed here without the j_list_lock because the * buffers cannot be stolen by try_to_free_buffers as long as we are @@ -1934,14 +1934,11 @@ retry: * for commit and try again. */ if (partial_page) { - tid_t tid = journal->j_committing_transaction->t_tid; - jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); write_unlock(&journal->j_state_lock); - jbd2_log_wait_commit(journal, tid); - goto retry; + return -EBUSY; } /* * OK, buffer won't be reachable after truncate. We just set @@ -2002,21 +1999,23 @@ zap_buffer_unlocked: * @page: page to flush * @offset: length of page to invalidate. * - * Reap page buffers containing data after offset in page. - * + * Reap page buffers containing data after offset in page. Can return -EBUSY + * if buffers are part of the committing transaction and the page is straddling + * i_size. Caller then has to wait for current commit and try again. */ -void jbd2_journal_invalidatepage(journal_t *journal, - struct page *page, - unsigned long offset) +int jbd2_journal_invalidatepage(journal_t *journal, + struct page *page, + unsigned long offset) { struct buffer_head *head, *bh, *next; unsigned int curr_off = 0; int may_free = 1; + int ret = 0; if (!PageLocked(page)) BUG(); if (!page_has_buffers(page)) - return; + return 0; /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be @@ -2030,9 +2029,11 @@ void jbd2_journal_invalidatepage(journal_t *journal, if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); - may_free &= journal_unmap_buffer(journal, bh, - offset > 0); + ret = journal_unmap_buffer(journal, bh, offset > 0); unlock_buffer(bh); + if (ret < 0) + return ret; + may_free &= ret; } curr_off = next_off; bh = next; @@ -2043,6 +2044,7 @@ void jbd2_journal_invalidatepage(journal_t *journal, if (may_free && try_to_free_buffers(page)) J_ASSERT(!page_has_buffers(page)); } + return 0; } /* diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c89b26bc9759..264d1aa935f2 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -206,7 +206,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, list_for_each_entry(lo, &server->layouts, plh_layouts) { ino = igrab(lo->plh_inode); - if (ino) + if (!ino) continue; spin_lock(&ino->i_lock); /* Is this layout in the process of being freed? */ diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 32e6c53520e2..1b2d7eb93796 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2153,12 +2153,16 @@ static int nfs_open_permission_mask(int openflags) { int mask = 0; - if ((openflags & O_ACCMODE) != O_WRONLY) - mask |= MAY_READ; - if ((openflags & O_ACCMODE) != O_RDONLY) - mask |= MAY_WRITE; - if (openflags & __FMODE_EXEC) - mask |= MAY_EXEC; + if (openflags & __FMODE_EXEC) { + /* ONLY check exec rights */ + mask = MAY_EXEC; + } else { + if ((openflags & O_ACCMODE) != O_WRONLY) + mask |= MAY_READ; + if ((openflags & O_ACCMODE) != O_RDONLY) + mask |= MAY_WRITE; + } + return mask; } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index dd057bc6b65b..fc8dc20fdeb9 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -177,11 +177,31 @@ out_nofree: return mnt; } +static int +nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + if (NFS_FH(dentry->d_inode)->size != 0) + return nfs_getattr(mnt, dentry, stat); + generic_fillattr(dentry->d_inode, stat); + return 0; +} + +static int +nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) +{ + if (NFS_FH(dentry->d_inode)->size != 0) + return nfs_setattr(dentry, attr); + return -EACCES; +} + const struct inode_operations nfs_mountpoint_inode_operations = { .getattr = nfs_getattr, + .setattr = nfs_setattr, }; const struct inode_operations nfs_referral_inode_operations = { + .getattr = nfs_namespace_getattr, + .setattr = nfs_namespace_setattr, }; static void nfs_expire_automounts(struct work_struct *work) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index acc347268124..2e9779b58b7a 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -236,11 +236,10 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; + nfs_put_client(clp); if (clp != old) { clp->cl_preserve_clid = true; - nfs_put_client(clp); clp = old; - atomic_inc(&clp->cl_count); } return clp; @@ -306,7 +305,7 @@ int nfs40_walk_client_list(struct nfs_client *new, .clientid = new->cl_clientid, .confirm = new->cl_confirm, }; - int status; + int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { @@ -332,40 +331,33 @@ int nfs40_walk_client_list(struct nfs_client *new, if (prev) nfs_put_client(prev); + prev = pos; status = nfs4_proc_setclientid_confirm(pos, &clid, cred); - if (status == 0) { + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + break; + case 0: nfs4_swap_callback_idents(pos, new); - nfs_put_client(pos); + prev = NULL; *result = pos; dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); - return 0; - } - if (status != -NFS4ERR_STALE_CLIENTID) { - nfs_put_client(pos); - dprintk("NFS: <-- %s status = %d, no result\n", - __func__, status); - return status; + default: + goto out; } spin_lock(&nn->nfs_client_lock); - prev = pos; } + spin_unlock(&nn->nfs_client_lock); - /* - * No matching nfs_client found. This should be impossible, - * because the new nfs_client has already been added to - * nfs_client_list by nfs_get_client(). - * - * Don't BUG(), since the caller is holding a mutex. - */ + /* No match found. The server lost our clientid */ +out: if (prev) nfs_put_client(prev); - spin_unlock(&nn->nfs_client_lock); - pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); - return -NFS4ERR_STALE_CLIENTID; + dprintk("NFS: <-- %s status = %d\n", __func__, status); + return status; } #ifdef CONFIG_NFS_V4_1 @@ -432,7 +424,7 @@ int nfs41_walk_client_list(struct nfs_client *new, { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); struct nfs_client *pos, *n, *prev = NULL; - int error; + int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { @@ -448,14 +440,17 @@ int nfs41_walk_client_list(struct nfs_client *new, nfs_put_client(prev); prev = pos; - error = nfs_wait_client_init_complete(pos); - if (error < 0) { + nfs4_schedule_lease_recovery(pos); + status = nfs_wait_client_init_complete(pos); + if (status < 0) { nfs_put_client(pos); spin_lock(&nn->nfs_client_lock); continue; } - + status = pos->cl_cons_state; spin_lock(&nn->nfs_client_lock); + if (status < 0) + continue; } if (pos->rpc_ops != new->rpc_ops) @@ -473,6 +468,7 @@ int nfs41_walk_client_list(struct nfs_client *new, if (!nfs4_match_serverowners(pos, new)) continue; + atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); @@ -481,16 +477,10 @@ int nfs41_walk_client_list(struct nfs_client *new, return 0; } - /* - * No matching nfs_client found. This should be impossible, - * because the new nfs_client has already been added to - * nfs_client_list by nfs_get_client(). - * - * Don't BUG(), since the caller is holding a mutex. - */ + /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); - pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); - return -NFS4ERR_STALE_CLIENTID; + dprintk("NFS: <-- %s status = %d\n", __func__, status); + return status; } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5d864fb36578..cf747ef86650 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1626,7 +1626,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data) static int nfs4_opendata_access(struct rpc_cred *cred, struct nfs4_opendata *opendata, - struct nfs4_state *state, fmode_t fmode) + struct nfs4_state *state, fmode_t fmode, + int openflags) { struct nfs_access_entry cache; u32 mask; @@ -1638,11 +1639,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred, mask = 0; /* don't check MAY_WRITE - a newly created file may not have - * write mode bits, but POSIX allows the creating process to write */ - if (fmode & FMODE_READ) - mask |= MAY_READ; - if (fmode & FMODE_EXEC) - mask |= MAY_EXEC; + * write mode bits, but POSIX allows the creating process to write. + * use openflags to check for exec, because fmode won't + * always have FMODE_EXEC set when file open for exec. */ + if (openflags & __FMODE_EXEC) { + /* ONLY check for exec rights */ + mask = MAY_EXEC; + } else if (fmode & FMODE_READ) + mask = MAY_READ; cache.cred = cred; cache.jiffies = jiffies; @@ -1896,7 +1900,7 @@ static int _nfs4_do_open(struct inode *dir, if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); - status = nfs4_opendata_access(cred, opendata, state, fmode); + status = nfs4_opendata_access(cred, opendata, state, fmode, flags); if (status != 0) goto err_opendata_put; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9448c579d41a..e61f68d5ef21 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -136,16 +136,11 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, clp->cl_confirm = clid.confirm; status = nfs40_walk_client_list(clp, result, cred); - switch (status) { - case -NFS4ERR_STALE_CLIENTID: - set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - case 0: + if (status == 0) { /* Sustain the lease, even if it's empty. If the clientid4 * goes stale it's of no use for trunking discovery. */ nfs4_schedule_state_renewal(*result); - break; } - out: return status; } @@ -1863,6 +1858,7 @@ again: case -ETIMEDOUT: case -EAGAIN: ssleep(1); + case -NFS4ERR_STALE_CLIENTID: dprintk("NFS: %s after status %d, retrying\n", __func__, status); goto again; @@ -2022,8 +2018,18 @@ static int nfs4_reset_session(struct nfs_client *clp) nfs4_begin_drain_session(clp); cred = nfs4_get_exchange_id_cred(clp); status = nfs4_proc_destroy_session(clp->cl_session, cred); - if (status && status != -NFS4ERR_BADSESSION && - status != -NFS4ERR_DEADSESSION) { + switch (status) { + case 0: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + break; + case -NFS4ERR_BACK_CHAN_BUSY: + case -NFS4ERR_DELAY: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + status = 0; + ssleep(1); + goto out; + default: status = nfs4_recovery_handle_error(clp, status); goto out; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index e7165d915362..d00260b08103 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -254,7 +254,7 @@ static void pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) { lo->plh_retry_timestamp = jiffies; - if (test_and_set_bit(fail_bit, &lo->plh_flags)) + if (!test_and_set_bit(fail_bit, &lo->plh_flags)) atomic_inc(&lo->plh_refcount); } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index b6bdb18e892c..a5e5d9899d56 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -91,12 +91,16 @@ void nfs_readdata_release(struct nfs_read_data *rdata) put_nfs_open_context(rdata->args.context); if (rdata->pages.pagevec != rdata->pages.page_array) kfree(rdata->pages.pagevec); - if (rdata != &read_header->rpc_data) - kfree(rdata); - else + if (rdata == &read_header->rpc_data) { rdata->header = NULL; + rdata = NULL; + } if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); + /* Note: we only free the rpc_task after callbacks are done. + * See the comment in rpc_free_task() for why + */ + kfree(rdata); } EXPORT_SYMBOL_GPL(nfs_readdata_release); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index c25cadf8f8c4..b056b1628722 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1152,7 +1152,7 @@ static int nfs_get_option_str(substring_t args[], char **option) { kfree(*option); *option = match_strdup(args); - return !option; + return !*option; } static int nfs_get_option_ul(substring_t args[], unsigned long *option) @@ -2589,27 +2589,23 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags, struct nfs_server *server; struct dentry *mntroot = ERR_PTR(-ENOMEM); struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod; - int error; - dprintk("--> nfs_xdev_mount_common()\n"); + dprintk("--> nfs_xdev_mount()\n"); mount_info.mntfh = mount_info.cloned->fh; /* create a new volume representation */ server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); - if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out_err; - } - mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, nfs_mod); - dprintk("<-- nfs_xdev_mount_common() = 0\n"); -out: - return mntroot; + if (IS_ERR(server)) + mntroot = ERR_CAST(server); + else + mntroot = nfs_fs_mount_common(server, flags, + dev_name, &mount_info, nfs_mod); -out_err: - dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error); - goto out; + dprintk("<-- nfs_xdev_mount() = %ld\n", + IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L); + return mntroot; } #if IS_ENABLED(CONFIG_NFS_V4) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b673be31590e..c483cc50b82e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -126,12 +126,16 @@ void nfs_writedata_release(struct nfs_write_data *wdata) put_nfs_open_context(wdata->args.context); if (wdata->pages.pagevec != wdata->pages.page_array) kfree(wdata->pages.pagevec); - if (wdata != &write_header->rpc_data) - kfree(wdata); - else + if (wdata == &write_header->rpc_data) { wdata->header = NULL; + wdata = NULL; + } if (atomic_dec_and_test(&hdr->refcnt)) hdr->completion_ops->completion(hdr); + /* Note: we only free the rpc_task after callbacks are done. + * See the comment in rpc_free_task() for why + */ + kfree(wdata); } EXPORT_SYMBOL_GPL(nfs_writedata_release); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index fdb180769485..f3859354e41a 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -664,8 +664,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, if (ret < 0) printk(KERN_ERR "NILFS: GC failed during preparation: " "cannot read source blocks: err=%d\n", ret); - else + else { + if (nilfs_sb_need_update(nilfs)) + set_nilfs_discontinued(nilfs); ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + } nilfs_remove_all_gcinodes(nilfs); clear_nilfs_gc_running(nilfs); diff --git a/fs/proc/array.c b/fs/proc/array.c index 6a91e6ffbcbd..f7ed9ee46eb9 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -449,7 +449,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, do { min_flt += t->min_flt; maj_flt += t->maj_flt; - gtime += t->gtime; + gtime += task_gtime(t); t = next_thread(t); } while (t != task); @@ -472,7 +472,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt = task->min_flt; maj_flt = task->maj_flt; task_cputime_adjusted(task, &utime, &stime); - gtime = task->gtime; + gtime = task_gtime(task); } /* scale priority and nice values from timeslices to -20..20 */ diff --git a/fs/proc/generic.c b/fs/proc/generic.c index e064f562b1f7..76ddae83daa5 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -352,18 +352,18 @@ retry: if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) return -ENOMEM; - spin_lock_bh(&proc_inum_lock); + spin_lock_irq(&proc_inum_lock); error = ida_get_new(&proc_inum_ida, &i); - spin_unlock_bh(&proc_inum_lock); + spin_unlock_irq(&proc_inum_lock); if (error == -EAGAIN) goto retry; else if (error) return error; if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { - spin_lock_bh(&proc_inum_lock); + spin_lock_irq(&proc_inum_lock); ida_remove(&proc_inum_ida, i); - spin_unlock_bh(&proc_inum_lock); + spin_unlock_irq(&proc_inum_lock); return -ENOSPC; } *inum = PROC_DYNAMIC_FIRST + i; @@ -372,9 +372,10 @@ retry: void proc_free_inum(unsigned int inum) { - spin_lock_bh(&proc_inum_lock); + unsigned long flags; + spin_lock_irqsave(&proc_inum_lock, flags); ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); - spin_unlock_bh(&proc_inum_lock); + spin_unlock_irqrestore(&proc_inum_lock, flags); } static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index fe72cd073dea..3131a03d7d37 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -177,20 +177,6 @@ const struct file_operations proc_net_operations = { .readdir = proc_tgid_net_readdir, }; - -struct proc_dir_entry *proc_net_fops_create(struct net *net, - const char *name, umode_t mode, const struct file_operations *fops) -{ - return proc_create(name, mode, net->proc_net, fops); -} -EXPORT_SYMBOL_GPL(proc_net_fops_create); - -void proc_net_remove(struct net *net, const char *name) -{ - remove_proc_entry(name, net->proc_net); -} -EXPORT_SYMBOL_GPL(proc_net_remove); - static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 448455b7fd91..ca5ce7f9f800 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1278,7 +1278,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.mm = mm; pol = get_vma_policy(task, vma, vma->vm_start); - mpol_to_str(buffer, sizeof(buffer), pol, 0); + mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); seq_printf(m, "%08lx %s", vma->vm_start, buffer); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index f883e7e74305..288f068740f6 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -167,12 +167,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz) { char *hdr; - struct timeval timestamp; + struct timespec timestamp; size_t len; - do_gettimeofday(×tamp); + /* Report zeroed timestamp if called before timekeeping has resumed. */ + if (__getnstimeofday(×tamp)) { + timestamp.tv_sec = 0; + timestamp.tv_nsec = 0; + } hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n", - (long)timestamp.tv_sec, (long)timestamp.tv_usec); + (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000)); WARN_ON_ONCE(!hdr); len = hdr ? strlen(hdr) : 0; persistent_ram_write(prz, hdr, len); @@ -291,9 +295,8 @@ static void ramoops_free_przs(struct ramoops_context *cxt) kfree(cxt->przs); } -static int __devinit ramoops_init_przs(struct device *dev, - struct ramoops_context *cxt, - phys_addr_t *paddr, size_t dump_mem_sz) +static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, + phys_addr_t *paddr, size_t dump_mem_sz) { int err = -ENOMEM; int i; @@ -336,10 +339,9 @@ fail_prz: return err; } -static int __devinit ramoops_init_prz(struct device *dev, - struct ramoops_context *cxt, - struct persistent_ram_zone **prz, - phys_addr_t *paddr, size_t sz, u32 sig) +static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, + struct persistent_ram_zone **prz, + phys_addr_t *paddr, size_t sz, u32 sig) { if (!sz) return 0; @@ -367,7 +369,7 @@ static int __devinit ramoops_init_prz(struct device *dev, return 0; } -static int __devinit ramoops_probe(struct platform_device *pdev) +static int ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct ramoops_platform_data *pdata = pdev->dev.platform_data; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index eecd2a8a84dd..0306303be372 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -390,8 +390,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, return 0; } -static int __devinit persistent_ram_post_init(struct persistent_ram_zone *prz, - u32 sig, int ecc_size) +static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, + int ecc_size) { int ret; @@ -443,9 +443,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz) kfree(prz); } -struct persistent_ram_zone * __devinit persistent_ram_new(phys_addr_t start, - size_t size, u32 sig, - int ecc_size) +struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, + u32 sig, int ecc_size) { struct persistent_ram_zone *prz; int ret = -ENOMEM; diff --git a/fs/select.c b/fs/select.c index 2ef72d965036..8c1c96c27062 100644 --- a/fs/select.c +++ b/fs/select.c @@ -26,6 +26,7 @@ #include <linux/fs.h> #include <linux/rcupdate.h> #include <linux/hrtimer.h> +#include <linux/sched/rt.h> #include <asm/uaccess.h> diff --git a/fs/seq_file.c b/fs/seq_file.c index 9d863fb501f9..f2bc3dfd0b88 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -296,7 +296,7 @@ EXPORT_SYMBOL(seq_read); * seq_lseek - ->llseek() method for sequential files. * @file: the file in question * @offset: new position - * @origin: 0 for absolute, 1 for relative position + * @whence: 0 for absolute, 1 for relative position * * Ready-made ->f_op->llseek() */ diff --git a/fs/splice.c b/fs/splice.c index 8890604e3fcd..6909d89d0da5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -696,8 +696,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, return -EINVAL; more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; - if (sd->len < sd->total_len) + + if (sd->len < sd->total_len && pipe->nrbufs > 1) more |= MSG_SENDPAGE_NOTLAST; + return file->f_op->sendpage(file, buf->page, buf->offset, sd->len, &pos, more); } diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 2df555c66d57..aec3d5c98c94 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -205,6 +205,48 @@ void sysfs_unmerge_group(struct kobject *kobj, } EXPORT_SYMBOL_GPL(sysfs_unmerge_group); +/** + * sysfs_add_link_to_group - add a symlink to an attribute group. + * @kobj: The kobject containing the group. + * @group_name: The name of the group. + * @target: The target kobject of the symlink to create. + * @link_name: The name of the symlink to create. + */ +int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, + struct kobject *target, const char *link_name) +{ + struct sysfs_dirent *dir_sd; + int error = 0; + + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); + if (!dir_sd) + return -ENOENT; + + error = sysfs_create_link_sd(dir_sd, target, link_name); + sysfs_put(dir_sd); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_add_link_to_group); + +/** + * sysfs_remove_link_from_group - remove a symlink from an attribute group. + * @kobj: The kobject containing the group. + * @group_name: The name of the group. + * @link_name: The name of the symlink to remove. + */ +void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, + const char *link_name) +{ + struct sysfs_dirent *dir_sd; + + dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name); + if (dir_sd) { + sysfs_hash_and_remove(dir_sd, NULL, link_name); + sysfs_put(dir_sd); + } +} +EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); EXPORT_SYMBOL_GPL(sysfs_create_group); EXPORT_SYMBOL_GPL(sysfs_update_group); diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 3c9eb5624f5e..8c940df97a52 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -21,26 +21,17 @@ #include "sysfs.h" -static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, - const char *name, int warn) +static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd, + struct kobject *target, + const char *name, int warn) { - struct sysfs_dirent *parent_sd = NULL; struct sysfs_dirent *target_sd = NULL; struct sysfs_dirent *sd = NULL; struct sysfs_addrm_cxt acxt; enum kobj_ns_type ns_type; int error; - BUG_ON(!name); - - if (!kobj) - parent_sd = &sysfs_root; - else - parent_sd = kobj->sd; - - error = -EFAULT; - if (!parent_sd) - goto out_put; + BUG_ON(!name || !parent_sd); /* target->sd can go away beneath us but is protected with * sysfs_assoc_lock. Fetch target_sd from it. @@ -96,6 +87,34 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, } /** + * sysfs_create_link_sd - create symlink to a given object. + * @sd: directory we're creating the link in. + * @target: object we're pointing to. + * @name: name of the symlink. + */ +int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, + const char *name) +{ + return sysfs_do_create_link_sd(sd, target, name, 1); +} + +static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target, + const char *name, int warn) +{ + struct sysfs_dirent *parent_sd = NULL; + + if (!kobj) + parent_sd = &sysfs_root; + else + parent_sd = kobj->sd; + + if (!parent_sd) + return -EFAULT; + + return sysfs_do_create_link_sd(parent_sd, target, name, warn); +} + +/** * sysfs_create_link - create symlink between two objects. * @kobj: object whose directory we're creating the link in. * @target: object we're pointing to. diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index d73c0932bbd6..d1e4043eb0c3 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -240,3 +240,5 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd); * symlink.c */ extern const struct inode_operations sysfs_symlink_inode_operations; +int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target, + const char *name); diff --git a/fs/udf/super.c b/fs/udf/super.c index d44fb568abe1..e9be396a558d 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -307,7 +307,8 @@ static void udf_sb_free_partitions(struct super_block *sb) { struct udf_sb_info *sbi = UDF_SB(sb); int i; - + if (sbi->s_partmaps == NULL) + return; for (i = 0; i < sbi->s_partitions; i++) udf_free_partition(&sbi->s_partmaps[i]); kfree(sbi->s_partmaps); |