diff options
Diffstat (limited to 'fs')
288 files changed, 7025 insertions, 5054 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 0576eaeb60b9..5b6a1743ea17 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -266,7 +266,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, if (IS_ERR(acl)) return PTR_ERR(acl); else if (acl) { - retval = posix_acl_valid(acl); + retval = posix_acl_valid(inode->i_sb->s_user_ns, acl); if (retval) goto err_out; } diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 2b6787fcb626..12700df0bb51 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -24,6 +24,10 @@ #include <linux/list.h> struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); +static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry) +{ + return v9fs_fid_lookup(dentry->d_parent); +} struct p9_fid *v9fs_fid_clone(struct dentry *dentry); void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); struct p9_fid *v9fs_writeback_fid(struct dentry *dentry); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index c37fb9c08970..6181ad79e1a5 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -231,7 +231,6 @@ static int v9fs_launder_page(struct page *page) /** * v9fs_direct_IO - 9P address space operation for direct I/O * @iocb: target I/O control block - * @pos: offset in file to begin the operation * * The presence of v9fs_direct_IO() in the address space ops vector * allowes open() O_DIRECT flags which would have failed otherwise. diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index e2e7c749925a..7da9a8354fad 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -595,7 +595,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) v9ses = v9fs_inode2v9ses(dir); inode = d_inode(dentry); - dfid = v9fs_fid_lookup(dentry->d_parent); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { retval = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval); @@ -653,7 +653,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, ofid = NULL; fid = NULL; name = (char *) dentry->d_name.name; - dfid = v9fs_fid_lookup(dentry->d_parent); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); @@ -798,7 +798,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, v9ses = v9fs_inode2v9ses(dir); /* We can walk d_parent because we hold the dir->i_mutex */ - dfid = v9fs_fid_lookup(dentry->d_parent); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) return ERR_CAST(dfid); @@ -975,13 +975,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_ERR(oldfid)) return PTR_ERR(oldfid); - olddirfid = v9fs_fid_clone(old_dentry->d_parent); + olddirfid = v9fs_parent_fid(old_dentry); if (IS_ERR(olddirfid)) { retval = PTR_ERR(olddirfid); goto done; } - newdirfid = v9fs_fid_clone(new_dentry->d_parent); + newdirfid = v9fs_parent_fid(new_dentry); if (IS_ERR(newdirfid)) { retval = PTR_ERR(newdirfid); goto clunk_olddir; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 1b51eaa5e2dd..2ed04c2fe7af 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -273,7 +273,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n", name, flags, omode); - dfid = v9fs_fid_lookup(dentry->d_parent); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); @@ -389,7 +389,6 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, umode_t mode; struct inode *inode; struct p9_qid qid; - struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); @@ -400,8 +399,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, if (dir->i_mode & S_ISGID) omode |= S_ISGID; - dir_dentry = dentry->d_parent; - dfid = v9fs_fid_lookup(dir_dentry); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); @@ -691,7 +689,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); v9ses = v9fs_inode2v9ses(dir); - dfid = v9fs_fid_lookup(dentry->d_parent); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); @@ -762,7 +760,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int err; - struct dentry *dir_dentry; struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; @@ -770,8 +767,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, dir->i_ino, old_dentry, dentry); v9ses = v9fs_inode2v9ses(dir); - dir_dentry = dentry->d_parent; - dfid = v9fs_fid_lookup(dir_dentry); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) return PTR_ERR(dfid); @@ -822,7 +818,6 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; struct p9_qid qid; - struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n", @@ -830,8 +825,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, MAJOR(rdev), MINOR(rdev)); v9ses = v9fs_inode2v9ses(dir); - dir_dentry = dentry->d_parent; - dfid = v9fs_fid_lookup(dir_dentry); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); diff --git a/fs/Kconfig b/fs/Kconfig index 4524916fa200..2bc7ad775842 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -70,6 +70,12 @@ config FS_POSIX_ACL config EXPORTFS tristate +config EXPORTFS_BLOCK_OPS + bool "Enable filesystem export operations for block IO" + help + This option enables the export operations for a filesystem to support + external block IO. + config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT default y diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 72c03354c14b..c7efddf6e038 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -89,7 +89,8 @@ config BINFMT_SCRIPT config BINFMT_FLAT bool "Kernel support for flat binaries" - depends on !MMU && (!FRV || BROKEN) + depends on !MMU || M68K + depends on !FRV || BROKEN help Support uClinux FLAT format binaries. diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index fd4cf2c48e48..bec25f7017c0 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -207,7 +207,7 @@ adfs_hash(const struct dentry *parent, struct qstr *qstr) */ qstr->len = i = name_len; name = qstr->name; - hash = init_name_hash(); + hash = init_name_hash(parent); while (i--) { char c; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 00d3002a6780..eb32029bc776 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -61,7 +61,7 @@ affs_get_toupper(struct super_block *sb) * Note: the dentry argument is the parent dentry. */ static inline int -__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) +__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t toupper, bool notruncate) { const u8 *name = qstr->name; unsigned long hash; @@ -72,7 +72,7 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) if (retval) return retval; - hash = init_name_hash(); + hash = init_name_hash(dentry); len = min(qstr->len, AFFSNAMEMAX); for (; len > 0; name++, len--) hash = partial_name_hash(toupper(*name), hash); @@ -84,7 +84,7 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) static int affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_toupper, + return __affs_hash_dentry(dentry, qstr, affs_toupper, affs_nofilenametruncate(dentry)); } @@ -92,7 +92,7 @@ affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) static int affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_intl_toupper, + return __affs_hash_dentry(dentry, qstr, affs_intl_toupper, affs_nofilenametruncate(dentry)); } diff --git a/fs/attr.c b/fs/attr.c index 25b24d0f6c88..42bb42bb3c72 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -255,6 +255,25 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID))) return 0; + /* + * Verify that uid/gid changes are valid in the target + * namespace of the superblock. + */ + if (ia_valid & ATTR_UID && + !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid)) + return -EOVERFLOW; + if (ia_valid & ATTR_GID && + !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid)) + return -EOVERFLOW; + + /* Don't allow modifications of files with invalid uids or + * gids unless those uids & gids are being made valid. + */ + if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid)) + return -EOVERFLOW; + if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid)) + return -EOVERFLOW; + error = security_inode_setattr(dentry, attr); if (error) return error; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 631f1554c87b..708214457d16 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -398,7 +398,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, } } qstr.name = name; - qstr.hash = full_name_hash(name, qstr.len); + qstr.hash = full_name_hash(dentry, name, qstr.len); if (mutex_lock_interruptible(&sbi->wq_mutex)) { kfree(qstr.name); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index a7a28110dc80..7f6aff3f72eb 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -605,28 +605,30 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, * Do the same thing for the memory mapping - between * elf_bss and last_bss is the bss section. */ - k = load_addr + eppnt->p_memsz + eppnt->p_vaddr; + k = load_addr + eppnt->p_vaddr + eppnt->p_memsz; if (k > last_bss) last_bss = k; } } + /* + * Now fill out the bss section: first pad the last page from + * the file up to the page boundary, and zero it from elf_bss + * up to the end of the page. + */ + if (padzero(elf_bss)) { + error = -EFAULT; + goto out; + } + /* + * Next, align both the file and mem bss up to the page size, + * since this is where elf_bss was just zeroed up to, and where + * last_bss will end after the vm_brk() below. + */ + elf_bss = ELF_PAGEALIGN(elf_bss); + last_bss = ELF_PAGEALIGN(last_bss); + /* Finally, if there is still more bss to allocate, do it. */ if (last_bss > elf_bss) { - /* - * Now fill out the bss section. First pad the last page up - * to the page boundary, and then perform a mmap to make sure - * that there are zero-mapped pages up to and including the - * last bss page. - */ - if (padzero(elf_bss)) { - error = -EFAULT; - goto out; - } - - /* What we have mapped so far */ - elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); - - /* Map the last of the bss segment */ error = vm_brk(elf_bss, last_bss - elf_bss); if (error) goto out; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 203589311bf8..464a972e88c1 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -67,8 +67,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *, struct elf_fdpic_params *); #ifndef CONFIG_MMU -static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *, - unsigned long *); static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *, struct file *, struct mm_struct *); @@ -515,8 +513,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, sp = mm->start_stack; /* stack the program arguments and environment */ - if (elf_fdpic_transfer_args_to_stack(bprm, &sp) < 0) + if (transfer_args_to_stack(bprm, &sp) < 0) return -EFAULT; + sp &= ~15; #endif /* @@ -711,39 +710,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, /*****************************************************************************/ /* - * transfer the program arguments and environment from the holding pages onto - * the stack - */ -#ifndef CONFIG_MMU -static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm, - unsigned long *_sp) -{ - unsigned long index, stop, sp; - char *src; - int ret = 0; - - stop = bprm->p >> PAGE_SHIFT; - sp = *_sp; - - for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { - src = kmap(bprm->page[index]); - sp -= PAGE_SIZE; - if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0) - ret = -EFAULT; - kunmap(bprm->page[index]); - if (ret < 0) - goto out; - } - - *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15; - -out: - return ret; -} -#endif - -/*****************************************************************************/ -/* * load the appropriate binary image (executable or interpreter) into memory * - we assume no MMU is available * - if no other PIC bits are set in params->hdr->e_flags diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index 490538536cb4..dd2d3f0cd55d 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -24,7 +24,8 @@ static int load_em86(struct linux_binprm *bprm) { - char *interp, *i_name, *i_arg; + const char *i_name, *i_arg; + char *interp; struct file * file; int retval; struct elfhdr elf_ex; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index caf9e39bb82b..9b2917a30294 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -15,7 +15,8 @@ * JAN/99 -- coded full program relocation (gerg@snapgear.com) */ -#include <linux/export.h> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> @@ -25,8 +26,6 @@ #include <linux/string.h> #include <linux/fs.h> #include <linux/file.h> -#include <linux/stat.h> -#include <linux/fcntl.h> #include <linux/ptrace.h> #include <linux/user.h> #include <linux/slab.h> @@ -34,26 +33,16 @@ #include <linux/personality.h> #include <linux/init.h> #include <linux/flat.h> -#include <linux/syscalls.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> #include <asm/byteorder.h> -#include <asm/uaccess.h> #include <asm/unaligned.h> #include <asm/cacheflush.h> #include <asm/page.h> /****************************************************************************/ -#if 0 -#define DEBUG 1 -#endif - -#ifdef DEBUG -#define DBG_FLT(a...) printk(a) -#else -#define DBG_FLT(a...) -#endif - /* * User data (data section and bss) needs to be aligned. * We pick 0x20 here because it is the max value elf2flt has always @@ -80,7 +69,7 @@ struct lib_info { unsigned long text_len; /* Length of text segment */ unsigned long entry; /* Start address for this module */ unsigned long build_date; /* When this one was compiled */ - short loaded; /* Has this library been loaded? */ + bool loaded; /* Has this library been loaded? */ } lib_list[MAX_SHARED_LIBS]; }; @@ -106,59 +95,67 @@ static struct linux_binfmt flat_format = { static int flat_core_dump(struct coredump_params *cprm) { - printk("Process %s:%d received signr %d and should have core dumped\n", - current->comm, current->pid, (int) cprm->siginfo->si_signo); - return(1); + pr_warn("Process %s:%d received signr %d and should have core dumped\n", + current->comm, current->pid, cprm->siginfo->si_signo); + return 1; } /****************************************************************************/ /* * create_flat_tables() parses the env- and arg-strings in new user * memory and creates the pointer tables from them, and puts their - * addresses on the "stack", returning the new stack pointer value. + * addresses on the "stack", recording the new stack pointer value. */ -static unsigned long create_flat_tables( - unsigned long pp, - struct linux_binprm * bprm) +static int create_flat_tables(struct linux_binprm *bprm, unsigned long arg_start) { - unsigned long *argv,*envp; - unsigned long * sp; - char * p = (char*)pp; - int argc = bprm->argc; - int envc = bprm->envc; - char uninitialized_var(dummy); - - sp = (unsigned long *)p; - sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); - sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN); - argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); - envp = argv + (argc + 1); + char __user *p; + unsigned long __user *sp; + long i, len; + + p = (char __user *)arg_start; + sp = (unsigned long __user *)current->mm->start_stack; + + sp -= bprm->envc + 1; + sp -= bprm->argc + 1; + sp -= flat_argvp_envp_on_stack() ? 2 : 0; + sp -= 1; /* &argc */ + current->mm->start_stack = (unsigned long)sp & -FLAT_STACK_ALIGN; + sp = (unsigned long __user *)current->mm->start_stack; + + __put_user(bprm->argc, sp++); if (flat_argvp_envp_on_stack()) { - put_user((unsigned long) envp, sp + 2); - put_user((unsigned long) argv, sp + 1); - } - - put_user(argc, sp); - current->mm->arg_start = (unsigned long) p; - while (argc-->0) { - put_user((unsigned long) p, argv++); - do { - get_user(dummy, p); p++; - } while (dummy); - } - put_user((unsigned long) NULL, argv); - current->mm->arg_end = current->mm->env_start = (unsigned long) p; - while (envc-->0) { - put_user((unsigned long)p, envp); envp++; - do { - get_user(dummy, p); p++; - } while (dummy); - } - put_user((unsigned long) NULL, envp); - current->mm->env_end = (unsigned long) p; - return (unsigned long)sp; + unsigned long argv, envp; + argv = (unsigned long)(sp + 2); + envp = (unsigned long)(sp + 2 + bprm->argc + 1); + __put_user(argv, sp++); + __put_user(envp, sp++); + } + + current->mm->arg_start = (unsigned long)p; + for (i = bprm->argc; i > 0; i--) { + __put_user((unsigned long)p, sp++); + len = strnlen_user(p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + __put_user(0, sp++); + current->mm->arg_end = (unsigned long)p; + + current->mm->env_start = (unsigned long) p; + for (i = bprm->envc; i > 0; i--) { + __put_user((unsigned long)p, sp++); + len = strnlen_user(p, MAX_ARG_STRLEN); + if (!len || len > MAX_ARG_STRLEN) + return -EINVAL; + p += len; + } + __put_user(0, sp++); + current->mm->env_end = (unsigned long)p; + + return 0; } /****************************************************************************/ @@ -190,17 +187,17 @@ static int decompress_exec( loff_t fpos; int ret, retval; - DBG_FLT("decompress_exec(offset=%x,buf=%x,len=%x)\n",(int)offset, (int)dst, (int)len); + pr_debug("decompress_exec(offset=%lx,buf=%p,len=%lx)\n", offset, dst, len); memset(&strm, 0, sizeof(strm)); strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); if (strm.workspace == NULL) { - DBG_FLT("binfmt_flat: no memory for decompress workspace\n"); + pr_debug("no memory for decompress workspace\n"); return -ENOMEM; } buf = kmalloc(LBUFSIZE, GFP_KERNEL); if (buf == NULL) { - DBG_FLT("binfmt_flat: no memory for read buffer\n"); + pr_debug("no memory for read buffer\n"); retval = -ENOMEM; goto out_free; } @@ -218,49 +215,49 @@ static int decompress_exec( /* Check minimum size -- gzip header */ if (ret < 10) { - DBG_FLT("binfmt_flat: file too small?\n"); + pr_debug("file too small?\n"); goto out_free_buf; } /* Check gzip magic number */ if ((buf[0] != 037) || ((buf[1] != 0213) && (buf[1] != 0236))) { - DBG_FLT("binfmt_flat: unknown compression magic?\n"); + pr_debug("unknown compression magic?\n"); goto out_free_buf; } /* Check gzip method */ if (buf[2] != 8) { - DBG_FLT("binfmt_flat: unknown compression method?\n"); + pr_debug("unknown compression method?\n"); goto out_free_buf; } /* Check gzip flags */ if ((buf[3] & ENCRYPTED) || (buf[3] & CONTINUATION) || (buf[3] & RESERVED)) { - DBG_FLT("binfmt_flat: unknown flags?\n"); + pr_debug("unknown flags?\n"); goto out_free_buf; } ret = 10; if (buf[3] & EXTRA_FIELD) { ret += 2 + buf[10] + (buf[11] << 8); - if (unlikely(LBUFSIZE <= ret)) { - DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n"); + if (unlikely(ret >= LBUFSIZE)) { + pr_debug("buffer overflow (EXTRA)?\n"); goto out_free_buf; } } if (buf[3] & ORIG_NAME) { while (ret < LBUFSIZE && buf[ret++] != 0) ; - if (unlikely(LBUFSIZE == ret)) { - DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n"); + if (unlikely(ret == LBUFSIZE)) { + pr_debug("buffer overflow (ORIG_NAME)?\n"); goto out_free_buf; } } if (buf[3] & COMMENT) { while (ret < LBUFSIZE && buf[ret++] != 0) ; - if (unlikely(LBUFSIZE == ret)) { - DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n"); + if (unlikely(ret == LBUFSIZE)) { + pr_debug("buffer overflow (COMMENT)?\n"); goto out_free_buf; } } @@ -273,7 +270,7 @@ static int decompress_exec( strm.total_out = 0; if (zlib_inflateInit2(&strm, -MAX_WBITS) != Z_OK) { - DBG_FLT("binfmt_flat: zlib init failed?\n"); + pr_debug("zlib init failed?\n"); goto out_free_buf; } @@ -290,7 +287,7 @@ static int decompress_exec( } if (ret < 0) { - DBG_FLT("binfmt_flat: decompression failed (%d), %s\n", + pr_debug("decompression failed (%d), %s\n", ret, strm.msg); goto out_zlib; } @@ -327,24 +324,23 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) r &= 0x00ffffff; /* Trim ID off here */ } if (id >= MAX_SHARED_LIBS) { - printk("BINFMT_FLAT: reference 0x%x to shared library %d", - (unsigned) r, id); + pr_err("reference 0x%lx to shared library %d", r, id); goto failed; } if (curid != id) { if (internalp) { - printk("BINFMT_FLAT: reloc address 0x%x not in same module " - "(%d != %d)", (unsigned) r, curid, id); + pr_err("reloc address 0x%lx not in same module " + "(%d != %d)", r, curid, id); goto failed; - } else if ( ! p->lib_list[id].loaded && - load_flat_shared_library(id, p) < 0) { - printk("BINFMT_FLAT: failed to load library %d", id); + } else if (!p->lib_list[id].loaded && + load_flat_shared_library(id, p) < 0) { + pr_err("failed to load library %d", id); goto failed; } /* Check versioning information (i.e. time stamps) */ if (p->lib_list[id].build_date && p->lib_list[curid].build_date && p->lib_list[curid].build_date < p->lib_list[id].build_date) { - printk("BINFMT_FLAT: library %d is younger than %d", id, curid); + pr_err("library %d is younger than %d", id, curid); goto failed; } } @@ -358,8 +354,8 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) text_len = p->lib_list[id].text_len; if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { - printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", - (int) r,(int)(start_brk-start_data+text_len),(int)text_len); + pr_err("reloc outside program 0x%lx (0 - 0x%lx/0x%lx)", + r, start_brk-start_data+text_len, text_len); goto failed; } @@ -369,10 +365,10 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) addr = r - text_len + start_data; /* Range checked already above so doing the range tests is redundant...*/ - return(addr); + return addr; failed: - printk(", killing %s!\n", current->comm); + pr_cont(", killing %s!\n", current->comm); send_sig(SIGSEGV, current, 0); return RELOC_FAILED; @@ -382,62 +378,57 @@ failed: static void old_reloc(unsigned long rl) { -#ifdef DEBUG - char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; -#endif + static const char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; flat_v2_reloc_t r; - unsigned long *ptr; - + unsigned long __user *ptr; + unsigned long val; + r.value = rl; #if defined(CONFIG_COLDFIRE) - ptr = (unsigned long *) (current->mm->start_code + r.reloc.offset); + ptr = (unsigned long __user *)(current->mm->start_code + r.reloc.offset); #else - ptr = (unsigned long *) (current->mm->start_data + r.reloc.offset); + ptr = (unsigned long __user *)(current->mm->start_data + r.reloc.offset); #endif + get_user(val, ptr); + + pr_debug("Relocation of variable at DATASEG+%x " + "(address %p, currently %lx) into segment %s\n", + r.reloc.offset, ptr, val, segment[r.reloc.type]); -#ifdef DEBUG - printk("Relocation of variable at DATASEG+%x " - "(address %p, currently %x) into segment %s\n", - r.reloc.offset, ptr, (int)*ptr, segment[r.reloc.type]); -#endif - switch (r.reloc.type) { case OLD_FLAT_RELOC_TYPE_TEXT: - *ptr += current->mm->start_code; + val += current->mm->start_code; break; case OLD_FLAT_RELOC_TYPE_DATA: - *ptr += current->mm->start_data; + val += current->mm->start_data; break; case OLD_FLAT_RELOC_TYPE_BSS: - *ptr += current->mm->end_data; + val += current->mm->end_data; break; default: - printk("BINFMT_FLAT: Unknown relocation type=%x\n", r.reloc.type); + pr_err("Unknown relocation type=%x\n", r.reloc.type); break; } + put_user(val, ptr); -#ifdef DEBUG - printk("Relocation became %x\n", (int)*ptr); -#endif -} + pr_debug("Relocation became %lx\n", val); +} /****************************************************************************/ -static int load_flat_file(struct linux_binprm * bprm, +static int load_flat_file(struct linux_binprm *bprm, struct lib_info *libinfo, int id, unsigned long *extra_stack) { - struct flat_hdr * hdr; - unsigned long textpos = 0, datapos = 0, result; - unsigned long realdatastart = 0; - unsigned long text_len, data_len, bss_len, stack_len, flags; - unsigned long full_data; - unsigned long len, memp = 0; - unsigned long memp_size, extra, rlim; - unsigned long *reloc = 0, *rp; + struct flat_hdr *hdr; + unsigned long textpos, datapos, realdatastart; + unsigned long text_len, data_len, bss_len, stack_len, full_data, flags; + unsigned long len, memp, memp_size, extra, rlim; + unsigned long __user *reloc, *rp; struct inode *inode; - int i, rev, relocs = 0; + int i, rev, relocs; loff_t fpos; unsigned long start_code, end_code; + ssize_t result; int ret; hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */ @@ -469,20 +460,30 @@ static int load_flat_file(struct linux_binprm * bprm, } if (flags & FLAT_FLAG_KTRACE) - printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename); + pr_info("Loading file: %s\n", bprm->filename); if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) { - printk("BINFMT_FLAT: bad flat file version 0x%x (supported " - "0x%lx and 0x%lx)\n", - rev, FLAT_VERSION, OLD_FLAT_VERSION); + pr_err("bad flat file version 0x%x (supported 0x%lx and 0x%lx)\n", + rev, FLAT_VERSION, OLD_FLAT_VERSION); ret = -ENOEXEC; goto err; } - + /* Don't allow old format executables to use shared libraries */ if (rev == OLD_FLAT_VERSION && id != 0) { - printk("BINFMT_FLAT: shared libraries are not available before rev 0x%x\n", - (int) FLAT_VERSION); + pr_err("shared libraries are not available before rev 0x%lx\n", + FLAT_VERSION); + ret = -ENOEXEC; + goto err; + } + + /* + * Make sure the header params are sane. + * 28 bits (256 MB) is way more than reasonable in this case. + * If some top bits are set we have probable binary corruption. + */ + if ((text_len | data_len | bss_len | stack_len | full_data) >> 28) { + pr_err("bad header\n"); ret = -ENOEXEC; goto err; } @@ -496,7 +497,7 @@ static int load_flat_file(struct linux_binprm * bprm, #ifndef CONFIG_BINFMT_ZFLAT if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) { - printk("Support for ZFLAT executables is not enabled.\n"); + pr_err("Support for ZFLAT executables is not enabled.\n"); ret = -ENOEXEC; goto err; } @@ -517,11 +518,9 @@ static int load_flat_file(struct linux_binprm * bprm, /* Flush all traces of the currently running executable */ if (id == 0) { - result = flush_old_exec(bprm); - if (result) { - ret = result; + ret = flush_old_exec(bprm); + if (ret) goto err; - } /* OK, This is the point of no return */ set_personality(PER_LINUX_32BIT); @@ -539,48 +538,48 @@ static int load_flat_file(struct linux_binprm * bprm, * case, and then the fully copied to RAM case which lumps * it all together. */ - if ((flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP)) == 0) { + if (!IS_ENABLED(CONFIG_MMU) && !(flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) { /* * this should give us a ROM ptr, but if it doesn't we don't * really care */ - DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n"); + pr_debug("ROM mapping of file (we hope)\n"); textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_EXECUTABLE, 0); if (!textpos || IS_ERR_VALUE(textpos)) { - if (!textpos) - textpos = (unsigned long) -ENOMEM; - printk("Unable to mmap process text, errno %d\n", (int)-textpos); ret = textpos; + if (!textpos) + ret = -ENOMEM; + pr_err("Unable to mmap process text, errno %d\n", ret); goto err; } len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); - realdatastart = vm_mmap(0, 0, len, + realdatastart = vm_mmap(NULL, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) { + ret = realdatastart; if (!realdatastart) - realdatastart = (unsigned long) -ENOMEM; - printk("Unable to allocate RAM for process data, errno %d\n", - (int)-realdatastart); + ret = -ENOMEM; + pr_err("Unable to allocate RAM for process data, " + "errno %d\n", ret); vm_munmap(textpos, text_len); - ret = realdatastart; goto err; } datapos = ALIGN(realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long), FLAT_DATA_ALIGN); - DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n", - (int)(data_len + bss_len + stack_len), (int)datapos); + pr_debug("Allocated data+bss+stack (%ld bytes): %lx\n", + data_len + bss_len + stack_len, datapos); fpos = ntohl(hdr->data_start); #ifdef CONFIG_BINFMT_ZFLAT if (flags & FLAT_FLAG_GZDATA) { - result = decompress_exec(bprm, fpos, (char *) datapos, + result = decompress_exec(bprm, fpos, (char *)datapos, full_data, 0); } else #endif @@ -589,29 +588,30 @@ static int load_flat_file(struct linux_binprm * bprm, full_data); } if (IS_ERR_VALUE(result)) { - printk("Unable to read data+bss, errno %d\n", (int)-result); + ret = result; + pr_err("Unable to read data+bss, errno %d\n", ret); vm_munmap(textpos, text_len); vm_munmap(realdatastart, len); - ret = result; goto err; } - reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); + reloc = (unsigned long __user *) + (datapos + (ntohl(hdr->reloc_start) - text_len)); memp = realdatastart; memp_size = len; } else { len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); - textpos = vm_mmap(0, 0, len, + textpos = vm_mmap(NULL, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); if (!textpos || IS_ERR_VALUE(textpos)) { - if (!textpos) - textpos = (unsigned long) -ENOMEM; - printk("Unable to allocate RAM for process text/data, errno %d\n", - (int)-textpos); ret = textpos; + if (!textpos) + ret = -ENOMEM; + pr_err("Unable to allocate RAM for process text/data, " + "errno %d\n", ret); goto err; } @@ -620,7 +620,7 @@ static int load_flat_file(struct linux_binprm * bprm, MAX_SHARED_LIBS * sizeof(unsigned long), FLAT_DATA_ALIGN); - reloc = (unsigned long *) + reloc = (unsigned long __user *) (datapos + (ntohl(hdr->reloc_start) - text_len)); memp = textpos; memp_size = len; @@ -629,21 +629,59 @@ static int load_flat_file(struct linux_binprm * bprm, * load it all in and treat it like a RAM load from now on */ if (flags & FLAT_FLAG_GZIP) { - result = decompress_exec(bprm, sizeof (struct flat_hdr), - (((char *) textpos) + sizeof (struct flat_hdr)), +#ifndef CONFIG_MMU + result = decompress_exec(bprm, sizeof(struct flat_hdr), + (((char *)textpos) + sizeof(struct flat_hdr)), (text_len + full_data - - sizeof (struct flat_hdr)), + - sizeof(struct flat_hdr)), 0); memmove((void *) datapos, (void *) realdatastart, full_data); +#else + /* + * This is used on MMU systems mainly for testing. + * Let's use a kernel buffer to simplify things. + */ + long unz_text_len = text_len - sizeof(struct flat_hdr); + long unz_len = unz_text_len + full_data; + char *unz_data = vmalloc(unz_len); + if (!unz_data) { + result = -ENOMEM; + } else { + result = decompress_exec(bprm, sizeof(struct flat_hdr), + unz_data, unz_len, 0); + if (result == 0 && + (copy_to_user((void __user *)textpos + sizeof(struct flat_hdr), + unz_data, unz_text_len) || + copy_to_user((void __user *)datapos, + unz_data + unz_text_len, full_data))) + result = -EFAULT; + vfree(unz_data); + } +#endif } else if (flags & FLAT_FLAG_GZDATA) { result = read_code(bprm->file, textpos, 0, text_len); - if (!IS_ERR_VALUE(result)) + if (!IS_ERR_VALUE(result)) { +#ifndef CONFIG_MMU result = decompress_exec(bprm, text_len, (char *) datapos, full_data, 0); - } - else +#else + char *unz_data = vmalloc(full_data); + if (!unz_data) { + result = -ENOMEM; + } else { + result = decompress_exec(bprm, text_len, + unz_data, full_data, 0); + if (result == 0 && + copy_to_user((void __user *)datapos, + unz_data, full_data)) + result = -EFAULT; + vfree(unz_data); + } #endif + } + } else +#endif /* CONFIG_BINFMT_ZFLAT */ { result = read_code(bprm->file, textpos, 0, text_len); if (!IS_ERR_VALUE(result)) @@ -652,21 +690,19 @@ static int load_flat_file(struct linux_binprm * bprm, full_data); } if (IS_ERR_VALUE(result)) { - printk("Unable to read code+data+bss, errno %d\n",(int)-result); + ret = result; + pr_err("Unable to read code+data+bss, errno %d\n", ret); vm_munmap(textpos, text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long)); - ret = result; goto err; } } - if (flags & FLAT_FLAG_KTRACE) - printk("Mapping is %x, Entry point is %x, data_start is %x\n", - (int)textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start)); + start_code = textpos + sizeof(struct flat_hdr); + end_code = textpos + text_len; + text_len -= sizeof(struct flat_hdr); /* the real code len */ /* The main program needs a little extra setup in the task structure */ - start_code = textpos + sizeof (struct flat_hdr); - end_code = textpos + text_len; if (id == 0) { current->mm->start_code = start_code; current->mm->end_code = end_code; @@ -681,19 +717,19 @@ static int load_flat_file(struct linux_binprm * bprm, */ current->mm->start_brk = datapos + data_len + bss_len; current->mm->brk = (current->mm->start_brk + 3) & ~3; +#ifndef CONFIG_MMU current->mm->context.end_brk = memp + memp_size - stack_len; +#endif } - if (flags & FLAT_FLAG_KTRACE) - printk("%s %s: TEXT=%x-%x DATA=%x-%x BSS=%x-%x\n", + if (flags & FLAT_FLAG_KTRACE) { + pr_info("Mapping is %lx, Entry point is %x, data_start is %x\n", + textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start)); + pr_info("%s %s: TEXT=%lx-%lx DATA=%lx-%lx BSS=%lx-%lx\n", id ? "Lib" : "Load", bprm->filename, - (int) start_code, (int) end_code, - (int) datapos, - (int) (datapos + data_len), - (int) (datapos + data_len), - (int) (((datapos + data_len + bss_len) + 3) & ~3)); - - text_len -= sizeof(struct flat_hdr); /* the real code len */ + start_code, end_code, datapos, datapos + data_len, + datapos + data_len, (datapos + data_len + bss_len + 3) & ~3); + } /* Store the current module values into the global library structure */ libinfo->lib_list[id].start_code = start_code; @@ -703,7 +739,7 @@ static int load_flat_file(struct linux_binprm * bprm, libinfo->lib_list[id].loaded = 1; libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos; libinfo->lib_list[id].build_date = ntohl(hdr->build_date); - + /* * We just load the allocations into some temporary memory to * help simplify all this mumbo jumbo @@ -717,15 +753,20 @@ static int load_flat_file(struct linux_binprm * bprm, * image. */ if (flags & FLAT_FLAG_GOTPIC) { - for (rp = (unsigned long *)datapos; *rp != 0xffffffff; rp++) { - unsigned long addr; - if (*rp) { - addr = calc_reloc(*rp, libinfo, id, 0); + for (rp = (unsigned long __user *)datapos; ; rp++) { + unsigned long addr, rp_val; + if (get_user(rp_val, rp)) + return -EFAULT; + if (rp_val == 0xffffffff) + break; + if (rp_val) { + addr = calc_reloc(rp_val, libinfo, id, 0); if (addr == RELOC_FAILED) { ret = -ENOEXEC; goto err; } - *rp = addr; + if (put_user(addr, rp)) + return -EFAULT; } } } @@ -742,19 +783,23 @@ static int load_flat_file(struct linux_binprm * bprm, * __start to address 4 so that is okay). */ if (rev > OLD_FLAT_VERSION) { - unsigned long persistent = 0; - for (i=0; i < relocs; i++) { + unsigned long __maybe_unused persistent = 0; + for (i = 0; i < relocs; i++) { unsigned long addr, relval; - /* Get the address of the pointer to be - relocated (of course, the address has to be - relocated first). */ - relval = ntohl(reloc[i]); - if (flat_set_persistent (relval, &persistent)) + /* + * Get the address of the pointer to be + * relocated (of course, the address has to be + * relocated first). + */ + if (get_user(relval, reloc + i)) + return -EFAULT; + relval = ntohl(relval); + if (flat_set_persistent(relval, &persistent)) continue; addr = flat_get_relocate_addr(relval); - rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1); - if (rp == (unsigned long *)RELOC_FAILED) { + rp = (unsigned long __user *)calc_reloc(addr, libinfo, id, 1); + if (rp == (unsigned long __user *)RELOC_FAILED) { ret = -ENOEXEC; goto err; } @@ -780,17 +825,23 @@ static int load_flat_file(struct linux_binprm * bprm, } } } else { - for (i=0; i < relocs; i++) - old_reloc(ntohl(reloc[i])); + for (i = 0; i < relocs; i++) { + unsigned long relval; + if (get_user(relval, reloc + i)) + return -EFAULT; + relval = ntohl(relval); + old_reloc(relval); + } } - + flush_icache_range(start_code, end_code); /* zero the BSS, BRK and stack areas */ - memset((void*)(datapos + data_len), 0, bss_len + - (memp + memp_size - stack_len - /* end brk */ - libinfo->lib_list[id].start_brk) + /* start brk */ - stack_len); + if (clear_user((void __user *)(datapos + data_len), bss_len + + (memp + memp_size - stack_len - /* end brk */ + libinfo->lib_list[id].start_brk) + /* start brk */ + stack_len)) + return -EFAULT; return 0; err: @@ -846,7 +897,7 @@ out: allow_write_access(bprm.file); fput(bprm.file); - return(res); + return res; } #endif /* CONFIG_BINFMT_SHARED_FLAT */ @@ -857,18 +908,17 @@ out: * libraries. There is no binary dependent code anywhere else. */ -static int load_flat_binary(struct linux_binprm * bprm) +static int load_flat_binary(struct linux_binprm *bprm) { struct lib_info libinfo; struct pt_regs *regs = current_pt_regs(); - unsigned long p = bprm->p; - unsigned long stack_len; + unsigned long stack_len = 0; unsigned long start_addr; - unsigned long *sp; int res; int i, j; memset(&libinfo, 0, sizeof(libinfo)); + /* * We have to add the size of our arguments to our stack size * otherwise it's too easy for users to create stack overflows @@ -876,38 +926,54 @@ static int load_flat_binary(struct linux_binprm * bprm) * pedantic and include space for the argv/envp array as it may have * a lot of entries. */ -#define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *)) - stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ - stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ - stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ - stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */ - +#ifndef CONFIG_MMU + stack_len += PAGE_SIZE * MAX_ARG_PAGES - bprm->p; /* the strings */ +#endif + stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ + stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ + stack_len = ALIGN(stack_len, FLAT_STACK_ALIGN); + res = load_flat_file(bprm, &libinfo, 0, &stack_len); if (res < 0) return res; - + /* Update data segment pointers for all libraries */ - for (i=0; i<MAX_SHARED_LIBS; i++) - if (libinfo.lib_list[i].loaded) - for (j=0; j<MAX_SHARED_LIBS; j++) - (-(j+1))[(unsigned long *)(libinfo.lib_list[i].start_data)] = - (libinfo.lib_list[j].loaded)? - libinfo.lib_list[j].start_data:UNLOADED_LIB; + for (i = 0; i < MAX_SHARED_LIBS; i++) { + if (!libinfo.lib_list[i].loaded) + continue; + for (j = 0; j < MAX_SHARED_LIBS; j++) { + unsigned long val = libinfo.lib_list[j].loaded ? + libinfo.lib_list[j].start_data : UNLOADED_LIB; + unsigned long __user *p = (unsigned long __user *) + libinfo.lib_list[i].start_data; + p -= j + 1; + if (put_user(val, p)) + return -EFAULT; + } + } install_exec_creds(bprm); set_binfmt(&flat_format); - p = ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4; - DBG_FLT("p=%x\n", (int)p); +#ifdef CONFIG_MMU + res = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); + if (!res) + res = create_flat_tables(bprm, bprm->p); +#else + /* Stash our initial stack pointer into the mm structure */ + current->mm->start_stack = + ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4; + pr_debug("sp=%lx\n", current->mm->start_stack); - /* copy the arg pages onto the stack, this could be more efficient :-) */ - for (i = TOP_OF_ARGS - 1; i >= bprm->p; i--) - * (char *) --p = - ((char *) page_address(bprm->page[i/PAGE_SIZE]))[i % PAGE_SIZE]; + /* copy the arg pages onto the stack */ + res = transfer_args_to_stack(bprm, ¤t->mm->start_stack); + if (!res) + res = create_flat_tables(bprm, current->mm->start_stack); +#endif + if (res) + return res; - sp = (unsigned long *) create_flat_tables(p, bprm); - /* Fake some return addresses to ensure the call chain will * initialise library in order for us. We are required to call * lib 1 first, then 2, ... and finally the main program (id 0). @@ -915,24 +981,24 @@ static int load_flat_binary(struct linux_binprm * bprm) start_addr = libinfo.lib_list[0].entry; #ifdef CONFIG_BINFMT_SHARED_FLAT - for (i = MAX_SHARED_LIBS-1; i>0; i--) { + for (i = MAX_SHARED_LIBS-1; i > 0; i--) { if (libinfo.lib_list[i].loaded) { /* Push previos first to call address */ - --sp; put_user(start_addr, sp); + unsigned long __user *sp; + current->mm->start_stack -= sizeof(unsigned long); + sp = (unsigned long __user *)current->mm->start_stack; + __put_user(start_addr, sp); start_addr = libinfo.lib_list[i].entry; } } #endif - - /* Stash our initial stack pointer into the mm structure */ - current->mm->start_stack = (unsigned long )sp; #ifdef FLAT_PLAT_INIT FLAT_PLAT_INIT(regs); #endif - DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n", - (int)regs, (int)start_addr, (int)current->mm->start_stack); - + + pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n", + regs, start_addr, current->mm->start_stack); start_thread(regs, start_addr, current->mm->start_stack); return 0; @@ -945,9 +1011,6 @@ static int __init init_flat_binfmt(void) register_binfmt(&flat_format); return 0; } - -/****************************************************************************/ - core_initcall(init_flat_binfmt); /****************************************************************************/ diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 3a3ced779fc7..5417516f6e59 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -637,13 +637,12 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, break; case 3: /* Delete this handler. */ - root = dget(file->f_path.dentry->d_sb->s_root); + root = file_inode(file)->i_sb->s_root; inode_lock(d_inode(root)); kill_node(e); inode_unlock(d_inode(root)); - dput(root); break; default: return res; @@ -665,8 +664,8 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, { Node *e; struct inode *inode; - struct dentry *root, *dentry; - struct super_block *sb = file->f_path.dentry->d_sb; + struct super_block *sb = file_inode(file)->i_sb; + struct dentry *root = sb->s_root, *dentry; int err = 0; e = create_entry(buffer, count); @@ -674,7 +673,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, if (IS_ERR(e)) return PTR_ERR(e); - root = dget(sb->s_root); inode_lock(d_inode(root)); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); @@ -712,7 +710,6 @@ out2: dput(dentry); out: inode_unlock(d_inode(root)); - dput(root); if (err) { kfree(e); @@ -753,14 +750,13 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, break; case 3: /* Delete all handlers. */ - root = dget(file->f_path.dentry->d_sb->s_root); + root = file_inode(file)->i_sb->s_root; inode_lock(d_inode(root)); while (!list_empty(&entries)) kill_node(list_entry(entries.next, Node, list)); inode_unlock(d_inode(root)); - dput(root); break; default: return res; diff --git a/fs/block_dev.c b/fs/block_dev.c index d012be4ab977..d402899ba135 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -416,7 +416,8 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, result = blk_queue_enter(bdev->bd_queue, false); if (result) return result; - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, + REQ_OP_READ); blk_queue_exit(bdev->bd_queue); return result; } @@ -445,7 +446,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, struct page *page, struct writeback_control *wbc) { int result; - int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; const struct block_device_operations *ops = bdev->bd_disk->fops; if (!ops->rw_page || bdev_get_integrity(bdev)) @@ -455,7 +455,8 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, return result; set_page_writeback(page); - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, + REQ_OP_WRITE); if (result) end_page_writeback(page); else @@ -614,7 +615,6 @@ static void init_once(void *foo) memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); - INIT_LIST_HEAD(&bdev->bd_inodes); INIT_LIST_HEAD(&bdev->bd_list); #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); @@ -624,24 +624,13 @@ static void init_once(void *foo) mutex_init(&bdev->bd_fsfreeze_mutex); } -static inline void __bd_forget(struct inode *inode) -{ - list_del_init(&inode->i_devices); - inode->i_bdev = NULL; - inode->i_mapping = &inode->i_data; -} - static void bdev_evict_inode(struct inode *inode) { struct block_device *bdev = &BDEV_I(inode)->bdev; - struct list_head *p; truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); spin_lock(&bdev_lock); - while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { - __bd_forget(list_entry(p, struct inode, i_devices)); - } list_del_init(&bdev->bd_list); spin_unlock(&bdev_lock); } @@ -805,7 +794,6 @@ static struct block_device *bd_acquire(struct inode *inode) bdgrab(bdev); inode->i_bdev = bdev; inode->i_mapping = bdev->bd_inode->i_mapping; - list_add(&inode->i_devices, &bdev->bd_inodes); } spin_unlock(&bdev_lock); } @@ -821,7 +809,8 @@ void bd_forget(struct inode *inode) spin_lock(&bdev_lock); if (!sb_is_blkdev_sb(inode->i_sb)) bdev = inode->i_bdev; - __bd_forget(inode); + inode->i_bdev = NULL; + inode->i_mapping = &inode->i_data; spin_unlock(&bdev_lock); if (bdev) @@ -1287,11 +1276,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && - blk_queue_dax(disk->queue)) - bdev->bd_inode->i_flags = S_DAX; - else - bdev->bd_inode->i_flags = 0; + bdev->bd_inode->i_flags = 0; if (!partno) { ret = -ENXIO; @@ -1858,7 +1843,7 @@ struct block_device *lookup_bdev(const char *pathname) if (!S_ISBLK(inode->i_mode)) goto fail; error = -EACCES; - if (path.mnt->mnt_flags & MNT_NODEV) + if (!may_open_dev(&path)) goto fail; error = -ENOMEM; bdev = bd_acquire(inode); diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 67a607709d4f..53bb7af4e5f0 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -55,8 +55,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) } if (size > 0) { acl = posix_acl_from_xattr(&init_user_ns, value, size); - } else if (size == -ENOENT || size == -ENODATA || size == 0) { - /* FIXME, who returns -ENOENT? I think nobody */ + } else if (size == -ERANGE || size == -ENODATA || size == 0) { acl = NULL; } else { acl = ERR_PTR(-EIO); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5fb60ea7eee2..e0f071f6b5a7 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -34,6 +34,10 @@ struct __btrfs_workqueue { struct workqueue_struct *normal_wq; + + /* File system this workqueue services */ + struct btrfs_fs_info *fs_info; + /* List head pointing to ordered work list */ struct list_head ordered_list; @@ -70,6 +74,18 @@ void btrfs_##name(struct work_struct *arg) \ normal_work_helper(work); \ } +struct btrfs_fs_info * +btrfs_workqueue_owner(struct __btrfs_workqueue *wq) +{ + return wq->fs_info; +} + +struct btrfs_fs_info * +btrfs_work_owner(struct btrfs_work *work) +{ + return work->wq->fs_info; +} + BTRFS_WORK_HELPER(worker_helper); BTRFS_WORK_HELPER(delalloc_helper); BTRFS_WORK_HELPER(flush_delalloc_helper); @@ -94,14 +110,15 @@ BTRFS_WORK_HELPER(scrubnc_helper); BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, - int thresh) +__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags, int limit_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; + ret->fs_info = fs_info; ret->limit_active = limit_active; atomic_set(&ret->pending, 0); if (thresh == 0) @@ -143,7 +160,8 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, static inline void __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); -struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, int limit_active, int thresh) @@ -153,7 +171,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, if (!ret) return NULL; - ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, + ret->normal = __btrfs_alloc_workqueue(fs_info, name, + flags & ~WQ_HIGHPRI, limit_active, thresh); if (!ret->normal) { kfree(ret); @@ -161,8 +180,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, } if (flags & WQ_HIGHPRI) { - ret->high = __btrfs_alloc_workqueue(name, flags, limit_active, - thresh); + ret->high = __btrfs_alloc_workqueue(fs_info, name, flags, + limit_active, thresh); if (!ret->high) { __btrfs_destroy_workqueue(ret->normal); kfree(ret); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index ad4d0647d1a6..8e52484cd461 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -21,6 +21,7 @@ #define __BTRFS_ASYNC_THREAD_ #include <linux/workqueue.h> +struct btrfs_fs_info; struct btrfs_workqueue; /* Internal use only */ struct __btrfs_workqueue; @@ -67,7 +68,8 @@ BTRFS_WORK_HELPER_PROTO(scrubnc_helper); BTRFS_WORK_HELPER_PROTO(scrubparity_helper); -struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, int limit_active, int thresh); @@ -80,4 +82,6 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); +struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work); +struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8bb3509099e8..2b88439c2ee8 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -139,7 +139,7 @@ int __init btrfs_prelim_ref_init(void) btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", sizeof(struct __prelim_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_prelim_ref_cache) return -ENOMEM; @@ -361,7 +361,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (btrfs_test_is_dummy_root(root)) { + if (btrfs_is_testing(fs_info)) { srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -ENOENT; goto out; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index cefedabf0a92..029db6e1105c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -403,7 +403,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, } ret = btrfs_map_bio(root, bio, 0, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } bio_put(bio); @@ -434,7 +437,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, } ret = btrfs_map_bio(root, bio, 0, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } bio_put(bio); return 0; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a85cf7d23309..d1c56c94dd5a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1153,14 +1153,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -1198,7 +1198,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = tree_mod_log_free_eb(root->fs_info, buf); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -1505,7 +1505,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; /* ensure we can see the force_cow */ @@ -1771,6 +1771,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb, unsigned long map_len = 0; int err; + if (low > high) { + btrfs_err(eb->fs_info, + "%s: low (%d) > high (%d) eb %llu owner %llu level %d", + __func__, low, high, eb->start, + btrfs_header_owner(eb), btrfs_header_level(eb)); + return -EINVAL; + } + while (low < high) { mid = (low + high) / 2; offset = p + mid * item_size; @@ -1858,7 +1866,6 @@ static void root_sub_used(struct btrfs_root *root, u32 size) /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). - * NULL is returned on error. */ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, struct extent_buffer *parent, int slot) @@ -1866,19 +1873,16 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, int level = btrfs_header_level(parent); struct extent_buffer *eb; - if (slot < 0) - return NULL; - if (slot >= btrfs_header_nritems(parent)) - return NULL; + if (slot < 0 || slot >= btrfs_header_nritems(parent)) + return ERR_PTR(-ENOENT); BUG_ON(level == 0); eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot)); - if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { - if (!IS_ERR(eb)) - free_extent_buffer(eb); - eb = NULL; + if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + eb = ERR_PTR(-EIO); } return eb; @@ -1931,8 +1935,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); - if (!child) { - ret = -EROFS; + if (IS_ERR(child)) { + ret = PTR_ERR(child); btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } @@ -1970,6 +1974,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; left = read_node_slot(root, parent, pslot - 1); + if (IS_ERR(left)) + left = NULL; + if (left) { btrfs_tree_lock(left); btrfs_set_lock_blocking(left); @@ -1980,7 +1987,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } } + right = read_node_slot(root, parent, pslot + 1); + if (IS_ERR(right)) + right = NULL; + if (right) { btrfs_tree_lock(right); btrfs_set_lock_blocking(right); @@ -2135,6 +2146,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, return 1; left = read_node_slot(root, parent, pslot - 1); + if (IS_ERR(left)) + left = NULL; /* first, try to make some room in the middle buffer */ if (left) { @@ -2185,6 +2198,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, free_extent_buffer(left); } right = read_node_slot(root, parent, pslot + 1); + if (IS_ERR(right)) + right = NULL; /* * then try to empty the right most buffer into the middle @@ -3240,7 +3255,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, push_items); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(dst, src, @@ -3315,7 +3330,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, src_nritems - push_items, push_items); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(dst, src, @@ -3519,7 +3534,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(split, c, @@ -3773,7 +3788,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); - if (right == NULL) + /* + * slot + 1 is not valid or we fail to read the right node, + * no big deal, just return. + */ + if (IS_ERR(right)) return 1; btrfs_tree_lock(right); @@ -4003,7 +4022,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); - if (left == NULL) + /* + * slot - 1 is not valid or we fail to read the left node, + * no big deal, just return. + */ + if (IS_ERR(left)) return 1; btrfs_tree_lock(left); @@ -5210,7 +5233,10 @@ find_next_key: } btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); - BUG_ON(!cur); /* -ENOMEM */ + if (IS_ERR(cur)) { + ret = PTR_ERR(cur); + goto out; + } btrfs_tree_read_lock(cur); @@ -5229,15 +5255,21 @@ out: return ret; } -static void tree_move_down(struct btrfs_root *root, +static int tree_move_down(struct btrfs_root *root, struct btrfs_path *path, int *level, int root_level) { + struct extent_buffer *eb; + BUG_ON(*level == 0); - path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], - path->slots[*level]); + eb = read_node_slot(root, path->nodes[*level], path->slots[*level]); + if (IS_ERR(eb)) + return PTR_ERR(eb); + + path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; + return 0; } static int tree_move_next_or_upnext(struct btrfs_root *root, @@ -5282,8 +5314,7 @@ static int tree_advance(struct btrfs_root *root, if (*level == 0 || !allow_down) { ret = tree_move_next_or_upnext(root, path, level, root_level); } else { - tree_move_down(root, path, level, root_level); - ret = 0; + ret = tree_move_down(root, path, level, root_level); } if (ret >= 0) { if (*level == 0) @@ -5457,8 +5488,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, left_root_level, advance_left != ADVANCE_ONLY_NEXT, &left_key); - if (ret < 0) + if (ret == -1) left_end_reached = ADVANCE; + else if (ret < 0) + goto out; advance_left = 0; } if (advance_right && !right_end_reached) { @@ -5466,8 +5499,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_root_level, advance_right != ADVANCE_ONLY_NEXT, &right_key); - if (ret < 0) + if (ret == -1) right_end_reached = ADVANCE; + else if (ret < 0) + goto out; advance_right = 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b2620d1f883f..2fe8f89091a3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -117,6 +117,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FS_STATE_REMOUNTING 1 #define BTRFS_FS_STATE_TRANS_ABORTED 2 #define BTRFS_FS_STATE_DEV_REPLACING 3 +#define BTRFS_FS_STATE_DUMMY_FS_INFO 4 #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 @@ -144,21 +145,6 @@ struct btrfs_header { u8 level; } __attribute__ ((__packed__)); -#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ - sizeof(struct btrfs_header)) / \ - sizeof(struct btrfs_key_ptr)) -#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) -#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize)) -#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ - (offsetof(struct btrfs_file_extent_item, disk_bytenr)) -#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) - \ - BTRFS_FILE_EXTENT_INLINE_DATA_START) -#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) -\ - sizeof(struct btrfs_dir_item)) - - /* * this is a very generous portion of the super block, giving us * room to translate 14 chunks with 3 stripes each. @@ -439,6 +425,8 @@ struct btrfs_space_info { struct list_head list; /* Protected by the spinlock 'lock'. */ struct list_head ro_bgs; + struct list_head priority_tickets; + struct list_head tickets; struct rw_semaphore groups_sem; /* for block groups in our same type */ @@ -1112,12 +1100,11 @@ struct btrfs_subvolume_writers { #define BTRFS_ROOT_REF_COWS 1 #define BTRFS_ROOT_TRACK_DIRTY 2 #define BTRFS_ROOT_IN_RADIX 3 -#define BTRFS_ROOT_DUMMY_ROOT 4 -#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5 -#define BTRFS_ROOT_DEFRAG_RUNNING 6 -#define BTRFS_ROOT_FORCE_COW 7 -#define BTRFS_ROOT_MULTI_LOG_TASKS 8 -#define BTRFS_ROOT_DIRTY 9 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4 +#define BTRFS_ROOT_DEFRAG_RUNNING 5 +#define BTRFS_ROOT_FORCE_COW 6 +#define BTRFS_ROOT_MULTI_LOG_TASKS 7 +#define BTRFS_ROOT_DIRTY 8 /* * in ram representation of the tree. extent_root is used for all allocations @@ -1179,8 +1166,10 @@ struct btrfs_root { u64 highest_objectid; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */ u64 alloc_bytenr; +#endif u64 defrag_trans_start; struct btrfs_key defrag_progress; @@ -1257,6 +1246,39 @@ struct btrfs_root { atomic_t qgroup_meta_rsv; }; +static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize) +{ + return blocksize - sizeof(struct btrfs_header); +} + +static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_root *root) +{ + return __BTRFS_LEAF_DATA_SIZE(root->nodesize); +} + +static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_root *root) +{ + return BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); +} + +static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_root *root) +{ + return BTRFS_LEAF_DATA_SIZE(root) / sizeof(struct btrfs_key_ptr); +} + +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_root *root) +{ + return BTRFS_MAX_ITEM_SIZE(root) - + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root) +{ + return BTRFS_MAX_ITEM_SIZE(root) - sizeof(struct btrfs_dir_item); +} + /* * Flags for mount options. * @@ -1297,21 +1319,21 @@ struct btrfs_root { #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) -#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ +#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) -#define btrfs_set_and_info(root, opt, fmt, args...) \ +#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ { \ - if (!btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_set_opt(root->fs_info->mount_opt, opt); \ + if (!btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_set_opt(fs_info->mount_opt, opt); \ } -#define btrfs_clear_and_info(root, opt, fmt, args...) \ +#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ { \ - if (btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_clear_opt(root->fs_info->mount_opt, opt); \ + if (btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_clear_opt(fs_info->mount_opt, opt); \ } #ifdef CONFIG_BTRFS_DEBUG @@ -1319,9 +1341,9 @@ static inline int btrfs_should_fragment_free_space(struct btrfs_root *root, struct btrfs_block_group_cache *block_group) { - return (btrfs_test_opt(root, FRAGMENT_METADATA) && + return (btrfs_test_opt(root->fs_info, FRAGMENT_METADATA) && block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(root, FRAGMENT_DATA) && + (btrfs_test_opt(root->fs_info, FRAGMENT_DATA) && block_group->flags & BTRFS_BLOCK_GROUP_DATA); } #endif @@ -2624,6 +2646,15 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_ALL, }; +enum btrfs_flush_state { + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELALLOC = 3, + FLUSH_DELALLOC_WAIT = 4, + ALLOC_CHUNK = 5, + COMMIT_TRANS = 6, +}; + int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); @@ -2661,8 +2692,8 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, - u64 num_bytes); + struct btrfs_block_rsv *dst_rsv, u64 num_bytes, + int update_size); int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *dest, u64 num_bytes, int min_factor); @@ -2875,9 +2906,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ -int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, @@ -3351,23 +3379,23 @@ const char *btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, + const char *function, unsigned int line, int errno); /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. */ -#define btrfs_abort_transaction(trans, root, errno) \ +#define btrfs_abort_transaction(trans, errno) \ do { \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((root)->fs_info->fs_state))) { \ + &((trans)->fs_info->fs_state))) { \ WARN(1, KERN_DEBUG \ "BTRFS: Transaction aborted (error %d)\n", \ (errno)); \ } \ - __btrfs_abort_transaction((trans), (root), __func__, \ + __btrfs_abort_transaction((trans), __func__, \ __LINE__, (errno)); \ } while (0) @@ -3599,13 +3627,13 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) void btrfs_test_destroy_inode(struct inode *inode); #endif -static inline int btrfs_test_is_dummy_root(struct btrfs_root *root) +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, + &fs_info->fs_state))) return 1; #endif return 0; } - #endif diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h new file mode 100644 index 000000000000..83ebfe28da9e --- /dev/null +++ b/fs/btrfs/dedupe.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2016 Fujitsu. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_DEDUPE__ +#define __BTRFS_DEDUPE__ + +/* later in-band dedupe will expand this struct */ +struct btrfs_dedupe_hash; +#endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index d3aaabbfada0..3eeb9cd8cfa5 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -34,7 +34,7 @@ int __init btrfs_delayed_inode_init(void) delayed_node_cache = kmem_cache_create("btrfs_delayed_node", sizeof(struct btrfs_delayed_node), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!delayed_node_cache) return -ENOMEM; @@ -553,7 +553,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); if (!ret) { trace_btrfs_space_reservation(root->fs_info, "delayed_item", item->key.objectid, @@ -598,6 +598,29 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes = btrfs_calc_trans_metadata_size(root, 1); /* + * If our block_rsv is the delalloc block reserve then check and see if + * we have our extra reservation for updating the inode. If not fall + * through and try to reserve space quickly. + * + * We used to try and steal from the delalloc block rsv or the global + * reserve, but we'd steal a full reservation, which isn't kind. We are + * here through delalloc which means we've likely just cowed down close + * to the leaf that contains the inode, so we would steal less just + * doing the fallback inode update, so if we do end up having to steal + * from the global block rsv we hopefully only steal one or two blocks + * worth which is less likely to hurt us. + */ + if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { + spin_lock(&BTRFS_I(inode)->lock); + if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) + release = true; + else + src_rsv = NULL; + spin_unlock(&BTRFS_I(inode)->lock); + } + + /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction * which doesn't reserve space for speed. This is a problem since we * still need to reserve space for this update, so try to reserve the @@ -626,51 +649,10 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes, 1); } return ret; - } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { - spin_lock(&BTRFS_I(inode)->lock); - if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) { - spin_unlock(&BTRFS_I(inode)->lock); - release = true; - goto migrate; - } - spin_unlock(&BTRFS_I(inode)->lock); - - /* Ok we didn't have space pre-reserved. This shouldn't happen - * too often but it can happen if we do delalloc to an existing - * inode which gets dirtied because of the time update, and then - * isn't touched again until after the transaction commits and - * then we try to write out the data. First try to be nice and - * reserve something strictly for us. If not be a pain and try - * to steal from the delalloc block rsv. - */ - ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, - BTRFS_RESERVE_NO_FLUSH); - if (!ret) - goto out; - - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) - goto out; - - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { - btrfs_debug(root->fs_info, - "block rsv migrate returned %d", ret); - WARN_ON(1); - } - /* - * Ok this is a problem, let's just steal from the global rsv - * since this really shouldn't happen that often. - */ - ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, - dst_rsv, num_bytes); - goto out; } -migrate: - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); -out: /* * Migrate only takes a reservation, it doesn't touch the size of the * block_rsv. This is to simplify people who don't normally have things @@ -1188,7 +1170,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, if (ret) { btrfs_release_delayed_node(curr_node); curr_node = NULL; - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 430b3689b112..b6d210e7a993 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -606,7 +606,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; - qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, + qexisting = btrfs_qgroup_insert_dirty_extent(fs_info, + delayed_refs, qrecord); if (qexisting) kfree(qrecord); @@ -615,7 +616,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); - trace_add_delayed_ref_head(ref, head_ref, action); + trace_add_delayed_ref_head(fs_info, ref, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); @@ -682,7 +683,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->type = BTRFS_TREE_BLOCK_REF_KEY; full_ref->level = level; - trace_add_delayed_tree_ref(ref, full_ref, action); + trace_add_delayed_tree_ref(fs_info, ref, full_ref, action); ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); @@ -739,7 +740,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, full_ref->objectid = owner; full_ref->offset = offset; - trace_add_delayed_data_ref(ref, full_ref, action); + trace_add_delayed_data_ref(fs_info, ref, full_ref, action); ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); @@ -940,28 +941,28 @@ int btrfs_delayed_ref_init(void) btrfs_delayed_ref_head_cachep = kmem_cache_create( "btrfs_delayed_ref_head", sizeof(struct btrfs_delayed_ref_head), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_ref_head_cachep) goto fail; btrfs_delayed_tree_ref_cachep = kmem_cache_create( "btrfs_delayed_tree_ref", sizeof(struct btrfs_delayed_tree_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_tree_ref_cachep) goto fail; btrfs_delayed_data_ref_cachep = kmem_cache_create( "btrfs_delayed_data_ref", sizeof(struct btrfs_delayed_data_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_data_ref_cachep) goto fail; btrfs_delayed_extent_op_cachep = kmem_cache_create( "btrfs_delayed_extent_op", sizeof(struct btrfs_delayed_extent_op), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_extent_op_cachep) goto fail; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 63ef9cdf0144..e9bbff3c0029 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -142,7 +142,7 @@ no_valid_dev_replace_entry_found: * missing */ if (!dev_replace->srcdev && - !btrfs_test_opt(dev_root, DEGRADED)) { + !btrfs_test_opt(dev_root->fs_info, DEGRADED)) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -151,7 +151,7 @@ no_valid_dev_replace_entry_found: src_devid); } if (!dev_replace->tgtdev && - !btrfs_test_opt(dev_root, DEGRADED)) { + !btrfs_test_opt(dev_root->fs_info, DEGRADED)) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9a726ded2c6d..87dad552e39a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -101,7 +101,7 @@ int __init btrfs_end_io_wq_init(void) btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", sizeof(struct btrfs_end_io_wq), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_end_io_wq_cache) return -ENOMEM; @@ -1140,7 +1140,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr) { - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return alloc_test_extent_buffer(root->fs_info, bytenr, root->nodesize); return alloc_extent_buffer(root->fs_info, bytenr); @@ -1227,6 +1227,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { + bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); root->node = NULL; root->commit_root = NULL; root->sectorsize = sectorsize; @@ -1281,14 +1282,14 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; - if (fs_info) + if (!dummy) extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - if (fs_info) + if (!dummy) root->defrag_trans_start = fs_info->generation; else root->defrag_trans_start = 0; @@ -1309,17 +1310,20 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ -struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize) +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, + u32 sectorsize, u32 nodesize) { struct btrfs_root *root; - root = btrfs_alloc_root(NULL, GFP_KERNEL); + if (!fs_info) + return ERR_PTR(-EINVAL); + + root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); /* We don't use the stripesize in selftest, set it as sectorsize */ - __setup_root(nodesize, sectorsize, sectorsize, root, NULL, + __setup_root(nodesize, sectorsize, sectorsize, root, fs_info, BTRFS_ROOT_TREE_OBJECTID); - set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); root->alloc_bytenr = 0; return root; @@ -1594,14 +1598,14 @@ int btrfs_init_fs_root(struct btrfs_root *root) ret = get_anon_bdev(&root->anon_dev); if (ret) - goto free_writers; + goto fail; mutex_lock(&root->objectid_mutex); ret = btrfs_find_highest_objectid(root, &root->highest_objectid); if (ret) { mutex_unlock(&root->objectid_mutex); - goto free_root_dev; + goto fail; } ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1609,14 +1613,8 @@ int btrfs_init_fs_root(struct btrfs_root *root) mutex_unlock(&root->objectid_mutex); return 0; - -free_root_dev: - free_anon_bdev(root->anon_dev); -free_writers: - btrfs_free_subvolume_writers(root->subv_writers); fail: - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); + /* the caller is responsible to call free_fs_root */ return ret; } @@ -2310,17 +2308,19 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = - btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, - max_active, 16); + btrfs_alloc_workqueue(fs_info, "worker", + flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = - btrfs_alloc_workqueue("delalloc", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "delalloc", + flags, max_active, 2); fs_info->flush_workers = - btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "flush_delalloc", + flags, max_active, 0); fs_info->caching_workers = - btrfs_alloc_workqueue("cache", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); /* * a higher idle thresh on the submit workers makes it much more @@ -2328,41 +2328,48 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, * devices */ fs_info->submit_workers = - btrfs_alloc_workqueue("submit", flags, + btrfs_alloc_workqueue(fs_info, "submit", flags, min_t(u64, fs_devices->num_devices, max_active), 64); fs_info->fixup_workers = - btrfs_alloc_workqueue("fixup", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); /* * endios are largely parallel and should have a very * low idle thresh */ fs_info->endio_workers = - btrfs_alloc_workqueue("endio", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); fs_info->endio_meta_workers = - btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio-meta", flags, + max_active, 4); fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, + max_active, 2); fs_info->endio_raid56_workers = - btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, + max_active, 4); fs_info->endio_repair_workers = - btrfs_alloc_workqueue("endio-repair", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0); fs_info->rmw_workers = - btrfs_alloc_workqueue("rmw", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); fs_info->endio_write_workers = - btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "endio-write", flags, + max_active, 2); fs_info->endio_freespace_worker = - btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "freespace-write", flags, + max_active, 0); fs_info->delayed_workers = - btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, + max_active, 0); fs_info->readahead_workers = - btrfs_alloc_workqueue("readahead", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "readahead", flags, + max_active, 2); fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); fs_info->extent_workers = - btrfs_alloc_workqueue("extent-refs", flags, + btrfs_alloc_workqueue(fs_info, "extent-refs", flags, min_t(u64, fs_devices->num_devices, max_active), 8); @@ -3010,8 +3017,8 @@ retry_root_backup: if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; - if (!btrfs_test_opt(tree_root, SSD) && - !btrfs_test_opt(tree_root, NOSSD) && + if (!btrfs_test_opt(tree_root->fs_info, SSD) && + !btrfs_test_opt(tree_root->fs_info, NOSSD) && !fs_info->fs_devices->rotating) { btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); btrfs_set_opt(fs_info->mount_opt, SSD); @@ -3024,9 +3031,9 @@ retry_root_backup: btrfs_apply_pending_changes(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { + if (btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY)) { ret = btrfsic_mount(tree_root, fs_devices, - btrfs_test_opt(tree_root, + btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? 1 : 0, fs_info->check_integrity_print_mask); @@ -3042,7 +3049,7 @@ retry_root_backup: /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && - !btrfs_test_opt(tree_root, NOLOGREPLAY)) { + !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) { ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; @@ -3083,7 +3090,7 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; - if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && + if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "creating free space tree"); ret = btrfs_create_free_space_tree(fs_info); @@ -3120,7 +3127,7 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); - if (btrfs_test_opt(tree_root, CLEAR_CACHE) && + if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "clearing free space tree"); ret = btrfs_clear_free_space_tree(fs_info); @@ -3141,7 +3148,7 @@ retry_root_backup: close_ctree(tree_root); return ret; } - } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || + } else if (btrfs_test_opt(tree_root->fs_info, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super)) { btrfs_info(fs_info, "checking UUID tree"); @@ -3218,7 +3225,7 @@ fail: return err; recovery_tree_root: - if (!btrfs_test_opt(tree_root, USEBACKUPROOT)) + if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT)) goto fail_tree_roots; free_root_pointers(fs_info, 0); @@ -3634,7 +3641,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - do_barriers = !btrfs_test_opt(root, NOBARRIER); + do_barriers = !btrfs_test_opt(root->fs_info, NOBARRIER); backup_super_roots(root->fs_info); sb = root->fs_info->super_for_commit; @@ -3918,7 +3925,7 @@ void close_ctree(struct btrfs_root *root) iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY)) + if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY)) btrfsic_unmount(root, fs_info->fs_devices); #endif diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index dbf3e1aab69e..b3207a0e09f7 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -90,7 +90,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, void btrfs_free_fs_root(struct btrfs_root *root); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize); +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, + u32 sectorsize, u32 nodesize); #endif /* diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b480fd555774..61b494e8e604 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -111,6 +111,16 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int reserved); +static int __reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush); +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -2170,7 +2180,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; @@ -2194,7 +2204,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(node, ref, node->action); + trace_run_delayed_data_ref(root->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; @@ -2349,7 +2359,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(node, ref, node->action); + trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; @@ -2413,7 +2423,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, */ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); - trace_run_delayed_ref_head(node, head, node->action); + trace_run_delayed_ref_head(root->fs_info, node, head, + node->action); if (insert_reserved) { btrfs_pin_extent(root, node->bytenr, @@ -2768,7 +2779,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) u64 num_csums_per_leaf; u64 num_csums; - csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + csum_size = BTRFS_MAX_ITEM_SIZE(root); num_csums_per_leaf = div64_u64(csum_size, (u64)btrfs_super_csum_size(root->fs_info->super_copy)); num_csums = div64_u64(csum_bytes, root->sectorsize); @@ -2960,7 +2971,7 @@ again: trans->can_flush_pending_bgs = false; ret = __btrfs_run_delayed_refs(trans, root, count); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -3224,7 +3235,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, u64, u64, u64, u64, u64, u64); - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; ref_root = btrfs_header_owner(buf); @@ -3419,7 +3430,7 @@ again: * transaction, this only happens in really bad situations * anyway. */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } WARN_ON(ret); @@ -3437,7 +3448,7 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE)) { + !btrfs_test_opt(root->fs_info, SPACE_CACHE)) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3514,7 +3525,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, struct btrfs_path *path; if (list_empty(&cur_trans->dirty_bgs) || - !btrfs_test_opt(root, SPACE_CACHE)) + !btrfs_test_opt(root->fs_info, SPACE_CACHE)) return 0; path = btrfs_alloc_path(); @@ -3659,7 +3670,7 @@ again: } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } } @@ -3805,7 +3816,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache); } if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } /* if its not on the io list, we need to put the block group */ @@ -3913,6 +3924,7 @@ static const char *alloc_name(u64 flags) static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; @@ -3933,8 +3945,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; if (total_bytes > 0) found->full = 0; + space_info_add_new_bytes(info, found, total_bytes - + bytes_used - bytes_readonly); spin_unlock(&found->lock); *space_info = found; return 0; @@ -3960,7 +3975,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_used = bytes_used * factor; found->bytes_pinned = 0; found->bytes_reserved = 0; - found->bytes_readonly = 0; + found->bytes_readonly = bytes_readonly; found->bytes_may_use = 0; found->full = 0; found->max_extent_size = 0; @@ -3969,6 +3984,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->flush = 0; init_waitqueue_head(&found->wait); INIT_LIST_HEAD(&found->ro_bgs); + INIT_LIST_HEAD(&found->tickets); + INIT_LIST_HEAD(&found->priority_tickets); ret = kobject_init_and_add(&found->kobj, &space_info_ktype, info->space_info_kobj, "%s", @@ -4427,7 +4444,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + btrfs_calc_trans_metadata_size(root, 1); - if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); dump_space_info(info, 0, 0); @@ -4470,7 +4487,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, - 0, 0, &space_info); + 0, 0, 0, &space_info); BUG_ON(ret); /* -ENOMEM */ } BUG_ON(!space_info); /* Logic error */ @@ -4572,7 +4589,7 @@ out: */ if (trans->can_flush_pending_bgs && trans->chunk_bytes_reserved >= (u64)SZ_2M) { - btrfs_create_pending_block_groups(trans, trans->root); + btrfs_create_pending_block_groups(trans, extent_root); btrfs_trans_release_chunk_metadata(trans); } return ret; @@ -4582,12 +4599,19 @@ static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - u64 profile = btrfs_get_alloc_profile(root, 0); + struct btrfs_block_rsv *global_rsv; + u64 profile; u64 space_size; u64 avail; u64 used; + /* Don't overcommit when in mixed mode. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + BUG_ON(root->fs_info == NULL); + global_rsv = &root->fs_info->global_block_rsv; + profile = btrfs_get_alloc_profile(root, 0); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly; @@ -4739,6 +4763,11 @@ skip_async: spin_unlock(&space_info->lock); break; } + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } spin_unlock(&space_info->lock); loops++; @@ -4807,13 +4836,11 @@ commit: return btrfs_commit_transaction(trans, root); } -enum flush_state { - FLUSH_DELAYED_ITEMS_NR = 1, - FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELALLOC = 3, - FLUSH_DELALLOC_WAIT = 4, - ALLOC_CHUNK = 5, - COMMIT_TRANS = 6, +struct reserve_ticket { + u64 bytes; + int error; + struct list_head list; + wait_queue_head_t wait; }; static int flush_space(struct btrfs_root *root, @@ -4866,6 +4893,8 @@ static int flush_space(struct btrfs_root *root, break; } + trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes, + orig_bytes, state, ret); return ret; } @@ -4873,17 +4902,22 @@ static inline u64 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, struct btrfs_space_info *space_info) { + struct reserve_ticket *ticket; u64 used; u64 expected; - u64 to_reclaim; + u64 to_reclaim = 0; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - spin_lock(&space_info->lock); if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) { - to_reclaim = 0; - goto out; - } + BTRFS_RESERVE_FLUSH_ALL)) + return 0; + + list_for_each_entry(ticket, &space_info->tickets, list) + to_reclaim += ticket->bytes; + list_for_each_entry(ticket, &space_info->priority_tickets, list) + to_reclaim += ticket->bytes; + if (to_reclaim) + return to_reclaim; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + @@ -4899,14 +4933,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, to_reclaim = 0; to_reclaim = min(to_reclaim, space_info->bytes_may_use + space_info->bytes_reserved); -out: - spin_unlock(&space_info->lock); - return to_reclaim; } static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_fs_info *fs_info, u64 used) + struct btrfs_root *root, u64 used) { u64 thresh = div_factor_fine(space_info->total_bytes, 98); @@ -4914,73 +4945,177 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; - return (used >= thresh && !btrfs_fs_closing(fs_info) && - !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); + if (!btrfs_calc_reclaim_metadata_size(root, space_info)) + return 0; + + return (used >= thresh && !btrfs_fs_closing(root->fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, + &root->fs_info->fs_state)); } -static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_fs_info *fs_info, - int flush_state) +static void wake_all_tickets(struct list_head *head) { - u64 used; - - spin_lock(&space_info->lock); - /* - * We run out of space and have not got any free space via flush_space, - * so don't bother doing async reclaim. - */ - if (flush_state > COMMIT_TRANS && space_info->full) { - spin_unlock(&space_info->lock); - return 0; - } + struct reserve_ticket *ticket; - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; - if (need_do_async_reclaim(space_info, fs_info, used)) { - spin_unlock(&space_info->lock); - return 1; + while (!list_empty(head)) { + ticket = list_first_entry(head, struct reserve_ticket, list); + list_del_init(&ticket->list); + ticket->error = -ENOSPC; + wake_up(&ticket->wait); } - spin_unlock(&space_info->lock); - - return 0; } +/* + * This is for normal flushers, we can wait all goddamned day if we want to. We + * will loop and continuously try to flush as long as we are making progress. + * We count progress as clearing off tickets each time we have to loop. + */ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) { + struct reserve_ticket *last_ticket = NULL; struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; int flush_state; + int commit_cycles = 0; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, space_info); - if (!to_reclaim) + if (!to_reclaim) { + space_info->flush = 0; + spin_unlock(&space_info->lock); return; + } + last_ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + spin_unlock(&space_info->lock); flush_state = FLUSH_DELAYED_ITEMS_NR; do { + struct reserve_ticket *ticket; + int ret; + + ret = flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + if (last_ticket == ticket) { + flush_state++; + } else { + last_ticket = ticket; + flush_state = FLUSH_DELAYED_ITEMS_NR; + if (commit_cycles) + commit_cycles--; + } + + if (flush_state > COMMIT_TRANS) { + commit_cycles++; + if (commit_cycles > 2) { + wake_all_tickets(&space_info->tickets); + space_info->flush = 0; + } else { + flush_state = FLUSH_DELAYED_ITEMS_NR; + } + } + spin_unlock(&space_info->lock); + } while (flush_state <= COMMIT_TRANS); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + +static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + u64 to_reclaim; + int flush_state = FLUSH_DELAYED_ITEMS_NR; + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + + do { flush_space(fs_info->fs_root, space_info, to_reclaim, to_reclaim, flush_state); flush_state++; - if (!btrfs_need_do_async_reclaim(space_info, fs_info, - flush_state)) + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); return; + } + spin_unlock(&space_info->lock); + + /* + * Priority flushers can't wait on delalloc without + * deadlocking. + */ + if (flush_state == FLUSH_DELALLOC || + flush_state == FLUSH_DELALLOC_WAIT) + flush_state = ALLOC_CHUNK; } while (flush_state < COMMIT_TRANS); } -void btrfs_init_async_reclaim_work(struct work_struct *work) +static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, u64 orig_bytes) + { - INIT_WORK(work, btrfs_async_reclaim_metadata_space); + DEFINE_WAIT(wait); + int ret = 0; + + spin_lock(&space_info->lock); + while (ticket->bytes > 0 && ticket->error == 0) { + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + if (ret) { + ret = -EINTR; + break; + } + spin_unlock(&space_info->lock); + + schedule(); + + finish_wait(&ticket->wait, &wait); + spin_lock(&space_info->lock); + } + if (!ret) + ret = ticket->error; + if (!list_empty(&ticket->list)) + list_del_init(&ticket->list); + if (ticket->bytes && ticket->bytes < orig_bytes) { + u64 num_bytes = orig_bytes - ticket->bytes; + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + } + spin_unlock(&space_info->lock); + + return ret; } /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for + * @space_info - the space info we want to allocate from * @orig_bytes - the number of bytes we want * @flush - whether or not we can flush to make our reservation * @@ -4991,81 +5126,36 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) * regain reservations will be made and this will fail if there is not enough * space already. */ -static int reserve_metadata_bytes(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int __reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *space_info = block_rsv->space_info; + struct reserve_ticket ticket; u64 used; - u64 num_bytes = orig_bytes; - int flush_state = FLUSH_DELAYED_ITEMS_NR; int ret = 0; - bool flushing = false; - -again: - ret = 0; - spin_lock(&space_info->lock); - /* - * We only want to wait if somebody other than us is flushing and we - * are actually allowed to flush all things. - */ - while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && - space_info->flush) { - spin_unlock(&space_info->lock); - /* - * If we have a trans handle we can't wait because the flusher - * may have to commit the transaction, which would mean we would - * deadlock since we are waiting for the flusher to finish, but - * hold the current transaction open. - */ - if (current->journal_info) - return -EAGAIN; - ret = wait_event_killable(space_info->wait, !space_info->flush); - /* Must have been killed, return */ - if (ret) - return -EINTR; - spin_lock(&space_info->lock); - } + ASSERT(orig_bytes); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + spin_lock(&space_info->lock); ret = -ENOSPC; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; /* - * The idea here is that we've not already over-reserved the block group - * then we can go ahead and save our reservation first and then start - * flushing if we need to. Otherwise if we've already overcommitted - * lets start flushing stuff first and then come back and try to make - * our reservation. + * If we have enough space then hooray, make our reservation and carry + * on. If not see if we can overcommit, and if we can, hooray carry on. + * If not things get more complicated. */ - if (used <= space_info->total_bytes) { - if (used + orig_bytes <= space_info->total_bytes) { - space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", space_info->flags, orig_bytes, 1); - ret = 0; - } else { - /* - * Ok set num_bytes to orig_bytes since we aren't - * overocmmitted, this way we only try and reclaim what - * we need. - */ - num_bytes = orig_bytes; - } - } else { - /* - * Ok we're over committed, set num_bytes to the overcommitted - * amount plus the amount of bytes that we need for this - * reservation. - */ - num_bytes = used - space_info->total_bytes + - (orig_bytes * 2); - } - - if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { + if (used + orig_bytes <= space_info->total_bytes) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + space_info->flags, orig_bytes, + 1); + ret = 0; + } else if (can_overcommit(root, space_info, orig_bytes, flush)) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", space_info->flags, orig_bytes, @@ -5074,16 +5164,31 @@ again: } /* - * Couldn't make our reservation, save our place so while we're trying - * to reclaim space we can actually use it instead of somebody else - * stealing it from us. + * If we couldn't make a reservation then setup our reservation ticket + * and kick the async worker if it's not already running. * - * We make the other tasks wait for the flush only when we can flush - * all things. + * If we are a priority flusher then we just need to add our ticket to + * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { - flushing = true; - space_info->flush = 1; + ticket.bytes = orig_bytes; + ticket.error = 0; + init_waitqueue_head(&ticket.wait); + if (flush == BTRFS_RESERVE_FLUSH_ALL) { + list_add_tail(&ticket.list, &space_info->tickets); + if (!space_info->flush) { + space_info->flush = 1; + trace_btrfs_trigger_flush(root->fs_info, + space_info->flags, + orig_bytes, flush, + "enospc"); + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); + } + } else { + list_add_tail(&ticket.list, + &space_info->priority_tickets); + } } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; /* @@ -5092,39 +5197,67 @@ again: * the async reclaim as we will panic. */ if (!root->fs_info->log_root_recovering && - need_do_async_reclaim(space_info, root->fs_info, used) && - !work_busy(&root->fs_info->async_reclaim_work)) + need_do_async_reclaim(space_info, root, used) && + !work_busy(&root->fs_info->async_reclaim_work)) { + trace_btrfs_trigger_flush(root->fs_info, + space_info->flags, + orig_bytes, flush, + "preempt"); queue_work(system_unbound_wq, &root->fs_info->async_reclaim_work); + } } spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) - goto out; + return ret; - ret = flush_space(root, space_info, num_bytes, orig_bytes, - flush_state); - flush_state++; + if (flush == BTRFS_RESERVE_FLUSH_ALL) + return wait_reserve_ticket(root->fs_info, space_info, &ticket, + orig_bytes); - /* - * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock - * would happen. So skip delalloc flush. - */ - if (flush == BTRFS_RESERVE_FLUSH_LIMIT && - (flush_state == FLUSH_DELALLOC || - flush_state == FLUSH_DELALLOC_WAIT)) - flush_state = ALLOC_CHUNK; + ret = 0; + priority_reclaim_metadata_space(root->fs_info, space_info, &ticket); + spin_lock(&space_info->lock); + if (ticket.bytes) { + if (ticket.bytes < orig_bytes) { + u64 num_bytes = orig_bytes - ticket.bytes; + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", space_info->flags, + num_bytes, 0); - if (!ret) - goto again; - else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && - flush_state < COMMIT_TRANS) - goto again; - else if (flush == BTRFS_RESERVE_FLUSH_ALL && - flush_state <= COMMIT_TRANS) - goto again; + } + list_del_init(&ticket.list); + ret = -ENOSPC; + } + spin_unlock(&space_info->lock); + ASSERT(list_empty(&ticket.list)); + return ret; +} -out: +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, + flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { struct btrfs_block_rsv *global_rsv = @@ -5137,13 +5270,8 @@ out: if (ret == -ENOSPC) trace_btrfs_space_reservation(root->fs_info, "space_info:enospc", - space_info->flags, orig_bytes, 1); - if (flushing) { - spin_lock(&space_info->lock); - space_info->flush = 0; - wake_up_all(&space_info->wait); - spin_unlock(&space_info->lock); - } + block_rsv->space_info->flags, + orig_bytes, 1); return ret; } @@ -5219,6 +5347,108 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, return 0; } +/* + * This is for space we already have accounted in space_info->bytes_may_use, so + * basically when we're returning space from block_rsv's. + */ +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head; + u64 used; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + bool check_overcommit = false; + + spin_lock(&space_info->lock); + head = &space_info->priority_tickets; + + /* + * If we are over our limit then we need to check and see if we can + * overcommit, and if we can't then we just need to free up our space + * and not satisfy any requests. + */ + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (used - num_bytes >= space_info->total_bytes) + check_overcommit = true; +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + /* + * We use 0 bytes because this space is already reserved, so + * adding the ticket space would be a double count. + */ + if (check_overcommit && + !can_overcommit(fs_info->extent_root, space_info, 0, + flush)) + break; + if (num_bytes >= ticket->bytes) { + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + ticket->bytes = 0; + wake_up(&ticket->wait); + } else { + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + flush = BTRFS_RESERVE_FLUSH_ALL; + goto again; + } + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + spin_unlock(&space_info->lock); +} + +/* + * This is for newly allocated space that isn't accounted in + * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent + * we use this helper. + */ +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head = &space_info->priority_tickets; + +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + if (num_bytes >= ticket->bytes) { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + ticket->bytes, 1); + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + space_info->bytes_may_use += ticket->bytes; + ticket->bytes = 0; + wake_up(&ticket->wait); + } else { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + num_bytes, 1); + space_info->bytes_may_use += num_bytes; + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + goto again; + } +} + static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) @@ -5253,18 +5483,15 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } spin_unlock(&dest->lock); } - if (num_bytes) { - spin_lock(&space_info->lock); - space_info->bytes_may_use -= num_bytes; - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - spin_unlock(&space_info->lock); - } + if (num_bytes) + space_info_add_old_bytes(fs_info, space_info, + num_bytes); } } -static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, - struct btrfs_block_rsv *dst, u64 num_bytes) +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes, + int update_size) { int ret; @@ -5272,7 +5499,7 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, if (ret) return ret; - block_rsv_add_bytes(dst, num_bytes, 1); + block_rsv_add_bytes(dst, num_bytes, update_size); return 0; } @@ -5379,13 +5606,6 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, - u64 num_bytes) -{ - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); -} - void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes) @@ -5398,48 +5618,21 @@ void btrfs_block_rsv_release(struct btrfs_root *root, num_bytes); } -/* - * helper to calculate size of global block reservation. - * the desired value is sum of space used by extent tree, - * checksum tree and root tree - */ -static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *sinfo; - u64 num_bytes; - u64 meta_used; - u64 data_used; - int csum_size = btrfs_super_csum_size(fs_info->super_copy); - - sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); - spin_lock(&sinfo->lock); - data_used = sinfo->bytes_used; - spin_unlock(&sinfo->lock); - - sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - spin_lock(&sinfo->lock); - if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) - data_used = 0; - meta_used = sinfo->bytes_used; - spin_unlock(&sinfo->lock); - - num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * - csum_size * 2; - num_bytes += div_u64(data_used + meta_used, 50); - - if (num_bytes * 3 > meta_used) - num_bytes = div_u64(meta_used, 3); - - return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); -} - static void update_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; struct btrfs_space_info *sinfo = block_rsv->space_info; u64 num_bytes; - num_bytes = calc_global_metadata_size(fs_info); + /* + * The global block rsv is based on the size of the extent tree, the + * checksum tree and the root tree. If the fs is empty we want to set + * it to a minimal amount for safety. + */ + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + + btrfs_root_used(&fs_info->csum_root->root_item) + + btrfs_root_used(&fs_info->tree_root->root_item); + num_bytes = max_t(u64, num_bytes, SZ_16M); spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); @@ -5537,7 +5730,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, */ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = trans->root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->chunk_bytes_reserved) return; @@ -5554,7 +5747,13 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + /* + * We always use trans->block_rsv here as we will have reserved space + * for our orphan when starting the transaction, using get_block_rsv() + * here will sometimes make us choose the wrong block rsv as we could be + * doing a reloc inode for a non refcounted root. + */ + struct btrfs_block_rsv *src_rsv = trans->block_rsv; struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; /* @@ -5565,7 +5764,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); trace_btrfs_space_reservation(root->fs_info, "orphan", btrfs_ino(inode), num_bytes, 1); - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); + return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); } void btrfs_orphan_release_metadata(struct inode *inode) @@ -5620,7 +5819,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, BTRFS_RESERVE_FLUSH_ALL); if (ret == -ENOSPC && use_global_rsv) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); if (ret && *qgroup_reserved) btrfs_qgroup_free_meta(root, *qgroup_reserved); @@ -5730,21 +5929,26 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) u64 to_reserve = 0; u64 csum_bytes; unsigned nr_extents = 0; - int extra_reserve = 0; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; u64 to_free = 0; unsigned dropped; + bool release_extra = false; /* If we are a free space inode we need to not flush since we will be in * the middle of a transaction commit. We also don't need the delalloc * mutex since we won't race with anybody. We need this mostly to make * lockdep shut its filthy mouth. + * + * If we have a transaction open (can happen if we call truncate_block + * from truncate), then we need FLUSH_LIMIT so we don't deadlock. */ if (btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; delalloc_lock = false; + } else if (current->journal_info) { + flush = BTRFS_RESERVE_FLUSH_LIMIT; } if (flush != BTRFS_RESERVE_NO_FLUSH && @@ -5761,24 +5965,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); BTRFS_I(inode)->outstanding_extents += nr_extents; - nr_extents = 0; + nr_extents = 0; if (BTRFS_I(inode)->outstanding_extents > BTRFS_I(inode)->reserved_extents) - nr_extents = BTRFS_I(inode)->outstanding_extents - + nr_extents += BTRFS_I(inode)->outstanding_extents - BTRFS_I(inode)->reserved_extents; - /* - * Add an item to reserve for updating the inode when we complete the - * delalloc io. - */ - if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) { - nr_extents++; - extra_reserve = 1; - } - - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + /* We always want to reserve a slot for updating the inode. */ + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1); to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); @@ -5790,17 +5985,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) goto out_fail; } - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); goto out_fail; } spin_lock(&BTRFS_I(inode)->lock); - if (extra_reserve) { - set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags); - nr_extents--; + if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { + to_reserve -= btrfs_calc_trans_metadata_size(root, 1); + release_extra = true; } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); @@ -5811,8 +6006,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (to_reserve) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_reserve, 1); - block_rsv_add_bytes(block_rsv, to_reserve, 1); - + if (release_extra) + btrfs_block_rsv_release(root, block_rsv, + btrfs_calc_trans_metadata_size(root, + 1)); return 0; out_fail: @@ -5904,7 +6101,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; trace_btrfs_space_reservation(root->fs_info, "delalloc", @@ -6019,7 +6216,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - if (btrfs_test_opt(root, SPACE_CACHE) && + if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -6044,6 +6241,9 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); + trace_btrfs_space_reservation(root->fs_info, "pinned", + cache->space_info->flags, + num_bytes, 1); set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); @@ -6118,10 +6318,10 @@ static int pin_down_extent(struct btrfs_root *root, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); + trace_btrfs_space_reservation(root->fs_info, "pinned", + cache->space_info->flags, num_bytes, 1); set_extent_dirty(root->fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); - if (reserved) - trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); return 0; } @@ -6398,7 +6598,7 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 *empty_cluster) { struct btrfs_free_cluster *ret = NULL; - bool ssd = btrfs_test_opt(root, SSD); + bool ssd = btrfs_test_opt(root->fs_info, SSD); *empty_cluster = 0; if (btrfs_mixed_space_info(space_info)) @@ -6476,6 +6676,9 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + + trace_btrfs_space_reservation(fs_info, "pinned", + space_info->flags, len, 0); space_info->max_extent_size = 0; percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { @@ -6483,17 +6686,29 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, readonly = true; } spin_unlock(&cache->lock); - if (!readonly && global_rsv->space_info == space_info) { + if (!readonly && return_free_space && + global_rsv->space_info == space_info) { + u64 to_add = len; + WARN_ON(!return_free_space); spin_lock(&global_rsv->lock); if (!global_rsv->full) { - len = min(len, global_rsv->size - - global_rsv->reserved); - global_rsv->reserved += len; - space_info->bytes_may_use += len; + to_add = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += to_add; + space_info->bytes_may_use += to_add; if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; + trace_btrfs_space_reservation(fs_info, + "space_info", + space_info->flags, + to_add, 1); + len -= to_add; } spin_unlock(&global_rsv->lock); + /* Add to any tickets we may have */ + if (len) + space_info_add_new_bytes(fs_info, space_info, + len); } spin_unlock(&space_info->lock); } @@ -6528,7 +6743,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, break; } - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, end + 1 - start, NULL); @@ -6666,7 +6881,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, NULL, refs_to_drop, is_data, &last_ref); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -6715,7 +6930,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, path->nodes[0]); } if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } extent_slot = path->slots[0]; @@ -6726,10 +6941,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, owner_offset); - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } else { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6741,7 +6956,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6760,7 +6975,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); } if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6785,7 +7000,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_err(info, "trying to drop %d refs but we only have %Lu " "for bytenr %Lu", refs_to_drop, refs, bytenr); ret = -EINVAL; - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } refs -= refs_to_drop; @@ -6808,7 +7023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, iref, refs_to_drop, is_data, &last_ref); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -6831,7 +7046,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -6839,7 +7054,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -6847,13 +7062,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = add_to_free_space_tree(trans, root->fs_info, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -7002,7 +7217,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(fs_info)) return 0; add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); @@ -7637,8 +7852,7 @@ loop: * can do more things. */ if (ret < 0 && ret != -ENOSPC) - btrfs_abort_transaction(trans, - root, ret); + btrfs_abort_transaction(trans, ret); else ret = 0; if (!exist) @@ -7692,8 +7906,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", info->flags, info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly, - (info->full) ? "" : "not "); + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use, (info->full) ? "" : "not "); printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", info->total_bytes, info->bytes_used, info->bytes_pinned, @@ -7747,7 +7961,7 @@ again: if (num_bytes == min_alloc_size) final_tried = true; goto again; - } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, flags); @@ -7778,16 +7992,14 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, if (pin) pin_down_extent(root, cache, start, len, 1); else { - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); + trace_btrfs_reserved_extent_free(root, start, len); } btrfs_put_block_group(cache); - - trace_btrfs_reserved_extent_free(root, start, len); - return ret; } @@ -8088,7 +8300,7 @@ again: goto again; } - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, /*DEFAULT_RATELIMIT_BURST*/ 1); @@ -8142,13 +8354,15 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); - if (btrfs_test_is_dummy_root(root)) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (btrfs_is_testing(root->fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; } +#endif block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) @@ -8328,7 +8542,8 @@ static int record_one_subtree_extent(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord)) + if (btrfs_qgroup_insert_dirty_extent(trans->fs_info, + delayed_refs, qrecord)) kfree(qrecord); spin_unlock(&delayed_refs->lock); @@ -9113,7 +9328,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, &root->root_key, root_item); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -9140,7 +9355,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_del_root(trans, tree_root, &root->root_key); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -9148,7 +9363,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); if (ret < 0) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } else if (ret > 0) { @@ -9519,7 +9734,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) int full = 0; int ret = 0; - debug = btrfs_test_opt(root, ENOSPC_DEBUG); + debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG); block_group = btrfs_lookup_block_group(root->fs_info, bytenr); @@ -9675,7 +9890,22 @@ static int find_first_block_group(struct btrfs_root *root, if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = 0; + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &root->fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, found_key.objectid, + found_key.offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(root->fs_info, + "logical %llu len %llu found bg but no related chunk", + found_key.objectid, found_key.offset); + ret = -ENOENT; + } else { + ret = 0; + } goto out; } path->slots[0]++; @@ -9791,13 +10021,15 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { - if (WARN_ON(space_info->bytes_pinned > 0 || + + /* + * Do not hide this behind enospc_debug, this is actually + * important and indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0)) { - dump_space_info(space_info, 0, 0); - } - } + space_info->bytes_may_use > 0)) + dump_space_info(space_info, 0, 0); list_del(&space_info->list); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; @@ -9915,10 +10147,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (btrfs_test_opt(root, SPACE_CACHE) && + if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && btrfs_super_generation(root->fs_info->super_copy) != cache_gen) need_clear = 1; - if (btrfs_test_opt(root, CLEAR_CACHE)) + if (btrfs_test_opt(root->fs_info, CLEAR_CACHE)) need_clear = 1; while (1) { @@ -9949,7 +10181,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) cache->disk_cache_state = BTRFS_DC_CLEAR; } @@ -10005,9 +10237,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) goto error; } + trace_btrfs_add_block_group(root->fs_info, cache, 0); ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), - &space_info); + cache->bytes_super, &space_info); if (ret) { btrfs_remove_free_space_cache(cache); spin_lock(&info->block_group_cache_lock); @@ -10020,9 +10253,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) } cache->space_info = space_info; - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_readonly += cache->bytes_super; - spin_unlock(&cache->space_info->lock); __link_block_group(space_info, cache); @@ -10093,11 +10323,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, extent_root, &key, &item, sizeof(item)); if (ret) - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); ret = btrfs_finish_chunk_alloc(trans, extent_root, key.objectid, key.offset); if (ret) - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, root->fs_info, block_group); /* already aborted the transaction if it failed. */ next: @@ -10114,7 +10344,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *extent_root; struct btrfs_block_group_cache *cache; - extent_root = root->fs_info->extent_root; btrfs_set_log_full_commit(root->fs_info, trans); @@ -10160,7 +10389,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * assigned to our block group, but don't update its counters just yet. * We want our bg to be added to the rbtree with its ->space_info set. */ - ret = update_space_info(root->fs_info, cache->flags, 0, 0, + ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0, &cache->space_info); if (ret) { btrfs_remove_free_space_cache(cache); @@ -10179,8 +10408,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * Now that our block group has its ->space_info set and is inserted in * the rbtree, update the space info's counters. */ + trace_btrfs_add_block_group(root->fs_info, cache, 1); ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, - &cache->space_info); + cache->bytes_super, &cache->space_info); if (ret) { btrfs_remove_free_space_cache(cache); spin_lock(&root->fs_info->block_group_cache_lock); @@ -10193,16 +10423,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, } update_global_block_rsv(root->fs_info); - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_readonly += cache->bytes_super; - spin_unlock(&cache->space_info->lock); - __link_block_group(cache->space_info, cache); list_add_tail(&cache->bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); - return 0; } @@ -10415,7 +10640,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { WARN_ON(block_group->space_info->total_bytes < block_group->key.offset); WARN_ON(block_group->space_info->bytes_readonly @@ -10683,7 +10908,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&space_info->lock); /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(root, DISCARD); + trimming = btrfs_test_opt(root->fs_info, DISCARD); /* Implicit trim during transaction commit. */ if (trimming) @@ -10747,21 +10972,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) mixed = 1; flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); if (ret) goto out; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); } else { flags = BTRFS_BLOCK_GROUP_METADATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); if (ret) goto out; flags = BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); } out: return ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cee4cb99b8ce..44fe66b53c8b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -163,13 +163,13 @@ int __init extent_io_init(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; @@ -2049,7 +2049,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return -EIO; } bio->bi_bdev = dev->bdev; - bio->bi_rw = WRITE_SYNC; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); bio_add_page(bio, page, length, pg_offset); if (btrfsic_submit_bio_wait(bio)) { @@ -2697,12 +2697,6 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; btrfs_bio->end_io = NULL; - -#ifdef CONFIG_BLK_CGROUP - /* FIXME, put this into bio_clone_bioset */ - if (bio->bi_css) - bio_associate_blkcg(new, bio->bi_css); -#endif } return new; } @@ -2756,7 +2750,6 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page, if (tree->ops && tree->ops->merge_bio_hook) ret = tree->ops->merge_bio_hook(page, offset, size, bio, bio_flags); - BUG_ON(ret < 0); return ret; } @@ -2879,6 +2872,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * into the tree that are removed when the IO is done (by the end_io * handlers) * XXX JDM: This needs looking at to ensure proper page locking + * return 0 on success, otherwise return error */ static int __do_readpage(struct extent_io_tree *tree, struct page *page, @@ -2900,7 +2894,7 @@ static int __do_readpage(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; - int ret; + int ret = 0; int nr = 0; size_t pg_offset = 0; size_t iosize; @@ -3081,6 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree, } else { SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + goto out; } cur = cur + iosize; pg_offset += iosize; @@ -3091,7 +3086,7 @@ out: SetPageUptodate(page); unlock_page(page); } - return 0; + return ret; } static inline void __do_contiguous_readpages(struct extent_io_tree *tree, @@ -5230,14 +5225,31 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = eb->pages[i]; + if (!PageUptodate(page)) { + if (ret) { + atomic_dec(&eb->io_pages); + unlock_page(page); + continue; + } + ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags, REQ_META); - if (err) + if (err) { ret = err; + /* + * We use &bio in above __extent_read_full_page, + * so we ensure that if it returns error, the + * current page fails to add itself to bio and + * it's been unlocked. + * + * We must dec io_pages by ourselves. + */ + atomic_dec(&eb->io_pages); + } } else { unlock_page(page); } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index e0715fcfb11e..26f9ac719d20 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -13,7 +13,7 @@ int __init extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) return -ENOMEM; return 0; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 62a81ee13a5f..d0d571c47d33 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -250,7 +250,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, offset + root->sectorsize - 1, EXTENT_NODATASUM); } else { - btrfs_info(BTRFS_I(inode)->root->fs_info, + btrfs_info_rl(BTRFS_I(inode)->root->fs_info, "no csum found for inode %llu start %llu", btrfs_ino(inode), offset); } @@ -699,7 +699,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, */ ret = btrfs_split_item(trans, root, path, &key, offset); if (ret && ret != -EAGAIN) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2234e88cf674..9404121fd5f7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -132,7 +132,7 @@ static int __btrfs_add_inode_defrag(struct inode *inode, static inline int __need_auto_defrag(struct btrfs_root *root) { - if (!btrfs_test_opt(root, AUTO_DEFRAG)) + if (!btrfs_test_opt(root->fs_info, AUTO_DEFRAG)) return 0; if (btrfs_fs_closing(root->fs_info)) @@ -950,7 +950,7 @@ delete_extent_item: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } @@ -974,7 +974,7 @@ delete_extent_item: path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } leaf = path->nodes[0]; @@ -1190,7 +1190,7 @@ again: goto again; } if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -1278,7 +1278,7 @@ again: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -1629,13 +1629,11 @@ again: * managed to copy. */ if (num_sectors > dirty_sectors) { - /* - * we round down because we don't want to count - * any partial blocks actually sent through the - * IO machines - */ - release_bytes = round_down(release_bytes - copied, - root->sectorsize); + + /* release everything except the sectors we dirtied */ + release_bytes -= dirty_sectors << + root->fs_info->sb->s_blocksize_bits; + if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; @@ -2479,7 +2477,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, - min_size); + min_size, 0); BUG_ON(ret); trans->block_rsv = rsv; @@ -2522,7 +2520,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, - rsv, min_size); + rsv, min_size, 0); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; @@ -2977,7 +2975,7 @@ int btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", sizeof(struct inode_defrag), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 69d270f6602c..d571bd2b697b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -280,7 +280,7 @@ fail: if (locked) mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -3026,7 +3026,7 @@ int btrfs_find_space_cluster(struct btrfs_root *root, * For metadata, allow allocates with smaller extents. For * data, keep it dense. */ - if (btrfs_test_opt(root, SSD_SPREAD)) { + if (btrfs_test_opt(root->fs_info, SSD_SPREAD)) { cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; @@ -3470,7 +3470,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) int ret = 0; u64 root_gen = btrfs_root_generation(&root->root_item); - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; /* @@ -3514,7 +3514,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_io_ctl io_ctl; bool release_metadata = true; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; memset(&io_ctl, 0, sizeof(io_ctl)); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 53dbeaf6ce94..87e7e3d3e676 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -305,7 +305,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, out: kvfree(bitmap); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -454,7 +454,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, out: kvfree(bitmap); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -851,7 +851,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1047,7 +1047,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1193,7 +1193,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) abort: fs_info->creating_free_space_tree = 0; - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -1280,7 +1280,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) return 0; abort: - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -1333,7 +1333,7 @@ out: btrfs_free_path(path); mutex_unlock(&block_group->free_space_lock); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1410,7 +1410,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 70107f7c9307..aa6fabaee72e 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -38,7 +38,7 @@ static int caching_kthread(void *data) int slot; int ret; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; path = btrfs_alloc_path(); @@ -141,7 +141,7 @@ static void start_caching(struct btrfs_root *root) int ret; u64 objectid; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; spin_lock(&root->ino_cache_lock); @@ -185,7 +185,7 @@ static void start_caching(struct btrfs_root *root) int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) { - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return btrfs_find_free_objectid(root, objectid); again: @@ -211,7 +211,7 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; again: if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { @@ -251,7 +251,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) struct rb_node *n; u64 count; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; while (1) { @@ -412,7 +412,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (btrfs_root_refs(&root->root_item) == 0) return 0; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; path = btrfs_alloc_path(); @@ -458,7 +458,7 @@ again: BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } @@ -466,7 +466,7 @@ again: ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) { if (ret != -ENOSPC) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index df731c0ebec7..b0f421f332ae 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -60,6 +60,7 @@ #include "hash.h" #include "props.h" #include "qgroup.h" +#include "dedupe.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -105,8 +106,9 @@ static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock); + u64 start, u64 end, u64 delalloc_end, + int *page_started, unsigned long *nr_written, + int unlock, struct btrfs_dedupe_hash *hash); static struct extent_map *create_pinned_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, @@ -294,7 +296,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, start, aligned_end, NULL, 1, 1, extent_item_size, &extent_inserted); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -305,7 +307,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; @@ -374,12 +376,12 @@ static inline int inode_need_compress(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; /* force compress */ - if (btrfs_test_opt(root, FORCE_COMPRESS)) + if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) return 1; /* bad compression ratios */ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) return 0; - if (btrfs_test_opt(root, COMPRESS) || + if (btrfs_test_opt(root->fs_info, COMPRESS) || BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || BTRFS_I(inode)->force_compress) return 1; @@ -585,9 +587,27 @@ cont: will_compress = 0; } else { num_bytes = total_in; + *num_added += 1; + + /* + * The async work queues will take care of doing actual + * allocation on disk for these compressed pages, and + * will submit them to the elevator. + */ + add_async_extent(async_cow, start, num_bytes, + total_compressed, pages, nr_pages_ret, + compress_type); + + if (start + num_bytes < end) { + start += num_bytes; + pages = NULL; + cond_resched(); + goto again; + } + return; } } - if (!will_compress && pages) { + if (pages) { /* * the compression code ran but failed to make things smaller, * free any pages it allocated and our page pointer array @@ -602,48 +622,28 @@ cont: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(root, FORCE_COMPRESS) && + if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) && !(BTRFS_I(inode)->force_compress)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } } - if (will_compress) { - *num_added += 1; - - /* the async work queues will take care of doing actual - * allocation on disk for these compressed pages, - * and will submit them to the elevator. - */ - add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret, - compress_type); - - if (start + num_bytes < end) { - start += num_bytes; - pages = NULL; - cond_resched(); - goto again; - } - } else { cleanup_and_bail_uncompressed: - /* - * No compression, but we still need to write the pages in - * the file we've been given so far. redirty the locked - * page if it corresponds to our extent and set things up - * for the async work queue to run cow_file_range to do - * the normal delalloc dance - */ - if (page_offset(locked_page) >= start && - page_offset(locked_page) <= end) { - __set_page_dirty_nobuffers(locked_page); - /* unlocked later on in the async handlers */ - } - if (redirty) - extent_range_redirty_for_io(inode, start, end); - add_async_extent(async_cow, start, end - start + 1, - 0, NULL, 0, BTRFS_COMPRESS_NONE); - *num_added += 1; - } + /* + * No compression, but we still need to write the pages in the file + * we've been given so far. redirty the locked page if it corresponds + * to our extent and set things up for the async work queue to run + * cow_file_range to do the normal delalloc dance. + */ + if (page_offset(locked_page) >= start && + page_offset(locked_page) <= end) + __set_page_dirty_nobuffers(locked_page); + /* unlocked later on in the async handlers */ + + if (redirty) + extent_range_redirty_for_io(inode, start, end); + add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); + *num_added += 1; return; @@ -712,7 +712,10 @@ retry: async_extent->start, async_extent->start + async_extent->ram_size - 1, - &page_started, &nr_written, 0); + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0, + NULL); /* JDM XXX */ @@ -925,9 +928,9 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, */ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) + u64 start, u64 end, u64 delalloc_end, + int *page_started, unsigned long *nr_written, + int unlock, struct btrfs_dedupe_hash *hash) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 alloc_hint = 0; @@ -1156,7 +1159,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->start = start; if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && - !btrfs_test_opt(root, FORCE_COMPRESS)) + !btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) cur_end = end; else cur_end = min(end, start + SZ_512K - 1); @@ -1418,7 +1421,8 @@ out_check: if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, - page_started, nr_written, 1); + end, page_started, nr_written, 1, + NULL); if (ret) { if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); @@ -1501,8 +1505,8 @@ out_check: } if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, end, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, cow_start, end, end, + page_started, nr_written, 1, NULL); if (ret) goto error; } @@ -1561,8 +1565,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!inode_need_compress(inode)) { - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, start, end, end, + page_started, nr_written, 1, NULL); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); @@ -1740,7 +1744,7 @@ static void btrfs_set_bit_hook(struct inode *inode, } /* For sanity tests */ - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; __percpu_counter_add(&root->fs_info->delalloc_bytes, len, @@ -1799,7 +1803,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_delalloc_release_metadata(inode, len); /* For sanity tests. */ - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID @@ -1822,6 +1826,10 @@ static void btrfs_clear_bit_hook(struct inode *inode, /* * extent_io.c merge_bio_hook, this must check the chunk tree to make sure * we don't create bios that span stripes or chunks + * + * return 1 if page cannot be merged to bio + * return 0 if page can be merged to bio + * return error otherwise */ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, @@ -1840,8 +1848,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, map_length = length; ret = btrfs_map_block(root->fs_info, bio_op(bio), logical, &map_length, NULL, 0); - /* Will always return 0 with map_multi == NULL */ - BUG_ON(ret < 0); + if (ret < 0) + return ret; if (map_length < length + size) return 1; return 0; @@ -2594,7 +2602,7 @@ again: ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -2621,7 +2629,7 @@ again: backref->root_id, backref->inum, new->file_pos); /* start - extent_offset */ if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -2890,7 +2898,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2950,7 +2958,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->len, trans->transid); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_unlock; } @@ -2960,7 +2968,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_unlock; } ret = 0; @@ -3204,7 +3212,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); else clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); @@ -3295,7 +3303,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (ret != -EEXIST) { clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -3307,7 +3315,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -4006,20 +4014,20 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, btrfs_info(root->fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", name_len, name, ino, dir_ino); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } skip_backref: ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir_ino); if (ret != 0 && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } @@ -4028,7 +4036,7 @@ skip_backref: if (ret == -ENOENT) ret = 0; else if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err: btrfs_free_path(path); if (ret) @@ -4142,7 +4150,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -4152,7 +4160,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, dir_ino, &index, name, name_len); if (ret < 0) { if (ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } di = btrfs_search_dir_index_item(root, path, dir_ino, @@ -4162,7 +4170,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = -ENOENT; else ret = PTR_ERR(di); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -4175,7 +4183,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -4184,7 +4192,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; @@ -4505,7 +4513,6 @@ search_again: pending_del_nr); if (err) { btrfs_abort_transaction(trans, - root, err); goto error; } @@ -4517,8 +4524,7 @@ search_again: item_end, new_size); if (err) { - btrfs_abort_transaction(trans, - root, err); + btrfs_abort_transaction(trans, err); goto error; } } else if (test_bit(BTRFS_ROOT_REF_COWS, @@ -4582,8 +4588,7 @@ delete: pending_del_slot, pending_del_nr); if (ret) { - btrfs_abort_transaction(trans, - root, ret); + btrfs_abort_transaction(trans, ret); goto error; } pending_del_nr = 0; @@ -4616,7 +4621,7 @@ out: ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } error: if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) @@ -4785,7 +4790,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); return ret; } @@ -4793,7 +4798,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 0, 0, len, 0, len, 0, 0, 0); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); else btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); @@ -5020,7 +5025,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, BTRFS_I(inode)->disk_i_size); err = btrfs_orphan_del(trans, inode); if (err) - btrfs_abort_transaction(trans, root, err); + btrfs_abort_transaction(trans, err); btrfs_end_transaction(trans, root); } } @@ -5158,11 +5163,18 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; int steal_from_global = 0; - u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + u64 min_size; int ret; trace_btrfs_inode_evict(inode); + if (!root) { + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); + return; + } + + min_size = btrfs_calc_trunc_metadata_size(root, 1); + evict_inode_truncate_pages(inode); if (inode->i_nlink && @@ -5263,7 +5275,7 @@ void btrfs_evict_inode(struct inode *inode) if (steal_from_global) { if (!btrfs_check_space_for_delayed_refs(trans, root)) ret = btrfs_block_rsv_migrate(global_rsv, rsv, - min_size); + min_size, 0); else ret = -ENOSPC; } @@ -6239,9 +6251,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); if (S_ISREG(mode)) { - if (btrfs_test_opt(root, NODATASUM)) + if (btrfs_test_opt(root->fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(root->fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; } @@ -6319,7 +6331,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -6330,7 +6342,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode(trans, root, parent_inode); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; fail_dir_item: @@ -9116,7 +9128,7 @@ static int btrfs_truncate(struct inode *inode) /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, - min_size); + min_size, 0); BUG_ON(ret); /* @@ -9156,7 +9168,7 @@ static int btrfs_truncate(struct inode *inode) } ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, - rsv, min_size); + rsv, min_size, 0); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; } @@ -9177,7 +9189,6 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); } - out: btrfs_free_block_rsv(root, rsv); @@ -9386,25 +9397,25 @@ int btrfs_init_cachep(void) btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", sizeof(struct btrfs_trans_handle), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", sizeof(struct btrfs_transaction), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", sizeof(struct btrfs_free_space), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_free_space_cachep) goto fail; @@ -9554,7 +9565,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = btrfs_update_inode(trans, root, old_inode); } if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9574,7 +9585,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, ret = btrfs_update_inode(trans, dest, new_inode); } if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9582,7 +9593,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_dentry->d_name.name, new_dentry->d_name.len, 0, old_idx); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9590,7 +9601,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, old_dentry->d_name.name, old_dentry->d_name.len, 0, new_idx); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9829,7 +9840,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = btrfs_update_inode(trans, root, old_inode); } if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9853,7 +9864,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, d_inode(new_dentry)); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } } @@ -9862,7 +9873,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9882,7 +9893,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } } @@ -10308,7 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (ret) { btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans, root); break; @@ -10368,7 +10379,7 @@ next: ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans, root); break; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 05173563e4a6..14ed1e9e6bc8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -561,7 +561,7 @@ static noinline int create_subvol(struct inode *dir, new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -570,7 +570,7 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); if (ret) { /* We potentially lose an unused inode item here */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -583,7 +583,7 @@ static noinline int create_subvol(struct inode *dir, */ ret = btrfs_set_inode_index(dir, &index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -591,7 +591,7 @@ static noinline int create_subvol(struct inode *dir, name, namelen, dir, &key, BTRFS_FT_DIR, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -608,7 +608,7 @@ static noinline int create_subvol(struct inode *dir, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); fail: kfree(root_item); @@ -1948,8 +1948,7 @@ static noinline int key_in_sk(struct btrfs_key *key, return 1; } -static noinline int copy_to_sk(struct btrfs_root *root, - struct btrfs_path *path, +static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, size_t *buf_size, @@ -2120,7 +2119,7 @@ static noinline int search_ioctl(struct inode *inode, ret = 0; goto err; } - ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf, + ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); if (ret) @@ -2406,7 +2405,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * rmdir(2). */ err = -EPERM; - if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + if (!btrfs_test_opt(root->fs_info, USER_SUBVOL_RM_ALLOWED)) goto out_dput; /* @@ -2489,7 +2488,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dentry->d_name.len); if (ret) { err = ret; - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -2505,7 +2504,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, root->fs_info->tree_root, dest->root_key.objectid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -2515,7 +2514,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -2525,7 +2524,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, BTRFS_UUID_KEY_RECEIVED_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -3292,7 +3291,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3694,7 +3693,7 @@ process_slot: if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); + ret); btrfs_end_transaction(trans, root); goto out; } @@ -3702,8 +3701,7 @@ process_slot: ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); if (ret) { - btrfs_abort_transaction(trans, root, - ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3735,7 +3733,6 @@ process_slot: new_key.offset - datao); if (ret) { btrfs_abort_transaction(trans, - root, ret); btrfs_end_transaction(trans, root); @@ -3772,7 +3769,6 @@ process_slot: if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); btrfs_end_transaction(trans, root); goto out; @@ -3828,7 +3824,7 @@ process_slot: last_dest_end, destoff + len, 1); if (ret) { if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -5164,13 +5160,13 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); if (ret < 0 && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_commit_transaction(trans, root); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index aca8264f4a49..3b78d38173b3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1122,7 +1122,7 @@ int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", sizeof(struct btrfs_ordered_extent), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_ordered_extent_cache) return -ENOMEM; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 36992128c746..cf0b444ac4f3 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -350,6 +350,7 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *parent_root) { + struct super_block *sb = root->fs_info->sb; struct btrfs_key key; struct inode *parent_inode, *child_inode; int ret; @@ -358,12 +359,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - parent_inode = btrfs_iget(parent_root->fs_info->sb, &key, - parent_root, NULL); + parent_inode = btrfs_iget(sb, &key, parent_root, NULL); if (IS_ERR(parent_inode)) return PTR_ERR(parent_inode); - child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + child_inode = btrfs_iget(sb, &key, root, NULL); if (IS_ERR(child_inode)) { iput(parent_inode); return PTR_ERR(child_inode); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9d4c05b14f6e..93ee1c18ef9d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -571,7 +571,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; - if (btrfs_test_is_dummy_root(quota_root)) + if (btrfs_is_testing(quota_root->fs_info)) return 0; path = btrfs_alloc_path(); @@ -728,7 +728,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; key.objectid = 0; @@ -1453,9 +1453,10 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, return ret; } -struct btrfs_qgroup_extent_record -*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) +struct btrfs_qgroup_extent_record * +btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; struct rb_node *parent_node = NULL; @@ -1463,7 +1464,7 @@ struct btrfs_qgroup_extent_record u64 bytenr = record->bytenr; assert_spin_locked(&delayed_refs->lock); - trace_btrfs_qgroup_insert_dirty_extent(record); + trace_btrfs_qgroup_insert_dirty_extent(fs_info, record); while (*p) { parent_node = *p; @@ -1595,8 +1596,8 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); - trace_qgroup_update_counters(qg->qgroupid, cur_old_count, - cur_new_count); + trace_qgroup_update_counters(fs_info, qg->qgroupid, + cur_old_count, cur_new_count); /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { @@ -1687,8 +1688,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, goto out_free; BUG_ON(!fs_info->quota_root); - trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots, - nr_new_roots); + trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes, + nr_old_roots, nr_new_roots); qgroups = ulist_alloc(GFP_NOFS); if (!qgroups) { @@ -1759,7 +1760,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, record = rb_entry(node, struct btrfs_qgroup_extent_record, node); - trace_btrfs_qgroup_account_extents(record); + trace_btrfs_qgroup_account_extents(fs_info, record); if (!ret) { /* @@ -2195,7 +2196,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) return; - btrfs_err(trans->root->fs_info, + btrfs_err(trans->fs_info, "qgroups not uptodate in trans handle %p: list is%s empty, " "seq is %#x.%x", trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index ecb2c143ef75..710887c06aaf 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -63,9 +63,10 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -struct btrfs_qgroup_extent_record -*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); +struct btrfs_qgroup_extent_record * +btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -88,7 +89,7 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); - trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes); + trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); } void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0477dca154ed..b26a5aea41b4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -235,12 +235,12 @@ static void backref_cache_cleanup(struct backref_cache *cache) cache->last_trans = 0; for (i = 0; i < BTRFS_MAX_LEVEL; i++) - BUG_ON(!list_empty(&cache->pending[i])); - BUG_ON(!list_empty(&cache->changed)); - BUG_ON(!list_empty(&cache->detached)); - BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); - BUG_ON(cache->nr_nodes); - BUG_ON(cache->nr_edges); + ASSERT(list_empty(&cache->pending[i])); + ASSERT(list_empty(&cache->changed)); + ASSERT(list_empty(&cache->detached)); + ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); + ASSERT(!cache->nr_nodes); + ASSERT(!cache->nr_edges); } static struct backref_node *alloc_backref_node(struct backref_cache *cache) @@ -1171,8 +1171,12 @@ out: lower = list_entry(useless.next, struct backref_node, list); list_del_init(&lower->list); + if (lower == node) + node = NULL; free_backref_node(cache, lower); } + + free_backref_node(cache, node); return ERR_PTR(err); } ASSERT(!node || !node->detached); @@ -1719,7 +1723,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } @@ -1727,7 +1731,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, parent, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } } @@ -2604,25 +2608,28 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, trans->block_rsv = rc->block_rsv; rc->reserved_bytes += num_bytes; + + /* + * We are under a transaction here so we can only do limited flushing. + * If we get an enospc just kick back -EAGAIN so we know to drop the + * transaction and try to refill when we can flush all the things. + */ ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, - BTRFS_RESERVE_FLUSH_ALL); + BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { - if (ret == -EAGAIN) { - tmp = rc->extent_root->nodesize * - RELOCATION_RESERVED_NODES; - while (tmp <= rc->reserved_bytes) - tmp <<= 1; - /* - * only one thread can access block_rsv at this point, - * so we don't need hold lock to protect block_rsv. - * we expand more reservation size here to allow enough - * space for relocation and we will return earlier in - * enospc case. - */ - rc->block_rsv->size = tmp + rc->extent_root->nodesize * - RELOCATION_RESERVED_NODES; - } - return ret; + tmp = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) + tmp <<= 1; + /* + * only one thread can access block_rsv at this point, + * so we don't need hold lock to protect block_rsv. + * we expand more reservation size here to allow enough + * space for relocation and we will return eailer in + * enospc case. + */ + rc->block_rsv->size = tmp + rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + return -EAGAIN; } return 0; @@ -3871,6 +3878,7 @@ static noinline_for_stack int prepare_to_relocate(struct reloc_control *rc) { struct btrfs_trans_handle *trans; + int ret; rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, BTRFS_BLOCK_RSV_TEMP); @@ -3885,6 +3893,11 @@ int prepare_to_relocate(struct reloc_control *rc) rc->reserved_bytes = 0; rc->block_rsv->size = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES; + ret = btrfs_block_rsv_refill(rc->extent_root, + rc->block_rsv, rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) + return ret; rc->create_reloc_tree = 1; set_reloc_control(rc); @@ -4643,7 +4656,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, if (rc->merge_reloc_tree) { ret = btrfs_block_rsv_migrate(&pending->block_rsv, rc->block_rsv, - rc->nodes_relocated); + rc->nodes_relocated, 1); if (ret) return ret; } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index f1c30861d062..7fd7e1830cfe 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -150,7 +150,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -176,20 +176,20 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_del_item(trans, root, path); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } l = path->nodes[0]; @@ -448,7 +448,7 @@ again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name_len); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e08b6bc676e3..1d195d2b32c6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3785,27 +3785,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) fs_info->scrub_workers = - btrfs_alloc_workqueue("scrub", flags, + btrfs_alloc_workqueue(fs_info, "scrub", flags, 1, 4); else fs_info->scrub_workers = - btrfs_alloc_workqueue("scrub", flags, + btrfs_alloc_workqueue(fs_info, "scrub", flags, max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue("scrubwrc", flags, + btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, max_active, 2); if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; fs_info->scrub_nocow_workers = - btrfs_alloc_workqueue("scrubnc", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0); if (!fs_info->scrub_nocow_workers) goto fail_scrub_nocow_workers; fs_info->scrub_parity_workers = - btrfs_alloc_workqueue("scrubparity", flags, + btrfs_alloc_workqueue(fs_info, "scrubparity", flags, max_active, 2); if (!fs_info->scrub_parity_workers) goto fail_scrub_parity_workers; @@ -3860,7 +3860,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ - btrfs_err(fs_info, + btrfs_err_rl(fs_info, "scrub: size assumption sectorsize != PAGE_SIZE " "(%d != %lu) fails", fs_info->chunk_root->sectorsize, PAGE_SIZE); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 60e7179ed4b7..864ce334f696 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -184,6 +184,22 @@ static const char * const logtypes[] = { "debug", }; + +/* + * Use one ratelimit state per log level so that a flood of less important + * messages doesn't cause more important ones to be dropped. + */ +static struct ratelimit_state printk_limits[] = { + RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), +}; + void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { struct super_block *sb = fs_info->sb; @@ -192,6 +208,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) va_list args; const char *type = logtypes[4]; int kern_level; + struct ratelimit_state *ratelimit; va_start(args, fmt); @@ -202,13 +219,18 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) lvl[size] = '\0'; fmt += size; type = logtypes[kern_level - '0']; - } else + ratelimit = &printk_limits[kern_level - '0']; + } else { *lvl = '\0'; + /* Default to debug output */ + ratelimit = &printk_limits[7]; + } vaf.fmt = fmt; vaf.va = &args; - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); + if (__ratelimit(ratelimit)) + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); va_end(args); } @@ -229,9 +251,11 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) */ __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, + const char *function, unsigned int line, int errno) { + struct btrfs_fs_info *fs_info = trans->fs_info; + trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ @@ -239,16 +263,16 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *errstr; errstr = btrfs_decode_error(errno); - btrfs_warn(root->fs_info, + btrfs_warn(fs_info, "%s:%d: Aborting unused transaction(%s).", function, line, errstr); return; } ACCESS_ONCE(trans->transaction->aborted) = errno; /* Wake up anybody who may be waiting on this transaction */ - wake_up(&root->fs_info->transaction_wait); - wake_up(&root->fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(root->fs_info, function, line, errno, NULL); + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); } /* * __btrfs_panic decodes unexpected, fatal errors from the caller, @@ -432,12 +456,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, */ break; case Opt_nodatasum: - btrfs_set_and_info(root, NODATASUM, + btrfs_set_and_info(info, NODATASUM, "setting nodatasum"); break; case Opt_datasum: - if (btrfs_test_opt(root, NODATASUM)) { - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(info, NODATASUM)) { + if (btrfs_test_opt(info, NODATACOW)) btrfs_info(root->fs_info, "setting datasum, datacow enabled"); else btrfs_info(root->fs_info, "setting datasum"); @@ -446,9 +470,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_clear_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: - if (!btrfs_test_opt(root, NODATACOW)) { - if (!btrfs_test_opt(root, COMPRESS) || - !btrfs_test_opt(root, FORCE_COMPRESS)) { + if (!btrfs_test_opt(info, NODATACOW)) { + if (!btrfs_test_opt(info, COMPRESS) || + !btrfs_test_opt(info, FORCE_COMPRESS)) { btrfs_info(root->fs_info, "setting nodatacow, compression disabled"); } else { @@ -461,7 +485,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_set_opt(info->mount_opt, NODATASUM); break; case Opt_datacow: - btrfs_clear_and_info(root, NODATACOW, + btrfs_clear_and_info(info, NODATACOW, "setting datacow"); break; case Opt_compress_force: @@ -470,10 +494,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, /* Fallthrough */ case Opt_compress: case Opt_compress_type: - saved_compress_type = btrfs_test_opt(root, COMPRESS) ? + saved_compress_type = btrfs_test_opt(info, + COMPRESS) ? info->compress_type : BTRFS_COMPRESS_NONE; saved_compress_force = - btrfs_test_opt(root, FORCE_COMPRESS); + btrfs_test_opt(info, FORCE_COMPRESS); if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -513,10 +538,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, */ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } - if ((btrfs_test_opt(root, COMPRESS) && + if ((btrfs_test_opt(info, COMPRESS) && (info->compress_type != saved_compress_type || compress_force != saved_compress_force)) || - (!btrfs_test_opt(root, COMPRESS) && + (!btrfs_test_opt(info, COMPRESS) && no_compress == 1)) { btrfs_info(root->fs_info, "%s %s compression", @@ -526,25 +551,25 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, compress_force = false; break; case Opt_ssd: - btrfs_set_and_info(root, SSD, + btrfs_set_and_info(info, SSD, "use ssd allocation scheme"); break; case Opt_ssd_spread: - btrfs_set_and_info(root, SSD_SPREAD, + btrfs_set_and_info(info, SSD_SPREAD, "use spread ssd allocation scheme"); btrfs_set_opt(info->mount_opt, SSD); break; case Opt_nossd: - btrfs_set_and_info(root, NOSSD, + btrfs_set_and_info(info, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); break; case Opt_barrier: - btrfs_clear_and_info(root, NOBARRIER, + btrfs_clear_and_info(info, NOBARRIER, "turning on barriers"); break; case Opt_nobarrier: - btrfs_set_and_info(root, NOBARRIER, + btrfs_set_and_info(info, NOBARRIER, "turning off barriers"); break; case Opt_thread_pool: @@ -604,24 +629,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; case Opt_notreelog: - btrfs_set_and_info(root, NOTREELOG, + btrfs_set_and_info(info, NOTREELOG, "disabling tree log"); break; case Opt_treelog: - btrfs_clear_and_info(root, NOTREELOG, + btrfs_clear_and_info(info, NOTREELOG, "enabling tree log"); break; case Opt_norecovery: case Opt_nologreplay: - btrfs_set_and_info(root, NOLOGREPLAY, + btrfs_set_and_info(info, NOLOGREPLAY, "disabling log replay at mount time"); break; case Opt_flushoncommit: - btrfs_set_and_info(root, FLUSHONCOMMIT, + btrfs_set_and_info(info, FLUSHONCOMMIT, "turning on flush-on-commit"); break; case Opt_noflushoncommit: - btrfs_clear_and_info(root, FLUSHONCOMMIT, + btrfs_clear_and_info(info, FLUSHONCOMMIT, "turning off flush-on-commit"); break; case Opt_ratio: @@ -638,11 +663,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, } break; case Opt_discard: - btrfs_set_and_info(root, DISCARD, + btrfs_set_and_info(info, DISCARD, "turning on discard"); break; case Opt_nodiscard: - btrfs_clear_and_info(root, DISCARD, + btrfs_clear_and_info(info, DISCARD, "turning off discard"); break; case Opt_space_cache: @@ -651,12 +676,13 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, strcmp(args[0].from, "v1") == 0) { btrfs_clear_opt(root->fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_set_and_info(root, SPACE_CACHE, + btrfs_set_and_info(info, SPACE_CACHE, "enabling disk space caching"); } else if (strcmp(args[0].from, "v2") == 0) { btrfs_clear_opt(root->fs_info->mount_opt, SPACE_CACHE); - btrfs_set_and_info(root, FREE_SPACE_TREE, + btrfs_set_and_info(info, + FREE_SPACE_TREE, "enabling free space tree"); } else { ret = -EINVAL; @@ -667,12 +693,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: - if (btrfs_test_opt(root, SPACE_CACHE)) { - btrfs_clear_and_info(root, SPACE_CACHE, + if (btrfs_test_opt(info, SPACE_CACHE)) { + btrfs_clear_and_info(info, + SPACE_CACHE, "disabling disk space caching"); } - if (btrfs_test_opt(root, FREE_SPACE_TREE)) { - btrfs_clear_and_info(root, FREE_SPACE_TREE, + if (btrfs_test_opt(info, FREE_SPACE_TREE)) { + btrfs_clear_and_info(info, + FREE_SPACE_TREE, "disabling free space tree"); } break; @@ -685,7 +713,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, "disabling inode map caching"); break; case Opt_clear_cache: - btrfs_set_and_info(root, CLEAR_CACHE, + btrfs_set_and_info(info, CLEAR_CACHE, "force clearing of disk cache"); break; case Opt_user_subvol_rm_allowed: @@ -698,11 +726,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); break; case Opt_defrag: - btrfs_set_and_info(root, AUTO_DEFRAG, + btrfs_set_and_info(info, AUTO_DEFRAG, "enabling auto defrag"); break; case Opt_nodefrag: - btrfs_clear_and_info(root, AUTO_DEFRAG, + btrfs_clear_and_info(info, AUTO_DEFRAG, "disabling auto defrag"); break; case Opt_recovery: @@ -810,22 +838,22 @@ check: /* * Extra check for current option against current flag */ - if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { + if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { btrfs_err(root->fs_info, "nologreplay must be used with ro mount option"); ret = -EINVAL; } out: if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && - !btrfs_test_opt(root, FREE_SPACE_TREE) && - !btrfs_test_opt(root, CLEAR_CACHE)) { + !btrfs_test_opt(info, FREE_SPACE_TREE) && + !btrfs_test_opt(info, CLEAR_CACHE)) { btrfs_err(root->fs_info, "cannot disable free space tree"); ret = -EINVAL; } - if (!ret && btrfs_test_opt(root, SPACE_CACHE)) + if (!ret && btrfs_test_opt(info, SPACE_CACHE)) btrfs_info(root->fs_info, "disk space caching is enabled"); - if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE)) + if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) btrfs_info(root->fs_info, "using free space tree"); kfree(orig); return ret; @@ -1149,7 +1177,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - trace_btrfs_sync_fs(wait); + trace_btrfs_sync_fs(fs_info, wait); if (!wait) { filemap_flush(fs_info->btree_inode->i_mapping); @@ -1192,13 +1220,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) struct btrfs_root *root = info->tree_root; char *compress_type; - if (btrfs_test_opt(root, DEGRADED)) + if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); - if (btrfs_test_opt(root, NODATASUM)) + if (btrfs_test_opt(info, NODATASUM)) seq_puts(seq, ",nodatasum"); - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(info, NODATACOW)) seq_puts(seq, ",nodatacow"); - if (btrfs_test_opt(root, NOBARRIER)) + if (btrfs_test_opt(info, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); @@ -1207,56 +1235,56 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); - if (btrfs_test_opt(root, COMPRESS)) { + if (btrfs_test_opt(info, COMPRESS)) { if (info->compress_type == BTRFS_COMPRESS_ZLIB) compress_type = "zlib"; else compress_type = "lzo"; - if (btrfs_test_opt(root, FORCE_COMPRESS)) + if (btrfs_test_opt(info, FORCE_COMPRESS)) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); } - if (btrfs_test_opt(root, NOSSD)) + if (btrfs_test_opt(info, NOSSD)) seq_puts(seq, ",nossd"); - if (btrfs_test_opt(root, SSD_SPREAD)) + if (btrfs_test_opt(info, SSD_SPREAD)) seq_puts(seq, ",ssd_spread"); - else if (btrfs_test_opt(root, SSD)) + else if (btrfs_test_opt(info, SSD)) seq_puts(seq, ",ssd"); - if (btrfs_test_opt(root, NOTREELOG)) + if (btrfs_test_opt(info, NOTREELOG)) seq_puts(seq, ",notreelog"); - if (btrfs_test_opt(root, NOLOGREPLAY)) + if (btrfs_test_opt(info, NOLOGREPLAY)) seq_puts(seq, ",nologreplay"); - if (btrfs_test_opt(root, FLUSHONCOMMIT)) + if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(info, DISCARD)) seq_puts(seq, ",discard"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(info, SPACE_CACHE)) seq_puts(seq, ",space_cache"); - else if (btrfs_test_opt(root, FREE_SPACE_TREE)) + else if (btrfs_test_opt(info, FREE_SPACE_TREE)) seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); - if (btrfs_test_opt(root, RESCAN_UUID_TREE)) + if (btrfs_test_opt(info, RESCAN_UUID_TREE)) seq_puts(seq, ",rescan_uuid_tree"); - if (btrfs_test_opt(root, CLEAR_CACHE)) + if (btrfs_test_opt(info, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); - if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED)) seq_puts(seq, ",user_subvol_rm_allowed"); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) + if (btrfs_test_opt(info, ENOSPC_DEBUG)) seq_puts(seq, ",enospc_debug"); - if (btrfs_test_opt(root, AUTO_DEFRAG)) + if (btrfs_test_opt(info, AUTO_DEFRAG)) seq_puts(seq, ",autodefrag"); - if (btrfs_test_opt(root, INODE_MAP_CACHE)) + if (btrfs_test_opt(info, INODE_MAP_CACHE)) seq_puts(seq, ",inode_cache"); - if (btrfs_test_opt(root, SKIP_BALANCE)) + if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) + if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) seq_puts(seq, ",check_int_data"); - else if (btrfs_test_opt(root, CHECK_INTEGRITY)) + else if (btrfs_test_opt(info, CHECK_INTEGRITY)) seq_puts(seq, ",check_int"); if (info->check_integrity_print_mask) seq_printf(seq, ",check_int_print_mask=%d", @@ -1265,14 +1293,14 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (info->metadata_ratio) seq_printf(seq, ",metadata_ratio=%d", info->metadata_ratio); - if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) + if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); #ifdef CONFIG_BTRFS_DEBUG - if (btrfs_test_opt(root, FRAGMENT_DATA)) + if (btrfs_test_opt(info, FRAGMENT_DATA)) seq_puts(seq, ",fragment=data"); - if (btrfs_test_opt(root, FRAGMENT_METADATA)) + if (btrfs_test_opt(info, FRAGMENT_METADATA)) seq_puts(seq, ",fragment=metadata"); #endif seq_printf(seq, ",subvolid=%llu", @@ -2030,9 +2058,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * chunk). * * If metadata is exhausted, f_bavail will be 0. - * - * FIXME: not accurate for mixed block groups, total and free/used are ok, - * available appears slightly larger. */ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -2319,49 +2344,6 @@ static void btrfs_print_mod_info(void) btrfs_crc32c_impl()); } -static int btrfs_run_sanity_tests(void) -{ - int ret, i; - u32 sectorsize, nodesize; - u32 test_sectorsize[] = { - PAGE_SIZE, - }; - ret = btrfs_init_test_fs(); - if (ret) - return ret; - for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) { - sectorsize = test_sectorsize[i]; - for (nodesize = sectorsize; - nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE; - nodesize <<= 1) { - pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n", - sectorsize, nodesize); - ret = btrfs_test_free_space_cache(sectorsize, nodesize); - if (ret) - goto out; - ret = btrfs_test_extent_buffer_operations(sectorsize, - nodesize); - if (ret) - goto out; - ret = btrfs_test_extent_io(sectorsize, nodesize); - if (ret) - goto out; - ret = btrfs_test_inodes(sectorsize, nodesize); - if (ret) - goto out; - ret = btrfs_test_qgroups(sectorsize, nodesize); - if (ret) - goto out; - ret = btrfs_test_free_space_tree(sectorsize, nodesize); - if (ret) - goto out; - } - } -out: - btrfs_destroy_test_fs(); - return ret; -} - static int __init init_btrfs_fs(void) { int err; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 4879656bda3c..c6569905d3d1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -326,6 +326,7 @@ SPACE_INFO_ATTR(bytes_used); SPACE_INFO_ATTR(bytes_pinned); SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); +SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); @@ -337,6 +338,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(bytes_pinned), BTRFS_ATTR_PTR(bytes_reserved), BTRFS_ATTR_PTR(bytes_may_use), + BTRFS_ATTR_PTR(bytes_readonly), BTRFS_ATTR_PTR(disk_used), BTRFS_ATTR_PTR(disk_total), BTRFS_ATTR_PTR(total_bytes_pinned), diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 02223f3f78f4..bf62ad919a95 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -54,7 +54,7 @@ struct inode *btrfs_new_test_inode(void) return new_inode(test_mnt->mnt_sb); } -int btrfs_init_test_fs(void) +static int btrfs_init_test_fs(void) { int ret; @@ -73,7 +73,7 @@ int btrfs_init_test_fs(void) return 0; } -void btrfs_destroy_test_fs(void) +static void btrfs_destroy_test_fs(void) { kern_unmount(test_mnt); unregister_filesystem(&test_type); @@ -128,14 +128,27 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) extent_io_tree_init(&fs_info->freed_extents[0], NULL); extent_io_tree_init(&fs_info->freed_extents[1], NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; + set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + + test_mnt->mnt_sb->s_fs_info = fs_info; + return fs_info; } -static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { struct radix_tree_iter iter; void **slot; + if (!fs_info) + return; + + if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, + &fs_info->fs_state))) + return; + + test_mnt->mnt_sb->s_fs_info = NULL; + spin_lock(&fs_info->buffer_lock); radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { struct extent_buffer *eb; @@ -167,10 +180,11 @@ void btrfs_free_dummy_root(struct btrfs_root *root) { if (!root) return; + /* Will be freed by btrfs_free_fs_roots */ + if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) + return; if (root->node) free_extent_buffer(root->node); - if (root->fs_info) - btrfs_free_dummy_fs_info(root->fs_info); kfree(root); } @@ -220,3 +234,46 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) INIT_LIST_HEAD(&trans->qgroup_ref_list); trans->type = __TRANS_DUMMY; } + +int btrfs_run_sanity_tests(void) +{ + int ret, i; + u32 sectorsize, nodesize; + u32 test_sectorsize[] = { + PAGE_SIZE, + }; + ret = btrfs_init_test_fs(); + if (ret) + return ret; + for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) { + sectorsize = test_sectorsize[i]; + for (nodesize = sectorsize; + nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE; + nodesize <<= 1) { + pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n", + sectorsize, nodesize); + ret = btrfs_test_free_space_cache(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(sectorsize, + nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_io(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_inodes(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_qgroups(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(sectorsize, nodesize); + if (ret) + goto out; + } + } +out: + btrfs_destroy_test_fs(); + return ret; +} diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 66fb6b701eb7..b17ffbe8f9f3 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -20,57 +20,29 @@ #define __BTRFS_TESTS #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_run_sanity_tests(void); #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) struct btrfs_root; struct btrfs_trans_handle; -int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); -int btrfs_init_test_fs(void); -void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group_cache * btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize); void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else -static inline int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) -{ - return 0; -} -static inline int btrfs_test_extent_buffer_operations(u32 sectorsize, - u32 nodesize) -{ - return 0; -} -static inline int btrfs_init_test_fs(void) -{ - return 0; -} -static inline void btrfs_destroy_test_fs(void) -{ -} -static inline int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) -{ - return 0; -} -static inline int btrfs_test_inodes(u32 sectorsize, u32 nodesize) -{ - return 0; -} -static inline int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) -{ - return 0; -} -static inline int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) +static inline int btrfs_run_sanity_tests(void) { return 0; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 4f8cbd1ec5ee..199569174637 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -24,8 +24,9 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) { - struct btrfs_path *path; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path = NULL; + struct btrfs_root *root = NULL; struct extent_buffer *eb; struct btrfs_item *item; char *value = "mary had a little lamb"; @@ -40,17 +41,24 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) test_msg("Running btrfs_split_item tests\n"); - root = btrfs_alloc_dummy_root(sectorsize, nodesize); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Could not allocate fs_info\n"); + return -ENOMEM; + } + + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Could not allocate root\n"); - return PTR_ERR(root); + ret = PTR_ERR(root); + goto out; } path = btrfs_alloc_path(); if (!path) { test_msg("Could not allocate path\n"); - kfree(root); - return -ENOMEM; + ret = -ENOMEM; + goto out; } path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize, @@ -219,7 +227,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) } out: btrfs_free_path(path); - kfree(root); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 3956bb2ff84c..3221c8dee272 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -837,6 +837,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info; struct btrfs_block_group_cache *cache; struct btrfs_root *root = NULL; int ret = -ENOMEM; @@ -855,15 +856,17 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) return 0; } - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - ret = PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + ret = -ENOMEM; goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + ret = PTR_ERR(root); goto out; + } root->fs_info->extent_root = root; cache->fs_info = root->fs_info; @@ -882,6 +885,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); test_msg("Free space cache tests finished\n"); return ret; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index aac507085ab0..7508d3b42780 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -443,23 +443,24 @@ typedef int (*test_func_t)(struct btrfs_trans_handle *, static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info; struct btrfs_root *root = NULL; struct btrfs_block_group_cache *cache = NULL; struct btrfs_trans_handle trans; struct btrfs_path *path = NULL; int ret; - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate dummy root\n"); - ret = PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); - ret = -ENOMEM; + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate dummy root\n"); + ret = PTR_ERR(root); goto out; } @@ -534,6 +535,7 @@ out: btrfs_free_path(path); btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 29648c0a39f1..9f72aeda9220 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -230,6 +230,7 @@ static unsigned long vacancy_only = 0; static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; struct extent_map *em = NULL; @@ -248,19 +249,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - /* - * We do this since btrfs_get_extent wants to assign em->bdev to - * root->fs_info->fs_devices->latest_bdev. - */ - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } @@ -835,11 +832,13 @@ out: free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } static int test_hole_first(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; struct extent_map *em = NULL; @@ -855,15 +854,15 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } @@ -934,11 +933,13 @@ out: free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } static int test_extent_accounting(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; int ret = -ENOMEM; @@ -949,15 +950,15 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) return ret; } - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } @@ -1132,6 +1133,7 @@ out: NULL, GFP_KERNEL); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 57a12c0d680b..4407fef7c16c 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -453,22 +453,24 @@ static int test_multiple_refs(struct btrfs_root *root, int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct btrfs_root *root; struct btrfs_root *tmp_root; int ret = 0; - root = btrfs_alloc_dummy_root(sectorsize, nodesize); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); - return PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + return -ENOMEM; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); - ret = -ENOMEM; + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + ret = PTR_ERR(root); goto out; } + /* We are using this root as our extent root */ root->fs_info->extent_root = root; @@ -495,7 +497,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) btrfs_set_header_nritems(root->node, 0); root->alloc_bytenr += 2 * nodesize; - tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize); + tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); @@ -510,7 +512,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) goto out; } - tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize); + tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); @@ -531,5 +533,6 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) ret = test_multiple_refs(root, sectorsize, nodesize); out: btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 948aa186b353..9cca0a721961 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -561,6 +561,7 @@ again: h->transaction = cur_trans; h->root = root; h->use_count = 1; + h->fs_info = root->fs_info; h->type = type; h->can_flush_pending_bgs = true; @@ -1491,7 +1492,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto dir_item_existed; } else if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } btrfs_release_path(path); @@ -1504,7 +1505,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_run_delayed_items(trans, root); if (ret) { /* Transaction aborted */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1543,7 +1544,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1554,7 +1555,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } /* see comments in should_cow_block() */ @@ -1568,7 +1569,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(tmp); free_extent_buffer(tmp); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1580,7 +1581,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_ino(parent_inode), index, dentry->d_name.name, dentry->d_name.len); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1588,19 +1589,19 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_reloc_post_snapshot(trans, pending); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1622,7 +1623,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1632,13 +1633,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { @@ -1647,14 +1648,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1709,7 +1710,7 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) super->cache_generation = root_item->generation; if (root->fs_info->update_uuid_tree_gen) super->uuid_tree_generation = root_item->generation; @@ -1850,7 +1851,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, WARN_ON(trans->use_count > 1); - btrfs_abort_transaction(trans, root, err); + btrfs_abort_transaction(trans, err); spin_lock(&root->fs_info->trans_lock); @@ -1895,14 +1896,14 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { - if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) return btrfs_start_delalloc_roots(fs_info, 1, -1); return 0; } static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { - if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index c5abee4f01ad..efb122643380 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -128,6 +128,7 @@ struct btrfs_trans_handle { * Subvolume quota depends on this */ struct btrfs_root *root; + struct btrfs_fs_info *fs_info; struct seq_list delayed_ref_elem; struct list_head qgroup_ref_list; struct list_head new_bgs; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c05f69a8ec42..d31a0c4f56be 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2757,7 +2757,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ - if (!btrfs_test_opt(root, SSD) && + if (!btrfs_test_opt(root->fs_info, SSD) && test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); @@ -2788,7 +2788,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { blk_finish_plug(&plug); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); btrfs_set_log_full_commit(root->fs_info, trans); mutex_unlock(&root->log_mutex); @@ -2838,7 +2838,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(root->fs_info, trans); if (ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out; } @@ -2898,7 +2898,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); if (ret) { btrfs_set_log_full_commit(root->fs_info, trans); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; @@ -2934,7 +2934,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { btrfs_set_log_full_commit(root->fs_info, trans); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_wake_log_root; } @@ -2991,7 +2991,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, ret = walk_log_tree(trans, log, &wc); /* I don't think this can happen but just in case */ if (ret) - btrfs_abort_transaction(trans, log, ret); + btrfs_abort_transaction(trans, ret); while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -3160,7 +3160,7 @@ out_unlock: btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); @@ -3193,7 +3193,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); return ret; @@ -4703,6 +4703,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ins_nr = 0; ret = btrfs_search_forward(root, &min_key, path, trans->transid); + if (ret < 0) { + err = ret; + goto out_unlock; + } if (ret != 0) break; again: @@ -5301,7 +5305,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, sb = inode->i_sb; - if (btrfs_test_opt(root, NOTREELOG)) { + if (btrfs_test_opt(root->fs_info, NOTREELOG)) { ret = 1; goto end_no_trans; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0fb4a959012e..bb0addce7558 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -140,7 +140,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); -static void btrfs_close_one_device(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -853,6 +852,46 @@ static void free_device(struct rcu_head *head) schedule_work(&device->rcu_work); } +static void btrfs_close_one_device(struct btrfs_device *device) +{ + struct btrfs_fs_devices *fs_devices = device->fs_devices; + struct btrfs_device *new_device; + struct rcu_string *name; + + if (device->bdev) + fs_devices->open_devices--; + + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + if (device->missing) + fs_devices->missing_devices--; + + if (device->bdev && device->writeable) { + sync_blockdev(device->bdev); + invalidate_bdev(device->bdev); + } + + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + if (device->name) { + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(!name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); + } + + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; + + call_rcu(&device->rcu, free_device); +} + static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; @@ -2399,14 +2438,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = init_first_rw_device(trans, root, device); unlock_chunks(root); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } } ret = btrfs_add_device(trans, root, device); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } @@ -2415,7 +2454,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_finish_sprout(trans, root); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } @@ -2801,7 +2840,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, &dev_extent_len); if (ret) { mutex_unlock(&fs_devices->device_list_mutex); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2820,7 +2859,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, ret = btrfs_update_device(trans, map->stripes[i].dev); if (ret) { mutex_unlock(&fs_devices->device_list_mutex); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -2829,7 +2868,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2838,14 +2877,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2902,7 +2941,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) * chunk tree entries */ ret = btrfs_remove_chunk(trans, root, chunk_offset); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans, extent_root); return ret; } @@ -3421,7 +3460,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u64 size_to_free; u64 chunk_type; struct btrfs_chunk *chunk; - struct btrfs_path *path; + struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_trans_handle *trans; @@ -3455,13 +3494,33 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) ret = btrfs_shrink_device(device, old_size - size_to_free); if (ret == -ENOSPC) break; - BUG_ON(ret); + if (ret) { + /* btrfs_shrink_device never returns ret > 0 */ + WARN_ON(ret > 0); + goto error; + } trans = btrfs_start_transaction(dev_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_info_in_rcu(fs_info, + "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", + rcu_str_deref(device->name), ret, + old_size, old_size - size_to_free); + goto error; + } ret = btrfs_grow_device(trans, device, old_size); - BUG_ON(ret); + if (ret) { + btrfs_end_transaction(trans, dev_root); + /* btrfs_grow_device never returns ret > 0 */ + WARN_ON(ret > 0); + btrfs_info_in_rcu(fs_info, + "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", + rcu_str_deref(device->name), ret, + old_size, old_size - size_to_free); + goto error; + } btrfs_end_transaction(trans, dev_root); } @@ -3885,7 +3944,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->balance_lock); - if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { btrfs_info(fs_info, "force skipping balance"); return 0; } @@ -4240,7 +4299,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -4514,8 +4573,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } -#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ - - sizeof(struct btrfs_item) \ +#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r) \ - sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) @@ -6401,7 +6459,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, BTRFS_UUID_SIZE); map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, uuid, NULL); - if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { + if (!map->stripes[i].dev && + !btrfs_test_opt(root->fs_info, DEGRADED)) { free_extent_map(em); return -EIO; } @@ -6469,7 +6528,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, fs_devices = find_fsid(fsid); if (!fs_devices) { - if (!btrfs_test_opt(root, DEGRADED)) + if (!btrfs_test_opt(root->fs_info, DEGRADED)) return ERR_PTR(-ENOENT); fs_devices = alloc_fs_devices(fsid); @@ -6531,7 +6590,7 @@ static int read_one_dev(struct btrfs_root *root, device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); if (!device) { - if (!btrfs_test_opt(root, DEGRADED)) + if (!btrfs_test_opt(root->fs_info, DEGRADED)) return -EIO; device = add_missing_dev(root, fs_devices, devid, dev_uuid); @@ -6540,7 +6599,7 @@ static int read_one_dev(struct btrfs_root *root, btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", devid, dev_uuid); } else { - if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) + if (!device->bdev && !btrfs_test_opt(root->fs_info, DEGRADED)) return -EIO; if(!device->bdev && !device->missing) { @@ -7143,38 +7202,3 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) fs_devices = fs_devices->seed; } } - -static void btrfs_close_one_device(struct btrfs_device *device) -{ - struct btrfs_fs_devices *fs_devices = device->fs_devices; - struct btrfs_device *new_device; - struct rcu_string *name; - - if (device->bdev) - fs_devices->open_devices--; - - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - list_del_init(&device->dev_alloc_list); - fs_devices->rw_devices--; - } - - if (device->missing) - fs_devices->missing_devices--; - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); - } - - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; - - call_rcu(&device->rcu, free_device); -} diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c index eccd33941199..125b90f6c796 100644 --- a/fs/cachefiles/proc.c +++ b/fs/cachefiles/proc.c @@ -93,7 +93,6 @@ static int cachefiles_histogram_open(struct inode *inode, struct file *file) } static const struct file_operations cachefiles_histogram_fops = { - .owner = THIS_MODULE, .open = cachefiles_histogram_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 26a9d10d75e9..d5b6f959a3c3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1730,7 +1730,8 @@ enum { POOL_WRITE = 2, }; -static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) +static int __ceph_pool_perm_get(struct ceph_inode_info *ci, + s64 pool, struct ceph_string *pool_ns) { struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -1738,6 +1739,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) struct rb_node **p, *parent; struct ceph_pool_perm *perm; struct page **pages; + size_t pool_ns_len; int err = 0, err2 = 0, have = 0; down_read(&mdsc->pool_perm_rwsem); @@ -1749,17 +1751,31 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) else if (pool > perm->pool) p = &(*p)->rb_right; else { - have = perm->perm; - break; + int ret = ceph_compare_string(pool_ns, + perm->pool_ns, + perm->pool_ns_len); + if (ret < 0) + p = &(*p)->rb_left; + else if (ret > 0) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } } } up_read(&mdsc->pool_perm_rwsem); if (*p) goto out; - dout("__ceph_pool_perm_get pool %u no perm cached\n", pool); + if (pool_ns) + dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n", + pool, (int)pool_ns->len, pool_ns->str); + else + dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool); down_write(&mdsc->pool_perm_rwsem); + p = &mdsc->pool_perm_tree.rb_node; parent = NULL; while (*p) { parent = *p; @@ -1769,8 +1785,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) else if (pool > perm->pool) p = &(*p)->rb_right; else { - have = perm->perm; - break; + int ret = ceph_compare_string(pool_ns, + perm->pool_ns, + perm->pool_ns_len); + if (ret < 0) + p = &(*p)->rb_left; + else if (ret > 0) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } } } if (*p) { @@ -1788,6 +1813,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) rd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); rd_req->r_base_oloc.pool = pool; + if (pool_ns) + rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns); ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino); err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); @@ -1841,7 +1868,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) goto out_unlock; } - perm = kmalloc(sizeof(*perm), GFP_NOFS); + pool_ns_len = pool_ns ? pool_ns->len : 0; + perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS); if (!perm) { err = -ENOMEM; goto out_unlock; @@ -1849,6 +1877,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) perm->pool = pool; perm->perm = have; + perm->pool_ns_len = pool_ns_len; + if (pool_ns_len > 0) + memcpy(perm->pool_ns, pool_ns->str, pool_ns_len); + perm->pool_ns[pool_ns_len] = 0; + rb_link_node(&perm->node, parent, p); rb_insert_color(&perm->node, &mdsc->pool_perm_tree); err = 0; @@ -1860,43 +1893,46 @@ out_unlock: out: if (!err) err = have; - dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err); + if (pool_ns) + dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n", + pool, (int)pool_ns->len, pool_ns->str, err); + else + dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err); return err; } int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) { - u32 pool; + s64 pool; + struct ceph_string *pool_ns; int ret, flags; - /* does not support pool namespace yet */ - if (ci->i_pool_ns_len) - return -EIO; - if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), NOPOOLPERM)) return 0; spin_lock(&ci->i_ceph_lock); flags = ci->i_ceph_flags; - pool = ceph_file_layout_pg_pool(ci->i_layout); + pool = ci->i_layout.pool_id; spin_unlock(&ci->i_ceph_lock); check: if (flags & CEPH_I_POOL_PERM) { if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { - dout("ceph_pool_perm_check pool %u no read perm\n", + dout("ceph_pool_perm_check pool %lld no read perm\n", pool); return -EPERM; } if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { - dout("ceph_pool_perm_check pool %u no write perm\n", + dout("ceph_pool_perm_check pool %lld no write perm\n", pool); return -EPERM; } return 0; } - ret = __ceph_pool_perm_get(ci, pool); + pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); + ret = __ceph_pool_perm_get(ci, pool, pool_ns); + ceph_put_string(pool_ns); if (ret < 0) return ret; @@ -1907,10 +1943,11 @@ check: flags |= CEPH_I_POOL_WR; spin_lock(&ci->i_ceph_lock); - if (pool == ceph_file_layout_pg_pool(ci->i_layout)) { - ci->i_ceph_flags = flags; + if (pool == ci->i_layout.pool_id && + pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) { + ci->i_ceph_flags |= flags; } else { - pool = ceph_file_layout_pg_pool(ci->i_layout); + pool = ci->i_layout.pool_id; flags = ci->i_ceph_flags; } spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 238c55b01723..5bc5d37b1217 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -71,7 +71,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) &ceph_fscache_fsid_object_def, fsc, true); if (!fsc->fscache) - pr_err("Unable to resgister fsid: %p fscache cookie", fsc); + pr_err("Unable to register fsid: %p fscache cookie\n", fsc); return 0; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6f60d0a3d0f9..99115cae1652 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -40,6 +40,11 @@ * cluster to release server state. */ +static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc); +static void __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci, + u64 oldest_flush_tid); /* * Generate readable cap strings for debugging output. @@ -849,12 +854,14 @@ int __ceph_caps_used(struct ceph_inode_info *ci) */ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { - int want = 0; - int mode; - for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++) - if (ci->i_nr_by_mode[mode]) - want |= ceph_caps_for_mode(mode); - return want; + int i, bits = 0; + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + if (ci->i_nr_by_mode[i]) + bits |= 1 << i; + } + if (bits == 0) + return 0; + return ceph_caps_for_mode(bits >> 1); } /* @@ -991,7 +998,7 @@ static int send_cap_msg(struct ceph_mds_session *session, u32 seq, u64 flush_tid, u64 oldest_flush_tid, u32 issue_seq, u32 mseq, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, - struct timespec *ctime, u64 time_warp_seq, + struct timespec *ctime, u32 time_warp_seq, kuid_t uid, kgid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, @@ -1116,8 +1123,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, struct inode *inode = &ci->vfs_inode; u64 cap_id = cap->cap_id; int held, revoking, dropping, keep; - u64 seq, issue_seq, mseq, time_warp_seq, follows; - u64 size, max_size; + u64 follows, size, max_size; + u32 seq, issue_seq, mseq, time_warp_seq; struct timespec mtime, atime, ctime; int wake = 0; umode_t mode; @@ -1215,6 +1222,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, return delayed; } +static inline int __send_flush_snap(struct inode *inode, + struct ceph_mds_session *session, + struct ceph_cap_snap *capsnap, + u32 mseq, u64 oldest_flush_tid) +{ + return send_cap_msg(session, ceph_vino(inode).ino, 0, + CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, + capsnap->dirty, 0, capsnap->cap_flush.tid, + oldest_flush_tid, 0, mseq, capsnap->size, 0, + &capsnap->mtime, &capsnap->atime, + &capsnap->ctime, capsnap->time_warp_seq, + capsnap->uid, capsnap->gid, capsnap->mode, + capsnap->xattr_version, capsnap->xattr_blob, + capsnap->follows, capsnap->inline_data); +} + /* * When a snapshot is taken, clients accumulate dirty metadata on * inodes with capabilities in ceph_cap_snaps to describe the file @@ -1222,37 +1245,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * - * Unless @kick is true, skip cap_snaps that were already sent to - * the MDS (i.e., during this session). - * * Called under i_ceph_lock. Takes s_mutex as needed. */ -void __ceph_flush_snaps(struct ceph_inode_info *ci, - struct ceph_mds_session **psession, - int kick) +static void __ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session *session) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->vfs_inode; - int mds; + struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_cap_snap *capsnap; - u32 mseq; - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - struct ceph_mds_session *session = NULL; /* if session != NULL, we hold - session->s_mutex */ - u64 next_follows = 0; /* keep track of how far we've gotten through the - i_cap_snaps list, and skip these entries next time - around to avoid an infinite loop */ + u64 oldest_flush_tid = 0; + u64 first_tid = 1, last_tid = 0; - if (psession) - session = *psession; + dout("__flush_snaps %p session %p\n", inode, session); - dout("__flush_snaps %p\n", inode); -retry: list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { - /* avoid an infiniute loop after retry */ - if (capsnap->follows < next_follows) - continue; /* * we need to wait for sync writes to complete and for dirty * pages to be written out. @@ -1263,97 +1271,129 @@ retry: /* should be removed by ceph_try_drop_cap_snap() */ BUG_ON(!capsnap->need_flush); - /* pick mds, take s_mutex */ - if (ci->i_auth_cap == NULL) { - dout("no auth cap (migrating?), doing nothing\n"); - goto out; - } - /* only flush each capsnap once */ - if (!kick && !list_empty(&capsnap->flushing_item)) { - dout("already flushed %p, skipping\n", capsnap); + if (capsnap->cap_flush.tid > 0) { + dout(" already flushed %p, skipping\n", capsnap); continue; } - mds = ci->i_auth_cap->session->s_mds; - mseq = ci->i_auth_cap->mseq; + spin_lock(&mdsc->cap_dirty_lock); + capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid; + list_add_tail(&capsnap->cap_flush.g_list, + &mdsc->cap_flush_list); + if (oldest_flush_tid == 0) + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + if (list_empty(&ci->i_flushing_item)) { + list_add_tail(&ci->i_flushing_item, + &session->s_cap_flushing); + } + spin_unlock(&mdsc->cap_dirty_lock); + + list_add_tail(&capsnap->cap_flush.i_list, + &ci->i_cap_flush_list); - if (session && session->s_mds != mds) { - dout("oops, wrong session %p mutex\n", session); - if (kick) - goto out; + if (first_tid == 1) + first_tid = capsnap->cap_flush.tid; + last_tid = capsnap->cap_flush.tid; + } - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - session = NULL; + ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS; + + while (first_tid <= last_tid) { + struct ceph_cap *cap = ci->i_auth_cap; + struct ceph_cap_flush *cf; + int ret; + + if (!(cap && cap->session == session)) { + dout("__flush_snaps %p auth cap %p not mds%d, " + "stop\n", inode, cap, session->s_mds); + break; } - if (!session) { - spin_unlock(&ci->i_ceph_lock); - mutex_lock(&mdsc->mutex); - session = __ceph_lookup_mds_session(mdsc, mds); - mutex_unlock(&mdsc->mutex); - if (session) { - dout("inverting session/ino locks on %p\n", - session); - mutex_lock(&session->s_mutex); + + ret = -ENOENT; + list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { + if (cf->tid >= first_tid) { + ret = 0; + break; } - /* - * if session == NULL, we raced against a cap - * deletion or migration. retry, and we'll - * get a better @mds value next time. - */ - spin_lock(&ci->i_ceph_lock); - goto retry; } + if (ret < 0) + break; - spin_lock(&mdsc->cap_dirty_lock); - capsnap->flush_tid = ++mdsc->last_cap_flush_tid; - spin_unlock(&mdsc->cap_dirty_lock); + first_tid = cf->tid + 1; + capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); atomic_inc(&capsnap->nref); - if (list_empty(&capsnap->flushing_item)) - list_add_tail(&capsnap->flushing_item, - &session->s_cap_snaps_flushing); spin_unlock(&ci->i_ceph_lock); - dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", - inode, capsnap, capsnap->follows, capsnap->flush_tid); - send_cap_msg(session, ceph_vino(inode).ino, 0, - CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, - capsnap->dirty, 0, capsnap->flush_tid, 0, - 0, mseq, capsnap->size, 0, - &capsnap->mtime, &capsnap->atime, - &capsnap->ctime, capsnap->time_warp_seq, - capsnap->uid, capsnap->gid, capsnap->mode, - capsnap->xattr_version, capsnap->xattr_blob, - capsnap->follows, capsnap->inline_data); - - next_follows = capsnap->follows + 1; - ceph_put_cap_snap(capsnap); + dout("__flush_snaps %p capsnap %p tid %llu %s\n", + inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); + + ret = __send_flush_snap(inode, session, capsnap, cap->mseq, + oldest_flush_tid); + if (ret < 0) { + pr_err("__flush_snaps: error sending cap flushsnap, " + "ino (%llx.%llx) tid %llu follows %llu\n", + ceph_vinop(inode), cf->tid, capsnap->follows); + } + ceph_put_cap_snap(capsnap); spin_lock(&ci->i_ceph_lock); - goto retry; } +} - /* we flushed them all; remove this inode from the queue */ - spin_lock(&mdsc->snap_flush_lock); - list_del_init(&ci->i_snap_flush_item); - spin_unlock(&mdsc->snap_flush_lock); +void ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *session = *psession; + int mds; + dout("ceph_flush_snaps %p\n", inode); +retry: + spin_lock(&ci->i_ceph_lock); + if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { + dout(" no capsnap needs flush, doing nothing\n"); + goto out; + } + if (!ci->i_auth_cap) { + dout(" no auth cap (migrating?), doing nothing\n"); + goto out; + } -out: - if (psession) - *psession = session; - else if (session) { + mds = ci->i_auth_cap->session->s_mds; + if (session && session->s_mds != mds) { + dout(" oops, wrong session %p mutex\n", session); mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); + session = NULL; + } + if (!session) { + spin_unlock(&ci->i_ceph_lock); + mutex_lock(&mdsc->mutex); + session = __ceph_lookup_mds_session(mdsc, mds); + mutex_unlock(&mdsc->mutex); + if (session) { + dout(" inverting session/ino locks on %p\n", session); + mutex_lock(&session->s_mutex); + } + goto retry; } -} -static void ceph_flush_snaps(struct ceph_inode_info *ci) -{ - spin_lock(&ci->i_ceph_lock); - __ceph_flush_snaps(ci, NULL, 0); + __ceph_flush_snaps(ci, session); +out: spin_unlock(&ci->i_ceph_lock); + + if (psession) { + *psession = session; + } else { + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + } + /* we flushed them all; remove this inode from the queue */ + spin_lock(&mdsc->snap_flush_lock); + list_del_init(&ci->i_snap_flush_item); + spin_unlock(&mdsc->snap_flush_lock); } /* @@ -1411,52 +1451,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, return dirty; } -static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci, - struct ceph_cap_flush *cf) -{ - struct rb_node **p = &ci->i_cap_flush_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_cap_flush *other = NULL; - - while (*p) { - parent = *p; - other = rb_entry(parent, struct ceph_cap_flush, i_node); - - if (cf->tid < other->tid) - p = &(*p)->rb_left; - else if (cf->tid > other->tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&cf->i_node, parent, p); - rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); -} - -static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc, - struct ceph_cap_flush *cf) -{ - struct rb_node **p = &mdsc->cap_flush_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_cap_flush *other = NULL; - - while (*p) { - parent = *p; - other = rb_entry(parent, struct ceph_cap_flush, g_node); - - if (cf->tid < other->tid) - p = &(*p)->rb_left; - else if (cf->tid > other->tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&cf->g_node, parent, p); - rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree); -} - struct ceph_cap_flush *ceph_alloc_cap_flush(void) { return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); @@ -1470,23 +1464,54 @@ void ceph_free_cap_flush(struct ceph_cap_flush *cf) static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) { - struct rb_node *n = rb_first(&mdsc->cap_flush_tree); - if (n) { + if (!list_empty(&mdsc->cap_flush_list)) { struct ceph_cap_flush *cf = - rb_entry(n, struct ceph_cap_flush, g_node); + list_first_entry(&mdsc->cap_flush_list, + struct ceph_cap_flush, g_list); return cf->tid; } return 0; } /* + * Remove cap_flush from the mdsc's or inode's flushing cap list. + * Return true if caller needs to wake up flush waiters. + */ +static bool __finish_cap_flush(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci, + struct ceph_cap_flush *cf) +{ + struct ceph_cap_flush *prev; + bool wake = cf->wake; + if (mdsc) { + /* are there older pending cap flushes? */ + if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { + prev = list_prev_entry(cf, g_list); + prev->wake = true; + wake = false; + } + list_del(&cf->g_list); + } else if (ci) { + if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { + prev = list_prev_entry(cf, i_list); + prev->wake = true; + wake = false; + } + list_del(&cf->i_list); + } else { + BUG_ON(1); + } + return wake; +} + +/* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * * Called under i_ceph_lock. */ static int __mark_caps_flushing(struct inode *inode, - struct ceph_mds_session *session, + struct ceph_mds_session *session, bool wake, u64 *flush_tid, u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; @@ -1509,26 +1534,22 @@ static int __mark_caps_flushing(struct inode *inode, swap(cf, ci->i_prealloc_cap_flush); cf->caps = flushing; + cf->wake = wake; spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); cf->tid = ++mdsc->last_cap_flush_tid; - __add_cap_flushing_to_mdsc(mdsc, cf); + list_add_tail(&cf->g_list, &mdsc->cap_flush_list); *oldest_flush_tid = __get_oldest_flush_tid(mdsc); if (list_empty(&ci->i_flushing_item)) { list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; - dout(" inode %p now flushing tid %llu\n", inode, cf->tid); - } else { - list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); - dout(" inode %p now flushing (more) tid %llu\n", - inode, cf->tid); } spin_unlock(&mdsc->cap_dirty_lock); - __add_cap_flushing_to_inode(ci, cf); + list_add_tail(&cf->i_list, &ci->i_cap_flush_list); *flush_tid = cf->tid; return flushing; @@ -1583,10 +1604,11 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ struct rb_node *p; - int tried_invalidate = 0; - int delayed = 0, sent = 0, force_requeue = 0, num; - int queue_invalidate = 0; - int is_delayed = flags & CHECK_CAPS_NODELAY; + int delayed = 0, sent = 0, num; + bool is_delayed = flags & CHECK_CAPS_NODELAY; + bool queue_invalidate = false; + bool force_requeue = false; + bool tried_invalidate = false; /* if we are unmounting, flush any unused caps immediately. */ if (mdsc->stopping) @@ -1597,9 +1619,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; - /* flush snaps first time around only */ - if (!list_empty(&ci->i_cap_snaps)) - __ceph_flush_snaps(ci, &session, 0); goto retry_locked; retry: spin_lock(&ci->i_ceph_lock); @@ -1666,17 +1685,17 @@ retry_locked: if (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) { dout("check_caps queuing invalidate\n"); - queue_invalidate = 1; + queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } else { dout("check_caps failed to invalidate pages\n"); /* we failed to invalidate pages. check these caps again later. */ - force_requeue = 1; + force_requeue = true; __cap_set_timeouts(mdsc, ci); } } - tried_invalidate = 1; + tried_invalidate = true; goto retry_locked; } @@ -1720,10 +1739,15 @@ retry_locked: } } /* flush anything dirty? */ - if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) && - ci->i_dirty_caps) { - dout("flushing dirty caps\n"); - goto ack; + if (cap == ci->i_auth_cap) { + if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) { + dout("flushing dirty caps\n"); + goto ack; + } + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) { + dout("flushing snap caps\n"); + goto ack; + } } /* completed revocation? going down and there are no caps? */ @@ -1782,6 +1806,26 @@ ack: goto retry; } } + + /* kick flushing and flush snaps before sending normal + * cap message */ + if (cap == ci->i_auth_cap && + (ci->i_ceph_flags & + (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + __kick_flushing_caps(mdsc, session, ci, + oldest_flush_tid); + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + } + if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) + __ceph_flush_snaps(ci, session); + + goto retry_locked; + } + /* take snap_rwsem after session mutex */ if (!took_snap_rwsem) { if (down_read_trylock(&mdsc->snap_rwsem) == 0) { @@ -1796,7 +1840,7 @@ ack: } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { - flushing = __mark_caps_flushing(inode, session, + flushing = __mark_caps_flushing(inode, session, false, &flush_tid, &oldest_flush_tid); } else { @@ -1822,7 +1866,7 @@ ack: * otherwise cancel. */ if (delayed && is_delayed) - force_requeue = 1; /* __send_cap delayed release; requeue */ + force_requeue = true; /* __send_cap delayed release; requeue */ if (!delayed && !is_delayed) __cap_delay_cancel(mdsc, ci); else if (!is_delayed || force_requeue) @@ -1873,8 +1917,8 @@ retry: if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; - flushing = __mark_caps_flushing(inode, session, &flush_tid, - &oldest_flush_tid); + flushing = __mark_caps_flushing(inode, session, true, + &flush_tid, &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, @@ -1887,10 +1931,11 @@ retry: spin_unlock(&ci->i_ceph_lock); } } else { - struct rb_node *n = rb_last(&ci->i_cap_flush_tree); - if (n) { + if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush *cf = - rb_entry(n, struct ceph_cap_flush, i_node); + list_last_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); + cf->wake = true; flush_tid = cf->tid; } flushing = ci->i_flushing_caps; @@ -1910,14 +1955,13 @@ out: static int caps_are_flushed(struct inode *inode, u64 flush_tid) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap_flush *cf; - struct rb_node *n; int ret = 1; spin_lock(&ci->i_ceph_lock); - n = rb_first(&ci->i_cap_flush_tree); - if (n) { - cf = rb_entry(n, struct ceph_cap_flush, i_node); + if (!list_empty(&ci->i_cap_flush_list)) { + struct ceph_cap_flush * cf = + list_first_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); if (cf->tid <= flush_tid) ret = 0; } @@ -1926,53 +1970,6 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid) } /* - * Wait on any unsafe replies for the given inode. First wait on the - * newest request, and make that the upper bound. Then, if there are - * more requests, keep waiting on the oldest as long as it is still older - * than the original request. - */ -static void sync_write_wait(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - struct list_head *head = &ci->i_unsafe_writes; - struct ceph_osd_request *req; - u64 last_tid; - - if (!S_ISREG(inode->i_mode)) - return; - - spin_lock(&ci->i_unsafe_lock); - if (list_empty(head)) - goto out; - - /* set upper bound as _last_ entry in chain */ - req = list_last_entry(head, struct ceph_osd_request, - r_unsafe_item); - last_tid = req->r_tid; - - do { - ceph_osdc_get_request(req); - spin_unlock(&ci->i_unsafe_lock); - dout("sync_write_wait on tid %llu (until %llu)\n", - req->r_tid, last_tid); - wait_for_completion(&req->r_safe_completion); - spin_lock(&ci->i_unsafe_lock); - ceph_osdc_put_request(req); - - /* - * from here on look at first entry in chain, since we - * only want to wait for anything older than last_tid - */ - if (list_empty(head)) - break; - req = list_first_entry(head, struct ceph_osd_request, - r_unsafe_item); - } while (req->r_tid < last_tid); -out: - spin_unlock(&ci->i_unsafe_lock); -} - -/* * wait for any unsafe requests to complete. */ static int unsafe_request_wait(struct inode *inode) @@ -2024,7 +2021,8 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) int dirty; dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); - sync_write_wait(inode); + + ceph_sync_write_wait(inode); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) @@ -2087,87 +2085,74 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } -/* - * After a recovering MDS goes active, we need to resend any caps - * we were flushing. - * - * Caller holds session->s_mutex. - */ -static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) -{ - struct ceph_cap_snap *capsnap; - - dout("kick_flushing_capsnaps mds%d\n", session->s_mds); - list_for_each_entry(capsnap, &session->s_cap_snaps_flushing, - flushing_item) { - struct ceph_inode_info *ci = capsnap->ci; - struct inode *inode = &ci->vfs_inode; - struct ceph_cap *cap; - - spin_lock(&ci->i_ceph_lock); - cap = ci->i_auth_cap; - if (cap && cap->session == session) { - dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, - cap, capsnap); - __ceph_flush_snaps(ci, &session, 1); - } else { - pr_err("%p auth cap %p not mds%d ???\n", inode, - cap, session->s_mds); - } - spin_unlock(&ci->i_ceph_lock); - } -} - -static int __kick_flushing_caps(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - struct ceph_inode_info *ci) +static void __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci, + u64 oldest_flush_tid) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; struct ceph_cap_flush *cf; - struct rb_node *n; - int delayed = 0; + int ret; u64 first_tid = 0; - u64 oldest_flush_tid; - spin_lock(&mdsc->cap_dirty_lock); - oldest_flush_tid = __get_oldest_flush_tid(mdsc); - spin_unlock(&mdsc->cap_dirty_lock); + list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { + if (cf->tid < first_tid) + continue; - while (true) { - spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { - pr_err("%p auth cap %p not mds%d ???\n", inode, - cap, session->s_mds); - spin_unlock(&ci->i_ceph_lock); + pr_err("%p auth cap %p not mds%d ???\n", + inode, cap, session->s_mds); break; } - for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { - cf = rb_entry(n, struct ceph_cap_flush, i_node); - if (cf->tid >= first_tid) - break; - } - if (!n) { + first_tid = cf->tid + 1; + + if (cf->caps) { + dout("kick_flushing_caps %p cap %p tid %llu %s\n", + inode, cap, cf->tid, ceph_cap_string(cf->caps)); + ci->i_ceph_flags |= CEPH_I_NODELAY; + ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + cf->caps, cf->tid, oldest_flush_tid); + if (ret) { + pr_err("kick_flushing_caps: error sending " + "cap flush, ino (%llx.%llx) " + "tid %llu flushing %s\n", + ceph_vinop(inode), cf->tid, + ceph_cap_string(cf->caps)); + } + } else { + struct ceph_cap_snap *capsnap = + container_of(cf, struct ceph_cap_snap, + cap_flush); + dout("kick_flushing_caps %p capsnap %p tid %llu %s\n", + inode, capsnap, cf->tid, + ceph_cap_string(capsnap->dirty)); + + atomic_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); - break; - } - cf = rb_entry(n, struct ceph_cap_flush, i_node); + ret = __send_flush_snap(inode, session, capsnap, cap->mseq, + oldest_flush_tid); + if (ret < 0) { + pr_err("kick_flushing_caps: error sending " + "cap flushsnap, ino (%llx.%llx) " + "tid %llu follows %llu\n", + ceph_vinop(inode), cf->tid, + capsnap->follows); + } - first_tid = cf->tid + 1; + ceph_put_cap_snap(capsnap); + } - dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, - cap, cf->tid, ceph_cap_string(cf->caps)); - delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - cf->caps, cf->tid, oldest_flush_tid); + spin_lock(&ci->i_ceph_lock); } - return delayed; } void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, @@ -2175,8 +2160,14 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, { struct ceph_inode_info *ci; struct ceph_cap *cap; + u64 oldest_flush_tid; dout("early_kick_flushing_caps mds%d\n", session->s_mds); + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; @@ -2196,10 +2187,11 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, */ if ((cap->issued & ci->i_flushing_caps) != ci->i_flushing_caps) { - spin_unlock(&ci->i_ceph_lock); - if (!__kick_flushing_caps(mdsc, session, ci)) - continue; - spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + __kick_flushing_caps(mdsc, session, ci, + oldest_flush_tid); + } else { + ci->i_ceph_flags |= CEPH_I_KICK_FLUSH; } spin_unlock(&ci->i_ceph_lock); @@ -2210,50 +2202,56 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_inode_info *ci; - - kick_flushing_capsnaps(mdsc, session); + struct ceph_cap *cap; + u64 oldest_flush_tid; dout("kick_flushing_caps mds%d\n", session->s_mds); + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { - int delayed = __kick_flushing_caps(mdsc, session, ci); - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", + &ci->vfs_inode, cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); + continue; + } + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + __kick_flushing_caps(mdsc, session, ci, + oldest_flush_tid); } + spin_unlock(&ci->i_ceph_lock); } } static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct inode *inode) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; - spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; dout("kick_flushing_inode_caps %p flushing %s\n", inode, ceph_cap_string(ci->i_flushing_caps)); - __ceph_flush_snaps(ci, &session, 1); - - if (ci->i_flushing_caps) { - int delayed; - + if (!list_empty(&ci->i_cap_flush_list)) { + u64 oldest_flush_tid; spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &cap->session->s_cap_flushing); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); - - delayed = __kick_flushing_caps(mdsc, session, ci); - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); - spin_unlock(&ci->i_ceph_lock); - } } else { spin_unlock(&ci->i_ceph_lock); } @@ -2580,16 +2578,19 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) * drop cap_snap that is not associated with any snapshot. * we don't need to send FLUSHSNAP message for it. */ -static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap) +static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap *capsnap) { if (!capsnap->need_flush && !capsnap->writing && !capsnap->dirty_pages) { - dout("dropping cap_snap %p follows %llu\n", capsnap, capsnap->follows); + BUG_ON(capsnap->cap_flush.tid > 0); ceph_put_snap_context(capsnap->context); + if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps)) + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; + list_del(&capsnap->ci_item); - list_del(&capsnap->flushing_item); ceph_put_cap_snap(capsnap); return 1; } @@ -2636,7 +2637,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) struct ceph_cap_snap, ci_item); capsnap->writing = 0; - if (ceph_try_drop_cap_snap(capsnap)) + if (ceph_try_drop_cap_snap(ci, capsnap)) put++; else if (__ceph_finish_cap_snap(ci, capsnap)) flushsnaps = 1; @@ -2661,7 +2662,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (last && !flushsnaps) ceph_check_caps(ci, 0, NULL); else if (flushsnaps) - ceph_flush_snaps(ci); + ceph_flush_snaps(ci, NULL); if (wake) wake_up_all(&ci->i_cap_wq); while (put-- > 0) @@ -2679,15 +2680,19 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { struct inode *inode = &ci->vfs_inode; - int last = 0; - int complete_capsnap = 0; - int drop_capsnap = 0; - int found = 0; struct ceph_cap_snap *capsnap = NULL; + int put = 0; + bool last = false; + bool found = false; + bool flush_snaps = false; + bool complete_capsnap = false; spin_lock(&ci->i_ceph_lock); ci->i_wrbuffer_ref -= nr; - last = !ci->i_wrbuffer_ref; + if (ci->i_wrbuffer_ref == 0) { + last = true; + put++; + } if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; @@ -2707,15 +2712,22 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, } else { list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->context == snapc) { - found = 1; + found = true; break; } } BUG_ON(!found); capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { - complete_capsnap = 1; - drop_capsnap = ceph_try_drop_cap_snap(capsnap); + complete_capsnap = true; + if (!capsnap->writing) { + if (ceph_try_drop_cap_snap(ci, capsnap)) { + put++; + } else { + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; + flush_snaps = true; + } + } } dout("put_wrbuffer_cap_refs on %p cap_snap %p " " snap %lld %d/%d -> %d/%d %s%s\n", @@ -2730,12 +2742,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (last) { ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); - iput(inode); - } else if (complete_capsnap) { - ceph_flush_snaps(ci); - wake_up_all(&ci->i_cap_wq); + } else if (flush_snaps) { + ceph_flush_snaps(ci, NULL); } - if (drop_capsnap) + if (complete_capsnap) + wake_up_all(&ci->i_cap_wq); + while (put-- > 0) iput(inode); } @@ -2779,12 +2791,11 @@ static void invalidate_aliases(struct inode *inode) */ static void handle_cap_grant(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *grant, - u64 inline_version, - void *inline_data, int inline_len, + struct ceph_string **pns, u64 inline_version, + void *inline_data, u32 inline_len, struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, - struct ceph_cap *cap, int issued, - u32 pool_ns_len) + struct ceph_cap *cap, int issued) __releases(ci->i_ceph_lock) __releases(mdsc->snap_rwsem) { @@ -2895,8 +2906,18 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { /* file layout may have changed */ - ci->i_layout = grant->layout; - ci->i_pool_ns_len = pool_ns_len; + s64 old_pool = ci->i_layout.pool_id; + struct ceph_string *old_ns; + + ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout); + old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, + lockdep_is_held(&ci->i_ceph_lock)); + rcu_assign_pointer(ci->i_layout.pool_ns, *pns); + + if (ci->i_layout.pool_id != old_pool || *pns != old_ns) + ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; + + *pns = old_ns; /* size/truncate_seq? */ queue_trunc = ceph_fill_file_size(inode, issued, @@ -2979,13 +3000,13 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, fill_inline = true; } - spin_unlock(&ci->i_ceph_lock); - if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { - kick_flushing_inode_caps(mdsc, session, inode); - up_read(&mdsc->snap_rwsem); if (newcaps & ~issued) wake = true; + kick_flushing_inode_caps(mdsc, session, inode); + up_read(&mdsc->snap_rwsem); + } else { + spin_unlock(&ci->i_ceph_lock); } if (fill_inline) @@ -3029,23 +3050,24 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; - struct ceph_cap_flush *cf; - struct rb_node *n; + struct ceph_cap_flush *cf, *tmp_cf; LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; - int drop = 0; + bool drop = false; + bool wake_ci = 0; + bool wake_mdsc = 0; - n = rb_first(&ci->i_cap_flush_tree); - while (n) { - cf = rb_entry(n, struct ceph_cap_flush, i_node); - n = rb_next(&cf->i_node); + list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { if (cf->tid == flush_tid) cleaned = cf->caps; + if (cf->caps == 0) /* capsnap */ + continue; if (cf->tid <= flush_tid) { - rb_erase(&cf->i_node, &ci->i_cap_flush_tree); - list_add_tail(&cf->list, &to_remove); + if (__finish_cap_flush(NULL, ci, cf)) + wake_ci = true; + list_add_tail(&cf->i_list, &to_remove); } else { cleaned &= ~cf->caps; if (!cleaned) @@ -3066,31 +3088,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&to_remove)) { - list_for_each_entry(cf, &to_remove, list) - rb_erase(&cf->g_node, &mdsc->cap_flush_tree); - - n = rb_first(&mdsc->cap_flush_tree); - cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; - if (!cf || cf->tid > flush_tid) - wake_up_all(&mdsc->cap_flushing_wq); + list_for_each_entry(cf, &to_remove, i_list) { + if (__finish_cap_flush(mdsc, NULL, cf)) + wake_mdsc = true; } if (ci->i_flushing_caps == 0) { - list_del_init(&ci->i_flushing_item); - if (!list_empty(&session->s_cap_flushing)) - dout(" mds%d still flushing cap on %p\n", - session->s_mds, - &list_entry(session->s_cap_flushing.next, - struct ceph_inode_info, - i_flushing_item)->vfs_inode); + if (list_empty(&ci->i_cap_flush_list)) { + list_del_init(&ci->i_flushing_item); + if (!list_empty(&session->s_cap_flushing)) { + dout(" mds%d still flushing cap on %p\n", + session->s_mds, + &list_first_entry(&session->s_cap_flushing, + struct ceph_inode_info, + i_flushing_item)->vfs_inode); + } + } mdsc->num_cap_flushing--; dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { dout(" inode %p now clean\n", inode); BUG_ON(!list_empty(&ci->i_dirty_item)); - drop = 1; + drop = true; if (ci->i_wr_ref == 0 && ci->i_wrbuffer_ref_head == 0) { BUG_ON(!ci->i_head_snapc); @@ -3102,17 +3122,21 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, } } spin_unlock(&mdsc->cap_dirty_lock); - wake_up_all(&ci->i_cap_wq); out: spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { cf = list_first_entry(&to_remove, - struct ceph_cap_flush, list); - list_del(&cf->list); + struct ceph_cap_flush, i_list); + list_del(&cf->i_list); ceph_free_cap_flush(cf); } + + if (wake_ci) + wake_up_all(&ci->i_cap_wq); + if (wake_mdsc) + wake_up_all(&mdsc->cap_flushing_wq); if (drop) iput(inode); } @@ -3131,7 +3155,9 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap; - int drop = 0; + bool flushed = false; + bool wake_ci = false; + bool wake_mdsc = false; dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", inode, ci, session->s_mds, follows); @@ -3139,30 +3165,47 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->follows == follows) { - if (capsnap->flush_tid != flush_tid) { + if (capsnap->cap_flush.tid != flush_tid) { dout(" cap_snap %p follows %lld tid %lld !=" " %lld\n", capsnap, follows, - flush_tid, capsnap->flush_tid); + flush_tid, capsnap->cap_flush.tid); break; } - WARN_ON(capsnap->dirty_pages || capsnap->writing); - dout(" removing %p cap_snap %p follows %lld\n", - inode, capsnap, follows); - ceph_put_snap_context(capsnap->context); - list_del(&capsnap->ci_item); - list_del(&capsnap->flushing_item); - ceph_put_cap_snap(capsnap); - wake_up_all(&mdsc->cap_flushing_wq); - drop = 1; + flushed = true; break; } else { dout(" skipping cap_snap %p follows %lld\n", capsnap, capsnap->follows); } } + if (flushed) { + WARN_ON(capsnap->dirty_pages || capsnap->writing); + dout(" removing %p cap_snap %p follows %lld\n", + inode, capsnap, follows); + list_del(&capsnap->ci_item); + if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush)) + wake_ci = true; + + spin_lock(&mdsc->cap_dirty_lock); + + if (list_empty(&ci->i_cap_flush_list)) + list_del_init(&ci->i_flushing_item); + + if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush)) + wake_mdsc = true; + + spin_unlock(&mdsc->cap_dirty_lock); + } spin_unlock(&ci->i_ceph_lock); - if (drop) + if (flushed) { + ceph_put_snap_context(capsnap->context); + ceph_put_cap_snap(capsnap); + if (wake_ci) + wake_up_all(&ci->i_cap_wq); + if (wake_mdsc) + wake_up_all(&mdsc->cap_flushing_wq); iput(inode); + } } /* @@ -3267,7 +3310,8 @@ retry: tcap->implemented |= issued; if (cap == ci->i_auth_cap) ci->i_auth_cap = tcap; - if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { + if (!list_empty(&ci->i_cap_flush_list) && + ci->i_auth_cap == tcap) { spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &tcap->session->s_cap_flushing); @@ -3420,20 +3464,18 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_cap *cap; struct ceph_mds_caps *h; struct ceph_mds_cap_peer *peer = NULL; - struct ceph_snap_realm *realm; + struct ceph_snap_realm *realm = NULL; + struct ceph_string *pool_ns = NULL; int mds = session->s_mds; int op, issued; u32 seq, mseq; struct ceph_vino vino; - u64 cap_id; - u64 size, max_size; u64 tid; u64 inline_version = 0; void *inline_data = NULL; u32 inline_len = 0; void *snaptrace; size_t snaptrace_len; - u32 pool_ns_len = 0; void *p, *end; dout("handle_caps from mds%d\n", mds); @@ -3447,11 +3489,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, op = le32_to_cpu(h->op); vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; - cap_id = le64_to_cpu(h->cap_id); seq = le32_to_cpu(h->seq); mseq = le32_to_cpu(h->migrate_seq); - size = le64_to_cpu(h->size); - max_size = le64_to_cpu(h->max_size); snaptrace = h + 1; snaptrace_len = le32_to_cpu(h->snap_trace_len); @@ -3490,6 +3529,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, u64 flush_tid; u32 caller_uid, caller_gid; u32 osd_epoch_barrier; + u32 pool_ns_len; /* version >= 5 */ ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad); /* version >= 6 */ @@ -3499,6 +3539,11 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_decode_32_safe(&p, end, caller_gid, bad); /* version >= 8 */ ceph_decode_32_safe(&p, end, pool_ns_len, bad); + if (pool_ns_len > 0) { + ceph_decode_need(&p, end, pool_ns_len, bad); + pool_ns = ceph_find_or_create_string(p, pool_ns_len); + p += pool_ns_len; + } } /* lookup ino */ @@ -3519,7 +3564,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, cap = ceph_get_cap(mdsc, NULL); cap->cap_ino = vino.ino; cap->queue_release = 1; - cap->cap_id = cap_id; + cap->cap_id = le64_to_cpu(h->cap_id); cap->mseq = mseq; cap->seq = seq; spin_lock(&session->s_cap_lock); @@ -3554,10 +3599,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, } handle_cap_import(mdsc, inode, h, peer, session, &cap, &issued); - handle_cap_grant(mdsc, inode, h, + handle_cap_grant(mdsc, inode, h, &pool_ns, inline_version, inline_data, inline_len, - msg->middle, session, cap, issued, - pool_ns_len); + msg->middle, session, cap, issued); if (realm) ceph_put_snap_realm(mdsc, realm); goto done_unlocked; @@ -3579,10 +3623,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_GRANT: __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); - handle_cap_grant(mdsc, inode, h, + handle_cap_grant(mdsc, inode, h, &pool_ns, inline_version, inline_data, inline_len, - msg->middle, session, cap, issued, - pool_ns_len); + msg->middle, session, cap, issued); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: @@ -3613,6 +3656,7 @@ done: mutex_unlock(&session->s_mutex); done_unlocked: iput(inode); + ceph_put_string(pool_ns); return; bad: @@ -3673,6 +3717,16 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) dout("flush_dirty_caps done\n"); } +void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode) +{ + int i; + int bits = (fmode << 1) | 1; + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + if (bits & (1 << i)) + ci->i_nr_by_mode[i]++; + } +} + /* * Drop open file reference. If we were the last open file, * we may need to release capabilities to the MDS (or schedule @@ -3680,15 +3734,20 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) { - struct inode *inode = &ci->vfs_inode; - int last = 0; - + int i, last = 0; + int bits = (fmode << 1) | 1; spin_lock(&ci->i_ceph_lock); - dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, - ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); - BUG_ON(ci->i_nr_by_mode[fmode] == 0); - if (--ci->i_nr_by_mode[fmode] == 0) - last++; + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { + if (bits & (1 << i)) { + BUG_ON(ci->i_nr_by_mode[i] == 0); + if (--ci->i_nr_by_mode[i] == 0) + last++; + } + } + dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n", + &ci->vfs_inode, fmode, + ci->i_nr_by_mode[0], ci->i_nr_by_mode[1], + ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]); spin_unlock(&ci->i_ceph_lock); if (last && ci->i_vino.snap == CEPH_NOSNAP) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 6e0fedf6713b..c64a0b794d49 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -59,7 +59,7 @@ int ceph_init_dentry(struct dentry *dentry) di->dentry = dentry; di->lease_session = NULL; - dentry->d_time = jiffies; + di->time = jiffies; /* avoid reordering d_fsdata setup so that the check above is safe */ smp_mb(); dentry->d_fsdata = di; @@ -1124,7 +1124,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, void ceph_invalidate_dentry_lease(struct dentry *dentry) { spin_lock(&dentry->d_lock); - dentry->d_time = jiffies; + ceph_dentry(dentry)->time = jiffies; ceph_dentry(dentry)->lease_shared_gen = 0; spin_unlock(&dentry->d_lock); } @@ -1133,7 +1133,8 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry) * Check if dentry lease is valid. If not, delete the lease. Try to * renew if the least is more than half up. */ -static int dentry_lease_is_valid(struct dentry *dentry) +static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, + struct inode *dir) { struct ceph_dentry_info *di; struct ceph_mds_session *s; @@ -1141,12 +1142,11 @@ static int dentry_lease_is_valid(struct dentry *dentry) u32 gen; unsigned long ttl; struct ceph_mds_session *session = NULL; - struct inode *dir = NULL; u32 seq = 0; spin_lock(&dentry->d_lock); di = ceph_dentry(dentry); - if (di->lease_session) { + if (di && di->lease_session) { s = di->lease_session; spin_lock(&s->s_gen_ttl_lock); gen = s->s_cap_gen; @@ -1154,17 +1154,24 @@ static int dentry_lease_is_valid(struct dentry *dentry) spin_unlock(&s->s_gen_ttl_lock); if (di->lease_gen == gen && - time_before(jiffies, dentry->d_time) && + time_before(jiffies, di->time) && time_before(jiffies, ttl)) { valid = 1; if (di->lease_renew_after && time_after(jiffies, di->lease_renew_after)) { - /* we should renew */ - dir = d_inode(dentry->d_parent); - session = ceph_get_mds_session(s); - seq = di->lease_seq; - di->lease_renew_after = 0; - di->lease_renew_from = jiffies; + /* + * We should renew. If we're in RCU walk mode + * though, we can't do that so just return + * -ECHILD. + */ + if (flags & LOOKUP_RCU) { + valid = -ECHILD; + } else { + session = ceph_get_mds_session(s); + seq = di->lease_seq; + di->lease_renew_after = 0; + di->lease_renew_from = jiffies; + } } } } @@ -1207,15 +1214,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) struct dentry *parent; struct inode *dir; - if (flags & LOOKUP_RCU) - return -ECHILD; + if (flags & LOOKUP_RCU) { + parent = ACCESS_ONCE(dentry->d_parent); + dir = d_inode_rcu(parent); + if (!dir) + return -ECHILD; + } else { + parent = dget_parent(dentry); + dir = d_inode(parent); + } dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset); - parent = dget_parent(dentry); - dir = d_inode(parent); - /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, @@ -1224,12 +1235,16 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) } else if (d_really_is_positive(dentry) && ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { valid = 1; - } else if (dentry_lease_is_valid(dentry) || - dir_lease_is_valid(dir, dentry)) { - if (d_really_is_positive(dentry)) - valid = ceph_is_any_caps(d_inode(dentry)); - else - valid = 1; + } else { + valid = dentry_lease_is_valid(dentry, flags, dir); + if (valid == -ECHILD) + return valid; + if (valid || dir_lease_is_valid(dir, dentry)) { + if (d_really_is_positive(dentry)) + valid = ceph_is_any_caps(d_inode(dentry)); + else + valid = 1; + } } if (!valid) { @@ -1238,6 +1253,9 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) struct ceph_mds_request *req; int op, mask, err; + if (flags & LOOKUP_RCU) + return -ECHILD; + op = ceph_snap(dir) == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); @@ -1273,7 +1291,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) ceph_dir_clear_complete(dir); } - dput(parent); + if (!(flags & LOOKUP_RCU)) + dput(parent); return valid; } @@ -1286,10 +1305,14 @@ static void ceph_d_release(struct dentry *dentry) dout("d_release %p\n", dentry); ceph_dentry_lru_del(dentry); + + spin_lock(&dentry->d_lock); + dentry->d_fsdata = NULL; + spin_unlock(&dentry->d_lock); + if (di->lease_session) ceph_put_mds_session(di->lease_session); kmem_cache_free(ceph_dentry_cachep, di); - dentry->d_fsdata = NULL; } static int ceph_snapdir_d_revalidate(struct dentry *dentry, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 0daaf7ceedc5..0f5375d8e030 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -708,7 +708,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } } - ceph_put_page_vector(osd_data->pages, num_pages, false); + ceph_put_page_vector(osd_data->pages, num_pages, !aio_req->write); ceph_osdc_put_request(req); if (rc < 0) @@ -821,6 +821,54 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } } +/* + * Wait on any unsafe replies for the given inode. First wait on the + * newest request, and make that the upper bound. Then, if there are + * more requests, keep waiting on the oldest as long as it is still older + * than the original request. + */ +void ceph_sync_write_wait(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct list_head *head = &ci->i_unsafe_writes; + struct ceph_osd_request *req; + u64 last_tid; + + if (!S_ISREG(inode->i_mode)) + return; + + spin_lock(&ci->i_unsafe_lock); + if (list_empty(head)) + goto out; + + /* set upper bound as _last_ entry in chain */ + + req = list_last_entry(head, struct ceph_osd_request, + r_unsafe_item); + last_tid = req->r_tid; + + do { + ceph_osdc_get_request(req); + spin_unlock(&ci->i_unsafe_lock); + + dout("sync_write_wait on tid %llu (until %llu)\n", + req->r_tid, last_tid); + wait_for_completion(&req->r_safe_completion); + ceph_osdc_put_request(req); + + spin_lock(&ci->i_unsafe_lock); + /* + * from here on look at first entry in chain, since we + * only want to wait for anything older than last_tid + */ + if (list_empty(head)) + break; + req = list_first_entry(head, struct ceph_osd_request, + r_unsafe_item); + } while (req->r_tid < last_tid); +out: + spin_unlock(&ci->i_unsafe_lock); +} static ssize_t ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, @@ -964,7 +1012,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, len = ret; } - ceph_put_page_vector(pages, num_pages, false); + ceph_put_page_vector(pages, num_pages, !write); ceph_osdc_put_request(req); if (ret < 0) @@ -985,6 +1033,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } if (aio_req) { + LIST_HEAD(osd_reqs); + if (aio_req->num_reqs == 0) { kfree(aio_req); return ret; @@ -993,8 +1043,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : CEPH_CAP_FILE_RD); - while (!list_empty(&aio_req->osd_reqs)) { - req = list_first_entry(&aio_req->osd_reqs, + list_splice(&aio_req->osd_reqs, &osd_reqs); + while (!list_empty(&osd_reqs)) { + req = list_first_entry(&osd_reqs, struct ceph_osd_request, r_unsafe_item); list_del_init(&req->r_unsafe_item); @@ -1448,16 +1499,14 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t i_size; - int ret; + loff_t ret; inode_lock(inode); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); - if (ret < 0) { - offset = ret; + if (ret < 0) goto out; - } } i_size = i_size_read(inode); @@ -1473,7 +1522,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) * write() or lseek() might have altered it */ if (offset == 0) { - offset = file->f_pos; + ret = file->f_pos; goto out; } offset += file->f_pos; @@ -1493,11 +1542,11 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) break; } - offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + ret = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: inode_unlock(inode); - return offset; + return ret; } static inline void ceph_zero_partial_page( @@ -1583,9 +1632,9 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) { int ret = 0; struct ceph_inode_info *ci = ceph_inode(inode); - s32 stripe_unit = ceph_file_layout_su(ci->i_layout); - s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout); - s32 object_size = ceph_file_layout_object_size(ci->i_layout); + s32 stripe_unit = ci->i_layout.stripe_unit; + s32 stripe_count = ci->i_layout.stripe_count; + s32 object_size = ci->i_layout.object_size; u64 object_set_size = object_size * stripe_count; u64 nearly, t; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f059b5997072..dd3a6dbf71eb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -446,7 +446,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); - ci->i_pool_ns_len = 0; + RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); ci->i_fragtree = RB_ROOT; mutex_init(&ci->i_fragtree_mutex); @@ -468,7 +468,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ci->i_dirty_item); INIT_LIST_HEAD(&ci->i_flushing_item); ci->i_prealloc_cap_flush = NULL; - ci->i_cap_flush_tree = RB_ROOT; + INIT_LIST_HEAD(&ci->i_cap_flush_list); init_waitqueue_head(&ci->i_cap_wq); ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; @@ -477,7 +477,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_head_snapc = NULL; ci->i_snap_caps = 0; - for (i = 0; i < CEPH_FILE_MODE_NUM; i++) + for (i = 0; i < CEPH_FILE_MODE_BITS; i++) ci->i_nr_by_mode[i] = 0; mutex_init(&ci->i_truncate_mutex); @@ -570,6 +570,8 @@ void ceph_destroy_inode(struct inode *inode) if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); + ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); + call_rcu(&inode->i_rcu, ceph_i_callback); } @@ -583,6 +585,14 @@ int ceph_drop_inode(struct inode *inode) return 1; } +void ceph_evict_inode(struct inode *inode) +{ + /* wait unsafe sync writes */ + ceph_sync_write_wait(inode); + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); +} + static inline blkcnt_t calc_inode_blocks(u64 size) { return (size + (1<<9) - 1) >> 9; @@ -733,6 +743,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, int issued = 0, implemented, new_issued; struct timespec mtime, atime, ctime; struct ceph_buffer *xattr_blob = NULL; + struct ceph_string *pool_ns = NULL; struct ceph_cap *new_cap = NULL; int err = 0; bool wake = false; @@ -760,6 +771,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, iinfo->xattr_len); } + if (iinfo->pool_ns_len > 0) + pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data, + iinfo->pool_ns_len); + spin_lock(&ci->i_ceph_lock); /* @@ -814,10 +829,18 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { - if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool) + s64 old_pool = ci->i_layout.pool_id; + struct ceph_string *old_ns; + + ceph_file_layout_from_legacy(&ci->i_layout, &info->layout); + old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, + lockdep_is_held(&ci->i_ceph_lock)); + rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns); + + if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns) ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; - ci->i_layout = info->layout; - ci->i_pool_ns_len = iinfo->pool_ns_len; + + pool_ns = old_ns; queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), @@ -985,6 +1008,7 @@ out: ceph_put_cap(mdsc, new_cap); if (xattr_blob) ceph_buffer_put(xattr_blob); + ceph_put_string(pool_ns); return err; } @@ -1018,7 +1042,7 @@ static void update_dentry_lease(struct dentry *dentry, goto out_unlock; if (di->lease_gen == session->s_cap_gen && - time_before(ttl, dentry->d_time)) + time_before(ttl, di->time)) goto out_unlock; /* we already have a newer lease. */ if (di->lease_session && di->lease_session != session) @@ -1032,7 +1056,7 @@ static void update_dentry_lease(struct dentry *dentry, di->lease_seq = le32_to_cpu(lease->seq); di->lease_renew_after = half_ttl; di->lease_renew_from = 0; - dentry->d_time = ttl; + di->time = ttl; out_unlock: spin_unlock(&dentry->d_lock); return; @@ -1164,7 +1188,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, dname.name = rinfo->dname; dname.len = rinfo->dname_len; - dname.hash = full_name_hash(dname.name, dname.len); + dname.hash = full_name_hash(parent, dname.name, dname.len); vino.ino = le64_to_cpu(rinfo->targeti.in->ino); vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); retry_lookup: @@ -1508,7 +1532,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, dname.name = rde->name; dname.len = rde->name_len; - dname.hash = full_name_hash(dname.name, dname.len); + dname.hash = full_name_hash(parent, dname.name, dname.len); vino.ino = le64_to_cpu(rde->inode.in->ino); vino.snap = le64_to_cpu(rde->inode.in->snapid); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index be6b1657b1af..7d752d53353a 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -21,10 +21,10 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false); if (!err) { - l.stripe_unit = ceph_file_layout_su(ci->i_layout); - l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); - l.object_size = ceph_file_layout_object_size(ci->i_layout); - l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); + l.stripe_unit = ci->i_layout.stripe_unit; + l.stripe_count = ci->i_layout.stripe_count; + l.object_size = ci->i_layout.object_size; + l.data_pool = ci->i_layout.pool_id; l.preferred_osd = (s32)-1; if (copy_to_user(arg, &l, sizeof(l))) return -EFAULT; @@ -82,19 +82,19 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) if (l.stripe_count) nl.stripe_count = l.stripe_count; else - nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + nl.stripe_count = ci->i_layout.stripe_count; if (l.stripe_unit) nl.stripe_unit = l.stripe_unit; else - nl.stripe_unit = ceph_file_layout_su(ci->i_layout); + nl.stripe_unit = ci->i_layout.stripe_unit; if (l.object_size) nl.object_size = l.object_size; else - nl.object_size = ceph_file_layout_object_size(ci->i_layout); + nl.object_size = ci->i_layout.object_size; if (l.data_pool) nl.data_pool = l.data_pool; else - nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout); + nl.data_pool = ci->i_layout.pool_id; /* this is obsolete, and always -1 */ nl.preferred_osd = le64_to_cpu(-1); @@ -183,7 +183,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; struct ceph_object_locator oloc; - struct ceph_object_id oid; + CEPH_DEFINE_OID_ONSTACK(oid); u64 len = 1, olen; u64 tmp; struct ceph_pg pgid; @@ -202,8 +202,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return -EIO; } dl.file_offset -= dl.object_offset; - dl.object_size = ceph_file_layout_object_size(ci->i_layout); - dl.block_size = ceph_file_layout_su(ci->i_layout); + dl.object_size = ci->i_layout.object_size; + dl.block_size = ci->i_layout.stripe_unit; /* block_offset = object_offset % block_size */ tmp = dl.object_offset; @@ -212,10 +212,13 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); + oloc.pool = ci->i_layout.pool_id; + oloc.pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); ceph_oid_printf(&oid, "%s", dl.object_name); r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); + + ceph_oloc_destroy(&oloc); if (r < 0) { up_read(&osdc->lock); return r; @@ -247,9 +250,8 @@ static long ceph_ioctl_lazyio(struct file *file) if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { spin_lock(&ci->i_ceph_lock); - ci->i_nr_by_mode[fi->fmode]--; fi->fmode |= CEPH_FILE_MODE_LAZY; - ci->i_nr_by_mode[fi->fmode]++; + ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++; spin_unlock(&ci->i_ceph_lock); dout("ioctl_layzio: file %p marked lazy\n", file); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 2103b823bec0..fa59a85226b2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -48,7 +48,7 @@ struct ceph_reconnect_state { int nr_caps; struct ceph_pagelist *pagelist; - bool flock; + unsigned msg_version; }; static void __wake_requests(struct ceph_mds_client *mdsc, @@ -100,12 +100,15 @@ static int parse_reply_info_in(void **p, void *end, } else info->inline_version = CEPH_INLINE_NONE; + info->pool_ns_len = 0; + info->pool_ns_data = NULL; if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { ceph_decode_32_safe(p, end, info->pool_ns_len, bad); - ceph_decode_need(p, end, info->pool_ns_len, bad); - *p += info->pool_ns_len; - } else { - info->pool_ns_len = 0; + if (info->pool_ns_len > 0) { + ceph_decode_need(p, end, info->pool_ns_len, bad); + info->pool_ns_data = *p; + *p += info->pool_ns_len; + } } return 0; @@ -469,7 +472,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); INIT_LIST_HEAD(&s->s_cap_flushing); - INIT_LIST_HEAD(&s->s_cap_snaps_flushing); dout("register_session mds%d\n", mds); if (mds >= mdsc->max_sessions) { @@ -1145,19 +1147,17 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) invalidate = true; - while (true) { - struct rb_node *n = rb_first(&ci->i_cap_flush_tree); - if (!n) - break; - cf = rb_entry(n, struct ceph_cap_flush, i_node); - rb_erase(&cf->i_node, &ci->i_cap_flush_tree); - list_add(&cf->list, &to_remove); + while (!list_empty(&ci->i_cap_flush_list)) { + cf = list_first_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); + list_del(&cf->i_list); + list_add(&cf->i_list, &to_remove); } spin_lock(&mdsc->cap_dirty_lock); - list_for_each_entry(cf, &to_remove, list) - rb_erase(&cf->g_node, &mdsc->cap_flush_tree); + list_for_each_entry(cf, &to_remove, i_list) + list_del(&cf->g_list); if (!list_empty(&ci->i_dirty_item)) { pr_warn_ratelimited( @@ -1181,7 +1181,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_unlock(&mdsc->cap_dirty_lock); if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { - list_add(&ci->i_prealloc_cap_flush->list, &to_remove); + list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } } @@ -1189,8 +1189,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, while (!list_empty(&to_remove)) { struct ceph_cap_flush *cf; cf = list_first_entry(&to_remove, - struct ceph_cap_flush, list); - list_del(&cf->list); + struct ceph_cap_flush, i_list); + list_del(&cf->i_list); ceph_free_cap_flush(cf); } @@ -1212,6 +1212,8 @@ static void remove_session_caps(struct ceph_mds_session *session) dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, fsc); + wake_up_all(&fsc->mdsc->cap_flushing_wq); + spin_lock(&session->s_cap_lock); if (session->s_nr_caps > 0) { struct inode *inode; @@ -1478,35 +1480,21 @@ static int trim_caps(struct ceph_mds_client *mdsc, return 0; } -static int check_capsnap_flush(struct ceph_inode_info *ci, - u64 want_snap_seq) -{ - int ret = 1; - spin_lock(&ci->i_ceph_lock); - if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { - struct ceph_cap_snap *capsnap = - list_first_entry(&ci->i_cap_snaps, - struct ceph_cap_snap, ci_item); - ret = capsnap->follows >= want_snap_seq; - } - spin_unlock(&ci->i_ceph_lock); - return ret; -} - static int check_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_tid) { - struct rb_node *n; - struct ceph_cap_flush *cf; int ret = 1; spin_lock(&mdsc->cap_dirty_lock); - n = rb_first(&mdsc->cap_flush_tree); - cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; - if (cf && cf->tid <= want_flush_tid) { - dout("check_caps_flush still flushing tid %llu <= %llu\n", - cf->tid, want_flush_tid); - ret = 0; + if (!list_empty(&mdsc->cap_flush_list)) { + struct ceph_cap_flush *cf = + list_first_entry(&mdsc->cap_flush_list, + struct ceph_cap_flush, g_list); + if (cf->tid <= want_flush_tid) { + dout("check_caps_flush still flushing tid " + "%llu <= %llu\n", cf->tid, want_flush_tid); + ret = 0; + } } spin_unlock(&mdsc->cap_dirty_lock); return ret; @@ -1518,54 +1506,9 @@ static int check_caps_flush(struct ceph_mds_client *mdsc, * returns true if we've flushed through want_flush_tid */ static void wait_caps_flush(struct ceph_mds_client *mdsc, - u64 want_flush_tid, u64 want_snap_seq) + u64 want_flush_tid) { - int mds; - - dout("check_caps_flush want %llu snap want %llu\n", - want_flush_tid, want_snap_seq); - mutex_lock(&mdsc->mutex); - for (mds = 0; mds < mdsc->max_sessions; ) { - struct ceph_mds_session *session = mdsc->sessions[mds]; - struct inode *inode = NULL; - - if (!session) { - mds++; - continue; - } - get_session(session); - mutex_unlock(&mdsc->mutex); - - mutex_lock(&session->s_mutex); - if (!list_empty(&session->s_cap_snaps_flushing)) { - struct ceph_cap_snap *capsnap = - list_first_entry(&session->s_cap_snaps_flushing, - struct ceph_cap_snap, - flushing_item); - struct ceph_inode_info *ci = capsnap->ci; - if (!check_capsnap_flush(ci, want_snap_seq)) { - dout("check_cap_flush still flushing snap %p " - "follows %lld <= %lld to mds%d\n", - &ci->vfs_inode, capsnap->follows, - want_snap_seq, mds); - inode = igrab(&ci->vfs_inode); - } - } - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - - if (inode) { - wait_event(mdsc->cap_flushing_wq, - check_capsnap_flush(ceph_inode(inode), - want_snap_seq)); - iput(inode); - } else { - mds++; - } - - mutex_lock(&mdsc->mutex); - } - mutex_unlock(&mdsc->mutex); + dout("check_caps_flush want %llu\n", want_flush_tid); wait_event(mdsc->cap_flushing_wq, check_caps_flush(mdsc, want_flush_tid)); @@ -2163,6 +2106,11 @@ static int __do_request(struct ceph_mds_client *mdsc, mds = __choose_mds(mdsc, req); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { + if (mdsc->mdsmap_err) { + err = mdsc->mdsmap_err; + dout("do_request mdsmap err %d\n", err); + goto finish; + } dout("do_request no mds or not active, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); goto out; @@ -2292,14 +2240,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); - /* deny access to directories with pool_ns layouts */ - if (req->r_inode && S_ISDIR(req->r_inode->i_mode) && - ceph_inode(req->r_inode)->i_pool_ns_len) - return -EIO; - if (req->r_locked_dir && - ceph_inode(req->r_locked_dir)->i_pool_ns_len) - return -EIO; - /* issue */ mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); @@ -2791,13 +2731,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_mds_cap_reconnect v2; struct ceph_mds_cap_reconnect_v1 v1; } rec; - size_t reclen; struct ceph_inode_info *ci; struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; char *path; int pathlen, err; u64 pathbase; + u64 snap_follows; struct dentry *dentry; ci = cap->ci; @@ -2820,9 +2760,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, path = NULL; pathlen = 0; } - err = ceph_pagelist_encode_string(pagelist, path, pathlen); - if (err) - goto out_free; spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ @@ -2830,14 +2767,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, cap->mseq = 0; /* and migrate_seq */ cap->cap_gen = cap->session->s_cap_gen; - if (recon_state->flock) { + if (recon_state->msg_version >= 2) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v2.pathbase = cpu_to_le64(pathbase); rec.v2.flock_len = 0; - reclen = sizeof(rec.v2); } else { rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); @@ -2847,13 +2783,23 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v1.pathbase = cpu_to_le64(pathbase); - reclen = sizeof(rec.v1); + } + + if (list_empty(&ci->i_cap_snaps)) { + snap_follows = 0; + } else { + struct ceph_cap_snap *capsnap = + list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + snap_follows = capsnap->follows; } spin_unlock(&ci->i_ceph_lock); - if (recon_state->flock) { + if (recon_state->msg_version >= 2) { int num_fcntl_locks, num_flock_locks; struct ceph_filelock *flocks; + size_t struct_len, total_len = 0; + u8 struct_v = 0; encode_again: ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); @@ -2872,20 +2818,51 @@ encode_again: goto encode_again; goto out_free; } + + if (recon_state->msg_version >= 3) { + /* version, compat_version and struct_len */ + total_len = 2 * sizeof(u8) + sizeof(u32); + struct_v = 2; + } /* * number of encoded locks is stable, so copy to pagelist */ - rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + - (num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock)); - err = ceph_pagelist_append(pagelist, &rec, reclen); - if (!err) - err = ceph_locks_to_pagelist(flocks, pagelist, - num_fcntl_locks, - num_flock_locks); + struct_len = 2 * sizeof(u32) + + (num_fcntl_locks + num_flock_locks) * + sizeof(struct ceph_filelock); + rec.v2.flock_len = cpu_to_le32(struct_len); + + struct_len += sizeof(rec.v2); + struct_len += sizeof(u32) + pathlen; + + if (struct_v >= 2) + struct_len += sizeof(u64); /* snap_follows */ + + total_len += struct_len; + err = ceph_pagelist_reserve(pagelist, total_len); + + if (!err) { + if (recon_state->msg_version >= 3) { + ceph_pagelist_encode_8(pagelist, struct_v); + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_32(pagelist, struct_len); + } + ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); + ceph_locks_to_pagelist(flocks, pagelist, + num_fcntl_locks, + num_flock_locks); + if (struct_v >= 2) + ceph_pagelist_encode_64(pagelist, snap_follows); + } kfree(flocks); } else { - err = ceph_pagelist_append(pagelist, &rec, reclen); + size_t size = sizeof(u32) + pathlen + sizeof(rec.v1); + err = ceph_pagelist_reserve(pagelist, size); + if (!err) { + ceph_pagelist_encode_string(pagelist, path, pathlen); + ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); + } } recon_state->nr_caps++; @@ -2976,7 +2953,12 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, recon_state.nr_caps = 0; recon_state.pagelist = pagelist; - recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; + if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) + recon_state.msg_version = 3; + else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK) + recon_state.msg_version = 2; + else + recon_state.msg_version = 1; err = iterate_session_caps(session, encode_caps_cb, &recon_state); if (err < 0) goto fail; @@ -3005,8 +2987,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail; } - if (recon_state.flock) - reply->hdr.version = cpu_to_le16(2); + reply->hdr.version = cpu_to_le16(recon_state.msg_version); /* raced with cap release? */ if (s_nr_caps != recon_state.nr_caps) { @@ -3204,7 +3185,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, WARN_ON(1); goto release; /* hrm... */ } - dname.hash = full_name_hash(dname.name, dname.len); + dname.hash = full_name_hash(parent, dname.name, dname.len); dentry = d_lookup(parent, &dname); dput(parent); if (!dentry) @@ -3231,7 +3212,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, msecs_to_jiffies(le32_to_cpu(h->duration_ms)); di->lease_seq = seq; - dentry->d_time = di->lease_renew_from + duration; + di->time = di->lease_renew_from + duration; di->lease_renew_after = di->lease_renew_from + (duration >> 1); di->lease_renew_from = 0; @@ -3297,47 +3278,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, } /* - * Preemptively release a lease we expect to invalidate anyway. - * Pass @inode always, @dentry is optional. - */ -void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dentry) -{ - struct ceph_dentry_info *di; - struct ceph_mds_session *session; - u32 seq; - - BUG_ON(inode == NULL); - BUG_ON(dentry == NULL); - - /* is dentry lease valid? */ - spin_lock(&dentry->d_lock); - di = ceph_dentry(dentry); - if (!di || !di->lease_session || - di->lease_session->s_mds < 0 || - di->lease_gen != di->lease_session->s_cap_gen || - !time_before(jiffies, dentry->d_time)) { - dout("lease_release inode %p dentry %p -- " - "no lease\n", - inode, dentry); - spin_unlock(&dentry->d_lock); - return; - } - - /* we do have a lease on this dentry; note mds and seq */ - session = ceph_get_mds_session(di->lease_session); - seq = di->lease_seq; - __ceph_mdsc_drop_dentry_lease(dentry); - spin_unlock(&dentry->d_lock); - - dout("lease_release inode %p dentry %p to mds%d\n", - inode, dentry, session->s_mds); - ceph_mdsc_lease_send_msg(session, inode, dentry, - CEPH_MDS_LEASE_RELEASE, seq); - ceph_put_mds_session(session); -} - -/* * drop all leases (and dentry refs) in preparation for umount */ static void drop_leases(struct ceph_mds_client *mdsc) @@ -3470,7 +3410,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; - mdsc->cap_flush_tree = RB_ROOT; + INIT_LIST_HEAD(&mdsc->cap_flush_list); INIT_LIST_HEAD(&mdsc->cap_dirty); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; @@ -3585,7 +3525,7 @@ restart: void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { - u64 want_tid, want_flush, want_snap; + u64 want_tid, want_flush; if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return; @@ -3598,17 +3538,19 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) ceph_flush_dirty_caps(mdsc); spin_lock(&mdsc->cap_dirty_lock); want_flush = mdsc->last_cap_flush_tid; + if (!list_empty(&mdsc->cap_flush_list)) { + struct ceph_cap_flush *cf = + list_last_entry(&mdsc->cap_flush_list, + struct ceph_cap_flush, g_list); + cf->wake = true; + } spin_unlock(&mdsc->cap_dirty_lock); - down_read(&mdsc->snap_rwsem); - want_snap = mdsc->last_snap_seq; - up_read(&mdsc->snap_rwsem); - - dout("sync want tid %lld flush_seq %lld snap_seq %lld\n", - want_tid, want_flush, want_snap); + dout("sync want tid %lld flush_seq %lld\n", + want_tid, want_flush); wait_unsafe_requests(mdsc, want_tid); - wait_caps_flush(mdsc, want_flush, want_snap); + wait_caps_flush(mdsc, want_flush); } /* @@ -3729,11 +3671,86 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc) dout("mdsc_destroy %p done\n", mdsc); } +void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) +{ + struct ceph_fs_client *fsc = mdsc->fsc; + const char *mds_namespace = fsc->mount_options->mds_namespace; + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + u32 epoch; + u32 map_len; + u32 num_fs; + u32 mount_fscid = (u32)-1; + u8 struct_v, struct_cv; + int err = -EINVAL; + + ceph_decode_need(&p, end, sizeof(u32), bad); + epoch = ceph_decode_32(&p); + + dout("handle_fsmap epoch %u\n", epoch); + + ceph_decode_need(&p, end, 2 + sizeof(u32), bad); + struct_v = ceph_decode_8(&p); + struct_cv = ceph_decode_8(&p); + map_len = ceph_decode_32(&p); + + ceph_decode_need(&p, end, sizeof(u32) * 3, bad); + p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */ + + num_fs = ceph_decode_32(&p); + while (num_fs-- > 0) { + void *info_p, *info_end; + u32 info_len; + u8 info_v, info_cv; + u32 fscid, namelen; + + ceph_decode_need(&p, end, 2 + sizeof(u32), bad); + info_v = ceph_decode_8(&p); + info_cv = ceph_decode_8(&p); + info_len = ceph_decode_32(&p); + ceph_decode_need(&p, end, info_len, bad); + info_p = p; + info_end = p + info_len; + p = info_end; + + ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad); + fscid = ceph_decode_32(&info_p); + namelen = ceph_decode_32(&info_p); + ceph_decode_need(&info_p, info_end, namelen, bad); + + if (mds_namespace && + strlen(mds_namespace) == namelen && + !strncmp(mds_namespace, (char *)info_p, namelen)) { + mount_fscid = fscid; + break; + } + } + + ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch); + if (mount_fscid != (u32)-1) { + fsc->client->monc.fs_cluster_id = mount_fscid; + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, + 0, true); + ceph_monc_renew_subs(&fsc->client->monc); + } else { + err = -ENOENT; + goto err_out; + } + return; +bad: + pr_err("error decoding fsmap\n"); +err_out: + mutex_lock(&mdsc->mutex); + mdsc->mdsmap_err = -ENOENT; + __wake_requests(mdsc, &mdsc->waiting_for_map); + mutex_unlock(&mdsc->mutex); + return; +} /* * handle mds map update. */ -void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) +void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) { u32 epoch; u32 maplen; @@ -3840,7 +3857,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) switch (type) { case CEPH_MSG_MDS_MAP: - ceph_mdsc_handle_map(mdsc, msg); + ceph_mdsc_handle_mdsmap(mdsc, msg); + break; + case CEPH_MSG_FS_MAP_USER: + ceph_mdsc_handle_fsmap(mdsc, msg); break; case CEPH_MSG_CLIENT_SESSION: handle_session(s, msg); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index e7d38aac7109..6b3679737d4a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -45,6 +45,7 @@ struct ceph_mds_reply_info_in { u32 inline_len; char *inline_data; u32 pool_ns_len; + char *pool_ns_data; }; struct ceph_mds_reply_dir_entry { @@ -151,7 +152,6 @@ struct ceph_mds_session { /* protected by mutex */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */ - struct list_head s_cap_snaps_flushing; unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; @@ -275,8 +275,10 @@ struct ceph_mds_request { struct ceph_pool_perm { struct rb_node node; - u32 pool; int perm; + s64 pool; + size_t pool_ns_len; + char pool_ns[]; }; /* @@ -290,6 +292,7 @@ struct ceph_mds_client { struct completion safe_umount_waiters; wait_queue_head_t session_close_wq; struct list_head waiting_for_map; + int mdsmap_err; struct ceph_mds_session **sessions; /* NULL for mds if no session */ atomic_t num_sessions; @@ -321,7 +324,7 @@ struct ceph_mds_client { spinlock_t snap_flush_lock; u64 last_cap_flush_tid; - struct rb_root cap_flush_tree; + struct list_head cap_flush_list; struct list_head cap_dirty; /* inodes with dirty caps */ struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ @@ -382,10 +385,6 @@ extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); -extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, - struct inode *inode, - struct dentry *dn); - extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct inode *dir); @@ -420,8 +419,10 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, struct dentry *dentry, char action, u32 seq); -extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, - struct ceph_msg *msg); +extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, + struct ceph_msg *msg); +extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, + struct ceph_msg *msg); extern struct ceph_mds_session * ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 9caaa7ffc93f..9ff5219d849e 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -520,9 +520,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) ihold(inode); atomic_set(&capsnap->nref, 1); - capsnap->ci = ci; INIT_LIST_HEAD(&capsnap->ci_item); - INIT_LIST_HEAD(&capsnap->flushing_item); capsnap->follows = old_snapc->seq; capsnap->issued = __ceph_caps_issued(ci, NULL); @@ -551,7 +549,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) ci->i_wrbuffer_ref_head = 0; capsnap->context = old_snapc; list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); - old_snapc = NULL; if (used & CEPH_CAP_FILE_WR) { dout("queue_cap_snap %p cap_snap %p snapc %p" @@ -563,6 +560,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) __ceph_finish_cap_snap(ci, capsnap); } capsnap = NULL; + old_snapc = NULL; update_snapc: if (ci->i_head_snapc) { @@ -603,6 +601,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->dirty_pages); return 0; } + + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", inode, capsnap, capsnap->context, capsnap->context->seq, ceph_cap_string(capsnap->dirty), @@ -799,9 +799,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) inode = &ci->vfs_inode; ihold(inode); spin_unlock(&mdsc->snap_flush_lock); - spin_lock(&ci->i_ceph_lock); - __ceph_flush_snaps(ci, &session, 0); - spin_unlock(&ci->i_ceph_lock); + ceph_flush_snaps(ci, &session); iput(inode); spin_lock(&mdsc->snap_flush_lock); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 91e02481ce06..e247f6f0feb7 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -108,7 +108,6 @@ static int ceph_sync_fs(struct super_block *sb, int wait) * mount options */ enum { - Opt_mds_namespace, Opt_wsize, Opt_rsize, Opt_rasize, @@ -121,6 +120,7 @@ enum { Opt_last_int, /* int args above */ Opt_snapdirname, + Opt_mds_namespace, Opt_last_string, /* string args above */ Opt_dirstat, @@ -144,7 +144,6 @@ enum { }; static match_table_t fsopt_tokens = { - {Opt_mds_namespace, "mds_namespace=%d"}, {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, {Opt_rasize, "rasize=%d"}, @@ -156,6 +155,7 @@ static match_table_t fsopt_tokens = { {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, + {Opt_mds_namespace, "mds_namespace=%s"}, /* string args above */ {Opt_dirstat, "dirstat"}, {Opt_nodirstat, "nodirstat"}, @@ -212,11 +212,14 @@ static int parse_fsopt_token(char *c, void *private) if (!fsopt->snapdir_name) return -ENOMEM; break; - - /* misc */ case Opt_mds_namespace: - fsopt->mds_namespace = intval; + fsopt->mds_namespace = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + if (!fsopt->mds_namespace) + return -ENOMEM; break; + /* misc */ case Opt_wsize: fsopt->wsize = intval; break; @@ -302,6 +305,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); kfree(args->snapdir_name); + kfree(args->mds_namespace); kfree(args->server_path); kfree(args); } @@ -333,6 +337,9 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); if (ret) return ret; + ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); + if (ret) + return ret; ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); if (ret) @@ -376,7 +383,6 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); - fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE; /* * Distinguish the server list from the path in "dev_name". @@ -469,8 +475,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noacl"); #endif - if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE) - seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace); + if (fsopt->mds_namespace) + seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) @@ -509,9 +515,11 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) switch (type) { case CEPH_MSG_MDS_MAP: - ceph_mdsc_handle_map(fsc->mdsc, msg); + ceph_mdsc_handle_mdsmap(fsc->mdsc, msg); + return 0; + case CEPH_MSG_FS_MAP_USER: + ceph_mdsc_handle_fsmap(fsc->mdsc, msg); return 0; - default: return -1; } @@ -543,8 +551,14 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; - fsc->client->monc.fs_cluster_id = fsopt->mds_namespace; - ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); + + if (fsopt->mds_namespace == NULL) { + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, + 0, true); + } else { + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP, + 0, false); + } fsc->mount_options = fsopt; @@ -672,8 +686,8 @@ static int __init init_caches(void) if (ceph_dentry_cachep == NULL) goto bad_dentry; - ceph_file_cachep = KMEM_CACHE(ceph_file_info, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); + if (ceph_file_cachep == NULL) goto bad_file; @@ -731,6 +745,7 @@ static const struct super_operations ceph_super_ops = { .destroy_inode = ceph_destroy_inode, .write_inode = ceph_write_inode, .drop_inode = ceph_drop_inode, + .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0168b49fb6ad..3e3fa9163059 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -62,7 +62,6 @@ struct ceph_mount_options { int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ - int mds_namespace; /* * everything above this point can be memcmp'd; everything below @@ -70,6 +69,7 @@ struct ceph_mount_options { */ char *snapdir_name; /* default ".snap" */ + char *mds_namespace; /* default NULL */ char *server_path; /* default "/" */ }; @@ -147,6 +147,14 @@ struct ceph_cap { #define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */ #define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */ +struct ceph_cap_flush { + u64 tid; + int caps; /* 0 means capsnap */ + bool wake; /* wake up flush waiters when finish ? */ + struct list_head g_list; // global + struct list_head i_list; // per inode +}; + /* * Snapped cap state that is pending flush to mds. When a snapshot occurs, * we first complete any in-process sync writes and writeback any dirty @@ -154,10 +162,11 @@ struct ceph_cap { */ struct ceph_cap_snap { atomic_t nref; - struct ceph_inode_info *ci; - struct list_head ci_item, flushing_item; + struct list_head ci_item; + + struct ceph_cap_flush cap_flush; - u64 follows, flush_tid; + u64 follows; int issued, dirty; struct ceph_snap_context *context; @@ -186,16 +195,6 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) } } -struct ceph_cap_flush { - u64 tid; - int caps; - struct rb_node g_node; // global - union { - struct rb_node i_node; // inode - struct list_head list; - }; -}; - /* * The frag tree describes how a directory is fragmented, potentially across * multiple metadata servers. It is also used to indicate points where @@ -246,7 +245,7 @@ struct ceph_dentry_info { unsigned long lease_renew_after, lease_renew_from; struct list_head lru; struct dentry *dentry; - u64 time; + unsigned long time; u64 offset; }; @@ -287,7 +286,6 @@ struct ceph_inode_info { struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; - size_t i_pool_ns_len; char *i_symlink; /* for dirs */ @@ -311,7 +309,7 @@ struct ceph_inode_info { * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ struct ceph_cap_flush *i_prealloc_cap_flush; - struct rb_root i_cap_flush_tree; + struct list_head i_cap_flush_list; wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ @@ -322,7 +320,7 @@ struct ceph_inode_info { dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ - int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */ struct mutex i_truncate_mutex; u32 i_truncate_seq; /* last truncate to smaller size */ @@ -471,6 +469,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ #define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ +#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ +#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, @@ -750,6 +750,7 @@ extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_destroy_inode(struct inode *inode); extern int ceph_drop_inode(struct inode *inode); +extern void ceph_evict_inode(struct inode *inode); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino); @@ -890,9 +891,8 @@ extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); -extern void __ceph_flush_snaps(struct ceph_inode_info *ci, - struct ceph_mds_session **psession, - int again); +extern void ceph_flush_snaps(struct ceph_inode_info *ci, + struct ceph_mds_session **psession); extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session); extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); @@ -907,10 +907,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page); /* for counting open files by mode */ -static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) -{ - ci->i_nr_by_mode[mode]++; -} +extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); /* addr.c */ @@ -931,6 +928,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); +extern void ceph_sync_write_wait(struct inode *inode); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_snapdir_fops; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 4870b29df224..adc231892b0d 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -57,81 +57,88 @@ struct ceph_vxattr { static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) { - size_t s; - char *p = (char *)&ci->i_layout; - - for (s = 0; s < sizeof(ci->i_layout); s++, p++) - if (*p) - return true; - return false; + struct ceph_file_layout *fl = &ci->i_layout; + return (fl->stripe_unit > 0 || fl->stripe_count > 0 || + fl->object_size > 0 || fl->pool_id >= 0 || + rcu_dereference_raw(fl->pool_ns) != NULL); } static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, size_t size) { - int ret; struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; - s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + struct ceph_string *pool_ns; + s64 pool = ci->i_layout.pool_id; const char *pool_name; + const char *ns_field = " pool_namespace="; char buf[128]; + size_t len, total_len = 0; + int ret; + + pool_ns = ceph_try_get_string(ci->i_layout.pool_ns); dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) { - size_t len = strlen(pool_name); - ret = snprintf(buf, sizeof(buf), - "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=", - (unsigned long long)ceph_file_layout_su(ci->i_layout), - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); - if (!size) { - ret += len; - } else if (ret + len > size) { - ret = -ERANGE; - } else { - memcpy(val, buf, ret); + len = snprintf(buf, sizeof(buf), + "stripe_unit=%u stripe_count=%u object_size=%u pool=", + ci->i_layout.stripe_unit, ci->i_layout.stripe_count, + ci->i_layout.object_size); + total_len = len + strlen(pool_name); + } else { + len = snprintf(buf, sizeof(buf), + "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld", + ci->i_layout.stripe_unit, ci->i_layout.stripe_count, + ci->i_layout.object_size, (unsigned long long)pool); + total_len = len; + } + + if (pool_ns) + total_len += strlen(ns_field) + pool_ns->len; + + if (!size) { + ret = total_len; + } else if (total_len > size) { + ret = -ERANGE; + } else { + memcpy(val, buf, len); + ret = len; + if (pool_name) { + len = strlen(pool_name); memcpy(val + ret, pool_name, len); ret += len; } - } else { - ret = snprintf(buf, sizeof(buf), - "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", - (unsigned long long)ceph_file_layout_su(ci->i_layout), - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout), - (unsigned long long)pool); - if (size) { - if (ret <= size) - memcpy(val, buf, ret); - else - ret = -ERANGE; + if (pool_ns) { + len = strlen(ns_field); + memcpy(val + ret, ns_field, len); + ret += len; + memcpy(val + ret, pool_ns->str, pool_ns->len); + ret += pool_ns->len; } } up_read(&osdc->lock); + ceph_put_string(pool_ns); return ret; } static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%lld", - (unsigned long long)ceph_file_layout_su(ci->i_layout)); + return snprintf(val, size, "%u", ci->i_layout.stripe_unit); } static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%lld", - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout)); + return snprintf(val, size, "%u", ci->i_layout.stripe_count); } static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%lld", - (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + return snprintf(val, size, "%u", ci->i_layout.object_size); } static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, @@ -140,7 +147,7 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, int ret; struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); struct ceph_osd_client *osdc = &fsc->client->osdc; - s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + s64 pool = ci->i_layout.pool_id; const char *pool_name; down_read(&osdc->lock); @@ -153,6 +160,18 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, return ret; } +static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci, + char *val, size_t size) +{ + int ret = 0; + struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns); + if (ns) { + ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str); + ceph_put_string(ns); + } + return ret; +} + /* directories */ static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, @@ -241,6 +260,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_LAYOUT_FIELD(dir, layout, stripe_count), XATTR_LAYOUT_FIELD(dir, layout, object_size), XATTR_LAYOUT_FIELD(dir, layout, pool), + XATTR_LAYOUT_FIELD(dir, layout, pool_namespace), XATTR_NAME_CEPH(dir, entries), XATTR_NAME_CEPH(dir, files), XATTR_NAME_CEPH(dir, subdirs), @@ -268,6 +288,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = { XATTR_LAYOUT_FIELD(file, layout, stripe_count), XATTR_LAYOUT_FIELD(file, layout, object_size), XATTR_LAYOUT_FIELD(file, layout, pool), + XATTR_LAYOUT_FIELD(file, layout, pool_namespace), { .name = NULL, 0 } /* Required table terminator */ }; static size_t ceph_file_vxattrs_name_size; /* total size of all names */ diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 788e19195991..6c58e13fed2f 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -244,7 +244,6 @@ static int cifs_debug_data_proc_open(struct inode *inode, struct file *file) } static const struct file_operations cifs_debug_data_proc_fops = { - .owner = THIS_MODULE, .open = cifs_debug_data_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -361,7 +360,6 @@ static int cifs_stats_proc_open(struct inode *inode, struct file *file) } static const struct file_operations cifs_stats_proc_fops = { - .owner = THIS_MODULE, .open = cifs_stats_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -447,7 +445,6 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, } static const struct file_operations cifsFYI_proc_fops = { - .owner = THIS_MODULE, .open = cifsFYI_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -479,7 +476,6 @@ static ssize_t cifs_linux_ext_proc_write(struct file *file, } static const struct file_operations cifs_linux_ext_proc_fops = { - .owner = THIS_MODULE, .open = cifs_linux_ext_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -511,7 +507,6 @@ static ssize_t cifs_lookup_cache_proc_write(struct file *file, } static const struct file_operations cifs_lookup_cache_proc_fops = { - .owner = THIS_MODULE, .open = cifs_lookup_cache_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -543,7 +538,6 @@ static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, } static const struct file_operations traceSMB_proc_fops = { - .owner = THIS_MODULE, .open = traceSMB_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -655,7 +649,6 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, } static const struct file_operations cifs_security_flags_proc_fops = { - .owner = THIS_MODULE, .open = cifs_security_flags_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 3182273a3407..1418daa03d95 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -46,6 +46,9 @@ #define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */ #define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */ #define CIFS_MOUNT_MAP_SFM_CHR 0x800000 /* SFM/MAC mapping for illegal chars */ +#define CIFS_MOUNT_USE_PREFIX_PATH 0x1000000 /* make subpath with unaccessible + * root mountable + */ struct cifs_sb_info { struct rb_root tlink_tree; @@ -67,5 +70,6 @@ struct cifs_sb_info { struct backing_dev_info bdi; struct delayed_work prune_tlinks; struct rcu_head rcu; + char *prepath; }; #endif /* _CIFS_FS_SB_H */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 6aeb8d4616a4..8347c90cf483 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -743,24 +743,26 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) memcpy(ses->auth_key.response + baselen, tiblob, tilen); + mutex_lock(&ses->server->srv_mutex); + rc = crypto_hmacmd5_alloc(ses->server); if (rc) { cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc); - goto setup_ntlmv2_rsp_ret; + goto unlock; } /* calculate ntlmv2_hash */ rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); if (rc) { cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc); - goto setup_ntlmv2_rsp_ret; + goto unlock; } /* calculate first part of the client response (CR1) */ rc = CalcNTLMv2_response(ses, ntlmv2_hash); if (rc) { cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc); - goto setup_ntlmv2_rsp_ret; + goto unlock; } /* now calculate the session key for NTLMv2 */ @@ -769,13 +771,13 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) if (rc) { cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n", __func__); - goto setup_ntlmv2_rsp_ret; + goto unlock; } rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); if (rc) { cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__); - goto setup_ntlmv2_rsp_ret; + goto unlock; } rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, @@ -783,7 +785,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) CIFS_HMAC_MD5_HASH_SIZE); if (rc) { cifs_dbg(VFS, "%s: Could not update with response\n", __func__); - goto setup_ntlmv2_rsp_ret; + goto unlock; } rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, @@ -791,6 +793,8 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) if (rc) cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); +unlock: + mutex_unlock(&ses->server->srv_mutex); setup_ntlmv2_rsp_ret: kfree(tiblob); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5d841f39c4b7..6bbec5e784cd 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -689,6 +689,14 @@ cifs_do_mount(struct file_system_type *fs_type, goto out_cifs_sb; } + if (volume_info->prepath) { + cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL); + if (cifs_sb->prepath == NULL) { + root = ERR_PTR(-ENOMEM); + goto out_cifs_sb; + } + } + cifs_setup_cifs_sb(volume_info, cifs_sb); rc = cifs_mount(cifs_sb, volume_info); @@ -727,7 +735,11 @@ cifs_do_mount(struct file_system_type *fs_type, sb->s_flags |= MS_ACTIVE; } - root = cifs_get_root(volume_info, sb); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + root = dget(sb->s_root); + else + root = cifs_get_root(volume_info, sb); + if (IS_ERR(root)) goto out_super; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7d2b15c06090..7ae03283bd61 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1228,6 +1228,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->ops = &smb1_operations; vol->vals = &smb1_values; + vol->echo_interval = SMB_ECHO_INTERVAL_DEFAULT; + if (!mountdata) goto cifs_parse_mount_err; @@ -2049,7 +2051,7 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) if (!match_security(server, vol)) return 0; - if (server->echo_interval != vol->echo_interval) + if (server->echo_interval != vol->echo_interval * HZ) return 0; return 1; @@ -3483,6 +3485,44 @@ cifs_get_volume_info(char *mount_data, const char *devname) return volume_info; } +static int +cifs_are_all_path_components_accessible(struct TCP_Server_Info *server, + unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + char *full_path) +{ + int rc; + char *s; + char sep, tmp; + + sep = CIFS_DIR_SEP(cifs_sb); + s = full_path; + + rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, ""); + while (rc == 0) { + /* skip separators */ + while (*s == sep) + s++; + if (!*s) + break; + /* next separator */ + while (*s && *s != sep) + s++; + + /* + * temporarily null-terminate the path at the end of + * the current component + */ + tmp = *s; + *s = 0; + rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, + full_path); + *s = tmp; + } + return rc; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { @@ -3620,6 +3660,16 @@ remote_path_check: kfree(full_path); goto mount_fail_check; } + + rc = cifs_are_all_path_components_accessible(server, + xid, tcon, cifs_sb, + full_path); + if (rc != 0) { + cifs_dbg(VFS, "cannot query dirs between root and final path, " + "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + rc = 0; + } kfree(full_path); } @@ -3889,6 +3939,7 @@ cifs_umount(struct cifs_sb_info *cifs_sb) bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb->mountdata); + kfree(cifs_sb->prepath); call_rcu(&cifs_sb->rcu, delayed_free); } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index fb0903fffc22..4e532536cbc6 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -84,6 +84,7 @@ build_path_from_dentry(struct dentry *direntry) struct dentry *temp; int namelen; int dfsplen; + int pplen = 0; char *full_path; char dirsep; struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); @@ -95,8 +96,12 @@ build_path_from_dentry(struct dentry *direntry) dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); else dfsplen = 0; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0; + cifs_bp_rename_retry: - namelen = dfsplen; + namelen = dfsplen + pplen; seq = read_seqbegin(&rename_lock); rcu_read_lock(); for (temp = direntry; !IS_ROOT(temp);) { @@ -137,7 +142,7 @@ cifs_bp_rename_retry: } } rcu_read_unlock(); - if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { + if (namelen != dfsplen + pplen || read_seqretry(&rename_lock, seq)) { cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n", namelen, dfsplen); /* presumably this is only possible if racing with a rename @@ -153,6 +158,17 @@ cifs_bp_rename_retry: those safely to '/' if any are found in the middle of the prepath */ /* BB test paths to Windows with '/' in the midst of prepath */ + if (pplen) { + int i; + + cifs_dbg(FYI, "using cifs_sb prepath <%s>\n", cifs_sb->prepath); + memcpy(full_path+dfsplen+1, cifs_sb->prepath, pplen-1); + full_path[dfsplen] = '\\'; + for (i = 0; i < pplen-1; i++) + if (full_path[dfsplen+1+i] == '/') + full_path[dfsplen+1+i] = CIFS_DIR_SEP(cifs_sb); + } + if (dfsplen) { strncpy(full_path, tcon->treeName, dfsplen); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { @@ -229,6 +245,13 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, goto cifs_create_get_file_info; } + if (S_ISDIR(newinode->i_mode)) { + CIFSSMBClose(xid, tcon, fid->netfid); + iput(newinode); + rc = -EISDIR; + goto out; + } + if (!S_ISREG(newinode->i_mode)) { /* * The server may allow us to open things like @@ -399,10 +422,14 @@ cifs_create_set_dentry: if (rc != 0) { cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n", rc); - if (server->ops->close) - server->ops->close(xid, tcon, fid); - goto out; + goto out_err; } + + if (S_ISDIR(newinode->i_mode)) { + rc = -EISDIR; + goto out_err; + } + d_drop(direntry); d_add(direntry, newinode); @@ -410,6 +437,13 @@ out: kfree(buf); kfree(full_path); return rc; + +out_err: + if (server->ops->close) + server->ops->close(xid, tcon, fid); + if (newinode) + iput(newinode); + goto out; } int @@ -856,7 +890,7 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q) wchar_t c; int i, charlen; - hash = init_name_hash(); + hash = init_name_hash(dentry); for (i = 0; i < q->len; i += charlen) { charlen = codepage->char2uni(&q->name[i], q->len - i, &c); /* error out if we can't convert the character */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 514dadb0575d..b87efd0c92d6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1002,10 +1002,26 @@ struct inode *cifs_root_iget(struct super_block *sb) struct inode *inode = NULL; long rc; struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + char *path = NULL; + int len; + + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) + && cifs_sb->prepath) { + len = strlen(cifs_sb->prepath); + path = kzalloc(len + 2 /* leading sep + null */, GFP_KERNEL); + if (path == NULL) + return ERR_PTR(-ENOMEM); + path[0] = '/'; + memcpy(path+1, cifs_sb->prepath, len); + } else { + path = kstrdup("", GFP_KERNEL); + if (path == NULL) + return ERR_PTR(-ENOMEM); + } xid = get_xid(); if (tcon->unix_ext) { - rc = cifs_get_inode_info_unix(&inode, "", sb, xid); + rc = cifs_get_inode_info_unix(&inode, path, sb, xid); /* some servers mistakenly claim POSIX support */ if (rc != -EOPNOTSUPP) goto iget_no_retry; @@ -1013,7 +1029,8 @@ struct inode *cifs_root_iget(struct super_block *sb) tcon->unix_ext = false; } - rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL); + convert_delimiter(path, CIFS_DIR_SEP(cifs_sb)); + rc = cifs_get_inode_info(&inode, path, NULL, sb, xid, NULL); iget_no_retry: if (!inode) { @@ -1042,6 +1059,7 @@ iget_no_retry: } out: + kfree(path); /* can not call macro free_xid here since in a void func * TODO: This is no longer true */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 3525ed756173..d203c0329626 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1044,6 +1044,9 @@ smb2_new_lease_key(struct cifs_fid *fid) get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE); } +#define SMB2_SYMLINK_STRUCT_SIZE \ + (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp)) + static int smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, const char *full_path, char **target_path, @@ -1056,7 +1059,10 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid fid; struct smb2_err_rsp *err_buf = NULL; struct smb2_symlink_err_rsp *symlink; - unsigned int sub_len, sub_offset; + unsigned int sub_len; + unsigned int sub_offset; + unsigned int print_len; + unsigned int print_offset; cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path); @@ -1077,11 +1083,33 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, kfree(utf16_path); return -ENOENT; } + + if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) || + get_rfc1002_length(err_buf) + 4 < SMB2_SYMLINK_STRUCT_SIZE) { + kfree(utf16_path); + return -ENOENT; + } + /* open must fail on symlink - reset rc */ rc = 0; symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData; sub_len = le16_to_cpu(symlink->SubstituteNameLength); sub_offset = le16_to_cpu(symlink->SubstituteNameOffset); + print_len = le16_to_cpu(symlink->PrintNameLength); + print_offset = le16_to_cpu(symlink->PrintNameOffset); + + if (get_rfc1002_length(err_buf) + 4 < + SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) { + kfree(utf16_path); + return -ENOENT; + } + + if (get_rfc1002_length(err_buf) + 4 < + SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) { + kfree(utf16_path); + return -ENOENT; + } + *target_path = cifs_strndup_from_utf16( (char *)symlink->PathBuffer + sub_offset, sub_len, true, cifs_sb->local_nls); @@ -1515,6 +1543,8 @@ struct smb_version_operations smb20_operations = { .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, .query_symlink = smb2_query_symlink, + .query_mf_symlink = smb3_query_mf_symlink, + .create_mf_symlink = smb3_create_mf_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index f36a4040afb8..b0b9cda41928 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -35,7 +35,6 @@ const struct inode_operations coda_ioctl_inode_operations = { }; const struct file_operations coda_ioctl_operations = { - .owner = THIS_MODULE, .unlocked_ioctl = coda_pioctl, .llseek = noop_llseek, }; diff --git a/fs/configfs/file.c b/fs/configfs/file.c index bbc1252a59f5..c30cf49b69d2 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -80,11 +80,11 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf count = attr->show(item, buffer->page); - buffer->needs_read_fill = 0; BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE); - if (count >= 0) + if (count >= 0) { + buffer->needs_read_fill = 0; buffer->count = count; - else + } else ret = count; return ret; } @@ -75,13 +75,13 @@ static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) struct request_queue *q = bdev->bd_queue; long rc = -EIO; - dax->addr = (void __pmem *) ERR_PTR(-EIO); + dax->addr = ERR_PTR(-EIO); if (blk_queue_enter(q, true) != 0) return rc; rc = bdev_direct_access(bdev, dax); if (rc < 0) { - dax->addr = (void __pmem *) ERR_PTR(rc); + dax->addr = ERR_PTR(rc); blk_queue_exit(q); return rc; } @@ -147,12 +147,12 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, struct buffer_head *bh) { loff_t pos = start, max = start, bh_max = start; - bool hole = false, need_wmb = false; + bool hole = false; struct block_device *bdev = NULL; int rw = iov_iter_rw(iter), rc; long map_len = 0; struct blk_dax_ctl dax = { - .addr = (void __pmem *) ERR_PTR(-EIO), + .addr = ERR_PTR(-EIO), }; unsigned blkbits = inode->i_blkbits; sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) @@ -218,7 +218,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, if (iov_iter_rw(iter) == WRITE) { len = copy_from_iter_pmem(dax.addr, max - pos, iter); - need_wmb = true; } else if (!hole) len = copy_to_iter((void __force *) dax.addr, max - pos, iter); @@ -235,8 +234,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, dax.addr += len; } - if (need_wmb) - wmb_pmem(); dax_unmap_atomic(bdev, &dax); return (pos == start) ? rc : pos - start; @@ -788,7 +785,6 @@ int dax_writeback_mapping_range(struct address_space *mapping, return ret; } } - wmb_pmem(); return 0; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); @@ -1187,7 +1183,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector, if (dax_map_atomic(bdev, &dax) < 0) return PTR_ERR(dax.addr); clear_pmem(dax.addr + offset, length); - wmb_pmem(); dax_unmap_atomic(bdev, &dax); } return 0; diff --git a/fs/dcache.c b/fs/dcache.c index d6847d7b123d..b90cf8e09d5b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -104,11 +104,9 @@ static unsigned int d_hash_shift __read_mostly; static struct hlist_bl_head *dentry_hashtable __read_mostly; -static inline struct hlist_bl_head *d_hash(const struct dentry *parent, - unsigned int hash) +static inline struct hlist_bl_head *d_hash(unsigned int hash) { - hash += (unsigned long) parent / L1_CACHE_BYTES; - return dentry_hashtable + hash_32(hash, d_hash_shift); + return dentry_hashtable + (hash >> (32 - d_hash_shift)); } #define IN_LOOKUP_SHIFT 10 @@ -226,10 +224,9 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { - const unsigned char *cs; /* * Be careful about RCU walk racing with rename: - * use ACCESS_ONCE to fetch the name pointer. + * use 'lockless_dereference' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The @@ -243,8 +240,8 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ - cs = ACCESS_ONCE(dentry->d_name.name); - smp_read_barrier_depends(); + const unsigned char *cs = lockless_dereference(dentry->d_name.name); + return dentry_string_cmp(cs, ct, tcount); } @@ -335,44 +332,21 @@ static inline void dentry_rcuwalk_invalidate(struct dentry *dentry) /* * Release the dentry's inode, using the filesystem - * d_iput() operation if defined. Dentry has no refcount - * and is unhashed. - */ -static void dentry_iput(struct dentry * dentry) - __releases(dentry->d_lock) - __releases(dentry->d_inode->i_lock) -{ - struct inode *inode = dentry->d_inode; - if (inode) { - __d_clear_type_and_inode(dentry); - hlist_del_init(&dentry->d_u.d_alias); - spin_unlock(&dentry->d_lock); - spin_unlock(&inode->i_lock); - if (!inode->i_nlink) - fsnotify_inoderemove(inode); - if (dentry->d_op && dentry->d_op->d_iput) - dentry->d_op->d_iput(dentry, inode); - else - iput(inode); - } else { - spin_unlock(&dentry->d_lock); - } -} - -/* - * Release the dentry's inode, using the filesystem - * d_iput() operation if defined. dentry remains in-use. + * d_iput() operation if defined. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; + bool hashed = !d_unhashed(dentry); - raw_write_seqcount_begin(&dentry->d_seq); + if (hashed) + raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); - raw_write_seqcount_end(&dentry->d_seq); + if (hashed) + raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) @@ -488,7 +462,7 @@ void __d_drop(struct dentry *dentry) if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_anon; else - b = d_hash(dentry->d_parent, dentry->d_name.hash); + b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); @@ -573,12 +547,10 @@ static void __dentry_kill(struct dentry *dentry) dentry_unlist(dentry, parent); if (parent) spin_unlock(&parent->d_lock); - dentry_iput(dentry); - /* - * dentry_iput drops the locks, at which point nobody (except - * transient RCU lookups) can reach this dentry. - */ - BUG_ON(dentry->d_lockref.count > 0); + if (dentry->d_inode) + dentry_unlink_inode(dentry); + else + spin_unlock(&dentry->d_lock); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); @@ -622,7 +594,6 @@ static struct dentry *dentry_kill(struct dentry *dentry) failed: spin_unlock(&dentry->d_lock); - cpu_relax(); return dentry; /* try again with same dentry */ } @@ -796,6 +767,8 @@ void dput(struct dentry *dentry) return; repeat: + might_sleep(); + rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); @@ -829,8 +802,10 @@ repeat: kill_it: dentry = dentry_kill(dentry); - if (dentry) + if (dentry) { + cond_resched(); goto repeat; + } } EXPORT_SYMBOL(dput); @@ -1595,6 +1570,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; + int err; dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); if (!dentry) @@ -1653,6 +1629,16 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) INIT_LIST_HEAD(&dentry->d_child); d_set_d_op(dentry, dentry->d_sb->s_d_op); + if (dentry->d_op && dentry->d_op->d_init) { + err = dentry->d_op->d_init(dentry); + if (err) { + if (dname_external(dentry)) + kfree(external_name(dentry)); + kmem_cache_free(dentry_cache, dentry); + return NULL; + } + } + this_cpu_inc(nr_dentry); return dentry; @@ -1716,7 +1702,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name) struct qstr q; q.name = name; - q.hash_len = hashlen_string(name); + q.hash_len = hashlen_string(parent, name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); @@ -1729,7 +1715,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | - DCACHE_OP_SELECT_INODE | DCACHE_OP_REAL)); dentry->d_op = op; if (!op) @@ -1746,8 +1731,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; - if (op->d_select_inode) - dentry->d_flags |= DCACHE_OP_SELECT_INODE; if (op->d_real) dentry->d_flags |= DCACHE_OP_REAL; @@ -1815,7 +1798,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); - __fsnotify_d_instantiate(dentry); + fsnotify_update_flags(dentry); spin_unlock(&dentry->d_lock); } @@ -2067,42 +2050,19 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, } EXPORT_SYMBOL(d_add_ci); -/* - * Do the slow-case of the dentry name compare. - * - * Unlike the dentry_cmp() function, we need to atomically - * load the name and length information, so that the - * filesystem can rely on them, and can use the 'name' and - * 'len' information without worrying about walking off the - * end of memory etc. - * - * Thus the read_seqcount_retry() and the "duplicate" info - * in arguments (the low-level filesystem should not look - * at the dentry inode or name contents directly, since - * rename can change them while we're in RCU mode). - */ -enum slow_d_compare { - D_COMP_OK, - D_COMP_NOMATCH, - D_COMP_SEQRETRY, -}; -static noinline enum slow_d_compare slow_dentry_cmp( - const struct dentry *parent, - struct dentry *dentry, - unsigned int seq, - const struct qstr *name) +static inline bool d_same_name(const struct dentry *dentry, + const struct dentry *parent, + const struct qstr *name) { - int tlen = dentry->d_name.len; - const char *tname = dentry->d_name.name; - - if (read_seqcount_retry(&dentry->d_seq, seq)) { - cpu_relax(); - return D_COMP_SEQRETRY; + if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { + if (dentry->d_name.len != name->len) + return false; + return dentry_cmp(dentry, name->name, name->len) == 0; } - if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) - return D_COMP_NOMATCH; - return D_COMP_OK; + return parent->d_op->d_compare(parent, dentry, + dentry->d_name.len, dentry->d_name.name, + name) == 0; } /** @@ -2140,7 +2100,7 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, { u64 hashlen = name->hash_len; const unsigned char *str = name->name; - struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen)); + struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen)); struct hlist_bl_node *node; struct dentry *dentry; @@ -2181,6 +2141,9 @@ seqretry: * dentry compare, we will do seqretries until it is stable, * and if we end up with a successful lookup, we actually * want to exit RCU lookup anyway. + * + * Note that raw_seqcount_begin still *does* smp_rmb(), so + * we are still guaranteed NUL-termination of ->d_name.name. */ seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) @@ -2189,24 +2152,28 @@ seqretry: continue; if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { + int tlen; + const char *tname; if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; - *seqp = seq; - switch (slow_dentry_cmp(parent, dentry, seq, name)) { - case D_COMP_OK: - return dentry; - case D_COMP_NOMATCH: - continue; - default: + tlen = dentry->d_name.len; + tname = dentry->d_name.name; + /* we want a consistent (name,len) pair */ + if (read_seqcount_retry(&dentry->d_seq, seq)) { + cpu_relax(); goto seqretry; } + if (parent->d_op->d_compare(parent, dentry, + tlen, tname, name) != 0) + continue; + } else { + if (dentry->d_name.hash_len != hashlen) + continue; + if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) + continue; } - - if (dentry->d_name.hash_len != hashlen) - continue; *seqp = seq; - if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) - return dentry; + return dentry; } return NULL; } @@ -2254,10 +2221,8 @@ EXPORT_SYMBOL(d_lookup); */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { - unsigned int len = name->len; unsigned int hash = name->hash; - const unsigned char *str = name->name; - struct hlist_bl_head *b = d_hash(parent, hash); + struct hlist_bl_head *b = d_hash(hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; @@ -2295,21 +2260,8 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) if (d_unhashed(dentry)) goto next; - /* - * It is safe to compare names since d_move() cannot - * change the qstr (protected by d_lock). - */ - if (parent->d_flags & DCACHE_OP_COMPARE) { - int tlen = dentry->d_name.len; - const char *tname = dentry->d_name.name; - if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) - goto next; - } else { - if (dentry->d_name.len != len) - goto next; - if (dentry_cmp(dentry, str, len)) - goto next; - } + if (!d_same_name(dentry, parent, name)) + goto next; dentry->d_lockref.count++; found = dentry; @@ -2337,7 +2289,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) * calculate the standard hash first, as the d_op->d_hash() * routine may choose to leave the hash value unchanged. */ - name->hash = full_name_hash(name->name, name->len); + name->hash = full_name_hash(dir, name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) @@ -2410,7 +2362,7 @@ static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b) static void _d_rehash(struct dentry * entry) { - __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash)); + __d_rehash(entry, d_hash(entry->d_name.hash)); } /** @@ -2462,9 +2414,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, wait_queue_head_t *wq) { - unsigned int len = name->len; unsigned int hash = name->hash; - const unsigned char *str = name->name; struct hlist_bl_head *b = in_lookup_hash(parent, hash); struct hlist_bl_node *node; struct dentry *new = d_alloc(parent, name); @@ -2515,17 +2465,8 @@ retry: continue; if (dentry->d_parent != parent) continue; - if (parent->d_flags & DCACHE_OP_COMPARE) { - int tlen = dentry->d_name.len; - const char *tname = dentry->d_name.name; - if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) - continue; - } else { - if (dentry->d_name.len != len) - continue; - if (dentry_cmp(dentry, str, len)) - continue; - } + if (!d_same_name(dentry, parent, name)) + continue; hlist_bl_unlock(b); /* now we can try to grab a reference */ if (!lockref_get_not_dead(&dentry->d_lockref)) { @@ -2552,17 +2493,8 @@ retry: goto mismatch; if (unlikely(d_unhashed(dentry))) goto mismatch; - if (parent->d_flags & DCACHE_OP_COMPARE) { - int tlen = dentry->d_name.len; - const char *tname = dentry->d_name.name; - if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) - goto mismatch; - } else { - if (unlikely(dentry->d_name.len != len)) - goto mismatch; - if (unlikely(dentry_cmp(dentry, str, len))) - goto mismatch; - } + if (unlikely(!d_same_name(dentry, parent, name))) + goto mismatch; /* OK, it *is* a hashed match; return it */ spin_unlock(&dentry->d_lock); dput(new); @@ -2615,7 +2547,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode) raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); - __fsnotify_d_instantiate(dentry); + fsnotify_update_flags(dentry); } _d_rehash(dentry); if (dir) @@ -2658,8 +2590,6 @@ EXPORT_SYMBOL(d_add); struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) { struct dentry *alias; - int len = entry->d_name.len; - const char *name = entry->d_name.name; unsigned int hash = entry->d_name.hash; spin_lock(&inode->i_lock); @@ -2673,9 +2603,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) continue; if (alias->d_parent != entry->d_parent) continue; - if (alias->d_name.len != len) - continue; - if (dentry_cmp(alias, name, len)) + if (!d_same_name(alias, entry->d_parent, &entry->d_name)) continue; spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { @@ -2874,7 +2802,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, * for the same hash queue because of how unlikely it is. */ __d_drop(dentry); - __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); + __d_rehash(dentry, d_hash(target->d_name.hash)); /* * Unhash the target (d_delete() is not usable here). If exchanging @@ -2882,8 +2810,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, */ __d_drop(target); if (exchange) { - __d_rehash(target, - d_hash(dentry->d_parent, dentry->d_name.hash)); + __d_rehash(target, d_hash(dentry->d_name.hash)); } /* Switch the names.. */ @@ -2906,8 +2833,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target, list_move(&target->d_child, &target->d_parent->d_subdirs); list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); if (exchange) - fsnotify_d_move(target); - fsnotify_d_move(dentry); + fsnotify_update_flags(target); + fsnotify_update_flags(dentry); } write_seqcount_end(&target->d_seq); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 4bc1f68243c1..72361baf9da7 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -621,9 +621,6 @@ void debugfs_remove(struct dentry *dentry) return; parent = dentry->d_parent; - if (!parent || d_really_is_negative(parent)) - return; - inode_lock(d_inode(parent)); ret = __debugfs_remove(dentry, parent); inode_unlock(d_inode(parent)); @@ -654,10 +651,6 @@ void debugfs_remove_recursive(struct dentry *dentry) if (IS_ERR_OR_NULL(dentry)) return; - parent = dentry->d_parent; - if (!parent || d_really_is_negative(parent)) - return; - parent = dentry; down: inode_lock(d_inode(parent)); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 37c134a132c7..d116453b0276 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -396,6 +396,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) { struct inode *inode; + s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; @@ -480,7 +481,7 @@ static struct file_system_type devpts_fs_type = { .name = "devpts", .mount = devpts_mount, .kill_sb = devpts_kill_sb, - .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, + .fs_flags = FS_USERNS_MOUNT, }; /* diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 9cb54a38832d..a5e607e8f056 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -65,7 +65,7 @@ static int efivarfs_d_compare(const struct dentry *parent, static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) { - unsigned long hash = init_name_hash(); + unsigned long hash = init_name_hash(dentry); const unsigned char *s = qstr->name; unsigned int len = qstr->len; @@ -98,7 +98,7 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) q.name = name; q.len = strlen(name); - err = efivarfs_d_hash(NULL, &q); + err = efivarfs_d_hash(parent, &q); if (err) return ERR_PTR(err); diff --git a/fs/exec.c b/fs/exec.c index 887c1c955df8..6fcfb3f7b137 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -762,6 +762,39 @@ out_unlock: } EXPORT_SYMBOL(setup_arg_pages); +#else + +/* + * Transfer the program arguments and environment from the holding pages + * onto the stack. The provided stack pointer is adjusted accordingly. + */ +int transfer_args_to_stack(struct linux_binprm *bprm, + unsigned long *sp_location) +{ + unsigned long index, stop, sp; + int ret = 0; + + stop = bprm->p >> PAGE_SHIFT; + sp = *sp_location; + + for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { + unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; + char *src = kmap(bprm->page[index]) + offset; + sp -= PAGE_SIZE - offset; + if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) + ret = -EFAULT; + kunmap(bprm->page[index]); + if (ret) + goto out; + } + + *sp_location = sp; + +out: + return ret; +} +EXPORT_SYMBOL(transfer_args_to_stack); + #endif /* CONFIG_MMU */ static struct file *do_open_execat(int fd, struct filename *name, int flags) @@ -866,7 +899,8 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, goto out; } - *buf = vmalloc(i_size); + if (id != READING_FIRMWARE_PREALLOC_BUFFER) + *buf = vmalloc(i_size); if (!*buf) { ret = -ENOMEM; goto out; @@ -897,8 +931,10 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, out_free: if (ret < 0) { - vfree(*buf); - *buf = NULL; + if (id != READING_FIRMWARE_PREALLOC_BUFFER) { + vfree(*buf); + *buf = NULL; + } } out: @@ -1411,7 +1447,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm) bprm->cred->euid = current_euid(); bprm->cred->egid = current_egid(); - if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) + if (!mnt_may_suid(bprm->file->f_path.mnt)) return; if (task_no_new_privs(current)) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 11562161e24a..f418f55c2bbe 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2350,7 +2350,6 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) } const struct file_operations ext4_seq_mb_groups_fops = { - .owner = THIS_MODULE, .open = ext4_mb_seq_groups_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 1420a3c614af..73bcfd41f5f2 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -359,7 +359,6 @@ static int name##_open(struct inode *inode, struct file *file) \ } \ \ static const struct file_operations ext4_seq_##name##_fops = { \ - .owner = THIS_MODULE, \ .open = name##_open, \ .read = seq_read, \ .llseek = seq_lseek, \ diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e2624275d828..d64d2a515cb2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -245,7 +245,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_put(bio); return -EFAULT; } - bio->bi_rw = fio->op_flags; bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b97c065cbe74..1b86d3f638ef 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -961,7 +961,6 @@ static int _name##_open_fs(struct inode *inode, struct file *file) \ } \ \ static const struct file_operations f2fs_seq_##_name##_fops = { \ - .owner = THIS_MODULE, \ .open = _name##_open_fs, \ .read = seq_read, \ .llseek = seq_lseek, \ diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 3bcf57925dca..da04c0298fab 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1589,7 +1589,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, /* * GFP_KERNEL is ok here, because while we do hold the - * supeblock lock, memory pressure can't call back into + * superblock lock, memory pressure can't call back into * the filesystem, since we're only just about to mount * it and have no inodes etc active! */ @@ -1726,7 +1726,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->dir_entries = bpb.fat_dir_entries; if (sbi->dir_entries & (sbi->dir_per_block - 1)) { if (!silent) - fat_msg(sb, KERN_ERR, "bogus directory-entries per block" + fat_msg(sb, KERN_ERR, "bogus number of directory entries" " (%u)", sbi->dir_entries); goto out_invalid; } diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index b7e2b33aa793..1337c0c7527d 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -154,7 +154,7 @@ static int msdos_hash(const struct dentry *dentry, struct qstr *qstr) error = msdos_format_name(qstr->name, qstr->len, msdos_name, options); if (!error) - qstr->hash = full_name_hash(msdos_name, MSDOS_NAME); + qstr->hash = full_name_hash(dentry, msdos_name, MSDOS_NAME); return 0; } diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 7092584f424a..6ccdf3f34f90 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -107,7 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr) */ static int vfat_hash(const struct dentry *dentry, struct qstr *qstr) { - qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); + qstr->hash = full_name_hash(dentry, qstr->name, vfat_striptail_len(qstr)); return 0; } @@ -127,7 +127,7 @@ static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr) name = qstr->name; len = vfat_striptail_len(qstr); - hash = init_name_hash(); + hash = init_name_hash(dentry); while (len--) hash = partial_name_hash(nls_tolower(t, *name++), hash); qstr->hash = end_name_hash(hash); diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig index 8dc1cd5c1efe..ce49df1020dd 100644 --- a/fs/freevxfs/Kconfig +++ b/fs/freevxfs/Kconfig @@ -5,12 +5,21 @@ config VXFS_FS FreeVxFS is a file system driver that support the VERITAS VxFS(TM) file system format. VERITAS VxFS(TM) is the standard file system of SCO UnixWare (and possibly others) and optionally available - for Sunsoft Solaris, HP-UX and many other operating systems. - Currently only readonly access is supported. + for Sunsoft Solaris, HP-UX and many other operating systems. However + these particular OS implementations of vxfs may differ in on-disk + data endianess and/or superblock offset. The vxfs module has been + tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.) + Currently only readonly access is supported and VxFX versions + 2, 3 and 4. Tests were performed with HP-UX VxFS version 3. NOTE: the file system type as used by mount(1), mount(2) and fstab(5) is 'vxfs' as it describes the file system format, not the actual driver. + There is a userspace utility for HP-UX logical volumes which makes + creating HP-UX logical volumes easy from HP-UX disk block device file + or regular file with image of the disk. See: + https://sourceforge.net/projects/linux-vxfs/ + To compile this as a module, choose M here: the module will be called freevxfs. If unsure, say N. diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h index c8a92652612a..a41ea0ba6943 100644 --- a/fs/freevxfs/vxfs.h +++ b/fs/freevxfs/vxfs.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2001 Christoph Hellwig. + * Copyright (c) 2016 Krzysztof Blaszkowski * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,13 +39,6 @@ */ #include <linux/types.h> - -/* - * Data types for use with the VxFS ondisk format. - */ -typedef int32_t vx_daddr_t; -typedef int32_t vx_ino_t; - /* * Superblock magic number (vxfs_super->vs_magic). */ @@ -60,6 +54,14 @@ typedef int32_t vx_ino_t; */ #define VXFS_NEFREE 32 +enum vxfs_byte_order { + VXFS_BO_LE, + VXFS_BO_BE, +}; + +typedef __u16 __bitwise __fs16; +typedef __u32 __bitwise __fs32; +typedef __u64 __bitwise __fs64; /* * VxFS superblock (disk). @@ -71,83 +73,83 @@ struct vxfs_sb { * Lots of this fields are no more used by version 2 * and never filesystems. */ - u_int32_t vs_magic; /* Magic number */ - int32_t vs_version; /* VxFS version */ - u_int32_t vs_ctime; /* create time - secs */ - u_int32_t vs_cutime; /* create time - usecs */ - int32_t __unused1; /* unused */ - int32_t __unused2; /* unused */ - vx_daddr_t vs_old_logstart; /* obsolete */ - vx_daddr_t vs_old_logend; /* obsolete */ - int32_t vs_bsize; /* block size */ - int32_t vs_size; /* number of blocks */ - int32_t vs_dsize; /* number of data blocks */ - u_int32_t vs_old_ninode; /* obsolete */ - int32_t vs_old_nau; /* obsolete */ - int32_t __unused3; /* unused */ - int32_t vs_old_defiextsize; /* obsolete */ - int32_t vs_old_ilbsize; /* obsolete */ - int32_t vs_immedlen; /* size of immediate data area */ - int32_t vs_ndaddr; /* number of direct extentes */ - vx_daddr_t vs_firstau; /* address of first AU */ - vx_daddr_t vs_emap; /* offset of extent map in AU */ - vx_daddr_t vs_imap; /* offset of inode map in AU */ - vx_daddr_t vs_iextop; /* offset of ExtOp. map in AU */ - vx_daddr_t vs_istart; /* offset of inode list in AU */ - vx_daddr_t vs_bstart; /* offset of fdblock in AU */ - vx_daddr_t vs_femap; /* aufirst + emap */ - vx_daddr_t vs_fimap; /* aufirst + imap */ - vx_daddr_t vs_fiextop; /* aufirst + iextop */ - vx_daddr_t vs_fistart; /* aufirst + istart */ - vx_daddr_t vs_fbstart; /* aufirst + bstart */ - int32_t vs_nindir; /* number of entries in indir */ - int32_t vs_aulen; /* length of AU in blocks */ - int32_t vs_auimlen; /* length of imap in blocks */ - int32_t vs_auemlen; /* length of emap in blocks */ - int32_t vs_auilen; /* length of ilist in blocks */ - int32_t vs_aupad; /* length of pad in blocks */ - int32_t vs_aublocks; /* data blocks in AU */ - int32_t vs_maxtier; /* log base 2 of aublocks */ - int32_t vs_inopb; /* number of inodes per blk */ - int32_t vs_old_inopau; /* obsolete */ - int32_t vs_old_inopilb; /* obsolete */ - int32_t vs_old_ndiripau; /* obsolete */ - int32_t vs_iaddrlen; /* size of indirect addr ext. */ - int32_t vs_bshift; /* log base 2 of bsize */ - int32_t vs_inoshift; /* log base 2 of inobp */ - int32_t vs_bmask; /* ~( bsize - 1 ) */ - int32_t vs_boffmask; /* bsize - 1 */ - int32_t vs_old_inomask; /* old_inopilb - 1 */ - int32_t vs_checksum; /* checksum of V1 data */ + __fs32 vs_magic; /* Magic number */ + __fs32 vs_version; /* VxFS version */ + __fs32 vs_ctime; /* create time - secs */ + __fs32 vs_cutime; /* create time - usecs */ + __fs32 __unused1; /* unused */ + __fs32 __unused2; /* unused */ + __fs32 vs_old_logstart; /* obsolete */ + __fs32 vs_old_logend; /* obsolete */ + __fs32 vs_bsize; /* block size */ + __fs32 vs_size; /* number of blocks */ + __fs32 vs_dsize; /* number of data blocks */ + __fs32 vs_old_ninode; /* obsolete */ + __fs32 vs_old_nau; /* obsolete */ + __fs32 __unused3; /* unused */ + __fs32 vs_old_defiextsize; /* obsolete */ + __fs32 vs_old_ilbsize; /* obsolete */ + __fs32 vs_immedlen; /* size of immediate data area */ + __fs32 vs_ndaddr; /* number of direct extentes */ + __fs32 vs_firstau; /* address of first AU */ + __fs32 vs_emap; /* offset of extent map in AU */ + __fs32 vs_imap; /* offset of inode map in AU */ + __fs32 vs_iextop; /* offset of ExtOp. map in AU */ + __fs32 vs_istart; /* offset of inode list in AU */ + __fs32 vs_bstart; /* offset of fdblock in AU */ + __fs32 vs_femap; /* aufirst + emap */ + __fs32 vs_fimap; /* aufirst + imap */ + __fs32 vs_fiextop; /* aufirst + iextop */ + __fs32 vs_fistart; /* aufirst + istart */ + __fs32 vs_fbstart; /* aufirst + bstart */ + __fs32 vs_nindir; /* number of entries in indir */ + __fs32 vs_aulen; /* length of AU in blocks */ + __fs32 vs_auimlen; /* length of imap in blocks */ + __fs32 vs_auemlen; /* length of emap in blocks */ + __fs32 vs_auilen; /* length of ilist in blocks */ + __fs32 vs_aupad; /* length of pad in blocks */ + __fs32 vs_aublocks; /* data blocks in AU */ + __fs32 vs_maxtier; /* log base 2 of aublocks */ + __fs32 vs_inopb; /* number of inodes per blk */ + __fs32 vs_old_inopau; /* obsolete */ + __fs32 vs_old_inopilb; /* obsolete */ + __fs32 vs_old_ndiripau; /* obsolete */ + __fs32 vs_iaddrlen; /* size of indirect addr ext. */ + __fs32 vs_bshift; /* log base 2 of bsize */ + __fs32 vs_inoshift; /* log base 2 of inobp */ + __fs32 vs_bmask; /* ~( bsize - 1 ) */ + __fs32 vs_boffmask; /* bsize - 1 */ + __fs32 vs_old_inomask; /* old_inopilb - 1 */ + __fs32 vs_checksum; /* checksum of V1 data */ /* * Version 1, writable */ - int32_t vs_free; /* number of free blocks */ - int32_t vs_ifree; /* number of free inodes */ - int32_t vs_efree[VXFS_NEFREE]; /* number of free extents by size */ - int32_t vs_flags; /* flags ?!? */ - u_int8_t vs_mod; /* filesystem has been changed */ - u_int8_t vs_clean; /* clean FS */ - u_int16_t __unused4; /* unused */ - u_int32_t vs_firstlogid; /* mount time log ID */ - u_int32_t vs_wtime; /* last time written - sec */ - u_int32_t vs_wutime; /* last time written - usec */ - u_int8_t vs_fname[6]; /* FS name */ - u_int8_t vs_fpack[6]; /* FS pack name */ - int32_t vs_logversion; /* log format version */ - int32_t __unused5; /* unused */ + __fs32 vs_free; /* number of free blocks */ + __fs32 vs_ifree; /* number of free inodes */ + __fs32 vs_efree[VXFS_NEFREE]; /* number of free extents by size */ + __fs32 vs_flags; /* flags ?!? */ + __u8 vs_mod; /* filesystem has been changed */ + __u8 vs_clean; /* clean FS */ + __fs16 __unused4; /* unused */ + __fs32 vs_firstlogid; /* mount time log ID */ + __fs32 vs_wtime; /* last time written - sec */ + __fs32 vs_wutime; /* last time written - usec */ + __u8 vs_fname[6]; /* FS name */ + __u8 vs_fpack[6]; /* FS pack name */ + __fs32 vs_logversion; /* log format version */ + __u32 __unused5; /* unused */ /* * Version 2, Read-only */ - vx_daddr_t vs_oltext[2]; /* OLT extent and replica */ - int32_t vs_oltsize; /* OLT extent size */ - int32_t vs_iauimlen; /* size of inode map */ - int32_t vs_iausize; /* size of IAU in blocks */ - int32_t vs_dinosize; /* size of inode in bytes */ - int32_t vs_old_dniaddr; /* indir levels per inode */ - int32_t vs_checksum2; /* checksum of V2 RO */ + __fs32 vs_oltext[2]; /* OLT extent and replica */ + __fs32 vs_oltsize; /* OLT extent size */ + __fs32 vs_iauimlen; /* size of inode map */ + __fs32 vs_iausize; /* size of IAU in blocks */ + __fs32 vs_dinosize; /* size of inode in bytes */ + __fs32 vs_old_dniaddr; /* indir levels per inode */ + __fs32 vs_checksum2; /* checksum of V2 RO */ /* * Actually much more... @@ -168,8 +170,32 @@ struct vxfs_sb_info { ino_t vsi_fshino; /* fileset header inode */ daddr_t vsi_oltext; /* OLT extent */ daddr_t vsi_oltsize; /* OLT size */ + enum vxfs_byte_order byte_order; }; +static inline u16 fs16_to_cpu(struct vxfs_sb_info *sbi, __fs16 a) +{ + if (sbi->byte_order == VXFS_BO_BE) + return be16_to_cpu((__force __be16)a); + else + return le16_to_cpu((__force __le16)a); +} + +static inline u32 fs32_to_cpu(struct vxfs_sb_info *sbi, __fs32 a) +{ + if (sbi->byte_order == VXFS_BO_BE) + return be32_to_cpu((__force __be32)a); + else + return le32_to_cpu((__force __le32)a); +} + +static inline u64 fs64_to_cpu(struct vxfs_sb_info *sbi, __fs64 a) +{ + if (sbi->byte_order == VXFS_BO_BE) + return be64_to_cpu((__force __be64)a); + else + return le64_to_cpu((__force __le64)a); +} /* * File modes. File types above 0xf000 are vxfs internal only, they should @@ -247,13 +273,6 @@ enum { #define VXFS_ISIMMED(ip) VXFS_IS_ORG((ip), VXFS_ORG_IMMED) #define VXFS_ISTYPED(ip) VXFS_IS_ORG((ip), VXFS_ORG_TYPED) - -/* - * Get filesystem private data from VFS inode. - */ -#define VXFS_INO(ip) \ - ((struct vxfs_inode_info *)(ip)->i_private) - /* * Get filesystem private data from VFS superblock. */ diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c index f86fd3cacd5a..1fd41cf98b9f 100644 --- a/fs/freevxfs/vxfs_bmap.c +++ b/fs/freevxfs/vxfs_bmap.c @@ -68,8 +68,9 @@ vxfs_bmap_ext4(struct inode *ip, long bn) { struct super_block *sb = ip->i_sb; struct vxfs_inode_info *vip = VXFS_INO(ip); + struct vxfs_sb_info *sbi = VXFS_SBI(sb); unsigned long bsize = sb->s_blocksize; - u32 indsize = vip->vii_ext4.ve4_indsize; + u32 indsize = fs32_to_cpu(sbi, vip->vii_ext4.ve4_indsize); int i; if (indsize > sb->s_blocksize) @@ -77,22 +78,24 @@ vxfs_bmap_ext4(struct inode *ip, long bn) for (i = 0; i < VXFS_NDADDR; i++) { struct direct *d = vip->vii_ext4.ve4_direct + i; - if (bn >= 0 && bn < d->size) - return (bn + d->extent); - bn -= d->size; + if (bn >= 0 && bn < fs32_to_cpu(sbi, d->size)) + return (bn + fs32_to_cpu(sbi, d->extent)); + bn -= fs32_to_cpu(sbi, d->size); } if ((bn / (indsize * indsize * bsize / 4)) == 0) { struct buffer_head *buf; daddr_t bno; - u32 *indir; + __fs32 *indir; - buf = sb_bread(sb, vip->vii_ext4.ve4_indir[0]); + buf = sb_bread(sb, + fs32_to_cpu(sbi, vip->vii_ext4.ve4_indir[0])); if (!buf || !buffer_mapped(buf)) goto fail_buf; - indir = (u32 *)buf->b_data; - bno = indir[(bn/indsize) % (indsize*bn)] + (bn%indsize); + indir = (__fs32 *)buf->b_data; + bno = fs32_to_cpu(sbi, indir[(bn / indsize) % (indsize * bn)]) + + (bn % indsize); brelse(buf); return bno; @@ -127,6 +130,7 @@ fail_buf: static daddr_t vxfs_bmap_indir(struct inode *ip, long indir, int size, long block) { + struct vxfs_sb_info *sbi = VXFS_SBI(ip->i_sb); struct buffer_head *bp = NULL; daddr_t pblock = 0; int i; @@ -142,24 +146,27 @@ vxfs_bmap_indir(struct inode *ip, long indir, int size, long block) typ = ((struct vxfs_typed *)bp->b_data) + (i % VXFS_TYPED_PER_BLOCK(ip->i_sb)); - off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK); + off = fs64_to_cpu(sbi, typ->vt_hdr) & VXFS_TYPED_OFFSETMASK; if (block < off) { brelse(bp); continue; } - switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) { + switch ((u_int32_t)(fs64_to_cpu(sbi, typ->vt_hdr) >> + VXFS_TYPED_TYPESHIFT)) { case VXFS_TYPED_INDIRECT: - pblock = vxfs_bmap_indir(ip, typ->vt_block, - typ->vt_size, block - off); + pblock = vxfs_bmap_indir(ip, + fs32_to_cpu(sbi, typ->vt_block), + fs32_to_cpu(sbi, typ->vt_size), + block - off); if (pblock == -2) break; goto out; case VXFS_TYPED_DATA: - if ((block - off) >= typ->vt_size) + if ((block - off) >= fs32_to_cpu(sbi, typ->vt_size)) break; - pblock = (typ->vt_block + block - off); + pblock = fs32_to_cpu(sbi, typ->vt_block) + block - off; goto out; case VXFS_TYPED_INDIRECT_DEV4: case VXFS_TYPED_DATA_DEV4: { @@ -167,13 +174,15 @@ vxfs_bmap_indir(struct inode *ip, long indir, int size, long block) (struct vxfs_typed_dev4 *)typ; printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n"); - printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n", - (unsigned long long) typ4->vd4_block, - (unsigned long long) typ4->vd4_size, - typ4->vd4_dev); + printk(KERN_INFO "block: %llu\tsize: %lld\tdev: %d\n", + fs64_to_cpu(sbi, typ4->vd4_block), + fs64_to_cpu(sbi, typ4->vd4_size), + fs32_to_cpu(sbi, typ4->vd4_dev)); goto fail; } default: + printk(KERN_ERR "%s:%d vt_hdr %llu\n", __func__, + __LINE__, fs64_to_cpu(sbi, typ->vt_hdr)); BUG(); } brelse(bp); @@ -201,28 +210,33 @@ static daddr_t vxfs_bmap_typed(struct inode *ip, long iblock) { struct vxfs_inode_info *vip = VXFS_INO(ip); + struct vxfs_sb_info *sbi = VXFS_SBI(ip->i_sb); daddr_t pblock = 0; int i; for (i = 0; i < VXFS_NTYPED; i++) { struct vxfs_typed *typ = vip->vii_org.typed + i; - int64_t off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK); + u64 hdr = fs64_to_cpu(sbi, typ->vt_hdr); + int64_t off = (hdr & VXFS_TYPED_OFFSETMASK); #ifdef DIAGNOSTIC vxfs_typdump(typ); #endif if (iblock < off) continue; - switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) { + switch ((u32)(hdr >> VXFS_TYPED_TYPESHIFT)) { case VXFS_TYPED_INDIRECT: - pblock = vxfs_bmap_indir(ip, typ->vt_block, - typ->vt_size, iblock - off); + pblock = vxfs_bmap_indir(ip, + fs32_to_cpu(sbi, typ->vt_block), + fs32_to_cpu(sbi, typ->vt_size), + iblock - off); if (pblock == -2) break; return (pblock); case VXFS_TYPED_DATA: - if ((iblock - off) < typ->vt_size) - return (typ->vt_block + iblock - off); + if ((iblock - off) < fs32_to_cpu(sbi, typ->vt_size)) + return (fs32_to_cpu(sbi, typ->vt_block) + + iblock - off); break; case VXFS_TYPED_INDIRECT_DEV4: case VXFS_TYPED_DATA_DEV4: { @@ -230,10 +244,10 @@ vxfs_bmap_typed(struct inode *ip, long iblock) (struct vxfs_typed_dev4 *)typ; printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n"); - printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n", - (unsigned long long) typ4->vd4_block, - (unsigned long long) typ4->vd4_size, - typ4->vd4_dev); + printk(KERN_INFO "block: %llu\tsize: %lld\tdev: %d\n", + fs64_to_cpu(sbi, typ4->vd4_block), + fs64_to_cpu(sbi, typ4->vd4_size), + fs32_to_cpu(sbi, typ4->vd4_dev)); return 0; } default: diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h index aaf1fb098639..acc5477b3f23 100644 --- a/fs/freevxfs/vxfs_dir.h +++ b/fs/freevxfs/vxfs_dir.h @@ -48,9 +48,9 @@ * Linux driver for now. */ struct vxfs_dirblk { - u_int16_t d_free; /* free space in dirblock */ - u_int16_t d_nhash; /* no of hash chains */ - u_int16_t d_hash[1]; /* hash chain */ + __fs16 d_free; /* free space in dirblock */ + __fs16 d_nhash; /* no of hash chains */ + __fs16 d_hash[1]; /* hash chain */ }; /* @@ -63,10 +63,10 @@ struct vxfs_dirblk { * VxFS directory entry. */ struct vxfs_direct { - vx_ino_t d_ino; /* inode number */ - u_int16_t d_reclen; /* record length */ - u_int16_t d_namelen; /* d_name length */ - u_int16_t d_hashnext; /* next hash entry */ + __fs32 d_ino; /* inode number */ + __fs16 d_reclen; /* record length */ + __fs16 d_namelen; /* d_name length */ + __fs16 d_hashnext; /* next hash entry */ char d_name[VXFS_NAMELEN]; /* name */ }; @@ -87,6 +87,7 @@ struct vxfs_direct { /* * VXFS_DIRBLKOV is the overhead of a specific dirblock. */ -#define VXFS_DIRBLKOV(dbp) ((sizeof(short) * dbp->d_nhash) + 4) +#define VXFS_DIRBLKOV(sbi, dbp) \ + ((sizeof(short) * fs16_to_cpu(sbi, dbp->d_nhash)) + 4) #endif /* _VXFS_DIR_H_ */ diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h index e3dcb4467d92..f5c428e21024 100644 --- a/fs/freevxfs/vxfs_extern.h +++ b/fs/freevxfs/vxfs_extern.h @@ -52,14 +52,10 @@ extern int vxfs_read_fshead(struct super_block *); /* vxfs_inode.c */ extern const struct address_space_operations vxfs_immed_aops; -extern struct kmem_cache *vxfs_inode_cachep; extern void vxfs_dumpi(struct vxfs_inode_info *, ino_t); -extern struct inode * vxfs_get_fake_inode(struct super_block *, - struct vxfs_inode_info *); -extern void vxfs_put_fake_inode(struct inode *); -extern struct vxfs_inode_info * vxfs_blkiget(struct super_block *, u_long, ino_t); -extern struct vxfs_inode_info * vxfs_stiget(struct super_block *, ino_t); -extern struct inode * vxfs_iget(struct super_block *, ino_t); +extern struct inode *vxfs_blkiget(struct super_block *, u_long, ino_t); +extern struct inode *vxfs_stiget(struct super_block *, ino_t); +extern struct inode *vxfs_iget(struct super_block *, ino_t); extern void vxfs_evict_inode(struct inode *); /* vxfs_lookup.c */ diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c index c9a6a94e58e9..a4610a77649e 100644 --- a/fs/freevxfs/vxfs_fshead.c +++ b/fs/freevxfs/vxfs_fshead.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2001 Christoph Hellwig. + * Copyright (c) 2016 Krzysztof Blaszkowski * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -108,31 +109,26 @@ vxfs_read_fshead(struct super_block *sbp) { struct vxfs_sb_info *infp = VXFS_SBI(sbp); struct vxfs_fsh *pfp, *sfp; - struct vxfs_inode_info *vip, *tip; + struct vxfs_inode_info *vip; - vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino); - if (!vip) { + infp->vsi_fship = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino); + if (!infp->vsi_fship) { printk(KERN_ERR "vxfs: unable to read fsh inode\n"); return -EINVAL; } + + vip = VXFS_INO(infp->vsi_fship); if (!VXFS_ISFSH(vip)) { printk(KERN_ERR "vxfs: fsh list inode is of wrong type (%x)\n", vip->vii_mode & VXFS_TYPE_MASK); - goto out_free_fship; + goto out_iput_fship; } - #ifdef DIAGNOSTIC printk("vxfs: fsh inode dump:\n"); vxfs_dumpi(vip, infp->vsi_fshino); #endif - infp->vsi_fship = vxfs_get_fake_inode(sbp, vip); - if (!infp->vsi_fship) { - printk(KERN_ERR "vxfs: unable to get fsh inode\n"); - goto out_free_fship; - } - sfp = vxfs_getfsh(infp->vsi_fship, 0); if (!sfp) { printk(KERN_ERR "vxfs: unable to get structural fsh\n"); @@ -153,14 +149,10 @@ vxfs_read_fshead(struct super_block *sbp) vxfs_dumpfsh(pfp); #endif - tip = vxfs_blkiget(sbp, infp->vsi_iext, sfp->fsh_ilistino[0]); - if (!tip) - goto out_free_pfp; - - infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip); + infp->vsi_stilist = vxfs_blkiget(sbp, infp->vsi_iext, + fs32_to_cpu(infp, sfp->fsh_ilistino[0])); if (!infp->vsi_stilist) { printk(KERN_ERR "vxfs: unable to get structural list inode\n"); - kfree(tip); goto out_free_pfp; } if (!VXFS_ISILT(VXFS_INO(infp->vsi_stilist))) { @@ -169,13 +161,9 @@ vxfs_read_fshead(struct super_block *sbp) goto out_iput_stilist; } - tip = vxfs_stiget(sbp, pfp->fsh_ilistino[0]); - if (!tip) - goto out_iput_stilist; - infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip); + infp->vsi_ilist = vxfs_stiget(sbp, fs32_to_cpu(infp, pfp->fsh_ilistino[0])); if (!infp->vsi_ilist) { printk(KERN_ERR "vxfs: unable to get inode list inode\n"); - kfree(tip); goto out_iput_stilist; } if (!VXFS_ISILT(VXFS_INO(infp->vsi_ilist))) { @@ -184,6 +172,8 @@ vxfs_read_fshead(struct super_block *sbp) goto out_iput_ilist; } + kfree(pfp); + kfree(sfp); return 0; out_iput_ilist: @@ -197,7 +187,4 @@ vxfs_read_fshead(struct super_block *sbp) out_iput_fship: iput(infp->vsi_fship); return -EINVAL; - out_free_fship: - kfree(vip); - return -EINVAL; } diff --git a/fs/freevxfs/vxfs_fshead.h b/fs/freevxfs/vxfs_fshead.h index ead0d640c181..e026f0c49159 100644 --- a/fs/freevxfs/vxfs_fshead.h +++ b/fs/freevxfs/vxfs_fshead.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2001 Christoph Hellwig. + * Copyright (c) 2016 Krzysztof Blaszkowski * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -42,20 +43,20 @@ * Fileset header */ struct vxfs_fsh { - u_int32_t fsh_version; /* fileset header version */ - u_int32_t fsh_fsindex; /* fileset index */ - u_int32_t fsh_time; /* modification time - sec */ - u_int32_t fsh_utime; /* modification time - usec */ - u_int32_t fsh_extop; /* extop flags */ - vx_ino_t fsh_ninodes; /* allocated inodes */ - u_int32_t fsh_nau; /* number of IAUs */ - u_int32_t fsh_old_ilesize; /* old size of ilist */ - u_int32_t fsh_dflags; /* flags */ - u_int32_t fsh_quota; /* quota limit */ - vx_ino_t fsh_maxinode; /* maximum inode number */ - vx_ino_t fsh_iauino; /* IAU inode */ - vx_ino_t fsh_ilistino[2]; /* ilist inodes */ - vx_ino_t fsh_lctino; /* link count table inode */ + __fs32 fsh_version; /* fileset header version */ + __fs32 fsh_fsindex; /* fileset index */ + __fs32 fsh_time; /* modification time - sec */ + __fs32 fsh_utime; /* modification time - usec */ + __fs32 fsh_extop; /* extop flags */ + __fs32 fsh_ninodes; /* allocated inodes */ + __fs32 fsh_nau; /* number of IAUs */ + __fs32 fsh_old_ilesize; /* old size of ilist */ + __fs32 fsh_dflags; /* flags */ + __fs32 fsh_quota; /* quota limit */ + __fs32 fsh_maxinode; /* maximum inode number */ + __fs32 fsh_iauino; /* IAU inode */ + __fs32 fsh_ilistino[2]; /* ilist inodes */ + __fs32 fsh_lctino; /* link count table inode */ /* * Slightly more fields follow, but they diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 3e2ccade61ed..1f41b25ef38b 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2001 Christoph Hellwig. + * Copyright (c) 2016 Krzysztof Blaszkowski * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -42,9 +43,6 @@ #include "vxfs_extern.h" -struct kmem_cache *vxfs_inode_cachep; - - #ifdef DIAGNOSTIC /* * Dump inode contents (partially). @@ -68,6 +66,83 @@ vxfs_dumpi(struct vxfs_inode_info *vip, ino_t ino) } #endif +/** + * vxfs_transmod - mode for a VxFS inode + * @vip: VxFS inode + * + * Description: + * vxfs_transmod returns a Linux mode_t for a given + * VxFS inode structure. + */ +static __inline__ umode_t +vxfs_transmod(struct vxfs_inode_info *vip) +{ + umode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK; + + if (VXFS_ISFIFO(vip)) + ret |= S_IFIFO; + if (VXFS_ISCHR(vip)) + ret |= S_IFCHR; + if (VXFS_ISDIR(vip)) + ret |= S_IFDIR; + if (VXFS_ISBLK(vip)) + ret |= S_IFBLK; + if (VXFS_ISLNK(vip)) + ret |= S_IFLNK; + if (VXFS_ISREG(vip)) + ret |= S_IFREG; + if (VXFS_ISSOC(vip)) + ret |= S_IFSOCK; + + return (ret); +} + +static inline void dip2vip_cpy(struct vxfs_sb_info *sbi, + struct vxfs_inode_info *vip, struct vxfs_dinode *dip) +{ + struct inode *inode = &vip->vfs_inode; + + vip->vii_mode = fs32_to_cpu(sbi, dip->vdi_mode); + vip->vii_nlink = fs32_to_cpu(sbi, dip->vdi_nlink); + vip->vii_uid = fs32_to_cpu(sbi, dip->vdi_uid); + vip->vii_gid = fs32_to_cpu(sbi, dip->vdi_gid); + vip->vii_size = fs64_to_cpu(sbi, dip->vdi_size); + vip->vii_atime = fs32_to_cpu(sbi, dip->vdi_atime); + vip->vii_autime = fs32_to_cpu(sbi, dip->vdi_autime); + vip->vii_mtime = fs32_to_cpu(sbi, dip->vdi_mtime); + vip->vii_mutime = fs32_to_cpu(sbi, dip->vdi_mutime); + vip->vii_ctime = fs32_to_cpu(sbi, dip->vdi_ctime); + vip->vii_cutime = fs32_to_cpu(sbi, dip->vdi_cutime); + vip->vii_orgtype = dip->vdi_orgtype; + + vip->vii_blocks = fs32_to_cpu(sbi, dip->vdi_blocks); + vip->vii_gen = fs32_to_cpu(sbi, dip->vdi_gen); + + if (VXFS_ISDIR(vip)) + vip->vii_dotdot = fs32_to_cpu(sbi, dip->vdi_dotdot); + else if (!VXFS_ISREG(vip) && !VXFS_ISLNK(vip)) + vip->vii_rdev = fs32_to_cpu(sbi, dip->vdi_rdev); + + /* don't endian swap the fields that differ by orgtype */ + memcpy(&vip->vii_org, &dip->vdi_org, sizeof(vip->vii_org)); + + inode->i_mode = vxfs_transmod(vip); + i_uid_write(inode, (uid_t)vip->vii_uid); + i_gid_write(inode, (gid_t)vip->vii_gid); + + set_nlink(inode, vip->vii_nlink); + inode->i_size = vip->vii_size; + + inode->i_atime.tv_sec = vip->vii_atime; + inode->i_ctime.tv_sec = vip->vii_ctime; + inode->i_mtime.tv_sec = vip->vii_mtime; + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + + inode->i_blocks = vip->vii_blocks; + inode->i_generation = vip->vii_gen; +} /** * vxfs_blkiget - find inode based on extent # @@ -85,50 +160,55 @@ vxfs_dumpi(struct vxfs_inode_info *vip, ino_t ino) * buffercache. This function should not be used outside the * read_super() method, otherwise the data may be incoherent. */ -struct vxfs_inode_info * +struct inode * vxfs_blkiget(struct super_block *sbp, u_long extent, ino_t ino) { struct buffer_head *bp; + struct inode *inode; u_long block, offset; + inode = new_inode(sbp); + if (!inode) + return NULL; + inode->i_ino = get_next_ino(); + block = extent + ((ino * VXFS_ISIZE) / sbp->s_blocksize); offset = ((ino % (sbp->s_blocksize / VXFS_ISIZE)) * VXFS_ISIZE); bp = sb_bread(sbp, block); if (bp && buffer_mapped(bp)) { - struct vxfs_inode_info *vip; + struct vxfs_inode_info *vip = VXFS_INO(inode); struct vxfs_dinode *dip; - if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL))) - goto fail; dip = (struct vxfs_dinode *)(bp->b_data + offset); - memcpy(vip, dip, sizeof(*vip)); + dip2vip_cpy(VXFS_SBI(sbp), vip, dip); + vip->vfs_inode.i_mapping->a_ops = &vxfs_aops; #ifdef DIAGNOSTIC vxfs_dumpi(vip, ino); #endif brelse(bp); - return (vip); + return inode; } -fail: printk(KERN_WARNING "vxfs: unable to read block %ld\n", block); brelse(bp); + iput(inode); return NULL; } /** * __vxfs_iget - generic find inode facility - * @sbp: VFS superblock - * @ino: inode number * @ilistp: inode list + * @vip: VxFS inode to fill in + * @ino: inode number * * Description: * Search the for inode number @ino in the filesystem * described by @sbp. Use the specified inode table (@ilistp). - * Returns the matching VxFS inode on success, else an error code. + * Returns the matching inode on success, else an error code. */ -static struct vxfs_inode_info * -__vxfs_iget(ino_t ino, struct inode *ilistp) +static int +__vxfs_iget(struct inode *ilistp, struct vxfs_inode_info *vip, ino_t ino) { struct page *pp; u_long offset; @@ -137,28 +217,22 @@ __vxfs_iget(ino_t ino, struct inode *ilistp) pp = vxfs_get_page(ilistp->i_mapping, ino * VXFS_ISIZE / PAGE_SIZE); if (!IS_ERR(pp)) { - struct vxfs_inode_info *vip; struct vxfs_dinode *dip; caddr_t kaddr = (char *)page_address(pp); - if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL))) - goto fail; dip = (struct vxfs_dinode *)(kaddr + offset); - memcpy(vip, dip, sizeof(*vip)); + dip2vip_cpy(VXFS_SBI(ilistp->i_sb), vip, dip); + vip->vfs_inode.i_mapping->a_ops = &vxfs_aops; #ifdef DIAGNOSTIC vxfs_dumpi(vip, ino); #endif vxfs_put_page(pp); - return (vip); + return 0; } - printk(KERN_WARNING "vxfs: error on page %p\n", pp); - return ERR_CAST(pp); - -fail: - printk(KERN_WARNING "vxfs: unable to read inode %ld\n", (unsigned long)ino); - vxfs_put_page(pp); - return ERR_PTR(-ENOMEM); + printk(KERN_WARNING "vxfs: error on page 0x%p for inode %ld\n", + pp, (unsigned long)ino); + return PTR_ERR(pp); } /** @@ -169,116 +243,26 @@ fail: * Description: * Find inode @ino in the filesystem described by @sbp using * the structural inode list. - * Returns the matching VxFS inode on success, else a NULL pointer. - */ -struct vxfs_inode_info * -vxfs_stiget(struct super_block *sbp, ino_t ino) -{ - struct vxfs_inode_info *vip; - - vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_stilist); - return IS_ERR(vip) ? NULL : vip; -} - -/** - * vxfs_transmod - mode for a VxFS inode - * @vip: VxFS inode - * - * Description: - * vxfs_transmod returns a Linux mode_t for a given - * VxFS inode structure. - */ -static __inline__ umode_t -vxfs_transmod(struct vxfs_inode_info *vip) -{ - umode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK; - - if (VXFS_ISFIFO(vip)) - ret |= S_IFIFO; - if (VXFS_ISCHR(vip)) - ret |= S_IFCHR; - if (VXFS_ISDIR(vip)) - ret |= S_IFDIR; - if (VXFS_ISBLK(vip)) - ret |= S_IFBLK; - if (VXFS_ISLNK(vip)) - ret |= S_IFLNK; - if (VXFS_ISREG(vip)) - ret |= S_IFREG; - if (VXFS_ISSOC(vip)) - ret |= S_IFSOCK; - - return (ret); -} - -/** - * vxfs_iinit- helper to fill inode fields - * @ip: VFS inode - * @vip: VxFS inode - * - * Description: - * vxfs_instino is a helper function to fill in all relevant - * fields in @ip from @vip. - */ -static void -vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip) -{ - - ip->i_mode = vxfs_transmod(vip); - i_uid_write(ip, (uid_t)vip->vii_uid); - i_gid_write(ip, (gid_t)vip->vii_gid); - - set_nlink(ip, vip->vii_nlink); - ip->i_size = vip->vii_size; - - ip->i_atime.tv_sec = vip->vii_atime; - ip->i_ctime.tv_sec = vip->vii_ctime; - ip->i_mtime.tv_sec = vip->vii_mtime; - ip->i_atime.tv_nsec = 0; - ip->i_ctime.tv_nsec = 0; - ip->i_mtime.tv_nsec = 0; - - ip->i_blocks = vip->vii_blocks; - ip->i_generation = vip->vii_gen; - - ip->i_private = vip; - -} - -/** - * vxfs_get_fake_inode - get fake inode structure - * @sbp: filesystem superblock - * @vip: fspriv inode - * - * Description: - * vxfs_fake_inode gets a fake inode (not in the inode hash) for a - * superblock, vxfs_inode pair. - * Returns the filled VFS inode. + * Returns the matching inode on success, else a NULL pointer. */ struct inode * -vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip) +vxfs_stiget(struct super_block *sbp, ino_t ino) { - struct inode *ip = NULL; - - if ((ip = new_inode(sbp))) { - ip->i_ino = get_next_ino(); - vxfs_iinit(ip, vip); - ip->i_mapping->a_ops = &vxfs_aops; + struct inode *inode; + int error; + + inode = new_inode(sbp); + if (!inode) + return NULL; + inode->i_ino = get_next_ino(); + + error = __vxfs_iget(VXFS_SBI(sbp)->vsi_stilist, VXFS_INO(inode), ino); + if (error) { + iput(inode); + return NULL; } - return (ip); -} -/** - * vxfs_put_fake_inode - free faked inode - * *ip: VFS inode - * - * Description: - * vxfs_put_fake_inode frees all data associated with @ip. - */ -void -vxfs_put_fake_inode(struct inode *ip) -{ - iput(ip); + return inode; } /** @@ -296,6 +280,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino) struct vxfs_inode_info *vip; const struct address_space_operations *aops; struct inode *ip; + int error; ip = iget_locked(sbp, ino); if (!ip) @@ -303,14 +288,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino) if (!(ip->i_state & I_NEW)) return ip; - vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_ilist); - if (IS_ERR(vip)) { + vip = VXFS_INO(ip); + error = __vxfs_iget(VXFS_SBI(sbp)->vsi_ilist, vip, ino); + if (error) { iget_failed(ip); - return ERR_CAST(vip); + return ERR_PTR(error); } - vxfs_iinit(ip, vip); - if (VXFS_ISIMMED(vip)) aops = &vxfs_immed_aops; else @@ -341,12 +325,6 @@ vxfs_iget(struct super_block *sbp, ino_t ino) return ip; } -static void vxfs_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(vxfs_inode_cachep, inode->i_private); -} - /** * vxfs_evict_inode - remove inode from main memory * @ip: inode to discard. @@ -360,5 +338,4 @@ vxfs_evict_inode(struct inode *ip) { truncate_inode_pages_final(&ip->i_data); clear_inode(ip); - call_rcu(&ip->i_rcu, vxfs_i_callback); } diff --git a/fs/freevxfs/vxfs_inode.h b/fs/freevxfs/vxfs_inode.h index 240aeb11263f..f012abed125d 100644 --- a/fs/freevxfs/vxfs_inode.h +++ b/fs/freevxfs/vxfs_inode.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2001 Christoph Hellwig. + * Copyright (c) 2016 Krzysztof Blaszkowski * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -66,74 +67,74 @@ enum { * Data stored immediately in the inode. */ struct vxfs_immed { - u_int8_t vi_immed[VXFS_NIMMED]; + __u8 vi_immed[VXFS_NIMMED]; }; struct vxfs_ext4 { - u_int32_t ve4_spare; /* ?? */ - u_int32_t ve4_indsize; /* Indirect extent size */ - vx_daddr_t ve4_indir[VXFS_NIADDR]; /* Indirect extents */ + __fs32 ve4_spare; /* ?? */ + __fs32 ve4_indsize; /* Indirect extent size */ + __fs32 ve4_indir[VXFS_NIADDR]; /* Indirect extents */ struct direct { /* Direct extents */ - vx_daddr_t extent; /* Extent number */ - int32_t size; /* Size of extent */ + __fs32 extent; /* Extent number */ + __fs32 size; /* Size of extent */ } ve4_direct[VXFS_NDADDR]; }; struct vxfs_typed { - u_int64_t vt_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */ - vx_daddr_t vt_block; /* Extent block */ - int32_t vt_size; /* Size in blocks */ + __fs64 vt_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */ + __fs32 vt_block; /* Extent block */ + __fs32 vt_size; /* Size in blocks */ }; struct vxfs_typed_dev4 { - u_int64_t vd4_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */ - u_int64_t vd4_block; /* Extent block */ - u_int64_t vd4_size; /* Size in blocks */ - int32_t vd4_dev; /* Device ID */ - u_int32_t __pad1; + __fs64 vd4_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */ + __fs64 vd4_block; /* Extent block */ + __fs64 vd4_size; /* Size in blocks */ + __fs32 vd4_dev; /* Device ID */ + __u8 __pad1; }; /* * The inode as contained on the physical device. */ struct vxfs_dinode { - int32_t vdi_mode; - u_int32_t vdi_nlink; /* Link count */ - u_int32_t vdi_uid; /* UID */ - u_int32_t vdi_gid; /* GID */ - u_int64_t vdi_size; /* Inode size in bytes */ - u_int32_t vdi_atime; /* Last time accessed - sec */ - u_int32_t vdi_autime; /* Last time accessed - usec */ - u_int32_t vdi_mtime; /* Last modify time - sec */ - u_int32_t vdi_mutime; /* Last modify time - usec */ - u_int32_t vdi_ctime; /* Create time - sec */ - u_int32_t vdi_cutime; /* Create time - usec */ - u_int8_t vdi_aflags; /* Allocation flags */ - u_int8_t vdi_orgtype; /* Organisation type */ - u_int16_t vdi_eopflags; - u_int32_t vdi_eopdata; + __fs32 vdi_mode; + __fs32 vdi_nlink; /* Link count */ + __fs32 vdi_uid; /* UID */ + __fs32 vdi_gid; /* GID */ + __fs64 vdi_size; /* Inode size in bytes */ + __fs32 vdi_atime; /* Last time accessed - sec */ + __fs32 vdi_autime; /* Last time accessed - usec */ + __fs32 vdi_mtime; /* Last modify time - sec */ + __fs32 vdi_mutime; /* Last modify time - usec */ + __fs32 vdi_ctime; /* Create time - sec */ + __fs32 vdi_cutime; /* Create time - usec */ + __u8 vdi_aflags; /* Allocation flags */ + __u8 vdi_orgtype; /* Organisation type */ + __fs16 vdi_eopflags; + __fs32 vdi_eopdata; union { - u_int32_t rdev; - u_int32_t dotdot; + __fs32 rdev; + __fs32 dotdot; struct { - u_int32_t reserved; - u_int32_t fixextsize; + __u32 reserved; + __fs32 fixextsize; } i_regular; struct { - u_int32_t matchino; - u_int32_t fsetindex; + __fs32 matchino; + __fs32 fsetindex; } i_vxspec; - u_int64_t align; + __u64 align; } vdi_ftarea; - u_int32_t vdi_blocks; /* How much blocks does inode occupy */ - u_int32_t vdi_gen; /* Inode generation */ - u_int64_t vdi_version; /* Version */ + __fs32 vdi_blocks; /* How much blocks does inode occupy */ + __fs32 vdi_gen; /* Inode generation */ + __fs64 vdi_version; /* Version */ union { struct vxfs_immed immed; struct vxfs_ext4 ext4; struct vxfs_typed typed[VXFS_NTYPED]; } vdi_org; - u_int32_t vdi_iattrino; + __fs32 vdi_iattrino; }; #define vdi_rdev vdi_ftarea.rdev @@ -149,32 +150,45 @@ struct vxfs_dinode { /* * The inode as represented in the main memory. - * - * TBD: This should become a separate structure... */ -#define vxfs_inode_info vxfs_dinode - -#define vii_mode vdi_mode -#define vii_uid vdi_uid -#define vii_gid vdi_gid -#define vii_nlink vdi_nlink -#define vii_size vdi_size -#define vii_atime vdi_atime -#define vii_ctime vdi_ctime -#define vii_mtime vdi_mtime -#define vii_blocks vdi_blocks -#define vii_org vdi_org -#define vii_orgtype vdi_orgtype -#define vii_gen vdi_gen - -#define vii_rdev vdi_ftarea.rdev -#define vii_dotdot vdi_ftarea.dotdot -#define vii_fixextsize vdi_ftarea.regular.fixextsize -#define vii_matchino vdi_ftarea.vxspec.matchino -#define vii_fsetindex vdi_ftarea.vxspec.fsetindex - -#define vii_immed vdi_org.immed -#define vii_ext4 vdi_org.ext4 -#define vii_typed vdi_org.typed +struct vxfs_inode_info { + struct inode vfs_inode; + + __u32 vii_mode; + __u32 vii_nlink; /* Link count */ + __u32 vii_uid; /* UID */ + __u32 vii_gid; /* GID */ + __u64 vii_size; /* Inode size in bytes */ + __u32 vii_atime; /* Last time accessed - sec */ + __u32 vii_autime; /* Last time accessed - usec */ + __u32 vii_mtime; /* Last modify time - sec */ + __u32 vii_mutime; /* Last modify time - usec */ + __u32 vii_ctime; /* Create time - sec */ + __u32 vii_cutime; /* Create time - usec */ + __u8 vii_orgtype; /* Organisation type */ + union { + __u32 rdev; + __u32 dotdot; + } vii_ftarea; + __u32 vii_blocks; /* How much blocks does inode occupy */ + __u32 vii_gen; /* Inode generation */ + union { + struct vxfs_immed immed; + struct vxfs_ext4 ext4; + struct vxfs_typed typed[VXFS_NTYPED]; + } vii_org; +}; + +#define vii_rdev vii_ftarea.rdev +#define vii_dotdot vii_ftarea.dotdot + +#define vii_immed vii_org.immed +#define vii_ext4 vii_org.ext4 +#define vii_typed vii_org.typed + +static inline struct vxfs_inode_info *VXFS_INO(struct inode *inode) +{ + return container_of(inode, struct vxfs_inode_info, vfs_inode); +} #endif /* _VXFS_INODE_H_ */ diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 6d576b97f2c8..ce4785fd81c6 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2001 Christoph Hellwig. + * Copyright (c) 2016 Krzysztof Blaszkowski * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -61,33 +62,6 @@ const struct file_operations vxfs_dir_operations = { .iterate_shared = vxfs_readdir, }; -static inline u_long -dir_blocks(struct inode *ip) -{ - u_long bsize = ip->i_sb->s_blocksize; - return (ip->i_size + bsize - 1) & ~(bsize - 1); -} - -/* - * NOTE! unlike strncmp, vxfs_match returns 1 for success, 0 for failure. - * - * len <= VXFS_NAMELEN and de != NULL are guaranteed by caller. - */ -static inline int -vxfs_match(int len, const char * const name, struct vxfs_direct *de) -{ - if (len != de->d_namelen) - return 0; - if (!de->d_ino) - return 0; - return !memcmp(name, de->d_name, len); -} - -static inline struct vxfs_direct * -vxfs_next_entry(struct vxfs_direct *de) -{ - return ((struct vxfs_direct *)((char*)de + de->d_reclen)); -} /** * vxfs_find_entry - find a mathing directory entry for a dentry @@ -106,50 +80,64 @@ vxfs_next_entry(struct vxfs_direct *de) static struct vxfs_direct * vxfs_find_entry(struct inode *ip, struct dentry *dp, struct page **ppp) { - u_long npages, page, nblocks, pblocks, block; - u_long bsize = ip->i_sb->s_blocksize; - const char *name = dp->d_name.name; - int namelen = dp->d_name.len; - - npages = dir_pages(ip); - nblocks = dir_blocks(ip); - pblocks = VXFS_BLOCK_PER_PAGE(ip->i_sb); - - for (page = 0; page < npages; page++) { - caddr_t kaddr; - struct page *pp; + u_long bsize = ip->i_sb->s_blocksize; + const char *name = dp->d_name.name; + int namelen = dp->d_name.len; + loff_t limit = VXFS_DIRROUND(ip->i_size); + struct vxfs_direct *de_exit = NULL; + loff_t pos = 0; + struct vxfs_sb_info *sbi = VXFS_SBI(ip->i_sb); - pp = vxfs_get_page(ip->i_mapping, page); + while (pos < limit) { + struct page *pp; + char *kaddr; + int pg_ofs = pos & ~PAGE_MASK; + + pp = vxfs_get_page(ip->i_mapping, pos >> PAGE_SHIFT); if (IS_ERR(pp)) - continue; - kaddr = (caddr_t)page_address(pp); - - for (block = 0; block <= nblocks && block <= pblocks; block++) { - caddr_t baddr, limit; - struct vxfs_dirblk *dbp; - struct vxfs_direct *de; - - baddr = kaddr + (block * bsize); - limit = baddr + bsize - VXFS_DIRLEN(1); - - dbp = (struct vxfs_dirblk *)baddr; - de = (struct vxfs_direct *)(baddr + VXFS_DIRBLKOV(dbp)); - - for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) { - if (!de->d_reclen) - break; - if (!de->d_ino) - continue; - if (vxfs_match(namelen, name, de)) { - *ppp = pp; - return (de); - } + return NULL; + kaddr = (char *)page_address(pp); + + while (pg_ofs < PAGE_SIZE && pos < limit) { + struct vxfs_direct *de; + + if ((pos & (bsize - 1)) < 4) { + struct vxfs_dirblk *dbp = + (struct vxfs_dirblk *) + (kaddr + (pos & ~PAGE_MASK)); + int overhead = VXFS_DIRBLKOV(sbi, dbp); + + pos += overhead; + pg_ofs += overhead; + } + de = (struct vxfs_direct *)(kaddr + pg_ofs); + + if (!de->d_reclen) { + pos += bsize - 1; + pos &= ~(bsize - 1); + break; + } + + pg_ofs += fs16_to_cpu(sbi, de->d_reclen); + pos += fs16_to_cpu(sbi, de->d_reclen); + if (!de->d_ino) + continue; + + if (namelen != fs16_to_cpu(sbi, de->d_namelen)) + continue; + if (!memcmp(name, de->d_name, namelen)) { + *ppp = pp; + de_exit = de; + break; } } - vxfs_put_page(pp); + if (!de_exit) + vxfs_put_page(pp); + else + break; } - return NULL; + return de_exit; } /** @@ -173,7 +161,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp) de = vxfs_find_entry(dip, dp, &pp); if (de) { - ino = de->d_ino; + ino = fs32_to_cpu(VXFS_SBI(dip->i_sb), de->d_ino); kunmap(pp); put_page(pp); } @@ -233,74 +221,80 @@ vxfs_readdir(struct file *fp, struct dir_context *ctx) struct inode *ip = file_inode(fp); struct super_block *sbp = ip->i_sb; u_long bsize = sbp->s_blocksize; - u_long page, npages, block, pblocks, nblocks, offset; - loff_t pos; + loff_t pos, limit; + struct vxfs_sb_info *sbi = VXFS_SBI(sbp); if (ctx->pos == 0) { if (!dir_emit_dot(fp, ctx)) - return 0; - ctx->pos = 1; + goto out; + ctx->pos++; } if (ctx->pos == 1) { if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR)) - return 0; - ctx->pos = 2; + goto out; + ctx->pos++; } - pos = ctx->pos - 2; - - if (pos > VXFS_DIRROUND(ip->i_size)) - return 0; - npages = dir_pages(ip); - nblocks = dir_blocks(ip); - pblocks = VXFS_BLOCK_PER_PAGE(sbp); + limit = VXFS_DIRROUND(ip->i_size); + if (ctx->pos > limit) + goto out; - page = pos >> PAGE_SHIFT; - offset = pos & ~PAGE_MASK; - block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks; + pos = ctx->pos & ~3L; - for (; page < npages; page++, block = 0) { - char *kaddr; - struct page *pp; + while (pos < limit) { + struct page *pp; + char *kaddr; + int pg_ofs = pos & ~PAGE_MASK; + int rc = 0; - pp = vxfs_get_page(ip->i_mapping, page); + pp = vxfs_get_page(ip->i_mapping, pos >> PAGE_SHIFT); if (IS_ERR(pp)) - continue; + return -ENOMEM; + kaddr = (char *)page_address(pp); - for (; block <= nblocks && block <= pblocks; block++) { - char *baddr, *limit; - struct vxfs_dirblk *dbp; - struct vxfs_direct *de; + while (pg_ofs < PAGE_SIZE && pos < limit) { + struct vxfs_direct *de; - baddr = kaddr + (block * bsize); - limit = baddr + bsize - VXFS_DIRLEN(1); - - dbp = (struct vxfs_dirblk *)baddr; - de = (struct vxfs_direct *) - (offset ? - (kaddr + offset) : - (baddr + VXFS_DIRBLKOV(dbp))); - - for (; (char *)de <= limit; de = vxfs_next_entry(de)) { - if (!de->d_reclen) - break; - if (!de->d_ino) - continue; - - offset = (char *)de - kaddr; - ctx->pos = ((page << PAGE_SHIFT) | offset) + 2; - if (!dir_emit(ctx, de->d_name, de->d_namelen, - de->d_ino, DT_UNKNOWN)) { - vxfs_put_page(pp); - return 0; - } + if ((pos & (bsize - 1)) < 4) { + struct vxfs_dirblk *dbp = + (struct vxfs_dirblk *) + (kaddr + (pos & ~PAGE_MASK)); + int overhead = VXFS_DIRBLKOV(sbi, dbp); + + pos += overhead; + pg_ofs += overhead; + } + de = (struct vxfs_direct *)(kaddr + pg_ofs); + + if (!de->d_reclen) { + pos += bsize - 1; + pos &= ~(bsize - 1); + break; + } + + pg_ofs += fs16_to_cpu(sbi, de->d_reclen); + pos += fs16_to_cpu(sbi, de->d_reclen); + if (!de->d_ino) + continue; + + rc = dir_emit(ctx, de->d_name, + fs16_to_cpu(sbi, de->d_namelen), + fs32_to_cpu(sbi, de->d_ino), + DT_UNKNOWN); + if (!rc) { + /* the dir entry was not read, fix pos. */ + pos -= fs16_to_cpu(sbi, de->d_reclen); + break; } - offset = 0; } vxfs_put_page(pp); - offset = 0; + if (!rc) + break; } - ctx->pos = ((page << PAGE_SHIFT) | offset) + 2; + + ctx->pos = pos | 2; + +out: return 0; } diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c index 049500847903..813da6685151 100644 --- a/fs/freevxfs/vxfs_olt.c +++ b/fs/freevxfs/vxfs_olt.c @@ -43,14 +43,14 @@ static inline void vxfs_get_fshead(struct vxfs_oltfshead *fshp, struct vxfs_sb_info *infp) { BUG_ON(infp->vsi_fshino); - infp->vsi_fshino = fshp->olt_fsino[0]; + infp->vsi_fshino = fs32_to_cpu(infp, fshp->olt_fsino[0]); } static inline void vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp) { BUG_ON(infp->vsi_iext); - infp->vsi_iext = ilistp->olt_iext[0]; + infp->vsi_iext = fs32_to_cpu(infp, ilistp->olt_iext[0]); } static inline u_long @@ -81,13 +81,12 @@ vxfs_read_olt(struct super_block *sbp, u_long bsize) struct vxfs_olt *op; char *oaddr, *eaddr; - bp = sb_bread(sbp, vxfs_oblock(sbp, infp->vsi_oltext, bsize)); if (!bp || !bp->b_data) goto fail; op = (struct vxfs_olt *)bp->b_data; - if (op->olt_magic != VXFS_OLT_MAGIC) { + if (fs32_to_cpu(infp, op->olt_magic) != VXFS_OLT_MAGIC) { printk(KERN_NOTICE "vxfs: ivalid olt magic number\n"); goto fail; } @@ -102,14 +101,14 @@ vxfs_read_olt(struct super_block *sbp, u_long bsize) goto fail; } - oaddr = bp->b_data + op->olt_size; + oaddr = bp->b_data + fs32_to_cpu(infp, op->olt_size); eaddr = bp->b_data + (infp->vsi_oltsize * sbp->s_blocksize); while (oaddr < eaddr) { struct vxfs_oltcommon *ocp = (struct vxfs_oltcommon *)oaddr; - switch (ocp->olt_type) { + switch (fs32_to_cpu(infp, ocp->olt_type)) { case VXFS_OLT_FSHEAD: vxfs_get_fshead((struct vxfs_oltfshead *)oaddr, infp); break; @@ -118,11 +117,11 @@ vxfs_read_olt(struct super_block *sbp, u_long bsize) break; } - oaddr += ocp->olt_size; + oaddr += fs32_to_cpu(infp, ocp->olt_size); } brelse(bp); - return 0; + return (infp->vsi_fshino && infp->vsi_iext) ? 0 : -EINVAL; fail: brelse(bp); diff --git a/fs/freevxfs/vxfs_olt.h b/fs/freevxfs/vxfs_olt.h index b7b3af502615..0c0b0c9fa557 100644 --- a/fs/freevxfs/vxfs_olt.h +++ b/fs/freevxfs/vxfs_olt.h @@ -63,83 +63,83 @@ enum { * the initial inode list, the fileset header or the device configuration. */ struct vxfs_olt { - u_int32_t olt_magic; /* magic number */ - u_int32_t olt_size; /* size of this entry */ - u_int32_t olt_checksum; /* checksum of extent */ - u_int32_t __unused1; /* ??? */ - u_int32_t olt_mtime; /* time of last mod. (sec) */ - u_int32_t olt_mutime; /* time of last mod. (usec) */ - u_int32_t olt_totfree; /* free space in OLT extent */ - vx_daddr_t olt_extents[2]; /* addr of this extent, replica */ - u_int32_t olt_esize; /* size of this extent */ - vx_daddr_t olt_next[2]; /* addr of next extent, replica */ - u_int32_t olt_nsize; /* size of next extent */ - u_int32_t __unused2; /* align to 8 byte boundary */ + __fs32 olt_magic; /* magic number */ + __fs32 olt_size; /* size of this entry */ + __fs32 olt_checksum; /* checksum of extent */ + __u32 __unused1; /* ??? */ + __fs32 olt_mtime; /* time of last mod. (sec) */ + __fs32 olt_mutime; /* time of last mod. (usec) */ + __fs32 olt_totfree; /* free space in OLT extent */ + __fs32 olt_extents[2]; /* addr of this extent, replica */ + __fs32 olt_esize; /* size of this extent */ + __fs32 olt_next[2]; /* addr of next extent, replica */ + __fs32 olt_nsize; /* size of next extent */ + __u32 __unused2; /* align to 8 byte boundary */ }; /* * VxFS common OLT entry (on disk). */ struct vxfs_oltcommon { - u_int32_t olt_type; /* type of this record */ - u_int32_t olt_size; /* size of this record */ + __fs32 olt_type; /* type of this record */ + __fs32 olt_size; /* size of this record */ }; /* * VxFS free OLT entry (on disk). */ struct vxfs_oltfree { - u_int32_t olt_type; /* type of this record */ - u_int32_t olt_fsize; /* size of this free record */ + __fs32 olt_type; /* type of this record */ + __fs32 olt_fsize; /* size of this free record */ }; /* * VxFS initial-inode list (on disk). */ struct vxfs_oltilist { - u_int32_t olt_type; /* type of this record */ - u_int32_t olt_size; /* size of this record */ - vx_ino_t olt_iext[2]; /* initial inode list, replica */ + __fs32 olt_type; /* type of this record */ + __fs32 olt_size; /* size of this record */ + __fs32 olt_iext[2]; /* initial inode list, replica */ }; /* * Current Usage Table */ struct vxfs_oltcut { - u_int32_t olt_type; /* type of this record */ - u_int32_t olt_size; /* size of this record */ - vx_ino_t olt_cutino; /* inode of current usage table */ - u_int32_t __pad; /* unused, 8 byte align */ + __fs32 olt_type; /* type of this record */ + __fs32 olt_size; /* size of this record */ + __fs32 olt_cutino; /* inode of current usage table */ + __u8 __pad; /* unused, 8 byte align */ }; /* * Inodes containing Superblock, Intent log and OLTs */ struct vxfs_oltsb { - u_int32_t olt_type; /* type of this record */ - u_int32_t olt_size; /* size of this record */ - vx_ino_t olt_sbino; /* inode of superblock file */ - u_int32_t __unused1; /* ??? */ - vx_ino_t olt_logino[2]; /* inode of log file,replica */ - vx_ino_t olt_oltino[2]; /* inode of OLT, replica */ + __fs32 olt_type; /* type of this record */ + __fs32 olt_size; /* size of this record */ + __fs32 olt_sbino; /* inode of superblock file */ + __u32 __unused1; /* ??? */ + __fs32 olt_logino[2]; /* inode of log file,replica */ + __fs32 olt_oltino[2]; /* inode of OLT, replica */ }; /* * Inode containing device configuration + it's replica */ struct vxfs_oltdev { - u_int32_t olt_type; /* type of this record */ - u_int32_t olt_size; /* size of this record */ - vx_ino_t olt_devino[2]; /* inode of device config files */ + __fs32 olt_type; /* type of this record */ + __fs32 olt_size; /* size of this record */ + __fs32 olt_devino[2]; /* inode of device config files */ }; /* * Fileset header */ struct vxfs_oltfshead { - u_int32_t olt_type; /* type number */ - u_int32_t olt_size; /* size of this record */ - vx_ino_t olt_fsino[2]; /* inodes of fileset header */ + __fs32 olt_type; /* type number */ + __fs32 olt_size; /* size of this record */ + __fs32 olt_fsino[2]; /* inodes of fileset header */ }; #endif /* _VXFS_OLT_H_ */ diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 7ca8c75d50d3..455ce5b77e9b 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2001 Christoph Hellwig. + * Copyright (c) 2016 Krzysztof Blaszkowski * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -48,22 +49,11 @@ #include "vxfs_inode.h" -MODULE_AUTHOR("Christoph Hellwig"); +MODULE_AUTHOR("Christoph Hellwig, Krzysztof Blaszkowski"); MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver"); MODULE_LICENSE("Dual BSD/GPL"); - - -static void vxfs_put_super(struct super_block *); -static int vxfs_statfs(struct dentry *, struct kstatfs *); -static int vxfs_remount(struct super_block *, int *, char *); - -static const struct super_operations vxfs_super_ops = { - .evict_inode = vxfs_evict_inode, - .put_super = vxfs_put_super, - .statfs = vxfs_statfs, - .remount_fs = vxfs_remount, -}; +static struct kmem_cache *vxfs_inode_cachep; /** * vxfs_put_super - free superblock resources @@ -79,9 +69,9 @@ vxfs_put_super(struct super_block *sbp) { struct vxfs_sb_info *infp = VXFS_SBI(sbp); - vxfs_put_fake_inode(infp->vsi_fship); - vxfs_put_fake_inode(infp->vsi_ilist); - vxfs_put_fake_inode(infp->vsi_stilist); + iput(infp->vsi_fship); + iput(infp->vsi_ilist); + iput(infp->vsi_stilist); brelse(infp->vsi_bp); kfree(infp); @@ -109,14 +99,15 @@ static int vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) { struct vxfs_sb_info *infp = VXFS_SBI(dentry->d_sb); + struct vxfs_sb *raw_sb = infp->vsi_raw; bufp->f_type = VXFS_SUPER_MAGIC; bufp->f_bsize = dentry->d_sb->s_blocksize; - bufp->f_blocks = infp->vsi_raw->vs_dsize; - bufp->f_bfree = infp->vsi_raw->vs_free; + bufp->f_blocks = fs32_to_cpu(infp, raw_sb->vs_dsize); + bufp->f_bfree = fs32_to_cpu(infp, raw_sb->vs_free); bufp->f_bavail = 0; bufp->f_files = 0; - bufp->f_ffree = infp->vsi_raw->vs_ifree; + bufp->f_ffree = fs32_to_cpu(infp, raw_sb->vs_ifree); bufp->f_namelen = VXFS_NAMELEN; return 0; @@ -129,6 +120,81 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data) return 0; } +static struct inode *vxfs_alloc_inode(struct super_block *sb) +{ + struct vxfs_inode_info *vi; + + vi = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL); + if (!vi) + return NULL; + inode_init_once(&vi->vfs_inode); + return &vi->vfs_inode; +} + +static void vxfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + kmem_cache_free(vxfs_inode_cachep, VXFS_INO(inode)); +} + +static void vxfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, vxfs_i_callback); +} + +static const struct super_operations vxfs_super_ops = { + .alloc_inode = vxfs_alloc_inode, + .destroy_inode = vxfs_destroy_inode, + .evict_inode = vxfs_evict_inode, + .put_super = vxfs_put_super, + .statfs = vxfs_statfs, + .remount_fs = vxfs_remount, +}; + +static int vxfs_try_sb_magic(struct super_block *sbp, int silent, + unsigned blk, __fs32 magic) +{ + struct buffer_head *bp; + struct vxfs_sb *rsbp; + struct vxfs_sb_info *infp = VXFS_SBI(sbp); + int rc = -ENOMEM; + + bp = sb_bread(sbp, blk); + do { + if (!bp || !buffer_mapped(bp)) { + if (!silent) { + printk(KERN_WARNING + "vxfs: unable to read disk superblock at %u\n", + blk); + } + break; + } + + rc = -EINVAL; + rsbp = (struct vxfs_sb *)bp->b_data; + if (rsbp->vs_magic != magic) { + if (!silent) + printk(KERN_NOTICE + "vxfs: WRONG superblock magic %08x at %u\n", + rsbp->vs_magic, blk); + break; + } + + rc = 0; + infp->vsi_raw = rsbp; + infp->vsi_bp = bp; + } while (0); + + if (rc) { + infp->vsi_raw = NULL; + infp->vsi_bp = NULL; + brelse(bp); + } + + return rc; +} + /** * vxfs_read_super - read superblock into memory and initialize filesystem * @sbp: VFS superblock (to fill) @@ -149,10 +215,10 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) { struct vxfs_sb_info *infp; struct vxfs_sb *rsbp; - struct buffer_head *bp = NULL; u_long bsize; struct inode *root; int ret = -EINVAL; + u32 j; sbp->s_flags |= MS_RDONLY; @@ -168,42 +234,43 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) goto out; } - bp = sb_bread(sbp, 1); - if (!bp || !buffer_mapped(bp)) { - if (!silent) { - printk(KERN_WARNING - "vxfs: unable to read disk superblock\n"); - } - goto out; - } + sbp->s_op = &vxfs_super_ops; + sbp->s_fs_info = infp; - rsbp = (struct vxfs_sb *)bp->b_data; - if (rsbp->vs_magic != VXFS_SUPER_MAGIC) { + if (!vxfs_try_sb_magic(sbp, silent, 1, + (__force __fs32)cpu_to_le32(VXFS_SUPER_MAGIC))) { + /* Unixware, x86 */ + infp->byte_order = VXFS_BO_LE; + } else if (!vxfs_try_sb_magic(sbp, silent, 8, + (__force __fs32)cpu_to_be32(VXFS_SUPER_MAGIC))) { + /* HP-UX, parisc */ + infp->byte_order = VXFS_BO_BE; + } else { if (!silent) - printk(KERN_NOTICE "vxfs: WRONG superblock magic\n"); + printk(KERN_NOTICE "vxfs: can't find superblock.\n"); goto out; } - if ((rsbp->vs_version < 2 || rsbp->vs_version > 4) && !silent) { - printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n", - rsbp->vs_version); + rsbp = infp->vsi_raw; + j = fs32_to_cpu(infp, rsbp->vs_version); + if ((j < 2 || j > 4) && !silent) { + printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n", j); goto out; } #ifdef DIAGNOSTIC - printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", rsbp->vs_version); - printk(KERN_DEBUG "vxfs: blocksize: %d\n", rsbp->vs_bsize); + printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", j); + printk(KERN_DEBUG "vxfs: blocksize: %d\n", + fs32_to_cpu(infp, rsbp->vs_bsize)); #endif - sbp->s_magic = rsbp->vs_magic; - sbp->s_fs_info = infp; + sbp->s_magic = fs32_to_cpu(infp, rsbp->vs_magic); - infp->vsi_raw = rsbp; - infp->vsi_bp = bp; - infp->vsi_oltext = rsbp->vs_oltext[0]; - infp->vsi_oltsize = rsbp->vs_oltsize; + infp->vsi_oltext = fs32_to_cpu(infp, rsbp->vs_oltext[0]); + infp->vsi_oltsize = fs32_to_cpu(infp, rsbp->vs_oltsize); - if (!sb_set_blocksize(sbp, rsbp->vs_bsize)) { + j = fs32_to_cpu(infp, rsbp->vs_bsize); + if (!sb_set_blocksize(sbp, j)) { printk(KERN_WARNING "vxfs: unable to set final block size\n"); goto out; } @@ -218,7 +285,6 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) goto out; } - sbp->s_op = &vxfs_super_ops; root = vxfs_iget(sbp, VXFS_ROOT_INO); if (IS_ERR(root)) { ret = PTR_ERR(root); @@ -233,11 +299,11 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) return 0; out_free_ilist: - vxfs_put_fake_inode(infp->vsi_fship); - vxfs_put_fake_inode(infp->vsi_ilist); - vxfs_put_fake_inode(infp->vsi_stilist); + iput(infp->vsi_fship); + iput(infp->vsi_ilist); + iput(infp->vsi_stilist); out: - brelse(bp); + brelse(infp->vsi_bp); kfree(infp); return ret; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6f9c9f6f5157..4d09d4441e3e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1327,6 +1327,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) dirty = inode->i_state & I_DIRTY; if (inode->i_state & I_DIRTY_TIME) { if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + wbc->sync_mode == WB_SYNC_ALL || unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || unlikely(time_after(jiffies, (inode->dirtied_time_when + @@ -1807,8 +1808,8 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) */ static unsigned long get_nr_dirty_pages(void) { - return global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + + return global_node_page_state(NR_FILE_DIRTY) + + global_node_page_state(NR_UNSTABLE_NFS) + get_nr_dirty_inodes(); } diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c index 7d637e2335fd..15a3d042247e 100644 --- a/fs/fscache/histogram.c +++ b/fs/fscache/histogram.c @@ -99,7 +99,6 @@ static int fscache_histogram_open(struct inode *inode, struct file *file) } const struct file_operations fscache_histogram_fops = { - .owner = THIS_MODULE, .open = fscache_histogram_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 6b028b7c4250..5d5ddaa84b21 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -404,7 +404,6 @@ static int fscache_objlist_release(struct inode *inode, struct file *file) } const struct file_operations fscache_objlist_fops = { - .owner = THIS_MODULE, .open = fscache_objlist_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 7cfa0aacdf6d..7ac6e839b065 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -295,7 +295,6 @@ static int fscache_stats_open(struct inode *inode, struct file *file) } const struct file_operations fscache_stats_fops = { - .owner = THIS_MODULE, .open = fscache_stats_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cbece1221417..a94d2ed81ab4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -99,19 +99,6 @@ void fuse_request_free(struct fuse_req *req) kmem_cache_free(fuse_req_cachep, req); } -static void block_sigs(sigset_t *oldset) -{ - sigset_t mask; - - siginitsetinv(&mask, sigmask(SIGKILL)); - sigprocmask(SIG_BLOCK, &mask, oldset); -} - -static void restore_sigs(sigset_t *oldset) -{ - sigprocmask(SIG_SETMASK, oldset, NULL); -} - void __fuse_get_request(struct fuse_req *req) { atomic_inc(&req->count); @@ -151,15 +138,9 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, atomic_inc(&fc->num_waiting); if (fuse_block_alloc(fc, for_background)) { - sigset_t oldset; - int intr; - - block_sigs(&oldset); - intr = wait_event_interruptible_exclusive(fc->blocked_waitq, - !fuse_block_alloc(fc, for_background)); - restore_sigs(&oldset); err = -EINTR; - if (intr) + if (wait_event_killable_exclusive(fc->blocked_waitq, + !fuse_block_alloc(fc, for_background))) goto out; } /* Matches smp_wmb() in fuse_set_initialized() */ @@ -446,14 +427,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) } if (!test_bit(FR_FORCE, &req->flags)) { - sigset_t oldset; - /* Only fatal signals may interrupt this */ - block_sigs(&oldset); - err = wait_event_interruptible(req->waitq, + err = wait_event_killable(req->waitq, test_bit(FR_FINISHED, &req->flags)); - restore_sigs(&oldset); - if (!err) return; @@ -1525,7 +1501,6 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, goto err; fuse_copy_finish(cs); buf[outarg.namelen] = 0; - name.hash = full_name_hash(name.name, name.len); down_read(&fc->killsb); err = -ENOENT; @@ -1576,7 +1551,6 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, goto err; fuse_copy_finish(cs); buf[outarg.namelen] = 0; - name.hash = full_name_hash(name.name, name.len); down_read(&fc->killsb); err = -ENOENT; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index cca7b048c07b..5f1627725791 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -955,6 +955,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!dir) goto unlock; + name->hash = full_name_hash(dir, name->name, name->len); entry = d_lookup(dir, name); dput(dir); if (!entry) @@ -1204,7 +1205,7 @@ static int fuse_direntplus_link(struct file *file, fc = get_fuse_conn(dir); - name.hash = full_name_hash(name.name, name.len); + name.hash = full_name_hash(parent, name.name, name.len); dentry = d_lookup(parent, &name); if (!dentry) { retry: diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9154f8679024..f394aff59c36 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -417,6 +417,10 @@ static int fuse_flush(struct file *file, fl_owner_t id) fuse_sync_writes(inode); inode_unlock(inode); + err = filemap_check_errors(file->f_mapping); + if (err) + return err; + req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -462,6 +466,16 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, goto out; fuse_sync_writes(inode); + + /* + * Due to implementation of fuse writeback + * filemap_write_and_wait_range() does not catch errors. + * We have to do this directly after fuse_sync_writes() + */ + err = filemap_check_errors(file->f_mapping); + if (err) + goto out; + err = sync_inode_metadata(inode, 1); if (err) goto out; @@ -562,7 +576,6 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) */ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) { - bool is_sync = is_sync_kiocb(io->iocb); int left; spin_lock(&io->lock); @@ -572,11 +585,11 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) io->bytes = pos; left = --io->reqs; - if (!left && is_sync) + if (!left && io->blocking) complete(io->done); spin_unlock(&io->lock); - if (!left && !is_sync) { + if (!left && !io->blocking) { ssize_t res = fuse_get_res_by_io(io); if (res >= 0) { @@ -1452,7 +1465,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(&req->writepages_entry); for (i = 0; i < req->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); + dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); } wake_up(&fi->page_waitq); @@ -1642,7 +1655,7 @@ static int fuse_writepage_locked(struct page *page) req->inode = inode; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); + inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fc->lock); list_add(&req->writepages_entry, &fi->writepages); @@ -1756,7 +1769,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, spin_unlock(&fc->lock); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_zone_page_state(page, NR_WRITEBACK_TEMP); + dec_node_page_state(page, NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); @@ -1855,7 +1868,7 @@ static int fuse_writepages_fill(struct page *page, req->page_descs[req->num_pages].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); + inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; if (is_writeback && fuse_writepage_in_flight(req, page)) { @@ -2850,7 +2863,6 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; struct fuse_io_priv *io; - bool is_sync = is_sync_kiocb(iocb); pos = offset; inode = file->f_mapping->host; @@ -2885,17 +2897,16 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) */ io->async = async_dio; io->iocb = iocb; + io->blocking = is_sync_kiocb(iocb); /* - * We cannot asynchronously extend the size of a file. We have no method - * to wait on real async I/O requests, so we must submit this request - * synchronously. + * We cannot asynchronously extend the size of a file. + * In such case the aio will behave exactly like sync io. */ - if (!is_sync && (offset + count > i_size) && - iov_iter_rw(iter) == WRITE) - io->async = false; + if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE) + io->blocking = true; - if (io->async && is_sync) { + if (io->async && io->blocking) { /* * Additional reference to keep io around after * calling fuse_aio_complete() @@ -2915,7 +2926,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) fuse_aio_complete(io, ret < 0 ? ret : 0, -1); /* we have a non-extending, async request, so return */ - if (!is_sync) + if (!io->blocking) return -EIOCBQUEUED; wait_for_completion(&wait); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 929c383432b0..5db5d24f91a5 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -259,6 +259,7 @@ struct fuse_io_priv { struct kiocb *iocb; struct file *file; struct completion *done; + bool blocking; }; #define FUSE_IO_PRIV_SYNC(f) \ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9961d8432ce3..9b7cb37b4ba8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -942,7 +942,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | - FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | + FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 8eed66af5b82..02a3845363f7 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -128,7 +128,7 @@ static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - struct inode *inode = file_inode(file)->i_mapping->host; + struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; diff --git a/fs/hfs/string.c b/fs/hfs/string.c index 85b610c3909f..ec9f164c35a5 100644 --- a/fs/hfs/string.c +++ b/fs/hfs/string.c @@ -59,7 +59,7 @@ int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this) if (len > HFS_NAMELEN) len = HFS_NAMELEN; - hash = init_name_hash(); + hash = init_name_hash(dentry); for (; len; len--) hash = partial_name_hash(caseorder[*name++], hash); this->hash = end_name_hash(hash); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index ef9fefe364a6..19462d773fe2 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -126,7 +126,7 @@ static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - struct inode *inode = file_inode(file)->i_mapping->host; + struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index e8ef121a4d8b..c13c8a240be3 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -346,7 +346,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); - hash = init_name_hash(); + hash = init_name_hash(dentry); astr = str->name; len = str->len; while (len > 0) { diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 5c57654927a6..90e46cd752fe 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -959,10 +959,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) if (S_ISLNK(root_inode->i_mode)) { char *name = follow_link(host_root_path); - if (IS_ERR(name)) + if (IS_ERR(name)) { err = PTR_ERR(name); - else - err = read_name(root_inode, name); + goto out_put; + } + err = read_name(root_inode, name); kfree(name); if (err) goto out_put; diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c index fa27980f2229..60e6d334d79a 100644 --- a/fs/hpfs/dentry.c +++ b/fs/hpfs/dentry.c @@ -26,7 +26,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) /*return -ENOENT;*/ x: - hash = init_name_hash(); + hash = init_name_hash(dentry); for (i = 0; i < l; i++) hash = partial_name_hash(hpfs_upcase(hpfs_sb(dentry->d_sb)->sb_cp_table,qstr->name[i]), hash); qstr->hash = end_name_hash(hash); diff --git a/fs/inode.c b/fs/inode.c index e171f7b5f9e4..ad445542c285 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -345,7 +345,7 @@ EXPORT_SYMBOL(inc_nlink); void address_space_init_once(struct address_space *mapping) { memset(mapping, 0, sizeof(*mapping)); - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); + INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT); spin_lock_init(&mapping->tree_lock); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); @@ -1619,6 +1619,13 @@ bool atime_needs_update(const struct path *path, struct inode *inode) if (inode->i_flags & S_NOATIME) return false; + + /* Atime updates will likely cause i_uid and i_gid to be written + * back improprely if their true value is unknown to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return false; + if (IS_NOATIME(inode)) return false; if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) diff --git a/fs/ioctl.c b/fs/ioctl.c index 116a333e9c77..0f56deb24ce6 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -590,6 +590,7 @@ static long ioctl_file_dedupe_range(struct file *file, void __user *arg) goto out; } + same->dest_count = count; ret = vfs_dedupe_file_range(file, same); if (ret) goto out; diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 2ce5b75ee9a5..44af14b2e916 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -361,7 +361,6 @@ static int zisofs_readpage(struct file *file, struct page *page) const struct address_space_operations zisofs_aops = { .readpage = zisofs_readpage, - /* No sync_page operation supported? */ /* No bmap operation supported */ }; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 131dedc920d8..761fade7680f 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -174,7 +174,7 @@ struct iso9660_options{ * Compute the hash for the isofs name corresponding to the dentry. */ static int -isofs_hashi_common(struct qstr *qstr, int ms) +isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms) { const char *name; int len; @@ -188,7 +188,7 @@ isofs_hashi_common(struct qstr *qstr, int ms) len--; } - hash = init_name_hash(); + hash = init_name_hash(dentry); while (len--) { c = tolower(*name++); hash = partial_name_hash(c, hash); @@ -231,7 +231,7 @@ static int isofs_dentry_cmp_common( static int isofs_hashi(const struct dentry *dentry, struct qstr *qstr) { - return isofs_hashi_common(qstr, 0); + return isofs_hashi_common(dentry, qstr, 0); } static int @@ -246,7 +246,7 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry, * Compute the hash for the isofs name corresponding to the dentry. */ static int -isofs_hash_common(struct qstr *qstr, int ms) +isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms) { const char *name; int len; @@ -258,7 +258,7 @@ isofs_hash_common(struct qstr *qstr, int ms) len--; } - qstr->hash = full_name_hash(name, len); + qstr->hash = full_name_hash(dentry, name, len); return 0; } @@ -266,13 +266,13 @@ isofs_hash_common(struct qstr *qstr, int ms) static int isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr) { - return isofs_hash_common(qstr, 1); + return isofs_hash_common(dentry, qstr, 1); } static int isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr) { - return isofs_hashi_common(qstr, 1); + return isofs_hashi_common(dentry, qstr, 1); } static int diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 84c4bf3631a2..30eb33ff8189 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -81,6 +81,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, struct jffs2_full_dirent *fd = NULL, *fd_list; uint32_t ino = 0; struct inode *inode = NULL; + unsigned int nhash; jffs2_dbg(1, "jffs2_lookup()\n"); @@ -89,11 +90,14 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, dir_f = JFFS2_INODE_INFO(dir_i); + /* The 'nhash' on the fd_list is not the same as the dentry hash */ + nhash = full_name_hash(NULL, target->d_name.name, target->d_name.len); + mutex_lock(&dir_f->sem); /* NB: The 2.2 backport will need to explicitly check for '.' and '..' here */ - for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= target->d_name.hash; fd_list = fd_list->next) { - if (fd_list->nhash == target->d_name.hash && + for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= nhash; fd_list = fd_list->next) { + if (fd_list->nhash == nhash && (!fd || fd_list->version > fd->version) && strlen(fd_list->name) == target->d_name.len && !strncmp(fd_list->name, target->d_name.name, target->d_name.len)) { diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index bfebbf13698c..06a71dbd4833 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -674,7 +674,7 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r } } - fd->nhash = full_name_hash(fd->name, rd->nsize); + fd->nhash = full_name_hash(NULL, fd->name, rd->nsize); fd->next = NULL; fd->name[rd->nsize] = '\0'; diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 9ad5ba4b299b..90431dd613b8 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -1100,7 +1100,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo fd->next = NULL; fd->version = je32_to_cpu(rd->version); fd->ino = je32_to_cpu(rd->ino); - fd->nhash = full_name_hash(fd->name, checkedlen); + fd->nhash = full_name_hash(NULL, fd->name, checkedlen); fd->type = rd->type; jffs2_add_fd_to_list(c, fd, &ic->scan_dents); diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index bc5385471a6e..be7c8a6a5748 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -476,7 +476,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras fd->next = NULL; fd->version = je32_to_cpu(spd->version); fd->ino = je32_to_cpu(spd->ino); - fd->nhash = full_name_hash(fd->name, checkedlen); + fd->nhash = full_name_hash(NULL, fd->name, checkedlen); fd->type = spd->type; jffs2_add_fd_to_list(c, fd, &ic->scan_dents); diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c index 7fb187ab2682..cda9a361368e 100644 --- a/fs/jffs2/write.c +++ b/fs/jffs2/write.c @@ -245,7 +245,7 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff fd->version = je32_to_cpu(rd->version); fd->ino = je32_to_cpu(rd->ino); - fd->nhash = full_name_hash(name, namelen); + fd->nhash = full_name_hash(NULL, name, namelen); fd->type = rd->type; memcpy(fd->name, name, namelen); fd->name[namelen]=0; @@ -598,7 +598,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, jffs2_add_fd_to_list(c, fd, &dir_f->dents); mutex_unlock(&dir_f->sem); } else { - uint32_t nhash = full_name_hash(name, namelen); + uint32_t nhash = full_name_hash(NULL, name, namelen); fd = dir_f->dents; /* We don't actually want to reserve any space, but we do diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index dd824d9b0b1a..a37eb5f8cbc0 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -58,7 +58,6 @@ static ssize_t jfs_loglevel_proc_write(struct file *file, } static const struct file_operations jfs_loglevel_proc_fops = { - .owner = THIS_MODULE, .open = jfs_loglevel_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index a74752146ec9..a21ea8b3e5fa 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2517,7 +2517,6 @@ static int jfs_lmstats_proc_open(struct inode *inode, struct file *file) } const struct file_operations jfs_lmstats_proc_fops = { - .owner = THIS_MODULE, .open = jfs_lmstats_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index e7fa9e513040..489aaa1403e5 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -830,7 +830,6 @@ static int jfs_mpstat_proc_open(struct inode *inode, struct file *file) } const struct file_operations jfs_mpstat_proc_fops = { - .owner = THIS_MODULE, .open = jfs_mpstat_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index eddf2b6eda85..2e58978d6f45 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -3040,7 +3040,6 @@ static int jfs_txanchor_proc_open(struct inode *inode, struct file *file) } const struct file_operations jfs_txanchor_proc_fops = { - .owner = THIS_MODULE, .open = jfs_txanchor_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -3081,7 +3080,6 @@ static int jfs_txstats_proc_open(struct inode *inode, struct file *file) } const struct file_operations jfs_txstats_proc_fops = { - .owner = THIS_MODULE, .open = jfs_txstats_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 5ad7748860ce..5cde6d2fcfca 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -3894,7 +3894,6 @@ static int jfs_xtstat_proc_open(struct inode *inode, struct file *file) } const struct file_operations jfs_xtstat_proc_fops = { - .owner = THIS_MODULE, .open = jfs_xtstat_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 539deddecbb0..04baf0dfc40c 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1564,7 +1564,7 @@ static int jfs_ci_hash(const struct dentry *dir, struct qstr *this) unsigned long hash; int i; - hash = init_name_hash(); + hash = init_name_hash(dir); for (i=0; i < this->len; i++) hash = partial_name_hash(tolower(this->name[i]), hash); this->hash = end_name_hash(hash); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 8a652404eef6..e57174d43683 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -336,11 +336,11 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) */ static unsigned int kernfs_name_hash(const char *name, const void *ns) { - unsigned long hash = init_name_hash(); + unsigned long hash = init_name_hash(ns); unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); - hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); + hash = end_name_hash(hash); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 2) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 63534f5f9073..b3d73ad52b22 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -152,6 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) struct dentry *root; info->sb = sb; + /* Userspace would break if executables or devices appear on sysfs */ + sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = magic; @@ -241,7 +243,8 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, info->root = root; info->ns = ns; - sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info); + sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, + &init_user_ns, info); if (IS_ERR(sb) || sb->s_fs_info != info) kfree(info); if (IS_ERR(sb)) diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c index 2a0a98480e39..8f72cb237ef3 100644 --- a/fs/lockd/procfs.c +++ b/fs/lockd/procfs.c @@ -64,7 +64,6 @@ static const struct file_operations lockd_end_grace_operations = { .read = nlm_end_grace_read, .llseek = default_llseek, .release = simple_transaction_release, - .owner = THIS_MODULE, }; int __init diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 2d5336bd4efd..bcd754d216bd 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -95,7 +95,7 @@ static int beyond_eof(struct inode *inode, loff_t bix) * of each character and pick a prime nearby, preferably a bit-sparse * one. */ -static u32 hash_32(const char *s, int len, u32 seed) +static u32 logfs_hash_32(const char *s, int len, u32 seed) { u32 hash = seed; int i; @@ -159,7 +159,7 @@ static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry) struct qstr *name = &dentry->d_name; struct page *page; struct logfs_disk_dentry *dd; - u32 hash = hash_32(name->name, name->len, 0); + u32 hash = logfs_hash_32(name->name, name->len, 0); pgoff_t index; int round; @@ -370,7 +370,7 @@ static int logfs_write_dir(struct inode *dir, struct dentry *dentry, { struct page *page; struct logfs_disk_dentry *dd; - u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0); + u32 hash = logfs_hash_32(dentry->d_name.name, dentry->d_name.len, 0); pgoff_t index; int round, err; diff --git a/fs/mpage.c b/fs/mpage.c index 2ca1f39c8cba..7a09c55b4bd0 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -50,7 +50,7 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, bio_data_dir(bio), bio->bi_error); + page_endio(page, bio_op(bio), bio->bi_error); } bio_put(bio); diff --git a/fs/namei.c b/fs/namei.c index 70580ab1445c..c386a329ab20 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -36,6 +36,7 @@ #include <linux/posix_acl.h> #include <linux/hash.h> #include <linux/bitops.h> +#include <linux/init_task.h> #include <asm/uaccess.h> #include "internal.h" @@ -410,6 +411,14 @@ int __inode_permission(struct inode *inode, int mask) */ if (IS_IMMUTABLE(inode)) return -EACCES; + + /* + * Updating mtime will likely cause i_uid and i_gid to be + * written back improperly if their true value is unknown + * to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EACCES; } retval = do_inode_permission(inode, mask); @@ -901,6 +910,7 @@ static inline int may_follow_link(struct nameidata *nd) { const struct inode *inode; const struct inode *parent; + kuid_t puid; if (!sysctl_protected_symlinks) return 0; @@ -916,7 +926,8 @@ static inline int may_follow_link(struct nameidata *nd) return 0; /* Allowed if parent directory and link owner match. */ - if (uid_eq(parent->i_uid, inode->i_uid)) + puid = parent->i_uid; + if (uid_valid(puid) && uid_eq(puid, inode->i_uid)) return 0; if (nd->flags & LOOKUP_RCU) @@ -1089,6 +1100,7 @@ static int follow_automount(struct path *path, struct nameidata *nd, bool *need_mntput) { struct vfsmount *mnt; + const struct cred *old_cred; int err; if (!path->dentry->d_op || !path->dentry->d_op->d_automount) @@ -1110,11 +1122,16 @@ static int follow_automount(struct path *path, struct nameidata *nd, path->dentry->d_inode) return -EISDIR; + if (path->dentry->d_sb->s_user_ns != &init_user_ns) + return -EACCES; + nd->total_link_count++; if (nd->total_link_count >= 40) return -ELOOP; + old_cred = override_creds(&init_cred); mnt = path->dentry->d_op->d_automount(path); + revert_creds(old_cred); if (IS_ERR(mnt)) { /* * The filesystem is allowed to return -EISDIR here to indicate @@ -1449,9 +1466,8 @@ static int follow_dotdot(struct nameidata *nd) } /* - * This looks up the name in dcache, possibly revalidates the old dentry and - * allocates a new one if not found or not valid. In the need_lookup argument - * returns whether i_op->lookup is necessary. + * This looks up the name in dcache and possibly revalidates the found dentry. + * NULL is returned if the dentry does not exist in the cache. */ static struct dentry *lookup_dcache(const struct qstr *name, struct dentry *dir, @@ -1890,9 +1906,9 @@ static inline unsigned int fold_hash(unsigned long x, unsigned long y) * payload bytes, to match the way that hash_name() iterates until it * finds the delimiter after the name. */ -unsigned int full_name_hash(const char *name, unsigned int len) +unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { - unsigned long a, x = 0, y = 0; + unsigned long a, x = 0, y = (unsigned long)salt; for (;;) { if (!len) @@ -1911,15 +1927,19 @@ done: EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ -u64 hashlen_string(const char *name) +u64 hashlen_string(const void *salt, const char *name) { - unsigned long a = 0, x = 0, y = 0, adata, mask, len; + unsigned long a = 0, x = 0, y = (unsigned long)salt; + unsigned long adata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - len = -sizeof(unsigned long); + len = 0; + goto inside; + do { HASH_MIX(x, y, a); len += sizeof(unsigned long); +inside: a = load_unaligned_zeropad(name+len); } while (!has_zero(a, &adata, &constants)); @@ -1935,15 +1955,19 @@ EXPORT_SYMBOL(hashlen_string); * Calculate the length and hash of the path component, and * return the "hash_len" as the result. */ -static inline u64 hash_name(const char *name) +static inline u64 hash_name(const void *salt, const char *name) { - unsigned long a = 0, b, x = 0, y = 0, adata, bdata, mask, len; + unsigned long a = 0, b, x = 0, y = (unsigned long)salt; + unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - len = -sizeof(unsigned long); + len = 0; + goto inside; + do { HASH_MIX(x, y, a); len += sizeof(unsigned long); +inside: a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); @@ -1959,9 +1983,9 @@ static inline u64 hash_name(const char *name) #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ -unsigned int full_name_hash(const char *name, unsigned int len) +unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { - unsigned long hash = init_name_hash(); + unsigned long hash = init_name_hash(salt); while (len--) hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); @@ -1969,9 +1993,9 @@ unsigned int full_name_hash(const char *name, unsigned int len) EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ -u64 hashlen_string(const char *name) +u64 hashlen_string(const void *salt, const char *name) { - unsigned long hash = init_name_hash(); + unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; @@ -1988,9 +2012,9 @@ EXPORT_SYMBOL(hashlen_string); * We know there's a real path component here of at least * one character. */ -static inline u64 hash_name(const char *name) +static inline u64 hash_name(const void *salt, const char *name) { - unsigned long hash = init_name_hash(); + unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; @@ -2030,7 +2054,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) return err; - hash_len = hash_name(name); + hash_len = hash_name(nd->path.dentry, name); type = LAST_NORM; if (name[0] == '.') switch (hashlen_len(hash_len)) { @@ -2389,33 +2413,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, EXPORT_SYMBOL(vfs_path_lookup); /** - * lookup_hash - lookup single pathname component on already hashed name - * @name: name and hash to lookup - * @base: base directory to lookup from - * - * The name must have been verified and hashed (see lookup_one_len()). Using - * this after just full_name_hash() is unsafe. - * - * This function also doesn't check for search permission on base directory. - * - * Use lookup_one_len_unlocked() instead, unless you really know what you are - * doing. - * - * Do not hold i_mutex; this helper takes i_mutex if necessary. - */ -struct dentry *lookup_hash(const struct qstr *name, struct dentry *base) -{ - struct dentry *ret; - - ret = lookup_dcache(name, base, 0); - if (!ret) - ret = lookup_slow(name, base, 0); - - return ret; -} -EXPORT_SYMBOL(lookup_hash); - -/** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from @@ -2436,7 +2433,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) this.name = name; this.len = len; - this.hash = full_name_hash(name, len); + this.hash = full_name_hash(base, name, len); if (!len) return ERR_PTR(-EACCES); @@ -2486,10 +2483,11 @@ struct dentry *lookup_one_len_unlocked(const char *name, struct qstr this; unsigned int c; int err; + struct dentry *ret; this.name = name; this.len = len; - this.hash = full_name_hash(name, len); + this.hash = full_name_hash(base, name, len); if (!len) return ERR_PTR(-EACCES); @@ -2517,7 +2515,10 @@ struct dentry *lookup_one_len_unlocked(const char *name, if (err) return ERR_PTR(err); - return lookup_hash(&this, base); + ret = lookup_dcache(&this, base, 0); + if (!ret) + ret = lookup_slow(&this, base, 0); + return ret; } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -2757,10 +2758,11 @@ EXPORT_SYMBOL(__check_sticky); * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. - * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. - * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. - * 9. We can't remove a root or mountpoint. - * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * 7. If the victim has an unknown uid or gid we can't change the inode. + * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 10. We can't remove a root or mountpoint. + * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) @@ -2782,7 +2784,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) return -EPERM; if (check_sticky(dir, inode) || IS_APPEND(inode) || - IS_IMMUTABLE(inode) || IS_SWAPFILE(inode)) + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -2803,16 +2805,22 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) * 1. We can't do it if child already exists (open has special treatment for * this case, but since we are inlined it's OK) * 2. We can't do it if dir is read-only (done in permission()) - * 3. We should have write and exec permissions on dir - * 4. We can't do it if dir is immutable (done in permission()) + * 3. We can't do it if the fs can't represent the fsuid or fsgid. + * 4. We should have write and exec permissions on dir + * 5. We can't do it if dir is immutable (done in permission()) */ static inline int may_create(struct inode *dir, struct dentry *child) { + struct user_namespace *s_user_ns; audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; + s_user_ns = dir->i_sb->s_user_ns; + if (!kuid_has_mapping(s_user_ns, current_fsuid()) || + !kgid_has_mapping(s_user_ns, current_fsgid())) + return -EOVERFLOW; return inode_permission(dir, MAY_WRITE | MAY_EXEC); } @@ -2881,6 +2889,12 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, } EXPORT_SYMBOL(vfs_create); +bool may_open_dev(const struct path *path) +{ + return !(path->mnt->mnt_flags & MNT_NODEV) && + !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); +} + static int may_open(struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; @@ -2899,7 +2913,7 @@ static int may_open(struct path *path, int acc_mode, int flag) break; case S_IFBLK: case S_IFCHR: - if (path->mnt->mnt_flags & MNT_NODEV) + if (!may_open_dev(path)) return -EACCES; /*FALLTHRU*/ case S_IFIFO: @@ -4151,6 +4165,13 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; + /* + * Updating the link count will likely cause i_uid and i_gid to + * be writen back improperly if their true value is unknown to + * the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EPERM; if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) @@ -4328,7 +4349,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, * Check source == target. * On overlayfs need to look at underlying inodes. */ - if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0)) + if (d_real_inode(old_dentry) == d_real_inode(new_dentry)) return 0; error = may_delete(old_dir, old_dentry, is_dir); diff --git a/fs/namespace.c b/fs/namespace.c index 419f746d851d..7bb2cda3bfef 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2186,13 +2186,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags, } if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) { - /* Was the nodev implicitly added in mount? */ - if ((mnt->mnt_ns->user_ns != &init_user_ns) && - !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) { - mnt_flags |= MNT_NODEV; - } else { - return -EPERM; - } + return -EPERM; } if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) { @@ -2376,7 +2370,7 @@ unlock: return err; } -static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags); +static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags); /* * create a new mount for userspace and request it to be added into the @@ -2386,7 +2380,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct vfsmount *mnt; int err; @@ -2397,26 +2390,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, if (!type) return -ENODEV; - if (user_ns != &init_user_ns) { - if (!(type->fs_flags & FS_USERNS_MOUNT)) { - put_filesystem(type); - return -EPERM; - } - /* Only in special cases allow devices from mounts - * created outside the initial user namespace. - */ - if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { - flags |= MS_NODEV; - mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; - } - if (type->fs_flags & FS_USERNS_VISIBLE) { - if (!fs_fully_visible(type, &mnt_flags)) { - put_filesystem(type); - return -EPERM; - } - } - } - mnt = vfs_kern_mount(type, flags, name, data); if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && !mnt->mnt_sb->s_subtype) @@ -2426,6 +2399,11 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, if (IS_ERR(mnt)) return PTR_ERR(mnt); + if (mount_too_revealing(mnt, &mnt_flags)) { + mntput(mnt); + return -EPERM; + } + err = do_add_mount(real_mount(mnt), path, mnt_flags); if (err) mntput(mnt); @@ -3217,22 +3195,19 @@ bool current_chrooted(void) return chrooted; } -static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) +static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, + int *new_mnt_flags) { - struct mnt_namespace *ns = current->nsproxy->mnt_ns; int new_flags = *new_mnt_flags; struct mount *mnt; bool visible = false; - if (unlikely(!ns)) - return false; - down_read(&namespace_sem); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; int mnt_flags; - if (mnt->mnt.mnt_sb->s_type != type) + if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type) continue; /* This mount is not fully visible if it's root directory @@ -3241,12 +3216,8 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; - /* Read the mount flags and filter out flags that - * may safely be ignored. - */ + /* A local view of the mount flags */ mnt_flags = mnt->mnt.mnt_flags; - if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC) - mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC); /* Don't miss readonly hidden in the superblock flags */ if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY) @@ -3258,15 +3229,6 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) if ((mnt_flags & MNT_LOCK_READONLY) && !(new_flags & MNT_READONLY)) continue; - if ((mnt_flags & MNT_LOCK_NODEV) && - !(new_flags & MNT_NODEV)) - continue; - if ((mnt_flags & MNT_LOCK_NOSUID) && - !(new_flags & MNT_NOSUID)) - continue; - if ((mnt_flags & MNT_LOCK_NOEXEC) && - !(new_flags & MNT_NOEXEC)) - continue; if ((mnt_flags & MNT_LOCK_ATIME) && ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) continue; @@ -3286,9 +3248,6 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) } /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ - MNT_LOCK_NODEV | \ - MNT_LOCK_NOSUID | \ - MNT_LOCK_NOEXEC | \ MNT_LOCK_ATIME); visible = true; goto found; @@ -3299,6 +3258,42 @@ found: return visible; } +static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) +{ + const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + unsigned long s_iflags; + + if (ns->user_ns == &init_user_ns) + return false; + + /* Can this filesystem be too revealing? */ + s_iflags = mnt->mnt_sb->s_iflags; + if (!(s_iflags & SB_I_USERNS_VISIBLE)) + return false; + + if ((s_iflags & required_iflags) != required_iflags) { + WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", + required_iflags); + return true; + } + + return !mnt_already_visible(ns, mnt, new_mnt_flags); +} + +bool mnt_may_suid(struct vfsmount *mnt) +{ + /* + * Foreign mounts (accessed via fchdir or through /proc + * symlinks) are always treated as if they are nosuid. This + * prevents namespaces from trusting potentially unsafe + * suid/sgid bits, file caps, or security labels that originate + * in other namespaces. + */ + return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) && + current_in_userns(mnt->mnt_sb->s_user_ns); +} + static struct ns_common *mntns_get(struct task_struct *task) { struct ns_common *ns = NULL; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index bfdad003ee56..9add7ab747a5 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -139,7 +139,7 @@ ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) int i; t = NCP_IO_TABLE(sb); - hash = init_name_hash(); + hash = init_name_hash(dentry); for (i=0; i<this->len ; i++) hash = partial_name_hash(ncp_tolower(t, this->name[i]), hash); diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 8664417955a2..6abdda209642 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o CFLAGS_nfstrace.o += -I$(src) nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ - direct.o pagelist.o read.o symlink.o unlink.o \ + io.o direct.o pagelist.o read.o symlink.o unlink.o \ write.o namespace.o mount_clnt.o nfstrace.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index e5b89675263e..a69ef4e9c24c 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) if (!p) return -EIO; b->simple.nr_sigs = be32_to_cpup(p++); - if (!b->simple.nr_sigs) { - dprintk("no signature\n"); + if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) { + dprintk("Bad signature count: %d\n", b->simple.nr_sigs); return -EIO; } @@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) memcpy(&b->simple.sigs[i].sig, p, b->simple.sigs[i].sig_len); - b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; + b->simple.len += 8 + 4 + \ + (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2); } break; case PNFS_BLOCK_VOLUME_SLICE: @@ -104,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) p = xdr_inline_decode(xdr, 4); if (!p) return -EIO; + b->concat.volumes_count = be32_to_cpup(p++); + if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) { + dprintk("Too many volumes: %d\n", b->concat.volumes_count); + return -EIO; + } p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); if (!p) @@ -116,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) p = xdr_inline_decode(xdr, 8 + 4); if (!p) return -EIO; + p = xdr_decode_hyper(p, &b->stripe.chunk_size); b->stripe.volumes_count = be32_to_cpup(p++); + if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) { + dprintk("Too many volumes: %d\n", b->stripe.volumes_count); + return -EIO; + } p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); if (!p) @@ -224,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; + struct block_device *bdev; dev_t dev; dev = bl_resolve_deviceid(server, v, gfp_mask); if (!dev) return -EIO; - d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); - if (IS_ERR(d->bdev)) { + bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); + if (IS_ERR(bdev)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", - MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); - return PTR_ERR(d->bdev); + MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); + return PTR_ERR(bdev); } + d->bdev = bdev; d->len = i_size_read(d->bdev->bd_inode); @@ -287,44 +300,71 @@ bl_validate_designator(struct pnfs_block_volume *v) } } +/* + * Try to open the udev path for the WWN. At least on Debian the udev + * by-id path will always point to the dm-multipath device if one exists. + */ +static struct block_device * +bl_open_udev_path(struct pnfs_block_volume *v) +{ + struct block_device *bdev; + const char *devname; + + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN", + v->scsi.designator_len, v->scsi.designator); + if (!devname) + return ERR_PTR(-ENOMEM); + + bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); + if (IS_ERR(bdev)) { + pr_warn("pNFS: failed to open device %s (%ld)\n", + devname, PTR_ERR(bdev)); + } + + kfree(devname); + return bdev; +} + +/* + * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the + * wwn- links will only point to the first discovered SCSI device there. + */ +static struct block_device * +bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v) +{ + struct block_device *bdev; + const char *devname; + + devname = kasprintf(GFP_KERNEL, + "/dev/disk/by-id/dm-uuid-mpath-%d%*phN", + v->scsi.designator_type, + v->scsi.designator_len, v->scsi.designator); + if (!devname) + return ERR_PTR(-ENOMEM); + + bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); + kfree(devname); + return bdev; +} + static int bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; + struct block_device *bdev; const struct pr_ops *ops; - const char *devname; int error; if (!bl_validate_designator(v)) return -EINVAL; - switch (v->scsi.designator_len) { - case 8: - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN", - v->scsi.designator); - break; - case 12: - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN", - v->scsi.designator); - break; - case 16: - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN", - v->scsi.designator); - break; - default: - return -EINVAL; - } - - d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL); - if (IS_ERR(d->bdev)) { - pr_warn("pNFS: failed to open device %s (%ld)\n", - devname, PTR_ERR(d->bdev)); - kfree(devname); - return PTR_ERR(d->bdev); - } - - kfree(devname); + bdev = bl_open_dm_mpath_udev_path(v); + if (IS_ERR(bdev)) + bdev = bl_open_udev_path(v); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + d->bdev = bdev; d->len = i_size_read(d->bdev->bd_inode); d->map = bl_map_simple; @@ -352,7 +392,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return 0; out_blkdev_put: - blkdev_put(d->bdev, FMODE_READ); + blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE); return error; } diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 720b3ff55fa9..992bcb19c11e 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) return be; } +static void __ext_put_deviceids(struct list_head *head) +{ + struct pnfs_block_extent *be, *tmp; + + list_for_each_entry_safe(be, tmp, head, be_list) { + nfs4_put_deviceid_node(be->be_device); + kfree(be); + } +} + static void __ext_tree_insert(struct rb_root *root, struct pnfs_block_extent *new, bool merge_ok) @@ -163,7 +173,8 @@ free_new: } static int -__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) +__ext_tree_remove(struct rb_root *root, + sector_t start, sector_t end, struct list_head *tmp) { struct pnfs_block_extent *be; sector_t len1 = 0, len2 = 0; @@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) struct pnfs_block_extent *next = ext_tree_next(be); rb_erase(&be->be_node, root); - nfs4_put_deviceid_node(be->be_device); - kfree(be); + list_add_tail(&be->be_list, tmp); be = next; } @@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, sector_t end) { int err, err2; + LIST_HEAD(tmp); spin_lock(&bl->bl_ext_lock); - err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); if (rw) { - err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); + err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp); if (!err) err = err2; } spin_unlock(&bl->bl_ext_lock); + __ext_put_deviceids(&tmp); return err; } @@ -396,12 +408,13 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, sector_t end = start + len; struct pnfs_block_extent *be; int err = 0; + LIST_HEAD(tmp); spin_lock(&bl->bl_ext_lock); /* * First remove all COW extents or holes from written to range. */ - err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); if (err) goto out; @@ -459,6 +472,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, } out: spin_unlock(&bl->bl_ext_lock); + + __ext_put_deviceids(&tmp); return err; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index aaa2e8d3df6f..c92a75e066a6 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -119,27 +119,30 @@ out: * hashed by filehandle. */ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, - struct nfs_fh *fh, nfs4_stateid *stateid) + struct nfs_fh *fh) { struct nfs_server *server; + struct nfs_inode *nfsi; struct inode *ino; struct pnfs_layout_hdr *lo; +restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry(lo, &server->layouts, plh_layouts) { - if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) + nfsi = NFS_I(lo->plh_inode); + if (nfs_compare_fh(fh, &nfsi->fh)) continue; - if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) + if (nfsi->layout != lo) continue; ino = igrab(lo->plh_inode); if (!ino) break; spin_lock(&ino->i_lock); /* Is this layout in the process of being freed? */ - if (NFS_I(ino)->layout != lo) { + if (nfsi->layout != lo) { spin_unlock(&ino->i_lock); iput(ino); - break; + goto restart; } pnfs_get_layout_hdr(lo); spin_unlock(&ino->i_lock); @@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, } static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, - struct nfs_fh *fh, nfs4_stateid *stateid) + struct nfs_fh *fh) { struct pnfs_layout_hdr *lo; spin_lock(&clp->cl_lock); rcu_read_lock(); - lo = get_layout_by_fh_locked(clp, fh, stateid); + lo = get_layout_by_fh_locked(clp, fh); rcu_read_unlock(); spin_unlock(&clp->cl_lock); @@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, /* * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) */ -static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, +static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new) { u32 oldseq, newseq; - oldseq = be32_to_cpu(lo->plh_stateid.seqid); + /* Is the stateid still not initialised? */ + if (!pnfs_layout_is_valid(lo)) + return NFS4ERR_DELAY; + + /* Mismatched stateid? */ + if (!nfs4_stateid_match_other(&lo->plh_stateid, new)) + return NFS4ERR_BAD_STATEID; + newseq = be32_to_cpu(new->seqid); + /* Are we already in a layout recall situation? */ + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && + lo->plh_return_seq != 0) { + if (newseq < lo->plh_return_seq) + return NFS4ERR_OLD_STATEID; + if (newseq > lo->plh_return_seq) + return NFS4ERR_DELAY; + goto out; + } + /* Check that the stateid matches what we think it should be. */ + oldseq = be32_to_cpu(lo->plh_stateid.seqid); if (newseq > oldseq + 1) - return false; - return true; + return NFS4ERR_DELAY; + /* Crazy server! */ + if (newseq <= oldseq) + return NFS4ERR_OLD_STATEID; +out: + return NFS_OK; } static u32 initiate_file_draining(struct nfs_client *clp, @@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, u32 rv = NFS4ERR_NOMATCHING_LAYOUT; LIST_HEAD(free_me_list); - lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); + lo = get_layout_by_fh(clp, &args->cbl_fh); if (!lo) { trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, &args->cbl_stateid, -rv); @@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp, } ino = lo->plh_inode; + pnfs_layoutcommit_inode(ino, false); + spin_lock(&ino->i_lock); - if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) { - rv = NFS4ERR_DELAY; + rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid); + if (rv != NFS_OK) goto unlock; - } pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); - spin_unlock(&ino->i_lock); - - pnfs_layoutcommit_inode(ino, false); - spin_lock(&ino->i_lock); /* * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) */ @@ -223,11 +245,13 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto unlock; } + /* Embrace your forgetfulness! */ + rv = NFS4ERR_NOMATCHING_LAYOUT; + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &args->cbl_range); } - pnfs_mark_layout_returned_if_empty(lo); unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d81f96aacd51..656f68f7fe53 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -925,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r if (hdr_arg.minorversion == 0) { cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) - return rpc_drop_reply; + goto out_invalidcred; } cps.minorversion = hdr_arg.minorversion; @@ -953,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r nfs_put_client(cps.clp); dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; + +out_invalidcred: + pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n"); + return rpc_autherr_badcred; } /* diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 0c96528db94a..003ebce4bbc4 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, */ struct nfs_client * nfs_get_client(const struct nfs_client_initdata *cl_init, - const struct rpc_timeout *timeparms, - const char *ip_addr, rpc_authflavor_t authflavour) { struct nfs_client *clp, *new = NULL; @@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); new->cl_flags = cl_init->init_flags; - return rpc_ops->init_client(new, timeparms, ip_addr); + return rpc_ops->init_client(new, cl_init); } spin_unlock(&nn->nfs_client_lock); @@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values); * Create an RPC client handle */ int nfs_create_rpc_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, + const struct nfs_client_initdata *cl_init, rpc_authflavor_t flavor) { struct rpc_clnt *clnt = NULL; @@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp, .protocol = clp->cl_proto, .address = (struct sockaddr *)&clp->cl_addr, .addrsize = clp->cl_addrlen, - .timeout = timeparms, + .timeout = cl_init->timeparms, .servername = clp->cl_hostname, + .nodename = cl_init->nodename, .program = &nfs_program, .version = clp->rpc_ops->version, .authflavor = flavor, @@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient); * nfs_init_client - Initialise an NFS2 or NFS3 client * * @clp: nfs_client to initialise - * @timeparms: timeout parameters for underlying RPC transport - * @ip_addr: IP presentation address (not used) + * @cl_init: Initialisation parameters * * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr) + const struct nfs_client_initdata *cl_init) { int error; @@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); @@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server, const struct nfs_parsed_mount_data *data, struct nfs_subversion *nfs_mod) { + struct rpc_timeout timeparms; struct nfs_client_initdata cl_init = { .hostname = data->nfs_server.hostname, .addr = (const struct sockaddr *)&data->nfs_server.address, @@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server, .nfs_mod = nfs_mod, .proto = data->nfs_server.protocol, .net = data->net, + .timeparms = &timeparms, }; - struct rpc_timeout timeparms; struct nfs_client *clp; int error; @@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server, set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); + clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); @@ -1102,7 +1100,6 @@ static const struct file_operations nfs_server_list_fops = { .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, - .owner = THIS_MODULE, }; static int nfs_volume_list_open(struct inode *inode, struct file *file); @@ -1123,7 +1120,6 @@ static const struct file_operations nfs_volume_list_fops = { .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, - .owner = THIS_MODULE, }; /* diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 19d93d0cd400..177fefb26c18 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -232,7 +232,7 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le * in a page cache page which kmemleak does not scan. */ kmemleak_not_leak(string->name); - string->hash = full_name_hash(name, len); + string->hash = full_name_hash(NULL, name, len); return 0; } @@ -502,7 +502,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) if (filename.len == 2 && filename.name[1] == '.') return; } - filename.hash = full_name_hash(filename.name, filename.len); + filename.hash = full_name_hash(parent, filename.name, filename.len); dentry = d_lookup(parent, &filename); again: @@ -734,7 +734,7 @@ struct page *get_cache_page(nfs_readdir_descriptor_t *desc) struct page *page; for (;;) { - page = read_cache_page(file_inode(desc->file)->i_mapping, + page = read_cache_page(desc->file->f_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); if (IS_ERR(page) || grab_page(page)) break; @@ -1397,19 +1397,18 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in if (IS_ERR(label)) goto out; - /* Protect against concurrent sillydeletes */ trace_nfs_lookup_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); if (error == -ENOENT) goto no_entry; if (error < 0) { res = ERR_PTR(error); - goto out_unblock_sillyrename; + goto out_label; } inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); res = ERR_CAST(inode); if (IS_ERR(res)) - goto out_unblock_sillyrename; + goto out_label; /* Success: notify readdir to use READDIRPLUS */ nfs_advise_use_readdirplus(dir); @@ -1418,11 +1417,11 @@ no_entry: res = d_splice_alias(inode, dentry); if (res != NULL) { if (IS_ERR(res)) - goto out_unblock_sillyrename; + goto out_label; dentry = res; } nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); -out_unblock_sillyrename: +out_label: trace_nfs_lookup_exit(dir, dentry, flags, error); nfs4_label_free(label); out: @@ -2253,21 +2252,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st return NULL; } -static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) +static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_access_entry *cache; - int err = -ENOENT; + bool retry = true; + int err; spin_lock(&inode->i_lock); - if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) - goto out_zap; - cache = nfs_access_search_rbtree(inode, cred); - if (cache == NULL) - goto out; - if (!nfs_have_delegated_attributes(inode) && - !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) - goto out_stale; + for(;;) { + if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) + goto out_zap; + cache = nfs_access_search_rbtree(inode, cred); + err = -ENOENT; + if (cache == NULL) + goto out; + /* Found an entry, is our attribute cache valid? */ + if (!nfs_attribute_cache_expired(inode) && + !(nfsi->cache_validity & NFS_INO_INVALID_ATTR)) + break; + err = -ECHILD; + if (!may_block) + goto out; + if (!retry) + goto out_zap; + spin_unlock(&inode->i_lock); + err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (err) + return err; + spin_lock(&inode->i_lock); + retry = false; + } res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; @@ -2276,12 +2291,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str out: spin_unlock(&inode->i_lock); return err; -out_stale: - rb_erase(&cache->rb_node, &nfsi->access_cache); - list_del(&cache->lru); - spin_unlock(&inode->i_lock); - nfs_access_free_entry(cache); - return -ENOENT; out_zap: spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); @@ -2308,13 +2317,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, cache = NULL; if (cache == NULL) goto out; - if (!nfs_have_delegated_attributes(inode) && - !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) + err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode); + if (err) goto out; res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; - err = 0; out: rcu_read_unlock(); return err; @@ -2403,18 +2411,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask); static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) { struct nfs_access_entry cache; + bool may_block = (mask & MAY_NOT_BLOCK) == 0; int status; trace_nfs_access_enter(inode); status = nfs_access_get_cached_rcu(inode, cred, &cache); if (status != 0) - status = nfs_access_get_cached(inode, cred, &cache); + status = nfs_access_get_cached(inode, cred, &cache, may_block); if (status == 0) goto out_cached; status = -ECHILD; - if (mask & MAY_NOT_BLOCK) + if (!may_block) goto out; /* Be clever: ask server to check for all possible rights */ diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index c7326c2af2c3..72b7d13ee3c6 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -196,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, WARN_ON_ONCE(verfp->committed < 0); } +static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1, + const struct nfs_writeverf *v2) +{ + return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier); +} + /* * nfs_direct_cmp_hdr_verf - compare verifier for pgio header * @dreq - direct request possibly spanning multiple servers @@ -215,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, nfs_direct_set_hdr_verf(dreq, hdr); return 0; } - return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); + return nfs_direct_cmp_verf(verfp, &hdr->verf); } /* @@ -238,15 +244,13 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, if (verfp->committed < 0) return 1; - return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); + return nfs_direct_cmp_verf(verfp, &data->verf); } /** * nfs_direct_IO - NFS address space operation for direct I/O * @iocb: target I/O control block - * @iov: array of vectors that define I/O buffer - * @pos: offset in file to begin the operation - * @nr_segs: size of iovec array + * @iter: I/O buffer * * The presence of this routine in the address space ops vector means * the NFS client supports direct I/O. However, for most direct IO, we @@ -368,22 +372,10 @@ out: * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust * the iocb is still valid here if this is a synchronous request. */ -static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) +static void nfs_direct_complete(struct nfs_direct_req *dreq) { struct inode *inode = dreq->inode; - if (dreq->iocb && write) { - loff_t pos = dreq->iocb->ki_pos + dreq->count; - - spin_lock(&inode->i_lock); - if (i_size_read(inode) < pos) - i_size_write(inode, pos); - spin_unlock(&inode->i_lock); - } - - if (write) - nfs_zap_mapping(inode, inode->i_mapping); - inode_dio_end(inode); if (dreq->iocb) { @@ -438,7 +430,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) } out_put: if (put_dreq(dreq)) - nfs_direct_complete(dreq, false); + nfs_direct_complete(dreq); hdr->release(hdr); } @@ -544,7 +536,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, } if (put_dreq(dreq)) - nfs_direct_complete(dreq, false); + nfs_direct_complete(dreq); return 0; } @@ -585,17 +577,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!count) goto out; - inode_lock(inode); - result = nfs_sync_mapping(mapping); - if (result) - goto out_unlock; - task_io_account_read(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (dreq == NULL) - goto out_unlock; + goto out; dreq->inode = inode; dreq->bytes_left = dreq->max_count = count; @@ -610,10 +597,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + nfs_start_io_direct(inode); + NFS_I(inode)->read_io += count; result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - inode_unlock(inode); + nfs_end_io_direct(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -621,13 +610,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos += result; } - nfs_direct_req_release(dreq); - return result; - out_release: nfs_direct_req_release(dreq); -out_unlock: - inode_unlock(inode); out: return result; } @@ -659,6 +643,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); dreq->count = 0; + dreq->verf.committed = NFS_INVALID_STABLE_HOW; + nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); for (i = 0; i < dreq->mirror_count; i++) dreq->mirrors[i].count = 0; get_dreq(dreq); @@ -777,7 +763,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) nfs_direct_write_reschedule(dreq); break; default: - nfs_direct_complete(dreq, true); + nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping); + nfs_direct_complete(dreq); } } @@ -993,6 +980,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) { ssize_t result = -EINVAL; + size_t count; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; @@ -1003,34 +991,24 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, - iov_iter_count(iter)); + result = generic_write_checks(iocb, iter); + if (result <= 0) + return result; + count = result; + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; - inode_lock(inode); - - result = nfs_sync_mapping(mapping); - if (result) - goto out_unlock; - - if (mapping->nrpages) { - result = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - if (result) - goto out_unlock; - } - - task_io_account_write(iov_iter_count(iter)); + task_io_account_write(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (!dreq) - goto out_unlock; + goto out; dreq->inode = inode; - dreq->bytes_left = dreq->max_count = iov_iter_count(iter); + dreq->bytes_left = dreq->max_count = count; dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); @@ -1042,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + nfs_start_io_direct(inode); + result = nfs_direct_write_schedule_iovec(dreq, iter, pos); if (mapping->nrpages) { @@ -1049,30 +1029,19 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos >> PAGE_SHIFT, end); } - inode_unlock(inode); + nfs_end_io_direct(inode); if (!result) { result = nfs_direct_wait(dreq); if (result > 0) { - struct inode *inode = mapping->host; - iocb->ki_pos = pos + result; - spin_lock(&inode->i_lock); - if (i_size_read(inode) < iocb->ki_pos) - i_size_write(inode, iocb->ki_pos); - spin_unlock(&inode->i_lock); - /* XXX: should check the generic_write_sync retval */ generic_write_sync(iocb, result); } } - nfs_direct_req_release(dreq); - return result; - out_release: nfs_direct_req_release(dreq); -out_unlock: - inode_unlock(inode); +out: return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 717a8d6af52d..7d620970f2e1 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); + nfs_start_io_read(inode); + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); if (result > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); } + nfs_end_io_read(inode); return result; } EXPORT_SYMBOL_GPL(nfs_file_read); @@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", filp, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); + nfs_start_io_read(inode); + res = nfs_revalidate_mapping(inode, filp->f_mapping); if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); if (res > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); } + nfs_end_io_read(inode); return res; } EXPORT_SYMBOL_GPL(nfs_file_splice_read); @@ -272,16 +276,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); - inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) ret = pnfs_sync_inode(inode, !!datasync); - inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by @@ -359,19 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file, mapping->host->i_ino, len, (long long) pos); start: - /* - * Prevent starvation issues if someone is doing a consistency - * sync-to-disk - */ - ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (ret) - return ret; - /* - * Wait for O_DIRECT to complete - */ - inode_dio_wait(mapping->host); - page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; @@ -432,7 +420,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, return status; NFS_I(mapping->host)->write_io += copied; - if (nfs_ctx_key_to_expire(ctx)) { + if (nfs_ctx_key_to_expire(ctx, mapping->host)) { status = nfs_wb_all(mapping->host); if (status < 0) return status; @@ -470,31 +458,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset, */ static int nfs_release_page(struct page *page, gfp_t gfp) { - struct address_space *mapping = page->mapping; - dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); - /* Always try to initiate a 'commit' if relevant, but only - * wait for it if the caller allows blocking. Even then, - * only wait 1 second and only if the 'bdi' is not congested. - * Waiting indefinitely can cause deadlocks when the NFS - * server is on this machine, when a new TCP connection is - * needed and in other rare cases. There is no particular - * need to wait extensively here. A short wait has the - * benefit that someone else can worry about the freezer. - */ - if (mapping) { - struct nfs_server *nfss = NFS_SERVER(mapping->host); - nfs_commit_inode(mapping->host, 0); - if (gfpflags_allow_blocking(gfp) && - !bdi_write_congested(&nfss->backing_dev_info)) { - wait_on_page_bit_killable_timeout(page, PG_private, - HZ); - if (PagePrivate(page)) - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } - } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; @@ -604,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) filp, filp->f_mapping->host->i_ino, (long long)page_offset(page)); + sb_start_pagefault(inode->i_sb); + /* make sure the cache has finished storing the page */ nfs_fscache_wait_on_page_write(NFS_I(inode), page); @@ -630,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out_unlock: unlock_page(page); out: + sb_end_pagefault(inode->i_sb); return ret; } @@ -645,7 +613,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode) ctx = nfs_file_open_context(filp); if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || - nfs_ctx_key_to_expire(ctx)) + nfs_ctx_key_to_expire(ctx, inode)) return 1; return 0; } @@ -656,23 +624,17 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(file); unsigned long written = 0; ssize_t result; - size_t count = iov_iter_count(from); result = nfs_key_timeout_notify(file, inode); if (result) return result; - if (iocb->ki_flags & IOCB_DIRECT) { - result = generic_write_checks(iocb, from); - if (result <= 0) - return result; + if (iocb->ki_flags & IOCB_DIRECT) return nfs_file_direct_write(iocb, from); - } dprintk("NFS: write(%pD2, %zu@%Ld)\n", - file, count, (long long) iocb->ki_pos); + file, iov_iter_count(from), (long long) iocb->ki_pos); - result = -EBUSY; if (IS_SWAPFILE(inode)) goto out_swapfile; /* @@ -684,28 +646,33 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) goto out; } - result = count; - if (!count) + nfs_start_io_write(inode); + result = generic_write_checks(iocb, from); + if (result > 0) { + current->backing_dev_info = inode_to_bdi(inode); + result = generic_perform_write(file, from, iocb->ki_pos); + current->backing_dev_info = NULL; + } + nfs_end_io_write(inode); + if (result <= 0) goto out; - result = generic_file_write_iter(iocb, from); - if (result > 0) - written = result; + written = generic_write_sync(iocb, result); + iocb->ki_pos += written; /* Return error values */ - if (result >= 0 && nfs_need_check_write(file, inode)) { + if (nfs_need_check_write(file, inode)) { int err = vfs_fsync(file, 0); if (err < 0) result = err; } - if (result > 0) - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); out: return result; out_swapfile: printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); - goto out; + return -EBUSY; } EXPORT_SYMBOL_GPL(nfs_file_write); @@ -780,11 +747,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) } static int -is_time_granular(struct timespec *ts) { - return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); -} - -static int do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; @@ -817,12 +779,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { - if (is_time_granular(&NFS_SERVER(inode)->time_delta)) - __nfs_revalidate_inode(NFS_SERVER(inode), inode); - else - nfs_zap_caches(inode); - } + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + nfs_zap_mapping(inode, filp->f_mapping); out: return status; } diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index aa59757389dc..a3fc48ba4931 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task, static void filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) { + loff_t end_offs = 0; if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || - hdr->res.verf->committed != NFS_DATA_SYNC) + hdr->res.verf->committed == NFS_FILE_SYNC) return; + if (hdr->res.verf->committed == NFS_DATA_SYNC) + end_offs = hdr->mds_offset + (loff_t)hdr->res.count; - pnfs_set_layoutcommit(hdr->inode, hdr->lseg, - hdr->mds_offset + hdr->res.count); + /* Note: if the write is unstable, don't set end_offs until commit */ + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -354,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task, } filelayout_set_layoutcommit(hdr); + + /* zero out the fattr */ + hdr->fattr.valid = 0; + if (task->tk_status >= 0) + nfs_writeback_update_inode(hdr); + return 0; } @@ -375,8 +384,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE) - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 0e8018bc9880..e6206eaf2bdf 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1325,15 +1325,16 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg) * we always send layoutcommit after DS writes. */ static void -ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) +ff_layout_set_layoutcommit(struct inode *inode, + struct pnfs_layout_segment *lseg, + loff_t end_offset) { - if (!ff_layout_need_layoutcommit(hdr->lseg)) + if (!ff_layout_need_layoutcommit(lseg)) return; - pnfs_set_layoutcommit(hdr->inode, hdr->lseg, - hdr->mds_offset + hdr->res.count); - dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, - (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); + pnfs_set_layoutcommit(inode, lseg, end_offset); + dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino, + (unsigned long long) NFS_I(inode)->layout->plh_lwb); } static bool @@ -1469,6 +1470,7 @@ static void ff_layout_read_release(void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + loff_t end_offs = 0; int err; trace_nfs4_pnfs_write(hdr, task->tk_status); @@ -1494,7 +1496,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task, if (hdr->res.verf->committed == NFS_FILE_SYNC || hdr->res.verf->committed == NFS_DATA_SYNC) - ff_layout_set_layoutcommit(hdr); + end_offs = hdr->mds_offset + (loff_t)hdr->res.count; + + /* Note: if the write is unstable, don't set end_offs until commit */ + ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); /* zero out fattr since we don't care DS attr at all */ hdr->fattr.valid = 0; @@ -1530,9 +1535,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE - && ff_layout_need_layoutcommit(data->lseg)) - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); + ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index dda689d7a8a7..bf4ec5ecc97e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -662,9 +662,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - inode_lock(inode); - err = nfs_sync_inode(inode); - inode_unlock(inode); + err = filemap_write_and_wait(inode->i_mapping); if (err) goto out; } @@ -879,7 +877,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - list_add(&ctx->list, &nfsi->open_files); + if (ctx->mode & FMODE_WRITE) + list_add(&ctx->list, &nfsi->open_files); + else + list_add_tail(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); @@ -972,6 +973,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (NFS_STALE(inode)) goto out; + /* pNFS: Attributes aren't updated until we layoutcommit */ + if (S_ISREG(inode->i_mode)) { + status = pnfs_sync_inode(inode, false); + if (status) + goto out; + } + status = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -1122,14 +1130,12 @@ out: } /** - * __nfs_revalidate_mapping - Revalidate the pagecache + * nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping - * @may_lock - take inode->i_mutex? */ -static int __nfs_revalidate_mapping(struct inode *inode, - struct address_space *mapping, - bool may_lock) +int nfs_revalidate_mapping(struct inode *inode, + struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; @@ -1178,12 +1184,7 @@ static int __nfs_revalidate_mapping(struct inode *inode, nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); - if (may_lock) { - inode_lock(inode); - ret = nfs_invalidate_mapping(inode, mapping); - inode_unlock(inode); - } else - ret = nfs_invalidate_mapping(inode, mapping); + ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); @@ -1193,27 +1194,28 @@ out: return ret; } -/** - * nfs_revalidate_mapping - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +static bool nfs_file_has_writers(struct nfs_inode *nfsi) { - return __nfs_revalidate_mapping(inode, mapping, false); + struct inode *inode = &nfsi->vfs_inode; + + assert_spin_locked(&inode->i_lock); + + if (!S_ISREG(inode->i_mode)) + return false; + if (list_empty(&nfsi->open_files)) + return false; + /* Note: This relies on nfsi->open_files being ordered with writers + * being placed at the head of the list. + * See nfs_inode_attach_open_context() + */ + return (list_first_entry(&nfsi->open_files, + struct nfs_open_context, + list)->mode & FMODE_WRITE) == FMODE_WRITE; } -/** - * nfs_revalidate_mapping_protected - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping - * - * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex - * while invalidating the mapping. - */ -int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) +static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) { - return __nfs_revalidate_mapping(inode, mapping, true); + return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); } static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) @@ -1280,22 +1282,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) return -EIO; - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && - inode->i_version != fattr->change_attr) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (!nfs_file_has_buffered_writers(nfsi)) { + /* Verify a few of the more important attributes */ + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; - /* Verify a few of the more important attributes */ - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) - invalid |= NFS_INO_INVALID_ATTR; + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) + invalid |= NFS_INO_INVALID_ATTR; - if (fattr->valid & NFS_ATTR_FATTR_SIZE) { - cur_size = i_size_read(inode); - new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) + invalid |= NFS_INO_INVALID_ATTR; + + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { + cur_size = i_size_read(inode); + new_isize = nfs_size_to_loff_t(fattr->size); + if (cur_size != new_isize) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + } } - if (nfsi->nrequests != 0) - invalid &= ~NFS_INO_REVAL_PAGECACHE; /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) @@ -1470,28 +1474,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } -/* - * Don't trust the change_attribute, mtime, ctime or size if - * a pnfs LAYOUTCOMMIT is outstanding - */ -static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, - struct nfs_fattr *fattr) -{ - if (pnfs_layoutcommit_outstanding(inode)) - fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | - NFS_ATTR_FATTR_MTIME | - NFS_ATTR_FATTR_CTIME | - NFS_ATTR_FATTR_SIZE); -} - static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { int ret; trace_nfs_refresh_inode_enter(inode); - nfs_inode_attrs_handle_layoutcommit(inode, fattr); - if (nfs_inode_attrs_need_update(inode, fattr)) ret = nfs_update_inode(inode, fattr); else @@ -1527,7 +1515,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { - unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + unsigned long invalid = NFS_INO_INVALID_ATTR; /* * Don't revalidate the pagecache if we hold a delegation, but do @@ -1676,6 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; + bool have_writers = nfs_file_has_buffered_writers(nfsi); bool cache_revalidated = true; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", @@ -1725,17 +1714,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Do atomic weak cache consistency updates */ invalid |= nfs_wcc_update_inode(inode, fattr); + if (pnfs_layoutcommit_outstanding(inode)) { + nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR; + cache_revalidated = false; + } + /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { if (inode->i_version != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); - invalid |= NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_DATA - | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL; - if (S_ISDIR(inode->i_mode)) - nfs_force_lookup_revalidate(inode); + /* Could it be a race with writeback? */ + if (!have_writers) { + invalid |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + } inode->i_version = fattr->change_attr; } } else { @@ -1768,9 +1765,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if ((nfsi->nrequests == 0) || new_isize > cur_isize) { + if (nfsi->nrequests == 0 || new_isize > cur_isize) { i_size_write(inode, new_isize); - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + if (!have_writers) + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5154fa65a2f2..7ce5e023c3c3 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -66,13 +66,16 @@ struct nfs_clone_mount { struct nfs_client_initdata { unsigned long init_flags; - const char *hostname; - const struct sockaddr *addr; + const char *hostname; /* Hostname of the server */ + const struct sockaddr *addr; /* Address of the server */ + const char *nodename; /* Hostname of the client */ + const char *ip_addr; /* IP address of the client */ size_t addrlen; struct nfs_subversion *nfs_mod; int proto; u32 minorversion; struct net *net; + const struct rpc_timeout *timeparms; }; /* @@ -147,9 +150,8 @@ extern void nfs_umount(const struct nfs_mount_request *info); extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); -int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); +int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, - const struct rpc_timeout *, const char *, rpc_authflavor_t); int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); @@ -184,7 +186,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *, rpc_authflavor_t); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); -extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, +extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, @@ -193,7 +195,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, rpc_authflavor_t au_flavor); extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); -extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, +extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, rpc_authflavor_t au_flavor); @@ -338,8 +340,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr); + const struct nfs_client_initdata *); /* dir.c */ extern void nfs_force_use_readdirplus(struct inode *dir); @@ -411,6 +412,19 @@ extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); +/* io.c */ +extern void nfs_start_io_read(struct inode *inode); +extern void nfs_end_io_read(struct inode *inode); +extern void nfs_start_io_write(struct inode *inode); +extern void nfs_end_io_write(struct inode *inode); +extern void nfs_start_io_direct(struct inode *inode); +extern void nfs_end_io_direct(struct inode *inode); + +static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) +{ + return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; +} + /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, @@ -496,9 +510,29 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo, struct inode *inode, struct nfs_direct_req *dreq); int nfs_key_timeout_notify(struct file *filp, struct inode *inode); -bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); +int nfs_filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend); + +#ifdef CONFIG_NFS_V4_1 +static inline +void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) +{ + int i; + + for (i = 0; i < cinfo->nbuckets; i++) + cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; +} +#else +static inline +void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) +{ +} +#endif + + #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); @@ -506,6 +540,13 @@ extern int nfs_migrate_page(struct address_space *, #define nfs_migrate_page NULL #endif +static inline int +nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, + const struct nfs_write_verifier *v2) +{ + return memcmp(v1->data, v2->data, sizeof(v1->data)); +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, @@ -521,8 +562,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_pgio_header *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr); + const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, struct nfs_client **result, struct rpc_cred *cred); @@ -623,7 +663,7 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) if (!cinfo->dreq) { struct inode *inode = page_file_mapping(page)->host; - inc_zone_page_state(page, NR_UNSTABLE_NFS); + inc_node_page_state(page, NR_UNSTABLE_NFS); inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } diff --git a/fs/nfs/io.c b/fs/nfs/io.c new file mode 100644 index 000000000000..1fc5d1ce327e --- /dev/null +++ b/fs/nfs/io.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016 Trond Myklebust + * + * I/O and data path helper functionality. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/rwsem.h> +#include <linux/fs.h> +#include <linux/nfs_fs.h> + +#include "internal.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) +{ + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); + inode_dio_wait(inode); + } +} + +/** + * nfs_start_io_read - declare the file is being used for buffered reads + * @inode - file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * NFS_INO_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +void +nfs_start_io_read(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + nfs_block_o_direct(nfsi, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * nfs_end_io_read - declare that the buffered read operation is done + * @inode - file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +nfs_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * nfs_start_io_write - declare the file is being used for buffered writes + * @inode - file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + */ +void +nfs_start_io_write(struct inode *inode) +{ + down_write(&inode->i_rwsem); + nfs_block_o_direct(NFS_I(inode), inode); +} + +/** + * nfs_end_io_write - declare that the buffered write operation is done + * @inode - file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +nfs_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) +{ + if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { + set_bit(NFS_INO_ODIRECT, &nfsi->flags); + nfs_wb_all(inode); + } +} + +/** + * nfs_end_io_direct - declare the file is being used for direct i/o + * @inode - file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the NFS_INO_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * NFS_INO_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +void +nfs_start_io_direct(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + nfs_block_buffered(nfsi, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * nfs_end_io_direct - declare that the direct i/o operation is done + * @inode - file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +nfs_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 9e9fa347a948..ee753547fb0a 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -76,19 +76,23 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source, * low timeout interval so that if a connection is lost, we retry through * the MDS. */ -struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, +struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, rpc_authflavor_t au_flavor) { + struct rpc_timeout ds_timeout; + struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, + .nodename = mds_clp->cl_rpcclient->cl_nodename, + .ip_addr = mds_clp->cl_ipaddr, .nfs_mod = &nfs_v3, .proto = ds_proto, .net = mds_clp->cl_net, + .timeparms = &ds_timeout, }; - struct rpc_timeout ds_timeout; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; @@ -97,10 +101,12 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, return ERR_PTR(-EINVAL); cl_init.hostname = buf; + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) + set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + /* Use the MDS nfs_client cl_ipaddr. */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - au_flavor); + clp = nfs_get_client(&cl_init, au_flavor); return clp; } diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index aa03ed09ba06..33da841a21bb 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -113,15 +113,17 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) return -EOPNOTSUPP; - nfs_wb_all(inode); inode_lock(inode); + err = nfs_sync_inode(inode); + if (err) + goto out_unlock; err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; - +out_unlock: inode_unlock(inode); return err; } @@ -154,11 +156,20 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, if (status) return status; + status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, + pos_src, pos_src + (loff_t)count - 1); + if (status) + return status; + status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, dst_lock, FMODE_WRITE); if (status) return status; + status = nfs_sync_inode(dst_inode); + if (status) + return status; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == -ENOTSUPP) @@ -258,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep, if (status) return status; - nfs_wb_all(inode); + status = nfs_filemap_write_and_wait_range(inode->i_mapping, + offset, LLONG_MAX); + if (status) + return status; + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == -ENOTSUPP) @@ -336,8 +351,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) * Mark the bad layout state as invalid, then retry * with the current stateid. */ - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); + pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); } else diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 6dc6f2aea0d6..8b2605882a20 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -330,13 +330,21 @@ static int decode_write_response(struct xdr_stream *xdr, struct nfs42_write_res *res) { __be32 *p; - int stateids; p = xdr_inline_decode(xdr, 4 + 8 + 4); if (unlikely(!p)) goto out_overflow; - stateids = be32_to_cpup(p++); + /* + * We never use asynchronous mode, so warn if a server returns + * a stateid. + */ + if (unlikely(*p != 0)) { + pr_err_once("%s: server has set unrequested " + "asynchronous mode\n", __func__); + return -EREMOTEIO; + } + p++; p = xdr_decode_hyper(p, &res->count); res->verifier.committed = be32_to_cpup(p); return decode_verifier(xdr, &res->verifier.verifier); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 768456fa1b17..4be567a54958 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -185,6 +185,7 @@ struct nfs4_state { struct nfs4_exception { struct nfs4_state *state; struct inode *inode; + nfs4_stateid *stateid; long timeout; unsigned char delay : 1, recovering : 1, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 10410e8b5853..8d7d08d4f95f 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) * Returns pointer to an NFS client, or an ERR_PTR value. */ struct nfs_client *nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr) + const struct nfs_client_initdata *cl_init) { char buf[INET6_ADDRSTRLEN + 1]; + const char *ip_addr = cl_init->ip_addr; struct nfs_client *old; int error; @@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); if (error < 0) goto error; @@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server, .hostname = hostname, .addr = addr, .addrlen = addrlen, + .ip_addr = ip_addr, .nfs_mod = &nfs_v4, .proto = proto, .minorversion = minorversion, .net = net, + .timeparms = timeparms, }; struct nfs_client *clp; int error; @@ -809,7 +811,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); + clp = nfs_get_client(&cl_init, authflavour); if (IS_ERR(clp)) { error = PTR_ERR(clp); goto error; @@ -842,20 +844,24 @@ error: * low timeout interval so that if a connection is lost, we retry through * the MDS. */ -struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, +struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version, rpc_authflavor_t au_flavor) { + struct rpc_timeout ds_timeout; + struct nfs_client *mds_clp = mds_srv->nfs_client; struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, + .nodename = mds_clp->cl_rpcclient->cl_nodename, + .ip_addr = mds_clp->cl_ipaddr, .nfs_mod = &nfs_v4, .proto = ds_proto, .minorversion = minor_version, .net = mds_clp->cl_net, + .timeparms = &ds_timeout, }; - struct rpc_timeout ds_timeout; struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; @@ -863,14 +869,16 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, return ERR_PTR(-EINVAL); cl_init.hostname = buf; + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) + __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS * (section 13.1 RFC 5661). */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - au_flavor); + clp = nfs_get_client(&cl_init, au_flavor); dprintk("<-- %s %p\n", __func__, clp); return clp; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 014b0e41ace5..d085ad794884 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (openflags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; - nfs_sync_inode(inode); + filemap_write_and_wait(inode->i_mapping); } inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); @@ -133,21 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { - struct inode *in_inode = file_inode(file_in); - struct inode *out_inode = file_inode(file_out); - int ret; - - if (in_inode == out_inode) + if (file_inode(file_in) == file_inode(file_out)) return -EINVAL; - /* flush any pending writes */ - ret = nfs_sync_inode(in_inode); - if (ret) - return ret; - ret = nfs_sync_inode(out_inode); - if (ret) - return ret; - return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ff416d0e24bc..da5c9e58e907 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -363,6 +363,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; + const nfs4_stateid *stateid = exception->stateid; struct inode *inode = exception->inode; int ret = errorcode; @@ -376,9 +377,18 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - if (inode && nfs_async_inode_return_delegation(inode, - NULL) == 0) - goto wait_on_recovery; + if (inode) { + int err; + + err = nfs_async_inode_return_delegation(inode, + stateid); + if (err == 0) + goto wait_on_recovery; + if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) { + exception->retry = 1; + break; + } + } if (state == NULL) break; ret = nfs4_schedule_stateid_recovery(server, state); @@ -427,6 +437,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: + case -NFS4ERR_LAYOUTTRYLATER: case -NFS4ERR_RECALLCONFLICT: exception->delay = 1; return 0; @@ -2669,28 +2680,17 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, return res; } -static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, - struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state, struct nfs4_label *ilabel, - struct nfs4_label *olabel) +static int _nfs4_do_setattr(struct inode *inode, + struct nfs_setattrargs *arg, + struct nfs_setattrres *res, + struct rpc_cred *cred, + struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs_setattrargs arg = { - .fh = NFS_FH(inode), - .iap = sattr, - .server = server, - .bitmask = server->attr_bitmask, - .label = ilabel, - }; - struct nfs_setattrres res = { - .fattr = fattr, - .label = olabel, - .server = server, - }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], - .rpc_argp = &arg, - .rpc_resp = &res, + .rpc_argp = arg, + .rpc_resp = res, .rpc_cred = cred, }; struct rpc_cred *delegation_cred = NULL; @@ -2699,17 +2699,13 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, bool truncate; int status; - arg.bitmask = nfs4_bitmask(server, ilabel); - if (ilabel) - arg.bitmask = nfs4_bitmask(server, olabel); - - nfs_fattr_init(fattr); + nfs_fattr_init(res->fattr); /* Servers should only apply open mode checks for file size changes */ - truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; + truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false; fmode = truncate ? FMODE_WRITE : FMODE_READ; - if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) { + if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) { /* Use that stateid */ } else if (truncate && state != NULL) { struct nfs_lockowner lockowner = { @@ -2719,19 +2715,19 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, if (!nfs4_valid_open_stateid(state)) return -EBADF; if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, - &arg.stateid, &delegation_cred) == -EIO) + &arg->stateid, &delegation_cred) == -EIO) return -EBADF; } else - nfs4_stateid_copy(&arg.stateid, &zero_stateid); + nfs4_stateid_copy(&arg->stateid, &zero_stateid); if (delegation_cred) msg.rpc_cred = delegation_cred; - status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1); put_rpccred(delegation_cred); if (status == 0 && state != NULL) renew_lease(server, timestamp); - trace_nfs4_setattr(inode, &arg.stateid, status); + trace_nfs4_setattr(inode, &arg->stateid, status); return status; } @@ -2741,13 +2737,31 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); + struct nfs_setattrargs arg = { + .fh = NFS_FH(inode), + .iap = sattr, + .server = server, + .bitmask = server->attr_bitmask, + .label = ilabel, + }; + struct nfs_setattrres res = { + .fattr = fattr, + .label = olabel, + .server = server, + }; struct nfs4_exception exception = { .state = state, .inode = inode, + .stateid = &arg.stateid, }; int err; + + arg.bitmask = nfs4_bitmask(server, ilabel); + if (ilabel) + arg.bitmask = nfs4_bitmask(server, olabel); + do { - err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); + err = _nfs4_do_setattr(inode, &arg, &res, cred, state); switch (err) { case -NFS4ERR_OPENMODE: if (!(sattr->ia_valid & ATTR_SIZE)) { @@ -3267,13 +3281,6 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, return status; } -static int nfs4_do_find_root_sec(struct nfs_server *server, - struct nfs_fh *fhandle, struct nfs_fsinfo *info) -{ - int mv = server->nfs_client->cl_minorversion; - return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info); -} - /** * nfs4_proc_get_rootfh - get file handle for server's pseudoroot * @server: initialized nfs_server handle @@ -3293,7 +3300,8 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, status = nfs4_lookup_root(server, fhandle, info); if (auth_probe || status == NFS4ERR_WRONGSEC) - status = nfs4_do_find_root_sec(server, fhandle, info); + status = server->nfs_client->cl_mvops->find_root_sec(server, + fhandle, info); if (status == 0) status = nfs4_server_capabilities(server, fhandle); @@ -4392,7 +4400,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, struct rpc_message *msg) { hdr->timestamp = jiffies; - hdr->pgio_done_cb = nfs4_read_done_cb; + if (!hdr->pgio_done_cb) + hdr->pgio_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); } @@ -7869,11 +7878,13 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; - int status = task->tk_status; + int nfs4err = task->tk_status; + int err, status = 0; + LIST_HEAD(head); dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); - switch (status) { + switch (nfs4err) { case 0: goto out; @@ -7905,45 +7916,42 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, status = -EOVERFLOW; goto out; } - /* Fallthrough */ + status = -EBUSY; + break; case -NFS4ERR_RECALLCONFLICT: - nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT, - exception); status = -ERECALLCONFLICT; - goto out; + break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: exception->timeout = 0; spin_lock(&inode->i_lock); - if (nfs4_stateid_match(&lgp->args.stateid, + lo = NFS_I(inode)->layout; + /* If the open stateid was bad, then recover it. */ + if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || + nfs4_stateid_match_other(&lgp->args.stateid, &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); - /* If the open stateid was bad, then recover it. */ exception->state = lgp->args.ctx->state; break; } - lo = NFS_I(inode)->layout; - if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) && - nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { - LIST_HEAD(head); - - /* - * Mark the bad layout state as invalid, then retry - * with the current stateid. - */ - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); - spin_unlock(&inode->i_lock); - pnfs_free_lseg_list(&head); - status = -EAGAIN; - goto out; - } else - spin_unlock(&inode->i_lock); - } - status = nfs4_handle_exception(server, status, exception); - if (exception->retry) + /* + * Mark the bad layout state as invalid, then retry + */ + pnfs_mark_layout_stateid_invalid(lo, &head); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); status = -EAGAIN; + goto out; + } + + err = nfs4_handle_exception(server, nfs4err, exception); + if (!status) { + if (exception->retry) + status = -EAGAIN; + else + status = err; + } out: dprintk("<-- %s\n", __func__); return status; @@ -8129,8 +8137,7 @@ static void nfs4_layoutreturn_release(void *calldata) spin_lock(&lo->plh_inode->i_lock); pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, be32_to_cpu(lrp->args.stateid.seqid)); - pnfs_mark_layout_returned_if_empty(lo); - if (lrp->res.lrs_present) + if (lrp->res.lrs_present && pnfs_layout_is_valid(lo)) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&lo->plh_inode->i_lock); @@ -8835,7 +8842,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { #endif }; -ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) +static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { ssize_t error, error2; diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 9c150b153782..cfb8f7ce5cf6 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -1235,8 +1235,8 @@ DECLARE_EVENT_CLASS(nfs4_idmap_event, len = 0; __entry->error = error < 0 ? error : 0; __entry->id = id; - memcpy(__get_dynamic_array(name), name, len); - ((char *)__get_dynamic_array(name))[len] = 0; + memcpy(__get_str(name), name, len); + __get_str(name)[len] = 0; ), TP_printk( diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 661e753fe1c9..7bd3a5c09d31 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr, p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ *p = cpu_to_be32(0); /* reclaim */ encode_nfs4_stateid(xdr, &args->stateid); - p = reserve_space(xdr, 20); - *p++ = cpu_to_be32(1); /* newoffset = TRUE */ - p = xdr_encode_hyper(p, args->lastbytewritten); + if (args->lastbytewritten != U64_MAX) { + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ + p = xdr_encode_hyper(p, args->lastbytewritten); + } else { + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(0); /* newoffset = FALSE */ + } *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 0b9e5cc9a747..2ca9167bc97d 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -37,7 +37,6 @@ { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ { 1 << NFS_INO_STALE, "STALE" }, \ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ - { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) @@ -707,9 +706,9 @@ TRACE_EVENT(nfs_sillyrename_unlink, __entry->dev = dir->i_sb->s_dev; __entry->dir = NFS_FILEID(dir); __entry->error = error; - memcpy(__get_dynamic_array(name), + memcpy(__get_str(name), data->args.name.name, len); - ((char *)__get_dynamic_array(name))[len] = 0; + __get_str(name)[len] = 0; ), TP_printk( diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0fbe734cc38c..70806cae0d36 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -259,7 +259,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) * is required. * Note that caller must hold inode->i_lock. */ -static int +int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, struct list_head *lseg_list) { @@ -334,14 +334,17 @@ pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) } static void -init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) +pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, + const struct pnfs_layout_range *range, + const nfs4_stateid *stateid) { INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_lc_list); atomic_set(&lseg->pls_refcount, 1); - smp_mb(); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->pls_layout = lo; + lseg->pls_range = *range; + lseg->pls_seq = be32_to_cpu(stateid->seqid); } static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) @@ -486,15 +489,6 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, (end2 == NFS4_MAX_UINT64 || end2 > start1); } -static bool -should_free_lseg(const struct pnfs_layout_range *lseg_range, - const struct pnfs_layout_range *recall_range) -{ - return (recall_range->iomode == IOMODE_ANY || - lseg_range->iomode == recall_range->iomode) && - pnfs_lseg_range_intersecting(lseg_range, recall_range); -} - static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) { @@ -533,6 +527,27 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) return (s32)(s1 - s2) > 0; } +static bool +pnfs_should_free_range(const struct pnfs_layout_range *lseg_range, + const struct pnfs_layout_range *recall_range) +{ + return (recall_range->iomode == IOMODE_ANY || + lseg_range->iomode == recall_range->iomode) && + pnfs_lseg_range_intersecting(lseg_range, recall_range); +} + +static bool +pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg, + const struct pnfs_layout_range *recall_range, + u32 seq) +{ + if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq)) + return false; + if (recall_range == NULL) + return true; + return pnfs_should_free_range(&lseg->pls_range, recall_range); +} + /** * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later * @lo: layout header containing the lsegs @@ -562,10 +577,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, if (list_empty(&lo->plh_segs)) return 0; list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) - if (!recall_range || - should_free_lseg(&lseg->pls_range, recall_range)) { - if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq)) - continue; + if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { dprintk("%s: freeing lseg %p iomode %d seq %u" "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_seq, @@ -761,24 +773,25 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier) { - u32 oldseq, newseq, new_barrier; - int empty = list_empty(&lo->plh_segs); + u32 oldseq, newseq, new_barrier = 0; + bool invalid = !pnfs_layout_is_valid(lo); oldseq = be32_to_cpu(lo->plh_stateid.seqid); newseq = be32_to_cpu(new->seqid); - if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { + if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) { nfs4_stateid_copy(&lo->plh_stateid, new); - if (update_barrier) { - new_barrier = be32_to_cpu(new->seqid); - } else { - /* Because of wraparound, we want to keep the barrier - * "close" to the current seqids. - */ - new_barrier = newseq - atomic_read(&lo->plh_outstanding); - } - if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) - lo->plh_barrier = new_barrier; + /* + * Because of wraparound, we want to keep the barrier + * "close" to the current seqids. + */ + new_barrier = newseq - atomic_read(&lo->plh_outstanding); } + if (update_barrier) + new_barrier = be32_to_cpu(new->seqid); + else if (new_barrier == 0) + return; + if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) + lo->plh_barrier = new_barrier; } static bool @@ -873,15 +886,37 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } +static void +pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) +{ + lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); +} + static bool -pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) +pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, + nfs4_stateid *stateid, + enum pnfs_iomode *iomode) { if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; - lo->plh_return_iomode = 0; - lo->plh_return_seq = 0; pnfs_get_layout_hdr(lo); - clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { + if (stateid != NULL) { + nfs4_stateid_copy(stateid, &lo->plh_stateid); + if (lo->plh_return_seq != 0) + stateid->seqid = cpu_to_be32(lo->plh_return_seq); + } + if (iomode != NULL) + *iomode = lo->plh_return_iomode; + pnfs_clear_layoutreturn_info(lo); + return true; + } + if (stateid != NULL) + nfs4_stateid_copy(stateid, &lo->plh_stateid); + if (iomode != NULL) + *iomode = IOMODE_ANY; return true; } @@ -949,10 +984,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) enum pnfs_iomode iomode; bool send; - nfs4_stateid_copy(&stateid, &lo->plh_stateid); - stateid.seqid = cpu_to_be32(lo->plh_return_seq); - iomode = lo->plh_return_iomode; - send = pnfs_prepare_layoutreturn(lo); + send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ @@ -989,7 +1021,6 @@ _pnfs_return_layout(struct inode *ino) dprintk("NFS: %s no layout to return\n", __func__); goto out; } - nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid); /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); @@ -1012,8 +1043,7 @@ _pnfs_return_layout(struct inode *ino) goto out_put_layout_hdr; } - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - send = pnfs_prepare_layoutreturn(lo); + send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); if (send) @@ -1080,11 +1110,10 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } - nfs4_stateid_copy(&stateid, &lo->plh_stateid); /* always send layoutreturn if being marked so */ - if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED, - &lo->plh_flags)) - layoutreturn = pnfs_prepare_layoutreturn(lo); + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo, + &stateid, NULL); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) /* If we are sending layoutreturn, invalidate all valid lsegs */ @@ -1132,7 +1161,6 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; - pnfs_mark_layout_returned_if_empty(lo); if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); @@ -1505,7 +1533,7 @@ pnfs_update_layout(struct inode *ino, struct pnfs_layout_segment *lseg = NULL; nfs4_stateid stateid; long timeout = 0; - unsigned long giveup = jiffies + rpc_get_timeout(server->client); + unsigned long giveup = jiffies + (clp->cl_lease_time << 1); bool first; if (!pnfs_enabled_sb(NFS_SERVER(ino))) { @@ -1645,33 +1673,44 @@ lookup_again: lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); + atomic_dec(&lo->plh_outstanding); if (IS_ERR(lseg)) { switch(PTR_ERR(lseg)) { - case -ERECALLCONFLICT: + case -EBUSY: if (time_after(jiffies, giveup)) lseg = NULL; - /* Fallthrough */ - case -EAGAIN: - pnfs_put_layout_hdr(lo); - if (first) - pnfs_clear_first_layoutget(lo); - if (lseg) { - trace_pnfs_update_layout(ino, pos, count, - iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); - goto lookup_again; + break; + case -ERECALLCONFLICT: + /* Huh? We hold no layouts, how is there a recall? */ + if (first) { + lseg = NULL; + break; } + /* Destroy the existing layout and start over */ + if (time_after(jiffies, giveup)) + pnfs_destroy_layout(NFS_I(ino)); /* Fallthrough */ + case -EAGAIN: + break; default: if (!nfs_error_is_fatal(PTR_ERR(lseg))) { pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); lseg = NULL; } + goto out_put_layout_hdr; + } + if (lseg) { + if (first) + pnfs_clear_first_layoutget(lo); + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); + pnfs_put_layout_hdr(lo); + goto lookup_again; } } else { pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); } - atomic_dec(&lo->plh_outstanding); out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); @@ -1735,9 +1774,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) return lseg; } - init_lseg(lo, lseg); - lseg->pls_range = res->range; - lseg->pls_seq = be32_to_cpu(res->stateid.seqid); + pnfs_init_lseg(lo, lseg, &res->range, &res->stateid); spin_lock(&ino->i_lock); if (pnfs_layoutgets_blocked(lo)) { @@ -1758,16 +1795,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) * inode invalid, and don't bother validating the stateid * sequence number. */ - pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0); + pnfs_mark_layout_stateid_invalid(lo, &free_me); nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); lo->plh_barrier = be32_to_cpu(res->stateid.seqid); } - clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_get_lseg(lseg); pnfs_layout_insert_lseg(lo, lseg, &free_me); + if (!pnfs_layout_is_valid(lo)) { + pnfs_clear_layoutreturn_info(lo); + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + } + if (res->return_on_close) set_bit(NFS_LSEG_ROC, &lseg->pls_flags); @@ -1787,14 +1827,14 @@ static void pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, u32 seq) { - if (lo->plh_return_iomode == iomode) - return; - if (lo->plh_return_iomode != 0) + if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode) iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); - if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) + if (seq != 0) { + WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); lo->plh_return_seq = seq; + } } /** @@ -1824,7 +1864,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) - if (should_free_lseg(&lseg->pls_range, return_range)) { + if (pnfs_match_lseg_recall(lseg, return_range, seq)) { dprintk("%s: marking lseg %p iomode %d " "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, @@ -1855,19 +1895,17 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, bool return_now = false; spin_lock(&inode->i_lock); - pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq); + pnfs_set_plh_return_info(lo, range.iomode, 0); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (!pnfs_mark_matching_lsegs_return(lo, &free_me, - &range, lseg->pls_seq)) { + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) { nfs4_stateid stateid; - enum pnfs_iomode iomode = lo->plh_return_iomode; + enum pnfs_iomode iomode; - nfs4_stateid_copy(&stateid, &lo->plh_stateid); - return_now = pnfs_prepare_layoutreturn(lo); + return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); spin_unlock(&inode->i_lock); if (return_now) pnfs_send_layoutreturn(lo, &stateid, iomode, false); @@ -2382,7 +2420,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) nfs_fattr_init(&data->fattr); data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; data->res.fattr = &data->fattr; - data->args.lastbytewritten = end_pos - 1; + if (end_pos != 0) + data->args.lastbytewritten = end_pos - 1; + else + data->args.lastbytewritten = U64_MAX; data->res.server = NFS_SERVER(inode); if (ld->prepare_layoutcommit) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index b21bd0bee784..31d99b2927b0 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -268,6 +268,8 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, const struct pnfs_layout_range *recall_range, u32 seq); +int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, + struct list_head *lseg_list); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); @@ -375,6 +377,11 @@ static inline bool nfs_have_layout(struct inode *inode) return NFS_I(inode)->layout != NULL; } +static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0; +} + static inline struct nfs4_deviceid_node * nfs4_get_deviceid(struct nfs4_deviceid_node *d) { @@ -545,19 +552,6 @@ pnfs_calc_offset_length(u64 offset, u64 end) return 1 + end - offset; } -/** - * pnfs_mark_layout_returned_if_empty - marks the layout as returned - * @lo: layout header - * - * Note: Caller must hold inode->i_lock - */ -static inline void -pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo) -{ - if (list_empty(&lo->plh_segs)) - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); -} - static inline void pnfs_copy_range(struct pnfs_layout_range *dst, const struct pnfs_layout_range *src) @@ -629,6 +623,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync) } static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + return false; +} + + +static inline bool pnfs_roc(struct inode *ino) { return false; @@ -716,13 +717,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, return false; } -static inline bool -pnfs_layoutcommit_outstanding(struct inode *inode) -{ - return false; -} - - static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { return NULL; diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index b38e3c0dc790..f3468b57a32a 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -595,7 +595,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) } static struct nfs_client *(*get_v3_ds_connect)( - struct nfs_client *mds_clp, + struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, @@ -654,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_test_and_add_xprt, NULL); } else - clp = get_v3_ds_connect(mds_srv->nfs_client, + clp = get_v3_ds_connect(mds_srv, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP, timeo, retrans, au_flavor); @@ -690,7 +690,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); - clp = nfs4_set_ds_client(mds_srv->nfs_client, + clp = nfs4_set_ds_client(mds_srv, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP, timeo, retrans, minor_version, @@ -940,6 +940,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); int pnfs_nfs_generic_sync(struct inode *inode, bool datasync) { + int ret; + + if (!pnfs_layoutcommit_outstanding(inode)) + return 0; + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + return ret; if (datasync) return 0; return pnfs_layoutcommit_inode(inode, true); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2137e0202f25..18d446e1a82b 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1684,6 +1684,7 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, { rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; unsigned int i; + int use_auth_null = false; /* * If the sec= mount option is used, the specified flavor or AUTH_NULL @@ -1691,14 +1692,21 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, * * AUTH_NULL has a special meaning when it's in the server list - it * means that the server will ignore the rpc creds, so any flavor - * can be used. + * can be used but still use the sec= that was specified. */ for (i = 0; i < count; i++) { flavor = server_authlist[i]; - if (nfs_auth_info_match(&args->auth_info, flavor) || - flavor == RPC_AUTH_NULL) + if (nfs_auth_info_match(&args->auth_info, flavor)) goto out; + + if (flavor == RPC_AUTH_NULL) + use_auth_null = true; + } + + if (use_auth_null) { + flavor = RPC_AUTH_NULL; + goto out; } dfprintk(MOUNT, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e1c74d3db64d..3a6724c6eb5f 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -625,7 +625,7 @@ static int nfs_writepage_locked(struct page *page, int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), + nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); err = nfs_do_writepage(page, wbc, &pgio, launder); nfs_pageio_complete(&pgio); @@ -657,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - unsigned long *bitlock = &NFS_I(inode)->flags; struct nfs_pageio_descriptor pgio; int err; - /* Stop dirtying of new pages while we sync */ - err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, - nfs_wait_bit_killable, TASK_KILLABLE); - if (err) - goto out_err; - nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, @@ -674,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); nfs_pageio_complete(&pgio); - clear_bit_unlock(NFS_INO_FLUSHING, bitlock); - smp_mb__after_atomic(); - wake_up_bit(bitlock, NFS_INO_FLUSHING); - if (err < 0) goto out_err; err = pgio.pg_error; @@ -898,7 +887,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static void nfs_clear_page_commit(struct page *page) { - dec_zone_page_state(page, NR_UNSTABLE_NFS); + dec_node_page_state(page, NR_UNSTABLE_NFS); dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, WB_RECLAIMABLE); } @@ -1195,9 +1184,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode) /* * Test if the open context credential key is marked to expire soon. */ -bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) +bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) { - return rpcauth_cred_key_to_expire(ctx->cred); + struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; + + return rpcauth_cred_key_to_expire(auth, ctx->cred); } /* @@ -1289,6 +1280,9 @@ int nfs_updatepage(struct file *file, struct page *page, dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", file, count, (long long)(page_file_offset(page) + offset)); + if (!count) + goto out; + if (nfs_can_extend_write(file, page, inode)) { count = max(count + offset, nfs_page_length(page)); offset = 0; @@ -1299,7 +1293,7 @@ int nfs_updatepage(struct file *file, struct page *page, nfs_set_pageerror(page); else __set_page_dirty_nobuffers(page); - +out: dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; @@ -1800,7 +1794,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* Okay, COMMIT succeeded, apparently. Check the verifier * returned by the server against all stored verfs. */ - if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) { + if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) { /* We have a match */ nfs_inode_remove_request(req); dprintk(" OK\n"); @@ -1924,6 +1918,24 @@ out_mark_dirty: EXPORT_SYMBOL_GPL(nfs_write_inode); /* + * Wrapper for filemap_write_and_wait_range() + * + * Needed for pNFS in order to ensure data becomes visible to the + * client. + */ +int nfs_filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + int ret; + + ret = filemap_write_and_wait_range(mapping, lstart, lend); + if (ret == 0) + ret = pnfs_sync_inode(mapping->host, true); + return ret; +} +EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range); + +/* * flush the inode to disk. */ int nfs_wb_all(struct inode *inode) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index c9f583d7bac8..47febcf99185 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -90,6 +90,7 @@ config NFSD_BLOCKLAYOUT bool "NFSv4.1 server support for pNFS block layouts" depends on NFSD_V4 && BLOCK select NFSD_PNFS + select EXPORTFS_BLOCK_OPS help This option enables support for the exporting pNFS block layouts in the kernel's NFS server. The pNFS block layout enables NFS @@ -102,6 +103,7 @@ config NFSD_SCSILAYOUT bool "NFSv4.1 server support for pNFS SCSI layouts" depends on NFSD_V4 && BLOCK select NFSD_PNFS + select EXPORTFS_BLOCK_OPS help This option enables support for the exporting pNFS SCSI layouts in the kernel's NFS server. The pNFS SCSI layout enables NFS @@ -111,6 +113,23 @@ config NFSD_SCSILAYOUT If unsure, say N. +config NFSD_FLEXFILELAYOUT + bool "NFSv4.1 server support for pNFS Flex File layouts" + depends on NFSD_V4 + select NFSD_PNFS + help + This option enables support for the exporting pNFS Flex File + layouts in the kernel's NFS server. The pNFS Flex File layout + enables NFS clients to directly perform I/O to NFSv3 devices + accesible to both the server and the clients. See + draft-ietf-nfsv4-flex-files for more details. + + Warning, this server implements the bare minimum functionality + to be a flex file server - it is for testing the client, + not for use in production. + + If unsure, say N. + config NFSD_V4_SECURITY_LABEL bool "Provide Security Label support for NFSv4 server" depends on NFSD_V4 && SECURITY diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 3ae5f3c77e28..5f5d3a76980c 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -20,3 +20,4 @@ nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o +nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index ad2c05e80a83..5a1708441510 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -163,6 +163,7 @@ nfsd4_block_get_device_info_simple(struct super_block *sb, static __be32 nfsd4_block_proc_getdeviceinfo(struct super_block *sb, + struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { @@ -355,6 +356,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, static __be32 nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, + struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) { diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 4ebaaf4b8d8a..ac6f54546fdd 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -44,7 +44,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) switch (b->type) { case PNFS_BLOCK_VOLUME_SIMPLE: - len = 4 + 4 + 8 + 4 + b->simple.sig_len; + len = 4 + 4 + 8 + 4 + (XDR_QUADLEN(b->simple.sig_len) << 2); p = xdr_reserve_space(xdr, len); if (!p) return -ETOOSMALL; @@ -55,7 +55,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len); break; case PNFS_BLOCK_VOLUME_SCSI: - len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8; + len = 4 + 4 + 4 + 4 + (XDR_QUADLEN(b->scsi.designator_len) << 2) + 8; p = xdr_reserve_space(xdr, len); if (!p) return -ETOOSMALL; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index b4d84b579f20..43e109cc0ccc 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -706,7 +706,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; - new->ex_layout_type = 0; + new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; } @@ -731,7 +731,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) item->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = item->ex_fslocs.migrated; item->ex_fslocs.migrated = 0; - new->ex_layout_type = item->ex_layout_type; + new->ex_layout_types = item->ex_layout_types; new->ex_nflavors = item->ex_nflavors; for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; @@ -954,6 +954,16 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX) return 0; } + + /* If the compound op contains a spo_must_allowed op, + * it will be sent with integrity/protection which + * will have to be expressly allowed on mounts that + * don't support it + */ + + if (nfsd4_spo_must_allow(rqstp)) + return 0; + return nfserr_wrongsec; } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 2e315072bf3f..730f15eeb7ed 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -57,7 +57,7 @@ struct svc_export { struct nfsd4_fs_locations ex_fslocs; uint32_t ex_nflavors; struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; - enum pnfs_layouttype ex_layout_type; + u32 ex_layout_types; struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; }; diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c new file mode 100644 index 000000000000..df880e9fa71f --- /dev/null +++ b/fs/nfsd/flexfilelayout.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> + * + * The following implements a super-simple flex-file server + * where the NFSv4.1 mds is also the ds. And the storage is + * the same. I.e., writing to the mds via a NFSv4.1 WRITE + * goes to the same location as the NFSv3 WRITE. + */ +#include <linux/slab.h> + +#include <linux/nfsd/debug.h> + +#include <linux/sunrpc/addr.h> + +#include "flexfilelayoutxdr.h" +#include "pnfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +static __be32 +nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, + struct nfsd4_layoutget *args) +{ + struct nfsd4_layout_seg *seg = &args->lg_seg; + u32 device_generation = 0; + int error; + uid_t u; + + struct pnfs_ff_layout *fl; + + /* + * The super simple flex file server has 1 mirror, 1 data server, + * and 1 file handle. So instead of 4 allocs, do 1 for now. + * Zero it out for the stateid - don't want junk in there! + */ + error = -ENOMEM; + fl = kzalloc(sizeof(*fl), GFP_KERNEL); + if (!fl) + goto out_error; + args->lg_content = fl; + + /* + * Avoid layout commit, try to force the I/O to the DS, + * and for fun, cause all IOMODE_RW layout segments to + * effectively be WRITE only. + */ + fl->flags = FF_FLAGS_NO_LAYOUTCOMMIT | FF_FLAGS_NO_IO_THRU_MDS | + FF_FLAGS_NO_READ_IO; + + /* Do not allow a IOMODE_READ segment to have write pemissions */ + if (seg->iomode == IOMODE_READ) { + u = from_kuid(&init_user_ns, inode->i_uid) + 1; + fl->uid = make_kuid(&init_user_ns, u); + } else + fl->uid = inode->i_uid; + fl->gid = inode->i_gid; + + error = nfsd4_set_deviceid(&fl->deviceid, fhp, device_generation); + if (error) + goto out_error; + + fl->fh.size = fhp->fh_handle.fh_size; + memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size); + + /* Give whole file layout segments */ + seg->offset = 0; + seg->length = NFS4_MAX_UINT64; + + dprintk("GET: 0x%llx:0x%llx %d\n", seg->offset, seg->length, + seg->iomode); + return 0; + +out_error: + seg->length = 0; + return nfserrno(error); +} + +static __be32 +nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp, + struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_ff_device_addr *da; + + u16 port; + char addr[INET6_ADDRSTRLEN]; + + da = kzalloc(sizeof(struct pnfs_ff_device_addr), GFP_KERNEL); + if (!da) + return nfserrno(-ENOMEM); + + gdp->gd_device = da; + + da->version = 3; + da->minor_version = 0; + + da->rsize = svc_max_payload(rqstp); + da->wsize = da->rsize; + + rpc_ntop((struct sockaddr *)&rqstp->rq_daddr, + addr, INET6_ADDRSTRLEN); + if (rqstp->rq_daddr.ss_family == AF_INET) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&rqstp->rq_daddr; + port = ntohs(sin->sin_port); + snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp"); + da->netaddr.netid_len = 3; + } else { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&rqstp->rq_daddr; + port = ntohs(sin6->sin6_port); + snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp6"); + da->netaddr.netid_len = 4; + } + + da->netaddr.addr_len = + snprintf(da->netaddr.addr, FF_ADDR_LEN + 1, + "%s.%hhu.%hhu", addr, port >> 8, port & 0xff); + + da->tightly_coupled = false; + + return 0; +} + +const struct nfsd4_layout_ops ff_layout_ops = { + .notify_types = + NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, + .proc_getdeviceinfo = nfsd4_ff_proc_getdeviceinfo, + .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo, + .proc_layoutget = nfsd4_ff_proc_layoutget, + .encode_layoutget = nfsd4_ff_encode_layoutget, +}; diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c new file mode 100644 index 000000000000..5e3fd7fc1a9f --- /dev/null +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> + */ +#include <linux/sunrpc/svc.h> +#include <linux/nfs4.h> + +#include "nfsd.h" +#include "flexfilelayoutxdr.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +struct ff_idmap { + char buf[11]; + int len; +}; + +__be32 +nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp) +{ + struct pnfs_ff_layout *fl = lgp->lg_content; + int len, mirror_len, ds_len, fh_len; + __be32 *p; + + /* + * Unlike nfsd4_encode_user, we know these will + * always be stringified. + */ + struct ff_idmap uid; + struct ff_idmap gid; + + fh_len = 4 + fl->fh.size; + + uid.len = sprintf(uid.buf, "%u", from_kuid(&init_user_ns, fl->uid)); + gid.len = sprintf(gid.buf, "%u", from_kgid(&init_user_ns, fl->gid)); + + /* 8 + len for recording the length, name, and padding */ + ds_len = 20 + sizeof(stateid_opaque_t) + 4 + fh_len + + 8 + uid.len + 8 + gid.len; + + mirror_len = 4 + ds_len; + + /* The layout segment */ + len = 20 + mirror_len; + + p = xdr_reserve_space(xdr, sizeof(__be32) + len); + if (!p) + return nfserr_toosmall; + + *p++ = cpu_to_be32(len); + p = xdr_encode_hyper(p, 0); /* stripe unit of 1 */ + + *p++ = cpu_to_be32(1); /* single mirror */ + *p++ = cpu_to_be32(1); /* single data server */ + + p = xdr_encode_opaque_fixed(p, &fl->deviceid, + sizeof(struct nfsd4_deviceid)); + + *p++ = cpu_to_be32(1); /* efficiency */ + + *p++ = cpu_to_be32(fl->stateid.si_generation); + p = xdr_encode_opaque_fixed(p, &fl->stateid.si_opaque, + sizeof(stateid_opaque_t)); + + *p++ = cpu_to_be32(1); /* single file handle */ + p = xdr_encode_opaque(p, fl->fh.data, fl->fh.size); + + p = xdr_encode_opaque(p, uid.buf, uid.len); + p = xdr_encode_opaque(p, gid.buf, gid.len); + + *p++ = cpu_to_be32(fl->flags); + *p++ = cpu_to_be32(0); /* No stats collect hint */ + + return 0; +} + +__be32 +nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_ff_device_addr *da = gdp->gd_device; + int len; + int ver_len; + int addr_len; + __be32 *p; + + /* len + padding for two strings */ + addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len; + ver_len = 20; + + len = 4 + ver_len + 4 + addr_len; + + p = xdr_reserve_space(xdr, len + sizeof(__be32)); + if (!p) + return nfserr_resource; + + /* + * Fill in the overall length and number of volumes at the beginning + * of the layout. + */ + *p++ = cpu_to_be32(len); + *p++ = cpu_to_be32(1); /* 1 netaddr */ + p = xdr_encode_opaque(p, da->netaddr.netid, da->netaddr.netid_len); + p = xdr_encode_opaque(p, da->netaddr.addr, da->netaddr.addr_len); + + *p++ = cpu_to_be32(1); /* 1 versions */ + + *p++ = cpu_to_be32(da->version); + *p++ = cpu_to_be32(da->minor_version); + *p++ = cpu_to_be32(da->rsize); + *p++ = cpu_to_be32(da->wsize); + *p++ = cpu_to_be32(da->tightly_coupled); + + return 0; +} diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h new file mode 100644 index 000000000000..467defd4e563 --- /dev/null +++ b/fs/nfsd/flexfilelayoutxdr.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com> + */ +#ifndef _NFSD_FLEXFILELAYOUTXDR_H +#define _NFSD_FLEXFILELAYOUTXDR_H 1 + +#include <linux/inet.h> +#include "xdr4.h" + +#define FF_FLAGS_NO_LAYOUTCOMMIT 1 +#define FF_FLAGS_NO_IO_THRU_MDS 2 +#define FF_FLAGS_NO_READ_IO 4 + +struct xdr_stream; + +#define FF_NETID_LEN (4) +#define FF_ADDR_LEN (INET6_ADDRSTRLEN + 8) +struct pnfs_ff_netaddr { + char netid[FF_NETID_LEN + 1]; + char addr[FF_ADDR_LEN + 1]; + u32 netid_len; + u32 addr_len; +}; + +struct pnfs_ff_device_addr { + struct pnfs_ff_netaddr netaddr; + u32 version; + u32 minor_version; + u32 rsize; + u32 wsize; + bool tightly_coupled; +}; + +struct pnfs_ff_layout { + u32 flags; + u32 stats_collect_hint; + kuid_t uid; + kgid_t gid; + struct nfsd4_deviceid deviceid; + stateid_t stateid; + struct nfs_fh fh; +}; + +__be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp); +__be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp); + +#endif /* _NFSD_FLEXFILELAYOUTXDR_H */ diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 953c0755cb37..2be9602b0221 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -27,6 +27,9 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; static const struct lock_manager_operations nfsd4_layouts_lm_ops; const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { +#ifdef CONFIG_NFSD_FLEXFILELAYOUT + [LAYOUT_FLEX_FILES] = &ff_layout_ops, +#endif #ifdef CONFIG_NFSD_BLOCKLAYOUT [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops, #endif @@ -122,28 +125,35 @@ nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, void nfsd4_setup_layout_type(struct svc_export *exp) { +#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT) struct super_block *sb = exp->ex_path.mnt->mnt_sb; +#endif if (!(exp->ex_flags & NFSEXP_PNFS)) return; /* - * Check if the file system supports exporting a block-like layout. + * If flex file is configured, use it by default. Otherwise + * check if the file system supports exporting a block-like layout. * If the block device supports reservations prefer the SCSI layout, * otherwise advertise the block layout. */ +#ifdef CONFIG_NFSD_FLEXFILELAYOUT + exp->ex_layout_types |= 1 << LAYOUT_FLEX_FILES; +#endif #ifdef CONFIG_NFSD_BLOCKLAYOUT + /* overwrite flex file layout selection if needed */ if (sb->s_export_op->get_uuid && sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks) - exp->ex_layout_type = LAYOUT_BLOCK_VOLUME; + exp->ex_layout_types |= 1 << LAYOUT_BLOCK_VOLUME; #endif #ifdef CONFIG_NFSD_SCSILAYOUT /* overwrite block layout selection if needed */ if (sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks && sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops) - exp->ex_layout_type = LAYOUT_SCSI; + exp->ex_layout_types |= 1 << LAYOUT_SCSI; #endif } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index de1ff1d98bb1..1fb222752b2b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -605,8 +605,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fh_init(&resfh, NFS4_FHSIZE); - status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, - NFSD_MAY_CREATE); + status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_NOP); if (status) return status; @@ -1219,12 +1218,12 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static const struct nfsd4_layout_ops * nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) { - if (!exp->ex_layout_type) { + if (!exp->ex_layout_types) { dprintk("%s: export does not support pNFS\n", __func__); return NULL; } - if (exp->ex_layout_type != layout_type) { + if (!(exp->ex_layout_types & (1 << layout_type))) { dprintk("%s: layout type %d not supported\n", __func__, layout_type); return NULL; @@ -1270,7 +1269,7 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp, nfserr = nfs_ok; if (gdp->gd_maxcount != 0) { nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, - cstate->session->se_client, gdp); + rqstp, cstate->session->se_client, gdp); } gdp->gd_notify_types &= ops->notify_types; @@ -2335,6 +2334,45 @@ static struct nfsd4_operation nfsd4_ops[] = { }, }; +/** + * nfsd4_spo_must_allow - Determine if the compound op contains an + * operation that is allowed to be sent with machine credentials + * + * @rqstp: a pointer to the struct svc_rqst + * + * Checks to see if the compound contains a spo_must_allow op + * and confirms that it was sent with the proper machine creds. + */ + +bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + struct nfsd4_op *this = &argp->ops[resp->opcnt - 1]; + struct nfsd4_compound_state *cstate = &resp->cstate; + struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow; + u32 opiter; + + if (!cstate->minorversion) + return false; + + if (cstate->spo_must_allowed == true) + return true; + + opiter = resp->opcnt; + while (opiter < argp->opcnt) { + this = &argp->ops[opiter++]; + if (test_bit(this->opnum, allow->u.longs) && + cstate->clp->cl_mach_cred && + nfsd4_mach_creds_match(cstate->clp, rqstp)) { + cstate->spo_must_allowed = true; + return true; + } + } + cstate->spo_must_allowed = false; + return false; +} + int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) { struct nfsd4_operation *opdesc; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 70d0b9b33031..8410ca275db1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1200,27 +1200,6 @@ free_ol_stateid_reaplist(struct list_head *reaplist) } } -static void release_lockowner(struct nfs4_lockowner *lo) -{ - struct nfs4_client *clp = lo->lo_owner.so_client; - struct nfs4_ol_stateid *stp; - struct list_head reaplist; - - INIT_LIST_HEAD(&reaplist); - - spin_lock(&clp->cl_lock); - unhash_lockowner_locked(lo); - while (!list_empty(&lo->lo_owner.so_stateids)) { - stp = list_first_entry(&lo->lo_owner.so_stateids, - struct nfs4_ol_stateid, st_perstateowner); - WARN_ON(!unhash_lock_stateid(stp)); - put_ol_stateid_locked(stp, &reaplist); - } - spin_unlock(&clp->cl_lock); - free_ol_stateid_reaplist(&reaplist); - nfs4_put_stateowner(&lo->lo_owner); -} - static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, struct list_head *reaplist) { @@ -1972,7 +1951,7 @@ static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp) service == RPC_GSS_SVC_PRIVACY; } -static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) +bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) { struct svc_cred *cr = &rqstp->rq_cred; @@ -2388,6 +2367,22 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, switch (exid->spa_how) { case SP4_MACH_CRED: + exid->spo_must_enforce[0] = 0; + exid->spo_must_enforce[1] = ( + 1 << (OP_BIND_CONN_TO_SESSION - 32) | + 1 << (OP_EXCHANGE_ID - 32) | + 1 << (OP_CREATE_SESSION - 32) | + 1 << (OP_DESTROY_SESSION - 32) | + 1 << (OP_DESTROY_CLIENTID - 32)); + + exid->spo_must_allow[0] &= (1 << (OP_CLOSE) | + 1 << (OP_OPEN_DOWNGRADE) | + 1 << (OP_LOCKU) | + 1 << (OP_DELEGRETURN)); + + exid->spo_must_allow[1] &= ( + 1 << (OP_TEST_STATEID - 32) | + 1 << (OP_FREE_STATEID - 32)); if (!svc_rqst_integrity_protected(rqstp)) { status = nfserr_inval; goto out_nolock; @@ -2424,7 +2419,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, status = nfserr_inval; goto out; } - if (!mach_creds_match(conf, rqstp)) { + if (!nfsd4_mach_creds_match(conf, rqstp)) { status = nfserr_wrong_cred; goto out; } @@ -2473,6 +2468,8 @@ out_new: goto out; } new->cl_minorversion = cstate->minorversion; + new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0]; + new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1]; gen_clid(new, nn); add_to_unconfirmed(new); @@ -2676,7 +2673,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, if (conf) { status = nfserr_wrong_cred; - if (!mach_creds_match(conf, rqstp)) + if (!nfsd4_mach_creds_match(conf, rqstp)) goto out_free_conn; cs_slot = &conf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); @@ -2692,7 +2689,7 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_conn; } status = nfserr_wrong_cred; - if (!mach_creds_match(unconf, rqstp)) + if (!nfsd4_mach_creds_match(unconf, rqstp)) goto out_free_conn; cs_slot = &unconf->cl_cs_slot; status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); @@ -2801,7 +2798,7 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, if (!session) goto out_no_session; status = nfserr_wrong_cred; - if (!mach_creds_match(session->se_client, rqstp)) + if (!nfsd4_mach_creds_match(session->se_client, rqstp)) goto out; status = nfsd4_map_bcts_dir(&bcts->dir); if (status) @@ -2848,7 +2845,7 @@ nfsd4_destroy_session(struct svc_rqst *r, if (!ses) goto out_client_lock; status = nfserr_wrong_cred; - if (!mach_creds_match(ses->se_client, r)) + if (!nfsd4_mach_creds_match(ses->se_client, r)) goto out_put_session; status = mark_session_dead_locked(ses, 1 + ref_held_by_me); if (status) @@ -3087,7 +3084,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta status = nfserr_stale_clientid; goto out; } - if (!mach_creds_match(clp, rqstp)) { + if (!nfsd4_mach_creds_match(clp, rqstp)) { clp = NULL; status = nfserr_wrong_cred; goto out; @@ -3112,7 +3109,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta * We don't take advantage of the rca_one_fs case. * That's OK, it's optional, we can safely ignore it. */ - return nfs_ok; + return nfs_ok; } status = nfserr_complete_already; @@ -5945,6 +5942,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct nfs4_client *clp; + LIST_HEAD (reaplist); dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); @@ -5975,9 +5973,23 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, nfs4_get_stateowner(sop); break; } + if (!lo) { + spin_unlock(&clp->cl_lock); + return status; + } + + unhash_lockowner_locked(lo); + while (!list_empty(&lo->lo_owner.so_stateids)) { + stp = list_first_entry(&lo->lo_owner.so_stateids, + struct nfs4_ol_stateid, + st_perstateowner); + WARN_ON(!unhash_lock_stateid(stp)); + put_ol_stateid_locked(stp, &reaplist); + } spin_unlock(&clp->cl_lock); - if (lo) - release_lockowner(lo); + free_ol_stateid_reaplist(&reaplist); + nfs4_put_stateowner(&lo->lo_owner); + return status; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9df898ba648f..0aa0236a1429 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1299,16 +1299,14 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, break; case SP4_MACH_CRED: /* spo_must_enforce */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - p += dummy; - + status = nfsd4_decode_bitmap(argp, + exid->spo_must_enforce); + if (status) + goto out; /* spo_must_allow */ - READ_BUF(4); - dummy = be32_to_cpup(p++); - READ_BUF(dummy * 4); - p += dummy; + status = nfsd4_decode_bitmap(argp, exid->spo_must_allow); + if (status) + goto out; break; case SP4_SSV: /* ssp_ops */ @@ -2164,22 +2162,20 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, } static inline __be32 -nfsd4_encode_layout_type(struct xdr_stream *xdr, enum pnfs_layouttype layout_type) +nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types) { - __be32 *p; + __be32 *p; + unsigned long i = hweight_long(layout_types); - if (layout_type) { - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(1); - *p++ = cpu_to_be32(layout_type); - } else { - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); - } + p = xdr_reserve_space(xdr, 4 + 4 * i); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(i); + + for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) + if (layout_types & (1 << i)) + *p++ = cpu_to_be32(i); return 0; } @@ -2754,13 +2750,13 @@ out_acl: } #ifdef CONFIG_NFSD_PNFS if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type); + status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); if (status) goto out; } if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) { - status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type); + status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); if (status) goto out; } @@ -3867,14 +3863,6 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w return nfserr; } -static const u32 nfs4_minimal_spo_must_enforce[2] = { - [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) | - 1 << (OP_EXCHANGE_ID - 32) | - 1 << (OP_CREATE_SESSION - 32) | - 1 << (OP_DESTROY_SESSION - 32) | - 1 << (OP_DESTROY_CLIENTID - 32) -}; - static __be32 nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_exchange_id *exid) @@ -3885,6 +3873,7 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, char *server_scope; int major_id_sz; int server_scope_sz; + int status = 0; uint64_t minor_id = 0; if (nfserr) @@ -3913,18 +3902,20 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, case SP4_NONE: break; case SP4_MACH_CRED: - /* spo_must_enforce, spo_must_allow */ - p = xdr_reserve_space(xdr, 16); - if (!p) - return nfserr_resource; - /* spo_must_enforce bitmap: */ - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]); - *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]); - /* empty spo_must_allow bitmap: */ - *p++ = cpu_to_be32(0); - + status = nfsd4_encode_bitmap(xdr, + exid->spo_must_enforce[0], + exid->spo_must_enforce[1], + exid->spo_must_enforce[2]); + if (status) + goto out; + /* spo_must_allow bitmap: */ + status = nfsd4_encode_bitmap(xdr, + exid->spo_must_allow[0], + exid->spo_must_allow[1], + exid->spo_must_allow[2]); + if (status) + goto out; break; default: WARN_ON_ONCE(1); @@ -3951,6 +3942,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, /* Implementation id */ *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */ return 0; +out: + return status; } static __be32 diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 9690cb4dd588..65ad0165a94f 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -158,7 +158,6 @@ static const struct file_operations exports_proc_operations = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, - .owner = THIS_MODULE, }; static int exports_nfsd_open(struct inode *inode, struct file *file) @@ -171,7 +170,6 @@ static const struct file_operations exports_nfsd_operations = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, - .owner = THIS_MODULE, }; static int export_features_show(struct seq_file *m, void *v) @@ -217,7 +215,6 @@ static const struct file_operations pool_stats_operations = { .read = seq_read, .llseek = seq_lseek, .release = nfsd_pool_stats_release, - .owner = THIS_MODULE, }; static struct file_operations reply_cache_stats_operations = { @@ -1154,20 +1151,15 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) #endif /* last one */ {""} }; - struct net *net = data; - int ret; - - ret = simple_fill_super(sb, 0x6e667364, nfsd_files); - if (ret) - return ret; - sb->s_fs_info = get_net(net); - return 0; + get_net(sb->s_fs_info); + return simple_fill_super(sb, 0x6e667364, nfsd_files); } static struct dentry *nfsd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super); + struct net *net = current->nsproxy->net_ns; + return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super); } static void nfsd_umount(struct super_block *sb) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index cf980523898b..9446849888d5 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -124,6 +124,7 @@ void nfs4_state_shutdown_net(struct net *net); void nfs4_reset_lease(time_t leasetime); int nfs4_reset_recoverydir(char *recdir); char * nfs4_recoverydir(void); +bool nfsd4_spo_must_allow(struct svc_rqst *rqstp); #else static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } @@ -134,6 +135,10 @@ static inline void nfs4_state_shutdown_net(struct net *net) { } static inline void nfs4_reset_lease(time_t leasetime) { } static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } static inline char * nfs4_recoverydir(void) {return NULL; } +static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) +{ + return false; +} #endif /* diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index a8919444c460..cfe7500d5847 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -59,14 +59,20 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) * the write call). */ static inline __be32 -nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, umode_t requested) +nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry, + umode_t requested) { - mode &= S_IFMT; + umode_t mode = d_inode(dentry)->i_mode & S_IFMT; if (requested == 0) /* the caller doesn't care */ return nfs_ok; - if (mode == requested) + if (mode == requested) { + if (mode == S_IFDIR && !d_can_lookup(dentry)) { + WARN_ON_ONCE(1); + return nfserr_notdir; + } return nfs_ok; + } /* * v4 has an error more specific than err_notdir which we should * return in preference to err_notdir: @@ -298,7 +304,7 @@ out: * that it expects something not of the given type. * * @access is formed from the NFSD_MAY_* constants defined in - * include/linux/nfsd/nfsd.h. + * fs/nfsd/vfs.h. */ __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) @@ -340,7 +346,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) if (error) goto out; - error = nfsd_mode_check(rqstp, d_inode(dentry)->i_mode, type); + error = nfsd_mode_check(rqstp, dentry, type); if (error) goto out; @@ -533,7 +539,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, * the reference filehandle (if it is in the same export) * or the export options. */ - set_version_and_fsid_type(fhp, exp, ref_fh); + set_version_and_fsid_type(fhp, exp, ref_fh); if (ref_fh == fhp) fh_put(ref_fh); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 4cd78ef4c95c..e9214768cde9 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -251,9 +251,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */ - nfserr = nfserr_acces; - if (!argp->len) - goto done; nfserr = nfserr_exist; if (isdotent(argp->name, argp->len)) goto done; @@ -362,8 +359,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, nfserr = 0; if (!inode) { /* File doesn't exist. Create it and set attrs */ - nfserr = nfsd_create(rqstp, dirfhp, argp->name, argp->len, - attr, type, rdev, newfhp); + nfserr = nfsd_create_locked(rqstp, dirfhp, argp->name, + argp->len, attr, type, rdev, newfhp); } else if (type == S_IFREG) { dprintk("nfsd: existing %s, valid=%x, size=%ld\n", argp->name, attr->ia_valid, (long) attr->ia_size); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 79d964aa8079..41b468a6a90f 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -240,7 +240,7 @@ nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p, || !(p = decode_filename(p, &args->name, &args->len))) return 0; - return xdr_argsize_check(rqstp, p); + return xdr_argsize_check(rqstp, p); } int diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index 7d073b9b1553..0c2a716e8741 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -21,6 +21,7 @@ struct nfsd4_layout_ops { u32 notify_types; __be32 (*proc_getdeviceinfo)(struct super_block *sb, + struct svc_rqst *rqstp, struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdevp); __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, @@ -44,6 +45,9 @@ extern const struct nfsd4_layout_ops bl_layout_ops; #ifdef CONFIG_NFSD_SCSILAYOUT extern const struct nfsd4_layout_ops scsi_layout_ops; #endif +#ifdef CONFIG_NFSD_FLEXFILELAYOUT +extern const struct nfsd4_layout_ops ff_layout_ops; +#endif __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stateid_t *stateid, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 64053eadeb81..b95adf9a1595 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -345,6 +345,7 @@ struct nfs4_client { u32 cl_exchange_flags; /* number of rpc's in progress over an associated session: */ atomic_t cl_refcount; + struct nfs4_op_map cl_spo_must_allow; /* for nfs41 callbacks */ /* We currently support a single back channel with a single slot */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index cd90878a76aa..d97338bb6a39 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -84,7 +84,6 @@ static int nfsd_proc_open(struct inode *inode, struct file *file) } static const struct file_operations nfsd_proc_fops = { - .owner = THIS_MODULE, .open = nfsd_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6fbd81ecb410..ba944123167b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1135,96 +1135,37 @@ nfsd_check_ignore_resizing(struct iattr *iap) iap->ia_valid &= ~ATTR_SIZE; } -/* - * Create a file (regular, directory, device, fifo); UNIX sockets - * not yet implemented. - * If the response fh has been verified, the parent directory should - * already be locked. Note that the parent directory is left locked. - * - * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp - */ +/* The parent directory should already be locked: */ __be32 -nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, +nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, char *fname, int flen, struct iattr *iap, int type, dev_t rdev, struct svc_fh *resfhp) { - struct dentry *dentry, *dchild = NULL; + struct dentry *dentry, *dchild; struct inode *dirp; __be32 err; __be32 err2; int host_err; - err = nfserr_perm; - if (!flen) - goto out; - err = nfserr_exist; - if (isdotent(fname, flen)) - goto out; - - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); - if (err) - goto out; - dentry = fhp->fh_dentry; dirp = d_inode(dentry); - err = nfserr_notdir; - if (!dirp->i_op->lookup) - goto out; - /* - * Check whether the response file handle has been verified yet. - * If it has, the parent directory should already be locked. - */ - if (!resfhp->fh_dentry) { - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - - /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ - fh_lock_nested(fhp, I_MUTEX_PARENT); - dchild = lookup_one_len(fname, dentry, flen); - host_err = PTR_ERR(dchild); - if (IS_ERR(dchild)) - goto out_nfserr; - err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); - if (err) - goto out; - } else { - /* called from nfsd_proc_create */ - dchild = dget(resfhp->fh_dentry); - if (!fhp->fh_locked) { - /* not actually possible */ - printk(KERN_ERR - "nfsd_create: parent %pd2 not locked!\n", + dchild = dget(resfhp->fh_dentry); + if (!fhp->fh_locked) { + WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n", dentry); - err = nfserr_io; - goto out; - } - } - /* - * Make sure the child dentry is still negative ... - */ - err = nfserr_exist; - if (d_really_is_positive(dchild)) { - dprintk("nfsd_create: dentry %pd/%pd not negative!\n", - dentry, dchild); - goto out; + err = nfserr_io; + goto out; } + err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE); + if (err) + goto out; + if (!(iap->ia_valid & ATTR_MODE)) iap->ia_mode = 0; iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; - err = nfserr_inval; - if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) { - printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", - type); - goto out; - } - - /* - * Get the dir op function pointer. - */ err = 0; host_err = 0; switch (type) { @@ -1242,6 +1183,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, case S_IFSOCK: host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); break; + default: + printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", + type); + host_err = -EINVAL; } if (host_err < 0) goto out_nfserr; @@ -1251,7 +1196,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, /* * nfsd_create_setattr already committed the child. Transactional * filesystems had a chance to commit changes for both parent and - * child * simultaneously making the following commit_metadata a + * child simultaneously making the following commit_metadata a * noop. */ err2 = nfserrno(commit_metadata(fhp)); @@ -1263,8 +1208,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!err) err = fh_update(resfhp); out: - if (dchild && !IS_ERR(dchild)) - dput(dchild); + dput(dchild); return err; out_nfserr: @@ -1272,6 +1216,50 @@ out_nfserr: goto out; } +/* + * Create a filesystem object (regular, directory, special). + * Note that the parent directory is left locked. + * + * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp + */ +__be32 +nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, struct iattr *iap, + int type, dev_t rdev, struct svc_fh *resfhp) +{ + struct dentry *dentry, *dchild = NULL; + struct inode *dirp; + __be32 err; + int host_err; + + if (isdotent(fname, flen)) + return nfserr_exist; + + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_NOP); + if (err) + return err; + + dentry = fhp->fh_dentry; + dirp = d_inode(dentry); + + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + + fh_lock_nested(fhp, I_MUTEX_PARENT); + dchild = lookup_one_len(fname, dentry, flen); + host_err = PTR_ERR(dchild); + if (IS_ERR(dchild)) + return nfserrno(host_err); + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); + if (err) { + dput(dchild); + return err; + } + return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type, + rdev, resfhp); +} + #ifdef CONFIG_NFSD_V3 /* @@ -1304,12 +1292,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; dirp = d_inode(dentry); - /* Get all the sanity checks out of the way before - * we lock the parent. */ - err = nfserr_notdir; - if (!dirp->i_op->lookup) - goto out; - host_err = fh_want_write(fhp); if (host_err) goto out_nfserr; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 2d573ec057f8..3cbb1b33777b 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -59,6 +59,9 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, __be32 nfsd4_clone_file_range(struct file *, u64, struct file *, u64, u64); #endif /* CONFIG_NFSD_V4 */ +__be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *, + char *name, int len, struct iattr *attrs, + int type, dev_t rdev, struct svc_fh *res); __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, int type, dev_t rdev, struct svc_fh *res); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d9554813e58a..beea0c5edc51 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -59,6 +59,7 @@ struct nfsd4_compound_state { struct nfsd4_session *session; struct nfsd4_slot *slot; int data_offset; + bool spo_must_allowed; size_t iovlen; u32 minorversion; __be32 status; @@ -403,6 +404,8 @@ struct nfsd4_exchange_id { clientid_t clientid; u32 seqid; int spa_how; + u32 spo_must_enforce[3]; + u32 spo_must_allow[3]; }; struct nfsd4_sequence { @@ -654,6 +657,8 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) } + +bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp); int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *, struct nfsd4_compoundargs *); diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 1a85d94f5b25..2c90e285d7c6 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -622,10 +622,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, lock = nilfs_mdt_bgl_lock(inode, group); if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) - nilfs_warning(inode->i_sb, __func__, - "entry number %llu already freed: ino=%lu", - (unsigned long long)req->pr_entry_nr, - (unsigned long)inode->i_ino); + nilfs_msg(inode->i_sb, KERN_WARNING, + "%s (ino=%lu): entry number %llu already freed", + __func__, inode->i_ino, + (unsigned long long)req->pr_entry_nr); else nilfs_palloc_group_desc_add_entries(desc, lock, 1); @@ -663,10 +663,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, lock = nilfs_mdt_bgl_lock(inode, group); if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) - nilfs_warning(inode->i_sb, __func__, - "entry number %llu already freed: ino=%lu", - (unsigned long long)req->pr_entry_nr, - (unsigned long)inode->i_ino); + nilfs_msg(inode->i_sb, KERN_WARNING, + "%s (ino=%lu): entry number %llu already freed", + __func__, inode->i_ino, + (unsigned long long)req->pr_entry_nr); else nilfs_palloc_group_desc_add_entries(desc, lock, 1); @@ -772,10 +772,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) do { if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) { - nilfs_warning(inode->i_sb, __func__, - "entry number %llu already freed: ino=%lu", - (unsigned long long)entry_nrs[j], - (unsigned long)inode->i_ino); + nilfs_msg(inode->i_sb, KERN_WARNING, + "%s (ino=%lu): entry number %llu already freed", + __func__, inode->i_ino, + (unsigned long long)entry_nrs[j]); } else { n++; } @@ -816,12 +816,11 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) for (k = 0; k < nempties; k++) { ret = nilfs_palloc_delete_entry_block(inode, last_nrs[k]); - if (ret && ret != -ENOENT) { - nilfs_warning(inode->i_sb, __func__, - "failed to delete block of entry %llu: ino=%lu, err=%d", - (unsigned long long)last_nrs[k], - (unsigned long)inode->i_ino, ret); - } + if (ret && ret != -ENOENT) + nilfs_msg(inode->i_sb, KERN_WARNING, + "error %d deleting block that object (entry=%llu, ino=%lu) belongs to", + ret, (unsigned long long)last_nrs[k], + inode->i_ino); } desc_kaddr = kmap_atomic(desc_bh->b_page); @@ -835,12 +834,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) if (nfree == nilfs_palloc_entries_per_group(inode)) { ret = nilfs_palloc_delete_bitmap_block(inode, group); - if (ret && ret != -ENOENT) { - nilfs_warning(inode->i_sb, __func__, - "failed to delete bitmap block of group %lu: ino=%lu, err=%d", - group, - (unsigned long)inode->i_ino, ret); - } + if (ret && ret != -ENOENT) + nilfs_msg(inode->i_sb, KERN_WARNING, + "error %d deleting bitmap block of group=%lu, ino=%lu", + ret, group, inode->i_ino); } } return 0; diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index f2a7877e0c8c..01fb1831ca25 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -41,8 +41,8 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap, struct inode *inode = bmap->b_inode; if (err == -EINVAL) { - nilfs_error(inode->i_sb, fname, - "broken bmap (inode number=%lu)", inode->i_ino); + __nilfs_error(inode->i_sb, fname, + "broken bmap (inode number=%lu)", inode->i_ino); err = -EIO; } return err; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index b6a4c8f93ac8..2b6ffbe5997a 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -22,7 +22,7 @@ #include <linux/types.h> #include <linux/fs.h> #include <linux/buffer_head.h> -#include <linux/nilfs2_fs.h> +#include <linux/nilfs2_ondisk.h> /* nilfs_binfo, nilfs_inode, etc */ #include "alloc.h" #include "dat.h" diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 4cca998ec7a0..d5c23da43513 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -41,7 +41,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) struct inode *inode = NILFS_BTNC_I(btnc); struct buffer_head *bh; - bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node)); if (unlikely(!bh)) return NULL; @@ -70,7 +70,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, struct page *page; int err; - bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node)); if (unlikely(!bh)) return -ENOMEM; diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 982d1e3df3a5..2e315f9f2e51 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -339,12 +339,14 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node, * nilfs_btree_node_broken - verify consistency of btree node * @node: btree node block to be examined * @size: node size (in bytes) + * @inode: host inode of btree * @blocknr: block number * * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. */ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, - size_t size, sector_t blocknr) + size_t size, struct inode *inode, + sector_t blocknr) { int level, flags, nchildren; int ret = 0; @@ -358,9 +360,10 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, (flags & NILFS_BTREE_NODE_ROOT) || nchildren < 0 || nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) { - printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): " - "level = %d, flags = 0x%x, nchildren = %d\n", - (unsigned long long)blocknr, level, flags, nchildren); + nilfs_msg(inode->i_sb, KERN_CRIT, + "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d", + inode->i_ino, (unsigned long long)blocknr, level, + flags, nchildren); ret = 1; } return ret; @@ -369,12 +372,12 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, /** * nilfs_btree_root_broken - verify consistency of btree root node * @node: btree root node to be examined - * @ino: inode number + * @inode: host inode of btree * * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. */ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, - unsigned long ino) + struct inode *inode) { int level, flags, nchildren; int ret = 0; @@ -387,8 +390,9 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, level >= NILFS_BTREE_LEVEL_MAX || nchildren < 0 || nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) { - pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n", - ino, level, flags, nchildren); + nilfs_msg(inode->i_sb, KERN_CRIT, + "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d", + inode->i_ino, level, flags, nchildren); ret = 1; } return ret; @@ -396,13 +400,15 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, int nilfs_btree_broken_node_block(struct buffer_head *bh) { + struct inode *inode; int ret; if (buffer_nilfs_checked(bh)) return 0; + inode = bh->b_page->mapping->host; ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data, - bh->b_size, bh->b_blocknr); + bh->b_size, inode, bh->b_blocknr); if (likely(!ret)) set_buffer_nilfs_checked(bh); return ret; @@ -448,13 +454,15 @@ nilfs_btree_get_node(const struct nilfs_bmap *btree, return node; } -static int -nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) +static int nilfs_btree_bad_node(const struct nilfs_bmap *btree, + struct nilfs_btree_node *node, int level) { if (unlikely(nilfs_btree_node_get_level(node) != level)) { dump_stack(); - printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n", - nilfs_btree_node_get_level(node), level); + nilfs_msg(btree->b_inode->i_sb, KERN_CRIT, + "btree level mismatch (ino=%lu): %d != %d", + btree->b_inode->i_ino, + nilfs_btree_node_get_level(node), level); return 1; } return 0; @@ -509,6 +517,9 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, out_no_wait: if (!buffer_uptodate(bh)) { + nilfs_msg(btree->b_inode->i_sb, KERN_ERR, + "I/O error reading b-tree node block (ino=%lu, blocknr=%llu)", + btree->b_inode->i_ino, (unsigned long long)ptr); brelse(bh); return -EIO; } @@ -568,7 +579,7 @@ static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree, return ret; node = nilfs_btree_get_nonroot_node(path, level); - if (nilfs_btree_bad_node(node, level)) + if (nilfs_btree_bad_node(btree, node, level)) return -EINVAL; if (!found) found = nilfs_btree_node_lookup(node, key, &index); @@ -616,7 +627,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree, if (ret < 0) return ret; node = nilfs_btree_get_nonroot_node(path, level); - if (nilfs_btree_bad_node(node, level)) + if (nilfs_btree_bad_node(btree, node, level)) return -EINVAL; index = nilfs_btree_node_get_nchildren(node) - 1; ptr = nilfs_btree_node_get_ptr(node, index, ncmax); @@ -2072,8 +2083,10 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree, ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { if (unlikely(ret == -ENOENT)) - printk(KERN_CRIT "%s: key = %llu, level == %d\n", - __func__, (unsigned long long)key, level); + nilfs_msg(btree->b_inode->i_sb, KERN_CRIT, + "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d", + btree->b_inode->i_ino, + (unsigned long long)key, level); goto out; } @@ -2110,12 +2123,11 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree, if (level < NILFS_BTREE_LEVEL_NODE_MIN || level >= NILFS_BTREE_LEVEL_MAX) { dump_stack(); - printk(KERN_WARNING - "%s: invalid btree level: %d (key=%llu, ino=%lu, " - "blocknr=%llu)\n", - __func__, level, (unsigned long long)key, - NILFS_BMAP_I(btree)->vfs_inode.i_ino, - (unsigned long long)bh->b_blocknr); + nilfs_msg(btree->b_inode->i_sb, KERN_WARNING, + "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)", + level, (unsigned long long)key, + btree->b_inode->i_ino, + (unsigned long long)bh->b_blocknr); return; } @@ -2394,8 +2406,7 @@ int nilfs_btree_init(struct nilfs_bmap *bmap) __nilfs_btree_init(bmap); - if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), - bmap->b_inode->i_ino)) + if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode)) ret = -EIO; return ret; } diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index df1a25faa83b..2184e47fa4bf 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -22,7 +22,7 @@ #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/list.h> -#include <linux/nilfs2_fs.h> +#include <linux/nilfs2_ondisk.h> /* nilfs_btree_node */ #include "btnode.h" #include "bmap.h" diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 8a3d3b65af3f..a15a1601e931 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -21,7 +21,6 @@ #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/errno.h> -#include <linux/nilfs2_fs.h> #include "mdt.h" #include "cpfile.h" @@ -332,9 +331,9 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, int ret, ncps, nicps, nss, count, i; if (unlikely(start == 0 || start > end)) { - printk(KERN_ERR "%s: invalid range of checkpoint numbers: " - "[%llu, %llu)\n", __func__, - (unsigned long long)start, (unsigned long long)end); + nilfs_msg(cpfile->i_sb, KERN_ERR, + "cannot delete checkpoints: invalid range [%llu, %llu)", + (unsigned long long)start, (unsigned long long)end); return -EINVAL; } @@ -386,9 +385,9 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, cpfile, cno); if (ret == 0) continue; - printk(KERN_ERR - "%s: cannot delete block\n", - __func__); + nilfs_msg(cpfile->i_sb, KERN_ERR, + "error %d deleting checkpoint block", + ret); break; } } @@ -991,14 +990,12 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, int err; if (cpsize > sb->s_blocksize) { - printk(KERN_ERR - "NILFS: too large checkpoint size: %zu bytes.\n", - cpsize); + nilfs_msg(sb, KERN_ERR, + "too large checkpoint size: %zu bytes", cpsize); return -EINVAL; } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) { - printk(KERN_ERR - "NILFS: too small checkpoint size: %zu bytes.\n", - cpsize); + nilfs_msg(sb, KERN_ERR, + "too small checkpoint size: %zu bytes", cpsize); return -EINVAL; } diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index 0249744ae234..6eca972f9673 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -21,7 +21,8 @@ #include <linux/fs.h> #include <linux/buffer_head.h> -#include <linux/nilfs2_fs.h> +#include <linux/nilfs2_api.h> /* nilfs_cpstat */ +#include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */ int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 7367610ea807..dffedb2f8817 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -349,10 +349,11 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) kaddr = kmap_atomic(entry_bh->b_page); entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { - printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__, - (unsigned long long)vblocknr, - (unsigned long long)le64_to_cpu(entry->de_start), - (unsigned long long)le64_to_cpu(entry->de_end)); + nilfs_msg(dat->i_sb, KERN_CRIT, + "%s: invalid vblocknr = %llu, [%llu, %llu)", + __func__, (unsigned long long)vblocknr, + (unsigned long long)le64_to_cpu(entry->de_start), + (unsigned long long)le64_to_cpu(entry->de_end)); kunmap_atomic(kaddr); brelse(entry_bh); return -EINVAL; @@ -479,14 +480,12 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size, int err; if (entry_size > sb->s_blocksize) { - printk(KERN_ERR - "NILFS: too large DAT entry size: %zu bytes.\n", - entry_size); + nilfs_msg(sb, KERN_ERR, "too large DAT entry size: %zu bytes", + entry_size); return -EINVAL; } else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) { - printk(KERN_ERR - "NILFS: too small DAT entry size: %zu bytes.\n", - entry_size); + nilfs_msg(sb, KERN_ERR, "too small DAT entry size: %zu bytes", + entry_size); return -EINVAL; } diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index abbfdabcabea..57dc6cf466d0 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -22,6 +22,7 @@ #include <linux/types.h> #include <linux/buffer_head.h> #include <linux/fs.h> +#include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */ struct nilfs_palloc_req; diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index e506f4f7120a..908ebbf0ac7e 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -42,6 +42,28 @@ #include "nilfs.h" #include "page.h" +static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen) +{ + unsigned int len = le16_to_cpu(dlen); + +#if (PAGE_SIZE >= 65536) + if (len == NILFS_MAX_REC_LEN) + return 1 << 16; +#endif + return len; +} + +static inline __le16 nilfs_rec_len_to_disk(unsigned int len) +{ +#if (PAGE_SIZE >= 65536) + if (len == (1 << 16)) + return cpu_to_le16(NILFS_MAX_REC_LEN); + + BUG_ON(len > (1 << 16)); +#endif + return cpu_to_le16(len); +} + /* * nilfs uses block-sized chunks. Arguably, sector-sized ones would be * more robust, but we have what we have @@ -140,10 +162,9 @@ out: /* Too bad, we had an error */ Ebadsize: - nilfs_error(sb, "nilfs_check_page", + nilfs_error(sb, "size of directory #%lu is not a multiple of chunk size", - dir->i_ino - ); + dir->i_ino); goto fail; Eshort: error = "rec_len is smaller than minimal"; @@ -157,19 +178,18 @@ Enamelen: Espan: error = "directory entry across blocks"; bad_entry: - nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - " - "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs, - (unsigned long) le64_to_cpu(p->inode), + nilfs_error(sb, + "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index << PAGE_SHIFT) + offs, + (unsigned long)le64_to_cpu(p->inode), rec_len, p->name_len); goto fail; Eend: p = (struct nilfs_dir_entry *)(kaddr + offs); - nilfs_error(sb, "nilfs_check_page", - "entry in directory #%lu spans the page boundary" - "offset=%lu, inode=%lu", - dir->i_ino, (page->index<<PAGE_SHIFT)+offs, - (unsigned long) le64_to_cpu(p->inode)); + nilfs_error(sb, + "entry in directory #%lu spans the page boundary offset=%lu, inode=%lu", + dir->i_ino, (page->index << PAGE_SHIFT) + offs, + (unsigned long)le64_to_cpu(p->inode)); fail: SetPageError(page); return false; @@ -267,8 +287,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) struct page *page = nilfs_get_page(inode, n); if (IS_ERR(page)) { - nilfs_error(sb, __func__, "bad page in #%lu", - inode->i_ino); + nilfs_error(sb, "bad page in #%lu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; return -EIO; } @@ -278,8 +297,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) NILFS_DIR_REC_LEN(1); for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) { if (de->rec_len == 0) { - nilfs_error(sb, __func__, - "zero-length directory entry"); + nilfs_error(sb, "zero-length directory entry"); nilfs_put_page(page); return -EIO; } @@ -345,7 +363,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, kaddr += nilfs_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { if (de->rec_len == 0) { - nilfs_error(dir->i_sb, __func__, + nilfs_error(dir->i_sb, "zero-length directory entry"); nilfs_put_page(page); goto out; @@ -360,7 +378,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, n = 0; /* next page is past the blocks we've got */ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) { - nilfs_error(dir->i_sb, __func__, + nilfs_error(dir->i_sb, "dir %lu size %lld exceeds block count %llu", dir->i_ino, dir->i_size, (unsigned long long)dir->i_blocks); @@ -469,7 +487,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) goto got_it; } if (de->rec_len == 0) { - nilfs_error(dir->i_sb, __func__, + nilfs_error(dir->i_sb, "zero-length directory entry"); err = -EIO; goto out_unlock; @@ -541,7 +559,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) while ((char *)de < (char *)dir) { if (de->rec_len == 0) { - nilfs_error(inode->i_sb, __func__, + nilfs_error(inode->i_sb, "zero-length directory entry"); err = -EIO; goto out; @@ -628,7 +646,7 @@ int nilfs_empty_dir(struct inode *inode) while ((char *)de <= kaddr) { if (de->rec_len == 0) { - nilfs_error(inode->i_sb, __func__, + nilfs_error(inode->i_sb, "zero-length directory entry (kaddr=%p, de=%p)", kaddr, de); goto not_empty; diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 251a44928405..96e3ed0d9652 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -337,14 +337,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap, key = nilfs_bmap_data_get_key(bmap, *bh); if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { - printk(KERN_CRIT "%s: invalid key: %llu\n", __func__, - (unsigned long long)key); + nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT, + "%s (ino=%lu): invalid key: %llu", __func__, + bmap->b_inode->i_ino, (unsigned long long)key); return -EINVAL; } ptr = nilfs_direct_get_ptr(bmap, key); if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { - printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__, - (unsigned long long)ptr); + nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT, + "%s (ino=%lu): invalid pointer: %llu", __func__, + bmap->b_inode->i_ino, (unsigned long long)ptr); return -EINVAL; } diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h index 3015a6e78724..cfe85e848bba 100644 --- a/fs/nilfs2/direct.h +++ b/fs/nilfs2/direct.h @@ -24,16 +24,6 @@ #include "bmap.h" -/** - * struct nilfs_direct_node - direct node - * @dn_flags: flags - * @dn_pad: padding - */ -struct nilfs_direct_node { - __u8 dn_flags; - __u8 pad[7]; -}; - #define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) #define NILFS_DIRECT_KEY_MIN 0 #define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index e9148f94d696..853a831dcde0 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -148,8 +148,15 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) { wait_on_buffer(bh); - if (!buffer_uptodate(bh)) + if (!buffer_uptodate(bh)) { + struct inode *inode = bh->b_page->mapping->host; + + nilfs_msg(inode->i_sb, KERN_ERR, + "I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)", + buffer_nilfs_node(bh) ? "node" : "data", + inode->i_ino, (unsigned long long)bh->b_blocknr); return -EIO; + } if (buffer_dirty(bh)) return -EEXIST; diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index 1d2b1805327a..b8fa45c20c63 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -145,15 +145,14 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, int err; if (unlikely(!NILFS_VALID_INODE(sb, ino))) { - nilfs_error(sb, __func__, "bad inode number: %lu", - (unsigned long) ino); + nilfs_error(sb, "bad inode number: %lu", (unsigned long)ino); return -EINVAL; } err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh); if (unlikely(err)) - nilfs_warning(sb, __func__, "unable to read inode: %lu", - (unsigned long) ino); + nilfs_msg(sb, KERN_WARNING, "error %d reading inode: ino=%lu", + err, (unsigned long)ino); return err; } diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 23ad2f091e76..188b94fe0ec5 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -23,7 +23,6 @@ #include <linux/fs.h> #include <linux/buffer_head.h> -#include <linux/nilfs2_fs.h> #include "mdt.h" #include "alloc.h" diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index a0ebdb17e912..af04f553d7c9 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -112,13 +112,10 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, * However, the page having this block must * be locked in this case. */ - printk(KERN_WARNING - "nilfs_get_block: a race condition " - "while inserting a data block. " - "(inode number=%lu, file block " - "offset=%llu)\n", - inode->i_ino, - (unsigned long long)blkoff); + nilfs_msg(inode->i_sb, KERN_WARNING, + "%s (ino=%lu): a race condition while inserting a data block at offset=%llu", + __func__, inode->i_ino, + (unsigned long long)blkoff); err = 0; } nilfs_transaction_abort(inode->i_sb); @@ -359,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) root = NILFS_I(dir)->i_root; ii = NILFS_I(inode); - ii->i_state = 1 << NILFS_I_NEW; + ii->i_state = BIT(NILFS_I_NEW); ii->i_root = root; err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh); @@ -558,7 +555,7 @@ static int nilfs_iget_set(struct inode *inode, void *opaque) inode->i_ino = args->ino; if (args->for_gc) { - NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE; + NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE); NILFS_I(inode)->i_cno = args->cno; NILFS_I(inode)->i_root = NULL; } else { @@ -726,9 +723,9 @@ repeat: goto repeat; failed: - nilfs_warning(ii->vfs_inode.i_sb, __func__, - "failed to truncate bmap (ino=%lu, err=%d)", - ii->vfs_inode.i_ino, ret); + nilfs_msg(ii->vfs_inode.i_sb, KERN_WARNING, + "error %d truncating bmap (ino=%lu)", ret, + ii->vfs_inode.i_ino); } void nilfs_truncate(struct inode *inode) @@ -939,9 +936,9 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty) * This will happen when somebody is freeing * this inode. */ - nilfs_warning(inode->i_sb, __func__, - "cannot get inode (ino=%lu)", - inode->i_ino); + nilfs_msg(inode->i_sb, KERN_WARNING, + "cannot set file dirty (ino=%lu): the file is being freed", + inode->i_ino); spin_unlock(&nilfs->ns_inode_lock); return -EINVAL; /* * NILFS_I_DIRTY may remain for @@ -962,8 +959,9 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags) err = nilfs_load_inode_block(inode, &ibh); if (unlikely(err)) { - nilfs_warning(inode->i_sb, __func__, - "failed to reget inode block."); + nilfs_msg(inode->i_sb, KERN_WARNING, + "cannot mark inode dirty (ino=%lu): error %d loading inode block", + inode->i_ino, err); return err; } nilfs_update_inode(inode, ibh, flags); @@ -989,8 +987,8 @@ void nilfs_dirty_inode(struct inode *inode, int flags) struct nilfs_mdt_info *mdi = NILFS_MDT(inode); if (is_bad_inode(inode)) { - nilfs_warning(inode->i_sb, __func__, - "tried to mark bad_inode dirty. ignored."); + nilfs_msg(inode->i_sb, KERN_WARNING, + "tried to mark bad_inode dirty. ignored."); dump_stack(); return; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 358b57e2cdf9..f1d7989459fd 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -25,7 +25,6 @@ #include <linux/compat.h> /* compat_ptr() */ #include <linux/mount.h> /* mnt_want_write_file(), mnt_drop_write_file() */ #include <linux/buffer_head.h> -#include <linux/nilfs2_fs.h> #include "nilfs.h" #include "segment.h" #include "bmap.h" @@ -584,27 +583,25 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode, if (unlikely(ret < 0)) { if (ret == -ENOENT) - printk(KERN_CRIT - "%s: invalid virtual block address (%s): " - "ino=%llu, cno=%llu, offset=%llu, " - "blocknr=%llu, vblocknr=%llu\n", - __func__, vdesc->vd_flags ? "node" : "data", - (unsigned long long)vdesc->vd_ino, - (unsigned long long)vdesc->vd_cno, - (unsigned long long)vdesc->vd_offset, - (unsigned long long)vdesc->vd_blocknr, - (unsigned long long)vdesc->vd_vblocknr); + nilfs_msg(inode->i_sb, KERN_CRIT, + "%s: invalid virtual block address (%s): ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); return ret; } if (unlikely(!list_empty(&bh->b_assoc_buffers))) { - printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, " - "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n", - __func__, vdesc->vd_flags ? "node" : "data", - (unsigned long long)vdesc->vd_ino, - (unsigned long long)vdesc->vd_cno, - (unsigned long long)vdesc->vd_offset, - (unsigned long long)vdesc->vd_blocknr, - (unsigned long long)vdesc->vd_vblocknr); + nilfs_msg(inode->i_sb, KERN_CRIT, + "%s: conflicting %s buffer: ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); brelse(bh); return -EEXIST; } @@ -854,8 +851,8 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, return 0; failed: - printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n", - msg, ret); + nilfs_msg(nilfs->ns_sb, KERN_ERR, "error %d preparing GC: %s", ret, + msg); return ret; } @@ -963,10 +960,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, } ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]); - if (ret < 0) - printk(KERN_ERR "NILFS: GC failed during preparation: " - "cannot read source blocks: err=%d\n", ret); - else { + if (ret < 0) { + nilfs_msg(inode->i_sb, KERN_ERR, + "error %d preparing GC: cannot read source blocks", + ret); + } else { if (nilfs_sb_need_update(nilfs)) set_nilfs_discontinued(nilfs); ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 0d7b71fbeff8..d56d3a5bea88 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -207,8 +207,12 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, out_no_wait: err = -EIO; - if (!buffer_uptodate(first_bh)) + if (!buffer_uptodate(first_bh)) { + nilfs_msg(inode->i_sb, KERN_ERR, + "I/O error reading meta-data file (ino=%lu, block-offset=%lu)", + inode->i_ino, block); goto failed_bh; + } out: *out_bh = first_bh; return 0; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 1ec8ae5995a5..dbcf1dc93a51 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -283,9 +283,9 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) goto out; if (!inode->i_nlink) { - nilfs_warning(inode->i_sb, __func__, - "deleting nonexistent file (%lu), %d", - inode->i_ino, inode->i_nlink); + nilfs_msg(inode->i_sb, KERN_WARNING, + "deleting nonexistent file (ino=%lu), %d", + inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } err = nilfs_delete_entry(de, page); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index b1d48bc0532d..33f8c8fc96e8 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -23,7 +23,8 @@ #include <linux/buffer_head.h> #include <linux/spinlock.h> #include <linux/blkdev.h> -#include <linux/nilfs2_fs.h> +#include <linux/nilfs2_api.h> +#include <linux/nilfs2_ondisk.h> #include "the_nilfs.h" #include "bmap.h" @@ -119,20 +120,19 @@ enum { /* * Macros to check inode numbers */ -#define NILFS_MDT_INO_BITS \ - ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \ - 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \ - 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO)) +#define NILFS_MDT_INO_BITS \ + (BIT(NILFS_DAT_INO) | BIT(NILFS_CPFILE_INO) | \ + BIT(NILFS_SUFILE_INO) | BIT(NILFS_IFILE_INO) | \ + BIT(NILFS_ATIME_INO) | BIT(NILFS_SKETCH_INO)) -#define NILFS_SYS_INO_BITS \ - ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS) +#define NILFS_SYS_INO_BITS (BIT(NILFS_ROOT_INO) | NILFS_MDT_INO_BITS) #define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino) #define NILFS_MDT_INODE(sb, ino) \ - ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino)))) + ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino))) #define NILFS_VALID_INODE(sb, ino) \ - ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino)))) + ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino))) /** * struct nilfs_transaction_info: context information for synchronization @@ -299,10 +299,36 @@ static inline int nilfs_mark_inode_dirty_sync(struct inode *inode) /* super.c */ extern struct inode *nilfs_alloc_inode(struct super_block *); extern void nilfs_destroy_inode(struct inode *); + extern __printf(3, 4) -void nilfs_error(struct super_block *, const char *, const char *, ...); +void __nilfs_msg(struct super_block *sb, const char *level, + const char *fmt, ...); extern __printf(3, 4) -void nilfs_warning(struct super_block *, const char *, const char *, ...); +void __nilfs_error(struct super_block *sb, const char *function, + const char *fmt, ...); + +#ifdef CONFIG_PRINTK + +#define nilfs_msg(sb, level, fmt, ...) \ + __nilfs_msg(sb, level, fmt, ##__VA_ARGS__) +#define nilfs_error(sb, fmt, ...) \ + __nilfs_error(sb, __func__, fmt, ##__VA_ARGS__) + +#else + +#define nilfs_msg(sb, level, fmt, ...) \ + do { \ + no_printk(fmt, ##__VA_ARGS__); \ + (void)(sb); \ + } while (0) +#define nilfs_error(sb, fmt, ...) \ + do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __nilfs_error(sb, "", " "); \ + } while (0) + +#endif /* CONFIG_PRINTK */ + extern struct nilfs_super_block * nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); extern int nilfs_store_magic_and_option(struct super_block *, diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index d97ba5f11b77..f11a3ad2df0c 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -30,9 +30,9 @@ #include "mdt.h" -#define NILFS_BUFFER_INHERENT_BITS \ - ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ - (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked)) +#define NILFS_BUFFER_INHERENT_BITS \ + (BIT(BH_Uptodate) | BIT(BH_Mapped) | BIT(BH_NILFS_Node) | \ + BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked)) static struct buffer_head * __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, @@ -85,9 +85,9 @@ void nilfs_forget_buffer(struct buffer_head *bh) { struct page *page = bh->b_page; const unsigned long clear_bits = - (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped | - 1 << BH_Async_Write | 1 << BH_NILFS_Volatile | - 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected); + (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) | + BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | + BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected)); lock_buffer(bh); set_mask_bits(&bh->b_state, clear_bits, 0); @@ -124,17 +124,17 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) dbh->b_bdev = sbh->b_bdev; bh = dbh; - bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped)); + bits = sbh->b_state & (BIT(BH_Uptodate) | BIT(BH_Mapped)); while ((bh = bh->b_this_page) != dbh) { lock_buffer(bh); bits &= bh->b_state; unlock_buffer(bh); } - if (bits & (1UL << BH_Uptodate)) + if (bits & BIT(BH_Uptodate)) SetPageUptodate(dpage); else ClearPageUptodate(dpage); - if (bits & (1UL << BH_Mapped)) + if (bits & BIT(BH_Mapped)) SetPageMappedToDisk(dpage); else ClearPageMappedToDisk(dpage); @@ -215,7 +215,7 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) create_empty_buffers(dst, sbh->b_size, 0); if (copy_dirty) - mask |= (1UL << BH_Dirty); + mask |= BIT(BH_Dirty); dbh = dbufs = page_buffers(dst); do { @@ -403,11 +403,10 @@ void nilfs_clear_dirty_page(struct page *page, bool silent) BUG_ON(!PageLocked(page)); - if (!silent) { - nilfs_warning(sb, __func__, - "discard page: offset %lld, ino %lu", - page_offset(page), inode->i_ino); - } + if (!silent) + nilfs_msg(sb, KERN_WARNING, + "discard dirty page: offset=%lld, ino=%lu", + page_offset(page), inode->i_ino); ClearPageUptodate(page); ClearPageMappedToDisk(page); @@ -415,18 +414,18 @@ void nilfs_clear_dirty_page(struct page *page, bool silent) if (page_has_buffers(page)) { struct buffer_head *bh, *head; const unsigned long clear_bits = - (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped | - 1 << BH_Async_Write | 1 << BH_NILFS_Volatile | - 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected); + (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) | + BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) | + BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected)); bh = head = page_buffers(page); do { lock_buffer(bh); - if (!silent) { - nilfs_warning(sb, __func__, - "discard block %llu, size %zu", - (u64)bh->b_blocknr, bh->b_size); - } + if (!silent) + nilfs_msg(sb, KERN_WARNING, + "discard dirty block: blocknr=%llu, size=%zu", + (u64)bh->b_blocknr, bh->b_size); + set_mask_bits(&bh->b_state, clear_bits, 0); unlock_buffer(bh); } while (bh = bh->b_this_page, bh != head); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index d893dc912b62..5139efed1888 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -54,38 +54,37 @@ struct nilfs_recovery_block { }; -static int nilfs_warn_segment_error(int err) +static int nilfs_warn_segment_error(struct super_block *sb, int err) { + const char *msg = NULL; + switch (err) { case NILFS_SEG_FAIL_IO: - printk(KERN_WARNING - "NILFS warning: I/O error on loading last segment\n"); + nilfs_msg(sb, KERN_ERR, "I/O error reading segment"); return -EIO; case NILFS_SEG_FAIL_MAGIC: - printk(KERN_WARNING - "NILFS warning: Segment magic number invalid\n"); + msg = "Magic number mismatch"; break; case NILFS_SEG_FAIL_SEQ: - printk(KERN_WARNING - "NILFS warning: Sequence number mismatch\n"); + msg = "Sequence number mismatch"; break; case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: - printk(KERN_WARNING - "NILFS warning: Checksum error in super root\n"); + msg = "Checksum error in super root"; break; case NILFS_SEG_FAIL_CHECKSUM_FULL: - printk(KERN_WARNING - "NILFS warning: Checksum error in segment payload\n"); + msg = "Checksum error in segment payload"; break; case NILFS_SEG_FAIL_CONSISTENCY: - printk(KERN_WARNING - "NILFS warning: Inconsistent segment\n"); + msg = "Inconsistency found"; break; case NILFS_SEG_NO_SUPER_ROOT: - printk(KERN_WARNING - "NILFS warning: No super root in the last segment\n"); + msg = "No super root in the last segment"; break; + default: + nilfs_msg(sb, KERN_ERR, "unrecognized segment error %d", err); + return -EINVAL; } + nilfs_msg(sb, KERN_WARNING, "invalid segment: %s", msg); return -EINVAL; } @@ -178,7 +177,7 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, brelse(bh_sr); failed: - return nilfs_warn_segment_error(ret); + return nilfs_warn_segment_error(nilfs->ns_sb, ret); } /** @@ -553,11 +552,10 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, put_page(page); failed_inode: - printk(KERN_WARNING - "NILFS warning: error recovering data block " - "(err=%d, ino=%lu, block-offset=%llu)\n", - err, (unsigned long)rb->ino, - (unsigned long long)rb->blkoff); + nilfs_msg(sb, KERN_WARNING, + "error %d recovering data block (ino=%lu, block-offset=%llu)", + err, (unsigned long)rb->ino, + (unsigned long long)rb->blkoff); if (!err2) err2 = err; next: @@ -680,8 +678,8 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, } if (nsalvaged_blocks) { - printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n", - sb->s_id, nsalvaged_blocks); + nilfs_msg(sb, KERN_INFO, "salvaged %lu blocks", + nsalvaged_blocks); ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; } out: @@ -692,10 +690,9 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, confused: err = -EINVAL; failed: - printk(KERN_ERR - "NILFS (device %s): Error roll-forwarding " - "(err=%d, pseg block=%llu). ", - sb->s_id, err, (unsigned long long)pseg_start); + nilfs_msg(sb, KERN_ERR, + "error %d roll-forwarding partial segment at blocknr = %llu", + err, (unsigned long long)pseg_start); goto out; } @@ -715,9 +712,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, set_buffer_dirty(bh); err = sync_dirty_buffer(bh); if (unlikely(err)) - printk(KERN_WARNING - "NILFS warning: buffer sync write failed during " - "post-cleaning of recovery.\n"); + nilfs_msg(nilfs->ns_sb, KERN_WARNING, + "buffer sync write failed during post-cleaning of recovery."); brelse(bh); } @@ -752,8 +748,8 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root); if (unlikely(err)) { - printk(KERN_ERR - "NILFS: error loading the latest checkpoint.\n"); + nilfs_msg(sb, KERN_ERR, + "error %d loading the latest checkpoint", err); return err; } @@ -764,8 +760,9 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri); if (unlikely(err)) { - printk(KERN_ERR "NILFS: Error preparing segments for " - "recovery.\n"); + nilfs_msg(sb, KERN_ERR, + "error %d preparing segment for recovery", + err); goto failed; } @@ -778,8 +775,9 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, nilfs_detach_log_writer(sb); if (unlikely(err)) { - printk(KERN_ERR "NILFS: Oops! recovery failed. " - "(err=%d)\n", err); + nilfs_msg(sb, KERN_ERR, + "error %d writing segment for recovery", + err); goto failed; } @@ -961,5 +959,5 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, failed: brelse(bh_sum); nilfs_dispose_segment_list(&segments); - return (ret < 0) ? ret : nilfs_warn_segment_error(ret); + return ret < 0 ? ret : nilfs_warn_segment_error(nilfs->ns_sb, ret); } diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index a962d7d83447..6f87b2ac1aeb 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -514,7 +514,11 @@ static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf) } while (--segbuf->sb_nbio > 0); if (unlikely(atomic_read(&segbuf->sb_err) > 0)) { - printk(KERN_ERR "NILFS: IO error writing segment\n"); + nilfs_msg(segbuf->sb_super, KERN_ERR, + "I/O error writing log (start-blocknr=%llu, block-count=%lu) in segment %llu", + (unsigned long long)segbuf->sb_pseg_start, + segbuf->sb_sum.nblocks, + (unsigned long long)segbuf->sb_segnum); err = -EIO; } return err; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index e78b68a81aec..bedcae2c28e6 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -150,7 +150,8 @@ static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int); #define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) #define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) -static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) +static int nilfs_prepare_segment_lock(struct super_block *sb, + struct nilfs_transaction_info *ti) { struct nilfs_transaction_info *cur_ti = current->journal_info; void *save = NULL; @@ -164,8 +165,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) * it is saved and will be restored on * nilfs_transaction_commit(). */ - printk(KERN_WARNING - "NILFS warning: journal info from a different FS\n"); + nilfs_msg(sb, KERN_WARNING, "journal info from a different FS"); save = current->journal_info; } if (!ti) { @@ -215,7 +215,7 @@ int nilfs_transaction_begin(struct super_block *sb, int vacancy_check) { struct the_nilfs *nilfs; - int ret = nilfs_prepare_segment_lock(ti); + int ret = nilfs_prepare_segment_lock(sb, ti); struct nilfs_transaction_info *trace_ti; if (unlikely(ret < 0)) @@ -373,7 +373,7 @@ static void nilfs_transaction_lock(struct super_block *sb, nilfs_segctor_do_immediate_flush(sci); up_write(&nilfs->ns_segctor_sem); - yield(); + cond_resched(); } if (gcflag) ti->ti_flags |= NILFS_TI_GC; @@ -1858,11 +1858,11 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) */ list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - const unsigned long set_bits = (1 << BH_Uptodate); + const unsigned long set_bits = BIT(BH_Uptodate); const unsigned long clear_bits = - (1 << BH_Dirty | 1 << BH_Async_Write | - 1 << BH_Delay | 1 << BH_NILFS_Volatile | - 1 << BH_NILFS_Redirected); + (BIT(BH_Dirty) | BIT(BH_Async_Write) | + BIT(BH_Delay) | BIT(BH_NILFS_Volatile) | + BIT(BH_NILFS_Redirected)); set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh == segbuf->sb_super_root) { @@ -1951,8 +1951,9 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, err = nilfs_ifile_get_inode_block( ifile, ii->vfs_inode.i_ino, &ibh); if (unlikely(err)) { - nilfs_warning(sci->sc_super, __func__, - "failed to get inode block."); + nilfs_msg(sci->sc_super, KERN_WARNING, + "log writer: error %d getting inode block (ino=%lu)", + err, ii->vfs_inode.i_ino); return err; } mark_buffer_dirty(ibh); @@ -2131,10 +2132,10 @@ static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn) { spin_lock(&sci->sc_state_lock); - if (!(sci->sc_flush_request & (1 << bn))) { + if (!(sci->sc_flush_request & BIT(bn))) { unsigned long prev_req = sci->sc_flush_request; - sci->sc_flush_request |= (1 << bn); + sci->sc_flush_request |= BIT(bn); if (!prev_req) wake_up(&sci->sc_wait_daemon); } @@ -2318,7 +2319,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, } #define FLUSH_FILE_BIT (0x1) /* data file only */ -#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ +#define FLUSH_DAT_BIT BIT(NILFS_DAT_INO) /* DAT only */ /** * nilfs_segctor_accept - record accepted sequence count of log-write requests @@ -2458,8 +2459,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, if (likely(!err)) break; - nilfs_warning(sb, __func__, - "segment construction failed. (err=%d)", err); + nilfs_msg(sb, KERN_WARNING, "error %d cleaning segments", err); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(sci->sc_interval); } @@ -2467,9 +2467,9 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs, sci->sc_nfreesegs); if (ret) { - printk(KERN_WARNING - "NILFS warning: error %d on discard request, " - "turning discards off for the device\n", ret); + nilfs_msg(sb, KERN_WARNING, + "error %d on discard request, turning discards off for the device", + ret); nilfs_clear_opt(nilfs, DISCARD); } } @@ -2551,10 +2551,9 @@ static int nilfs_segctor_thread(void *arg) /* start sync. */ sci->sc_task = current; wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */ - printk(KERN_INFO - "segctord starting. Construction interval = %lu seconds, " - "CP frequency < %lu seconds\n", - sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ); + nilfs_msg(sci->sc_super, KERN_INFO, + "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds", + sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ); spin_lock(&sci->sc_state_lock); loop: @@ -2628,8 +2627,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci) if (IS_ERR(t)) { int err = PTR_ERR(t); - printk(KERN_ERR "NILFS: error %d creating segctord thread\n", - err); + nilfs_msg(sci->sc_super, KERN_ERR, + "error %d creating segctord thread", err); return err; } wait_event(sci->sc_wait_task, sci->sc_task != NULL); @@ -2739,14 +2738,14 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) nilfs_segctor_write_out(sci); if (!list_empty(&sci->sc_dirty_files)) { - nilfs_warning(sci->sc_super, __func__, - "dirty file(s) after the final construction"); + nilfs_msg(sci->sc_super, KERN_WARNING, + "disposed unprocessed dirty file(s) when stopping log writer"); nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1); } if (!list_empty(&sci->sc_iput_queue)) { - nilfs_warning(sci->sc_super, __func__, - "iput queue is not empty"); + nilfs_msg(sci->sc_super, KERN_WARNING, + "disposed unprocessed inode(s) in iput queue when stopping log writer"); nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1); } @@ -2822,8 +2821,8 @@ void nilfs_detach_log_writer(struct super_block *sb) spin_lock(&nilfs->ns_inode_lock); if (!list_empty(&nilfs->ns_dirty_files)) { list_splice_init(&nilfs->ns_dirty_files, &garbage_list); - nilfs_warning(sb, __func__, - "Hit dirty file after stopped log writer"); + nilfs_msg(sb, KERN_WARNING, + "disposed unprocessed dirty file(s) when detaching log writer"); } spin_unlock(&nilfs->ns_inode_lock); up_write(&nilfs->ns_segctor_sem); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 6565c10b7b76..1060949d7dd2 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -23,7 +23,6 @@ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/workqueue.h> -#include <linux/nilfs2_fs.h> #include "nilfs.h" struct nilfs_root; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1963595a1580..1541a1e9221a 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -22,7 +22,6 @@ #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/errno.h> -#include <linux/nilfs2_fs.h> #include "mdt.h" #include "sufile.h" @@ -181,9 +180,9 @@ int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs, down_write(&NILFS_MDT(sufile)->mi_sem); for (seg = segnumv; seg < segnumv + nsegs; seg++) { if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) { - printk(KERN_WARNING - "%s: invalid segment number: %llu\n", __func__, - (unsigned long long)*seg); + nilfs_msg(sufile->i_sb, KERN_WARNING, + "%s: invalid segment number: %llu", + __func__, (unsigned long long)*seg); nerr++; } } @@ -240,8 +239,9 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, int ret; if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) { - printk(KERN_WARNING "%s: invalid segment number: %llu\n", - __func__, (unsigned long long)segnum); + nilfs_msg(sufile->i_sb, KERN_WARNING, + "%s: invalid segment number: %llu", + __func__, (unsigned long long)segnum); return -EINVAL; } down_write(&NILFS_MDT(sufile)->mi_sem); @@ -419,8 +419,9 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, kaddr = kmap_atomic(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (unlikely(!nilfs_segment_usage_clean(su))) { - printk(KERN_WARNING "%s: segment %llu must be clean\n", - __func__, (unsigned long long)segnum); + nilfs_msg(sufile->i_sb, KERN_WARNING, + "%s: segment %llu must be clean", __func__, + (unsigned long long)segnum); kunmap_atomic(kaddr); return; } @@ -444,7 +445,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, kaddr = kmap_atomic(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); - if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) && + if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) && su->su_nblocks == cpu_to_le32(0)) { kunmap_atomic(kaddr); return; @@ -455,7 +456,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, /* make the segment garbage */ su->su_lastmod = cpu_to_le64(0); su->su_nblocks = cpu_to_le32(0); - su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY); + su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)); kunmap_atomic(kaddr); nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); @@ -476,8 +477,9 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, kaddr = kmap_atomic(su_bh->b_page); su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); if (nilfs_segment_usage_clean(su)) { - printk(KERN_WARNING "%s: segment %llu is already clean\n", - __func__, (unsigned long long)segnum); + nilfs_msg(sufile->i_sb, KERN_WARNING, + "%s: segment %llu is already clean", + __func__, (unsigned long long)segnum); kunmap_atomic(kaddr); return; } @@ -692,7 +694,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile, su2 = su; for (j = 0; j < n; j++, su = (void *)su + susz) { if ((le32_to_cpu(su->su_flags) & - ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) || + ~BIT(NILFS_SEGMENT_USAGE_ERROR)) || nilfs_segment_is_active(nilfs, segnum + j)) { ret = -EBUSY; kunmap_atomic(kaddr); @@ -859,10 +861,10 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, si->sui_lastmod = le64_to_cpu(su->su_lastmod); si->sui_nblocks = le32_to_cpu(su->su_nblocks); si->sui_flags = le32_to_cpu(su->su_flags) & - ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + ~BIT(NILFS_SEGMENT_USAGE_ACTIVE); if (nilfs_segment_is_active(nilfs, segnum + j)) si->sui_flags |= - (1UL << NILFS_SEGMENT_USAGE_ACTIVE); + BIT(NILFS_SEGMENT_USAGE_ACTIVE); } kunmap_atomic(kaddr); brelse(su_bh); @@ -950,7 +952,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, * disk. */ sup->sup_sui.sui_flags &= - ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + ~BIT(NILFS_SEGMENT_USAGE_ACTIVE); cleansi = nilfs_suinfo_clean(&sup->sup_sui); cleansu = nilfs_segment_usage_clean(su); @@ -1175,14 +1177,12 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, int err; if (susize > sb->s_blocksize) { - printk(KERN_ERR - "NILFS: too large segment usage size: %zu bytes.\n", - susize); + nilfs_msg(sb, KERN_ERR, + "too large segment usage size: %zu bytes", susize); return -EINVAL; } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) { - printk(KERN_ERR - "NILFS: too small segment usage size: %zu bytes.\n", - susize); + nilfs_msg(sb, KERN_ERR, + "too small segment usage size: %zu bytes", susize); return -EINVAL; } diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index 46e89872294c..158a9190c8ec 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -21,7 +21,6 @@ #include <linux/fs.h> #include <linux/buffer_head.h> -#include <linux/nilfs2_fs.h> #include "mdt.h" diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 666107a18a22..c95d369e90aa 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -71,6 +71,22 @@ struct kmem_cache *nilfs_btree_path_cache; static int nilfs_setup_super(struct super_block *sb, int is_mount); static int nilfs_remount(struct super_block *sb, int *flags, char *data); +void __nilfs_msg(struct super_block *sb, const char *level, const char *fmt, + ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + printk("%sNILFS (%s): %pV\n", level, sb->s_id, &vaf); + else + printk("%sNILFS: %pV\n", level, &vaf); + va_end(args); +} + static void nilfs_set_error(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; @@ -91,19 +107,20 @@ static void nilfs_set_error(struct super_block *sb) } /** - * nilfs_error() - report failure condition on a filesystem + * __nilfs_error() - report failure condition on a filesystem + * + * __nilfs_error() sets an ERROR_FS flag on the superblock as well as + * reporting an error message. This function should be called when + * NILFS detects incoherences or defects of meta data on disk. * - * nilfs_error() sets an ERROR_FS flag on the superblock as well as - * reporting an error message. It should be called when NILFS detects - * incoherences or defects of meta data on disk. As for sustainable - * errors such as a single-shot I/O error, nilfs_warning() or the printk() - * function should be used instead. + * This implements the body of nilfs_error() macro. Normally, + * nilfs_error() should be used. As for sustainable errors such as a + * single-shot I/O error, nilfs_msg() should be used instead. * - * The segment constructor must not call this function because it can - * kill itself. + * Callers should not add a trailing newline since this will do it. */ -void nilfs_error(struct super_block *sb, const char *function, - const char *fmt, ...) +void __nilfs_error(struct super_block *sb, const char *function, + const char *fmt, ...) { struct the_nilfs *nilfs = sb->s_fs_info; struct va_format vaf; @@ -133,24 +150,6 @@ void nilfs_error(struct super_block *sb, const char *function, sb->s_id); } -void nilfs_warning(struct super_block *sb, const char *function, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n", - sb->s_id, function, &vaf); - - va_end(args); -} - - struct inode *nilfs_alloc_inode(struct super_block *sb) { struct nilfs_inode_info *ii; @@ -196,8 +195,8 @@ static int nilfs_sync_super(struct super_block *sb, int flag) } if (unlikely(err)) { - printk(KERN_ERR - "NILFS: unable to write superblock (err=%d)\n", err); + nilfs_msg(sb, KERN_ERR, "unable to write superblock: err=%d", + err); if (err == -EIO && nilfs->ns_sbh[1]) { /* * sbp[0] points to newer log than sbp[1], @@ -267,8 +266,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb, sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) { memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); } else { - printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", - sb->s_id); + nilfs_msg(sb, KERN_CRIT, "superblock broke"); return NULL; } } else if (sbp[1] && @@ -378,9 +376,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) offset = sb2off & (nilfs->ns_blocksize - 1); nsbh = sb_getblk(sb, newblocknr); if (!nsbh) { - printk(KERN_WARNING - "NILFS warning: unable to move secondary superblock " - "to block %llu\n", (unsigned long long)newblocknr); + nilfs_msg(sb, KERN_WARNING, + "unable to move secondary superblock to block %llu", + (unsigned long long)newblocknr); ret = -EIO; goto out; } @@ -543,10 +541,9 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, up_read(&nilfs->ns_segctor_sem); if (unlikely(err)) { if (err == -ENOENT || err == -EINVAL) { - printk(KERN_ERR - "NILFS: Invalid checkpoint " - "(checkpoint number=%llu)\n", - (unsigned long long)cno); + nilfs_msg(sb, KERN_ERR, + "Invalid checkpoint (checkpoint number=%llu)", + (unsigned long long)cno); err = -EINVAL; } goto failed; @@ -642,9 +639,8 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) err = nilfs_ifile_count_free_inodes(root->ifile, &nmaxinodes, &nfreeinodes); if (unlikely(err)) { - printk(KERN_WARNING - "NILFS warning: fail to count free inodes: err %d.\n", - err); + nilfs_msg(sb, KERN_WARNING, + "failed to count free inodes: err=%d", err); if (err == -ERANGE) { /* * If nilfs_palloc_count_max_entries() returns @@ -776,9 +772,9 @@ static int parse_options(char *options, struct super_block *sb, int is_remount) break; case Opt_snapshot: if (is_remount) { - printk(KERN_ERR - "NILFS: \"%s\" option is invalid " - "for remount.\n", p); + nilfs_msg(sb, KERN_ERR, + "\"%s\" option is invalid for remount", + p); return 0; } break; @@ -792,8 +788,8 @@ static int parse_options(char *options, struct super_block *sb, int is_remount) nilfs_clear_opt(nilfs, DISCARD); break; default: - printk(KERN_ERR - "NILFS: Unrecognized mount option \"%s\"\n", p); + nilfs_msg(sb, KERN_ERR, + "unrecognized mount option \"%s\"", p); return 0; } } @@ -829,12 +825,10 @@ static int nilfs_setup_super(struct super_block *sb, int is_mount) mnt_count = le16_to_cpu(sbp[0]->s_mnt_count); if (nilfs->ns_mount_state & NILFS_ERROR_FS) { - printk(KERN_WARNING - "NILFS warning: mounting fs with errors\n"); + nilfs_msg(sb, KERN_WARNING, "mounting fs with errors"); #if 0 } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) { - printk(KERN_WARNING - "NILFS warning: maximal mount count reached\n"); + nilfs_msg(sb, KERN_WARNING, "maximal mount count reached"); #endif } if (!max_mnt_count) @@ -897,17 +891,17 @@ int nilfs_check_feature_compatibility(struct super_block *sb, features = le64_to_cpu(sbp->s_feature_incompat) & ~NILFS_FEATURE_INCOMPAT_SUPP; if (features) { - printk(KERN_ERR "NILFS: couldn't mount because of unsupported " - "optional features (%llx)\n", - (unsigned long long)features); + nilfs_msg(sb, KERN_ERR, + "couldn't mount because of unsupported optional features (%llx)", + (unsigned long long)features); return -EINVAL; } features = le64_to_cpu(sbp->s_feature_compat_ro) & ~NILFS_FEATURE_COMPAT_RO_SUPP; if (!(sb->s_flags & MS_RDONLY) && features) { - printk(KERN_ERR "NILFS: couldn't mount RDWR because of " - "unsupported optional features (%llx)\n", - (unsigned long long)features); + nilfs_msg(sb, KERN_ERR, + "couldn't mount RDWR because of unsupported optional features (%llx)", + (unsigned long long)features); return -EINVAL; } return 0; @@ -923,13 +917,13 @@ static int nilfs_get_root_dentry(struct super_block *sb, inode = nilfs_iget(sb, root, NILFS_ROOT_INO); if (IS_ERR(inode)) { - printk(KERN_ERR "NILFS: get root inode failed\n"); ret = PTR_ERR(inode); + nilfs_msg(sb, KERN_ERR, "error %d getting root inode", ret); goto out; } if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) { iput(inode); - printk(KERN_ERR "NILFS: corrupt root inode.\n"); + nilfs_msg(sb, KERN_ERR, "corrupt root inode"); ret = -EINVAL; goto out; } @@ -957,7 +951,7 @@ static int nilfs_get_root_dentry(struct super_block *sb, return ret; failed_dentry: - printk(KERN_ERR "NILFS: get root dentry failed\n"); + nilfs_msg(sb, KERN_ERR, "error %d getting root dentry", ret); goto out; } @@ -977,18 +971,18 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, ret = (ret == -ENOENT) ? -EINVAL : ret; goto out; } else if (!ret) { - printk(KERN_ERR "NILFS: The specified checkpoint is " - "not a snapshot (checkpoint number=%llu).\n", - (unsigned long long)cno); + nilfs_msg(s, KERN_ERR, + "The specified checkpoint is not a snapshot (checkpoint number=%llu)", + (unsigned long long)cno); ret = -EINVAL; goto out; } ret = nilfs_attach_checkpoint(s, cno, false, &root); if (ret) { - printk(KERN_ERR "NILFS: error loading snapshot " - "(checkpoint number=%llu).\n", - (unsigned long long)cno); + nilfs_msg(s, KERN_ERR, + "error %d while loading snapshot (checkpoint number=%llu)", + ret, (unsigned long long)cno); goto out; } ret = nilfs_get_root_dentry(s, root, root_dentry); @@ -1058,7 +1052,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) __u64 cno; int err; - nilfs = alloc_nilfs(sb->s_bdev); + nilfs = alloc_nilfs(sb); if (!nilfs) return -ENOMEM; @@ -1083,8 +1077,9 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) cno = nilfs_last_cno(nilfs); err = nilfs_attach_checkpoint(sb, cno, true, &fsroot); if (err) { - printk(KERN_ERR "NILFS: error loading last checkpoint " - "(checkpoint number=%llu).\n", (unsigned long long)cno); + nilfs_msg(sb, KERN_ERR, + "error %d while loading last checkpoint (checkpoint number=%llu)", + err, (unsigned long long)cno); goto failed_unload; } @@ -1144,9 +1139,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; if (!nilfs_valid_fs(nilfs)) { - printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount because the filesystem is in an " - "incomplete recovery state.\n", sb->s_id); + nilfs_msg(sb, KERN_WARNING, + "couldn't remount because the filesystem is in an incomplete recovery state"); goto restore_opts; } @@ -1178,10 +1172,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) ~NILFS_FEATURE_COMPAT_RO_SUPP; up_read(&nilfs->ns_sem); if (features) { - printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount RDWR because of unsupported optional " - "features (%llx)\n", - sb->s_id, (unsigned long long)features); + nilfs_msg(sb, KERN_WARNING, + "couldn't remount RDWR because of unsupported optional features (%llx)", + (unsigned long long)features); err = -EROFS; goto restore_opts; } @@ -1212,6 +1205,38 @@ struct nilfs_super_data { int flags; }; +static int nilfs_parse_snapshot_option(const char *option, + const substring_t *arg, + struct nilfs_super_data *sd) +{ + unsigned long long val; + const char *msg = NULL; + int err; + + if (!(sd->flags & MS_RDONLY)) { + msg = "read-only option is not specified"; + goto parse_error; + } + + err = kstrtoull(arg->from, 0, &val); + if (err) { + if (err == -ERANGE) + msg = "too large checkpoint number"; + else + msg = "malformed argument"; + goto parse_error; + } else if (val == 0) { + msg = "invalid checkpoint number 0"; + goto parse_error; + } + sd->cno = val; + return 0; + +parse_error: + nilfs_msg(NULL, KERN_ERR, "invalid option \"%s\": %s", option, msg); + return 1; +} + /** * nilfs_identify - pre-read mount options needed to identify mount instance * @data: mount options @@ -1228,24 +1253,9 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd) p = strsep(&options, ","); if (p != NULL && *p) { token = match_token(p, tokens, args); - if (token == Opt_snapshot) { - if (!(sd->flags & MS_RDONLY)) { - ret++; - } else { - sd->cno = simple_strtoull(args[0].from, - NULL, 0); - /* - * No need to see the end pointer; - * match_token() has done syntax - * checking. - */ - if (sd->cno == 0) - ret++; - } - } - if (ret) - printk(KERN_ERR - "NILFS: invalid mount option: %s\n", p); + if (token == Opt_snapshot) + ret = nilfs_parse_snapshot_option(p, &args[0], + sd); } if (!options) break; @@ -1326,10 +1336,10 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } else if (!sd.cno) { if (nilfs_tree_is_busy(s->s_root)) { if ((flags ^ s->s_flags) & MS_RDONLY) { - printk(KERN_ERR "NILFS: the device already " - "has a %s mount.\n", - (s->s_flags & MS_RDONLY) ? - "read-only" : "read/write"); + nilfs_msg(s, KERN_ERR, + "the device already has a %s mount.", + (s->s_flags & MS_RDONLY) ? + "read-only" : "read/write"); err = -EBUSY; goto failed_super; } diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 8ffa42b704d8..490303e3d517 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -272,8 +272,8 @@ nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr, err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); up_read(&nilfs->ns_segctor_sem); if (err < 0) { - printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n", - err); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "unable to get checkpoint stat: err=%d", err); return err; } @@ -295,8 +295,8 @@ nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr, err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); up_read(&nilfs->ns_segctor_sem); if (err < 0) { - printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n", - err); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "unable to get checkpoint stat: err=%d", err); return err; } @@ -326,9 +326,9 @@ nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr, { __u64 cno; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); cno = nilfs->ns_cno; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", cno); } @@ -414,8 +414,8 @@ nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr, err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat); up_read(&nilfs->ns_segctor_sem); if (err < 0) { - printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n", - err); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "unable to get segment stat: err=%d", err); return err; } @@ -511,9 +511,9 @@ nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr, { u64 seg_seq; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); seg_seq = nilfs->ns_seg_seq; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq); } @@ -525,9 +525,9 @@ nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr, { __u64 segnum; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); segnum = nilfs->ns_segnum; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", segnum); } @@ -539,9 +539,9 @@ nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr, { __u64 nextnum; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); nextnum = nilfs->ns_nextnum; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum); } @@ -553,9 +553,9 @@ nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr, { unsigned long pseg_offset; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); pseg_offset = nilfs->ns_pseg_offset; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset); } @@ -567,9 +567,9 @@ nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr, { __u64 cno; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); cno = nilfs->ns_cno; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", cno); } @@ -581,9 +581,9 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr, { time_t ctime; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); ctime = nilfs->ns_ctime; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return NILFS_SHOW_TIME(ctime, buf); } @@ -595,9 +595,9 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr, { time_t ctime; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); ctime = nilfs->ns_ctime; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime); } @@ -609,9 +609,9 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr, { time_t nongc_ctime; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); nongc_ctime = nilfs->ns_nongc_ctime; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return NILFS_SHOW_TIME(nongc_ctime, buf); } @@ -623,9 +623,9 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr, { time_t nongc_ctime; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); nongc_ctime = nilfs->ns_nongc_ctime; - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)nongc_ctime); @@ -638,9 +638,9 @@ nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr, { u32 ndirtyblks; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_segctor_sem); ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks); - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_segctor_sem); return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks); } @@ -789,14 +789,15 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr, err = kstrtouint(skip_spaces(buf), 0, &val); if (err) { - printk(KERN_ERR "NILFS: unable to convert string: err=%d\n", - err); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "unable to convert string: err=%d", err); return err; } if (val < NILFS_SB_FREQ) { val = NILFS_SB_FREQ; - printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n"); + nilfs_msg(nilfs->ns_sb, KERN_WARNING, + "superblock update frequency cannot be lesser than 10 seconds"); } down_write(&nilfs->ns_sem); @@ -999,7 +1000,8 @@ int nilfs_sysfs_create_device_group(struct super_block *sb) nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL); if (unlikely(!nilfs->ns_dev_subgroups)) { err = -ENOMEM; - printk(KERN_ERR "NILFS: unable to allocate memory for device group\n"); + nilfs_msg(sb, KERN_ERR, + "unable to allocate memory for device group"); goto failed_create_device_group; } @@ -1109,15 +1111,15 @@ int __init nilfs_sysfs_init(void) nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj); if (!nilfs_kset) { err = -ENOMEM; - printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n", - err); + nilfs_msg(NULL, KERN_ERR, + "unable to create sysfs entry: err=%d", err); goto failed_sysfs_init; } err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group); if (unlikely(err)) { - printk(KERN_ERR "NILFS: unable to create feature group: err %d\n", - err); + nilfs_msg(NULL, KERN_ERR, + "unable to create feature group: err=%d", err); goto cleanup_sysfs_init; } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index e9fd241b9a0a..2dd75bf619ad 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -56,12 +56,12 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs, /** * alloc_nilfs - allocate a nilfs object - * @bdev: block device to which the_nilfs is related + * @sb: super block instance * * Return Value: On success, pointer to the_nilfs is returned. * On error, NULL is returned. */ -struct the_nilfs *alloc_nilfs(struct block_device *bdev) +struct the_nilfs *alloc_nilfs(struct super_block *sb) { struct the_nilfs *nilfs; @@ -69,7 +69,8 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev) if (!nilfs) return NULL; - nilfs->ns_bdev = bdev; + nilfs->ns_sb = sb; + nilfs->ns_bdev = sb->s_bdev; atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); mutex_init(&nilfs->ns_snapshot_mount_mutex); @@ -191,7 +192,10 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs, nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg); nilfs->ns_cno = nilfs->ns_last_cno + 1; if (nilfs->ns_segnum >= nilfs->ns_nsegments) { - printk(KERN_ERR "NILFS invalid last segment number.\n"); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "pointed segment number is out of range: segnum=%llu, nsegments=%lu", + (unsigned long long)nilfs->ns_segnum, + nilfs->ns_nsegments); ret = -EINVAL; } return ret; @@ -215,12 +219,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) int err; if (!valid_fs) { - printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); + nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs"); if (s_flags & MS_RDONLY) { - printk(KERN_INFO "NILFS: INFO: recovery " - "required for readonly filesystem.\n"); - printk(KERN_INFO "NILFS: write access will " - "be enabled during recovery.\n"); + nilfs_msg(sb, KERN_INFO, + "recovery required for readonly filesystem"); + nilfs_msg(sb, KERN_INFO, + "write access will be enabled during recovery"); } } @@ -235,13 +239,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) goto scan_error; if (!nilfs_valid_sb(sbp[1])) { - printk(KERN_WARNING - "NILFS warning: unable to fall back to spare" - "super block\n"); + nilfs_msg(sb, KERN_WARNING, + "unable to fall back to spare super block"); goto scan_error; } - printk(KERN_INFO - "NILFS: try rollback from an earlier position\n"); + nilfs_msg(sb, KERN_INFO, + "trying rollback from an earlier position"); /* * restore super block with its spare and reconfigure @@ -254,10 +257,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) /* verify consistency between two super blocks */ blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size); if (blocksize != nilfs->ns_blocksize) { - printk(KERN_WARNING - "NILFS warning: blocksize differs between " - "two super blocks (%d != %d)\n", - blocksize, nilfs->ns_blocksize); + nilfs_msg(sb, KERN_WARNING, + "blocksize differs between two super blocks (%d != %d)", + blocksize, nilfs->ns_blocksize); goto scan_error; } @@ -276,7 +278,8 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root); if (unlikely(err)) { - printk(KERN_ERR "NILFS: error loading super root.\n"); + nilfs_msg(sb, KERN_ERR, "error %d while loading super root", + err); goto failed; } @@ -287,30 +290,29 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) __u64 features; if (nilfs_test_opt(nilfs, NORECOVERY)) { - printk(KERN_INFO "NILFS: norecovery option specified. " - "skipping roll-forward recovery\n"); + nilfs_msg(sb, KERN_INFO, + "norecovery option specified, skipping roll-forward recovery"); goto skip_recovery; } features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & ~NILFS_FEATURE_COMPAT_RO_SUPP; if (features) { - printk(KERN_ERR "NILFS: couldn't proceed with " - "recovery because of unsupported optional " - "features (%llx)\n", - (unsigned long long)features); + nilfs_msg(sb, KERN_ERR, + "couldn't proceed with recovery because of unsupported optional features (%llx)", + (unsigned long long)features); err = -EROFS; goto failed_unload; } if (really_read_only) { - printk(KERN_ERR "NILFS: write access " - "unavailable, cannot proceed.\n"); + nilfs_msg(sb, KERN_ERR, + "write access unavailable, cannot proceed"); err = -EROFS; goto failed_unload; } sb->s_flags &= ~MS_RDONLY; } else if (nilfs_test_opt(nilfs, NORECOVERY)) { - printk(KERN_ERR "NILFS: recovery cancelled because norecovery " - "option was specified for a read/write mount\n"); + nilfs_msg(sb, KERN_ERR, + "recovery cancelled because norecovery option was specified for a read/write mount"); err = -EINVAL; goto failed_unload; } @@ -325,11 +327,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) up_write(&nilfs->ns_sem); if (err) { - printk(KERN_ERR "NILFS: failed to update super block. " - "recovery unfinished.\n"); + nilfs_msg(sb, KERN_ERR, + "error %d updating super block. recovery unfinished.", + err); goto failed_unload; } - printk(KERN_INFO "NILFS: recovery complete.\n"); + nilfs_msg(sb, KERN_INFO, "recovery complete"); skip_recovery: nilfs_clear_recovery_info(&ri); @@ -337,7 +340,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) return 0; scan_error: - printk(KERN_ERR "NILFS: error searching super root.\n"); + nilfs_msg(sb, KERN_ERR, "error %d while searching super root", err); goto failed; failed_unload: @@ -384,12 +387,11 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, struct nilfs_super_block *sbp) { if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) { - printk(KERN_ERR "NILFS: unsupported revision " - "(superblock rev.=%d.%d, current rev.=%d.%d). " - "Please check the version of mkfs.nilfs.\n", - le32_to_cpu(sbp->s_rev_level), - le16_to_cpu(sbp->s_minor_rev_level), - NILFS_CURRENT_REV, NILFS_MINOR_REV); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).", + le32_to_cpu(sbp->s_rev_level), + le16_to_cpu(sbp->s_minor_rev_level), + NILFS_CURRENT_REV, NILFS_MINOR_REV); return -EINVAL; } nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes); @@ -398,12 +400,14 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); if (nilfs->ns_inode_size > nilfs->ns_blocksize) { - printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n", - nilfs->ns_inode_size); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "too large inode size: %d bytes", + nilfs->ns_inode_size); return -EINVAL; } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) { - printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n", - nilfs->ns_inode_size); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "too small inode size: %d bytes", + nilfs->ns_inode_size); return -EINVAL; } @@ -411,7 +415,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { - printk(KERN_ERR "NILFS: too short segment.\n"); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "too short segment: %lu blocks", + nilfs->ns_blocks_per_segment); return -EINVAL; } @@ -420,7 +426,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, le32_to_cpu(sbp->s_r_segments_percentage); if (nilfs->ns_r_segments_percentage < 1 || nilfs->ns_r_segments_percentage > 99) { - printk(KERN_ERR "NILFS: invalid reserved segments percentage.\n"); + nilfs_msg(nilfs->ns_sb, KERN_ERR, + "invalid reserved segments percentage: %lu", + nilfs->ns_r_segments_percentage); return -EINVAL; } @@ -504,16 +512,16 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, if (!sbp[0]) { if (!sbp[1]) { - printk(KERN_ERR "NILFS: unable to read superblock\n"); + nilfs_msg(sb, KERN_ERR, "unable to read superblock"); return -EIO; } - printk(KERN_WARNING - "NILFS warning: unable to read primary superblock " - "(blocksize = %d)\n", blocksize); + nilfs_msg(sb, KERN_WARNING, + "unable to read primary superblock (blocksize = %d)", + blocksize); } else if (!sbp[1]) { - printk(KERN_WARNING - "NILFS warning: unable to read secondary superblock " - "(blocksize = %d)\n", blocksize); + nilfs_msg(sb, KERN_WARNING, + "unable to read secondary superblock (blocksize = %d)", + blocksize); } /* @@ -535,14 +543,14 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, } if (!valid[swp]) { nilfs_release_super_block(nilfs); - printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n", - sb->s_id); + nilfs_msg(sb, KERN_ERR, "couldn't find nilfs on the device"); return -EINVAL; } if (!valid[!swp]) - printk(KERN_WARNING "NILFS warning: broken superblock. " - "using spare superblock (blocksize = %d).\n", blocksize); + nilfs_msg(sb, KERN_WARNING, + "broken superblock, retrying with spare superblock (blocksize = %d)", + blocksize); if (swp) nilfs_swap_super_block(nilfs); @@ -576,7 +584,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); if (!blocksize) { - printk(KERN_ERR "NILFS: unable to set blocksize\n"); + nilfs_msg(sb, KERN_ERR, "unable to set blocksize"); err = -EINVAL; goto out; } @@ -595,8 +603,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); if (blocksize < NILFS_MIN_BLOCK_SIZE || blocksize > NILFS_MAX_BLOCK_SIZE) { - printk(KERN_ERR "NILFS: couldn't mount because of unsupported " - "filesystem blocksize %d\n", blocksize); + nilfs_msg(sb, KERN_ERR, + "couldn't mount because of unsupported filesystem blocksize %d", + blocksize); err = -EINVAL; goto failed_sbh; } @@ -604,10 +613,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) int hw_blocksize = bdev_logical_block_size(sb->s_bdev); if (blocksize < hw_blocksize) { - printk(KERN_ERR - "NILFS: blocksize %d too small for device " - "(sector-size = %d).\n", - blocksize, hw_blocksize); + nilfs_msg(sb, KERN_ERR, + "blocksize %d too small for device (sector-size = %d)", + blocksize, hw_blocksize); err = -EINVAL; goto failed_sbh; } diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 79369fd6b13b..b305c6f033e7 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -43,6 +43,7 @@ enum { * struct the_nilfs - struct to supervise multiple nilfs mount points * @ns_flags: flags * @ns_flushed_device: flag indicating if all volatile data was flushed + * @ns_sb: back pointer to super block instance * @ns_bdev: block device * @ns_sem: semaphore for shared states * @ns_snapshot_mount_mutex: mutex to protect snapshot mounts @@ -102,6 +103,7 @@ struct the_nilfs { unsigned long ns_flags; int ns_flushed_device; + struct super_block *ns_sb; struct block_device *ns_bdev; struct rw_semaphore ns_sem; struct mutex ns_snapshot_mount_mutex; @@ -120,11 +122,8 @@ struct the_nilfs { unsigned int ns_sb_update_freq; /* - * Following fields are dedicated to a writable FS-instance. - * Except for the period seeking checkpoint, code outside the segment - * constructor must lock a segment semaphore while accessing these - * fields. - * The writable FS-instance is sole during a lifetime of the_nilfs. + * The following fields are updated by a writable FS-instance. + * These fields are protected by ns_segctor_sem outside load_nilfs(). */ u64 ns_seg_seq; __u64 ns_segnum; @@ -281,7 +280,7 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) } void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); -struct the_nilfs *alloc_nilfs(struct block_device *bdev); +struct the_nilfs *alloc_nilfs(struct super_block *sb); void destroy_nilfs(struct the_nilfs *nilfs); int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data); int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index f40972d6df90..e01287c964a8 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1854,7 +1854,7 @@ int ntfs_read_inode_mount(struct inode *vi) /* Need this to sanity check attribute list references to $MFT. */ vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); - /* Provides readpage() and sync_page() for map_mft_record(). */ + /* Provides readpage() for map_mft_record(). */ vi->i_mapping->a_ops = &ntfs_mst_aops; ctx = ntfs_attr_get_search_ctx(ni, m); diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 443abecf01b7..358258364616 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -253,7 +253,7 @@ handle_name: err = (signed)nls_name.len; goto err_out; } - nls_name.hash = full_name_hash(nls_name.name, nls_name.len); + nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len); dent = d_add_ci(dent, dent_inode, &nls_name); kfree(nls_name.name); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 460c0cedab3a..7dabbc31060e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6106,6 +6106,43 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, } } +/* + * Try to flush truncate logs if we can free enough clusters from it. + * As for return value, "< 0" means error, "0" no space and "1" means + * we have freed enough spaces and let the caller try to allocate again. + */ +int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, + unsigned int needed) +{ + tid_t target; + int ret = 0; + unsigned int truncated_clusters; + + inode_lock(osb->osb_tl_inode); + truncated_clusters = osb->truncated_clusters; + inode_unlock(osb->osb_tl_inode); + + /* + * Check whether we can succeed in allocating if we free + * the truncate log. + */ + if (truncated_clusters < needed) + goto out; + + ret = ocfs2_flush_truncate_log(osb); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { + jbd2_log_wait_commit(osb->journal->j_journal, target); + ret = 1; + } +out: + return ret; +} + static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, int slot_num, struct inode **tl_inode, diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index f3dc1b0dfffc..4a5152ec88a3 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -188,6 +188,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, u64 start_blk, unsigned int num_clusters); int __ocfs2_flush_truncate_log(struct ocfs2_super *osb); +int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, + unsigned int needed); /* * Process local structure which describes the block unlinks done diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index e97a37179614..98d36548153d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1645,43 +1645,6 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, return ret; } -/* - * Try to flush truncate logs if we can free enough clusters from it. - * As for return value, "< 0" means error, "0" no space and "1" means - * we have freed enough spaces and let the caller try to allocate again. - */ -static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, - unsigned int needed) -{ - tid_t target; - int ret = 0; - unsigned int truncated_clusters; - - inode_lock(osb->osb_tl_inode); - truncated_clusters = osb->truncated_clusters; - inode_unlock(osb->osb_tl_inode); - - /* - * Check whether we can succeed in allocating if we free - * the truncate log. - */ - if (truncated_clusters < needed) - goto out; - - ret = ocfs2_flush_truncate_log(osb); - if (ret) { - mlog_errno(ret); - goto out; - } - - if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) { - jbd2_log_wait_commit(osb->journal->j_journal, target); - ret = 1; - } -out: - return ret; -} - int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, ocfs2_write_type_t type, struct page **pagep, void **fsdata, @@ -2426,7 +2389,7 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file)->i_mapping->host; + struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); get_block_t *get_block; diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 004f2cbe8f71..e9f3705c4c9f 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -47,7 +47,7 @@ #define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE) /* Intended to make it easier for us to switch out hash functions */ -#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) +#define dlm_lockid_hash(_n, _l) full_name_hash(NULL, _n, _l) enum dlm_mle_type { DLM_MLE_BLOCK = 0, @@ -1004,6 +1004,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 nodenum, u8 *real_master); +void __dlm_do_purge_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 13719d3f35f8..6ea06f8a7d29 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2276,9 +2276,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", dlm->name, namelen, lockname, res->owner, r); dlm_print_one_lock_resource(res); - BUG(); - } - return ret ? ret : r; + if (r == -ENOMEM) + BUG(); + } else + ret = r; + + return ret; } int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, @@ -2416,48 +2419,26 @@ int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, } spin_lock(&res->spinlock); - BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF)); - if (!list_empty(&res->purge)) { - mlog(0, "%s: Removing res %.*s from purgelist\n", - dlm->name, res->lockname.len, res->lockname.name); - list_del_init(&res->purge); - dlm_lockres_put(res); - dlm->purge_count--; - } - - if (!__dlm_lockres_unused(res)) { - mlog(ML_ERROR, "%s: res %.*s in use after deref\n", - dlm->name, res->lockname.len, res->lockname.name); - __dlm_print_one_lock_resource(res); - BUG(); - } - - __dlm_unhash_lockres(dlm, res); - - spin_lock(&dlm->track_lock); - if (!list_empty(&res->tracking)) - list_del_init(&res->tracking); - else { - mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n", - dlm->name, res->lockname.len, res->lockname.name); - __dlm_print_one_lock_resource(res); + if (!(res->state & DLM_LOCK_RES_DROPPING_REF)) { + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + mlog(ML_NOTICE, "%s:%.*s: node %u sends deref done " + "but it is already derefed!\n", dlm->name, + res->lockname.len, res->lockname.name, node); + ret = 0; + goto done; } - spin_unlock(&dlm->track_lock); - /* lockres is not in the hash now. drop the flag and wake up - * any processes waiting in dlm_get_lock_resource. - */ - res->state &= ~DLM_LOCK_RES_DROPPING_REF; + __dlm_do_purge_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); - dlm_lockres_put(res); - spin_unlock(&dlm->spinlock); ret = 0; - done: + if (res) + dlm_lockres_put(res); dlm_put(dlm); return ret; } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index f6b313898763..dd5cb8bcefd1 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2343,6 +2343,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) struct dlm_lock_resource *res; int i; struct hlist_head *bucket; + struct hlist_node *tmp; struct dlm_lock *lock; @@ -2365,7 +2366,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_lockres_hash(dlm, i); - hlist_for_each_entry(res, bucket, hash_node) { + hlist_for_each_entry_safe(res, tmp, bucket, hash_node) { /* always prune any $RECOVERY entries for dead nodes, * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, @@ -2386,8 +2387,17 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) break; } } - dlm_lockres_clear_refmap_bit(dlm, res, - dead_node); + + if ((res->owner == dead_node) && + (res->state & DLM_LOCK_RES_DROPPING_REF)) { + dlm_lockres_get(res); + __dlm_do_purge_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + dlm_lockres_put(res); + continue; + } else if (res->owner == dlm->node_num) + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); spin_unlock(&res->spinlock); continue; } @@ -2398,14 +2408,17 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) if (res->state & DLM_LOCK_RES_DROPPING_REF) { mlog(0, "%s:%.*s: owned by " "dead node %u, this node was " - "dropping its ref when it died. " - "continue, dropping the flag.\n", + "dropping its ref when master died. " + "continue, purging the lockres.\n", dlm->name, res->lockname.len, res->lockname.name, dead_node); + dlm_lockres_get(res); + __dlm_do_purge_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + dlm_lockres_put(res); + continue; } - res->state &= ~DLM_LOCK_RES_DROPPING_REF; - dlm_move_lockres_to_recovery_list(dlm, - res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 68d239ba0c63..838a06d4066a 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -160,6 +160,52 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, spin_unlock(&dlm->spinlock); } +/* + * Do the real purge work: + * unhash the lockres, and + * clear flag DLM_LOCK_RES_DROPPING_REF. + * It requires dlm and lockres spinlock to be taken. + */ +void __dlm_do_purge_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + if (!list_empty(&res->purge)) { + mlog(0, "%s: Removing res %.*s from purgelist\n", + dlm->name, res->lockname.len, res->lockname.name); + list_del_init(&res->purge); + dlm_lockres_put(res); + dlm->purge_count--; + } + + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "%s: res %.*s in use after deref\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + BUG(); + } + + __dlm_unhash_lockres(dlm, res); + + spin_lock(&dlm->track_lock); + if (!list_empty(&res->tracking)) + list_del_init(&res->tracking); + else { + mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + } + spin_unlock(&dlm->track_lock); + + /* + * lockres is not in the hash now. drop the flag and wake up + * any processes waiting in dlm_get_lock_resource. + */ + res->state &= ~DLM_LOCK_RES_DROPPING_REF; +} + static void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { @@ -175,6 +221,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, res->lockname.len, res->lockname.name, master); if (!master) { + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + mlog(ML_NOTICE, "%s: res %.*s already in DLM_LOCK_RES_DROPPING_REF state\n", + dlm->name, res->lockname.len, res->lockname.name); + spin_unlock(&res->spinlock); + return; + } + res->state |= DLM_LOCK_RES_DROPPING_REF; /* drop spinlock... retake below */ spin_unlock(&res->spinlock); @@ -203,8 +256,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, dlm->purge_count--; } - if (!master && ret != 0) { - mlog(0, "%s: deref %.*s in progress or master goes down\n", + if (!master && ret == DLM_DEREF_RESPONSE_INPROG) { + mlog(0, "%s: deref %.*s in progress\n", dlm->name, res->lockname.len, res->lockname.name); spin_unlock(&res->spinlock); return; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index ab6a6cdcf91c..87e577a49b0d 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -483,7 +483,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; struct ocfs2_global_disk_dqblk dqblk; s64 spacechange, inodechange; - time_t olditime, oldbtime; + time64_t olditime, oldbtime; err = sb->s_op->quota_read(sb, type, (char *)&dqblk, sizeof(struct ocfs2_global_disk_dqblk), diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index ced70c8139f7..c9e828ec3c8e 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1007,10 +1007,17 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) lc->oc_type = NO_CONTROLD; rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name, - DLM_LSFL_FS, DLM_LVB_LEN, + DLM_LSFL_FS | DLM_LSFL_NEWEXCL, DLM_LVB_LEN, &ocfs2_ls_ops, conn, &ops_rv, &fsdlm); - if (rc) + if (rc) { + if (rc == -EEXIST || rc == -EPROTO) + printk(KERN_ERR "ocfs2: Unable to create the " + "lockspace %s (%d), because a ocfs2-tools " + "program is running on this file system " + "with the same name lockspace\n", + conn->cc_name, rc); goto out; + } if (ops_rv == -EOPNOTSUPP) { lc->oc_type = WITH_CONTROLD; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 2f19aeec5482..ea47120a85ff 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1164,7 +1164,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, int flags, struct ocfs2_alloc_context **ac) { - int status; + int status, ret = 0; + int retried = 0; *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); if (!(*ac)) { @@ -1189,7 +1190,24 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, } if (status == -ENOSPC) { +retry: status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + /* Retry if there is sufficient space cached in truncate log */ + if (status == -ENOSPC && !retried) { + retried = 1; + ocfs2_inode_unlock((*ac)->ac_inode, 1); + inode_unlock((*ac)->ac_inode); + + ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); + if (ret == 1) + goto retry; + + if (ret < 0) + mlog_errno(ret); + + inode_lock((*ac)->ac_inode); + ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); + } if (status < 0) { if (status != -ENOSPC) mlog_errno(status); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d2053853951e..5bb44f7a78ee 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7344,7 +7344,7 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = { * 'user' attributes support */ static int ocfs2_xattr_user_get(const struct xattr_handler *handler, - struct dentry *unusde, struct inode *inode, + struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); diff --git a/fs/open.c b/fs/open.c index 93ae3cdee4ab..bf66cf1a9f5c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -840,13 +840,13 @@ EXPORT_SYMBOL(file_path); int vfs_open(const struct path *path, struct file *file, const struct cred *cred) { - struct inode *inode = vfs_select_inode(path->dentry, file->f_flags); + struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags); - if (IS_ERR(inode)) - return PTR_ERR(inode); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); file->f_path = *path; - return do_dentry_open(file, inode, NULL, cred); + return do_dentry_open(file, d_backing_inode(dentry), NULL, cred); } struct file *dentry_open(const struct path *path, int flags, diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c index 5dfc4f3cfe68..00235bf644dc 100644 --- a/fs/orangefs/dcache.c +++ b/fs/orangefs/dcache.c @@ -73,6 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry) } } + dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; ret = 1; out_release_op: op_release(new_op); @@ -94,6 +95,9 @@ static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags) { int ret; + if (time_before(jiffies, dentry->d_time)) + return 1; + if (flags & LOOKUP_RCU) return -ECHILD; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 8f2fa94cc4f6..28a0557a69be 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -262,7 +262,7 @@ int orangefs_getattr(struct vfsmount *mnt, "orangefs_getattr: called on %s\n", dentry->d_name.name); - ret = orangefs_inode_getattr(inode, 0, 1); + ret = orangefs_inode_getattr(inode, 0, 0); if (ret == 0) { generic_fillattr(inode, kstat); @@ -291,7 +291,7 @@ int orangefs_permission(struct inode *inode, int mask) } /* ORANGEDS2 implementation of VFS inode operations for files */ -struct inode_operations orangefs_file_inode_operations = { +const struct inode_operations orangefs_file_inode_operations = { .get_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, .setattr = orangefs_setattr, @@ -384,7 +384,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref if (!inode || !(inode->i_state & I_NEW)) return inode; - error = orangefs_inode_getattr(inode, 1, 0); + error = orangefs_inode_getattr(inode, 1, 1); if (error) { iget_failed(inode); return ERR_PTR(error); @@ -429,7 +429,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, orangefs_set_inode(inode, ref); inode->i_ino = hash; /* needed for stat etc */ - error = orangefs_inode_getattr(inode, 1, 0); + error = orangefs_inode_getattr(inode, 1, 1); if (error) goto out_iput; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 5a60c508af4e..62c525936ee8 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -72,6 +72,8 @@ static int orangefs_create(struct inode *dir, d_instantiate(dentry, inode); unlock_new_inode(inode); + dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, "%s: dentry instantiated for %s\n", @@ -181,6 +183,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, goto out; } + dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn); if (IS_ERR(inode)) { gossip_debug(GOSSIP_NAME_DEBUG, @@ -189,6 +193,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, goto out; } + ORANGEFS_I(inode)->getattr_time = jiffies - 1; + gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d " "Found good inode [%lu] with count [%d]\n", @@ -316,6 +322,8 @@ static int orangefs_symlink(struct inode *dir, d_instantiate(dentry, inode); unlock_new_inode(inode); + dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, "Inode (Symlink) %pU -> %s\n", @@ -378,6 +386,8 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode d_instantiate(dentry, inode); unlock_new_inode(inode); + dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000; + ORANGEFS_I(inode)->getattr_time = jiffies - 1; gossip_debug(GOSSIP_NAME_DEBUG, "Inode (Directory) %pU -> %s\n", @@ -405,12 +415,10 @@ static int orangefs_rename(struct inode *old_dir, int ret; gossip_debug(GOSSIP_NAME_DEBUG, - "orangefs_rename: called (%s/%s => %s/%s) ct=%d\n", - old_dentry->d_parent->d_name.name, - old_dentry->d_name.name, - new_dentry->d_parent->d_name.name, - new_dentry->d_name.name, - d_count(new_dentry)); + "orangefs_rename: called (%pd2 => %pd2) ct=%d\n", + old_dentry, new_dentry, d_count(new_dentry)); + + ORANGEFS_I(new_dentry->d_parent->d_inode)->getattr_time = jiffies - 1; new_op = op_alloc(ORANGEFS_VFS_OP_RENAME); if (!new_op) @@ -442,7 +450,7 @@ static int orangefs_rename(struct inode *old_dir, } /* ORANGEFS implementation of VFS inode operations for directories */ -struct inode_operations orangefs_dir_inode_operations = { +const struct inode_operations orangefs_dir_inode_operations = { .lookup = orangefs_lookup, .get_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index c1181e5529af..633c07a6e3d8 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -246,6 +246,8 @@ struct orangefs_inode_s { * with this object */ unsigned long pinode_flags; + + unsigned long getattr_time; }; #define P_ATIME_FLAG 0 @@ -527,7 +529,7 @@ int orangefs_inode_setxattr(struct inode *inode, size_t size, int flags); -int orangefs_inode_getattr(struct inode *inode, int new, int size); +int orangefs_inode_getattr(struct inode *inode, int new, int bypass); int orangefs_inode_check_changed(struct inode *inode); @@ -546,6 +548,8 @@ extern struct mutex request_mutex; extern int debug; extern int op_timeout_secs; extern int slot_timeout_secs; +extern int dcache_timeout_msecs; +extern int getattr_timeout_msecs; extern struct list_head orangefs_superblocks; extern spinlock_t orangefs_superblocks_lock; extern struct list_head orangefs_request_list; @@ -557,10 +561,10 @@ extern int hash_table_size; extern const struct address_space_operations orangefs_address_operations; extern struct backing_dev_info orangefs_backing_dev_info; -extern struct inode_operations orangefs_file_inode_operations; +extern const struct inode_operations orangefs_file_inode_operations; extern const struct file_operations orangefs_file_operations; -extern struct inode_operations orangefs_symlink_inode_operations; -extern struct inode_operations orangefs_dir_inode_operations; +extern const struct inode_operations orangefs_symlink_inode_operations; +extern const struct inode_operations orangefs_dir_inode_operations; extern const struct file_operations orangefs_dir_operations; extern const struct dentry_operations orangefs_dentry_operations; extern const struct file_operations orangefs_devreq_file_operations; diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index 6f072a8c0de1..e9fd5755c05f 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -47,6 +47,8 @@ struct client_debug_mask client_debug_mask = { NULL, 0, 0 }; unsigned int kernel_mask_set_mod_init; /* implicitly false */ int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS; int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS; +int dcache_timeout_msecs = 50; +int getattr_timeout_msecs = 50; MODULE_LICENSE("GPL"); MODULE_AUTHOR("ORANGEFS Development Team"); diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index 5c03113e3ad2..375708c2db87 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -61,10 +61,21 @@ * Slots are requested and waited for, * the wait times out after slot_timeout_secs. * + * What: /sys/fs/orangefs/dcache_timeout_msecs + * Date: Jul 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Time lookup is valid in milliseconds. + * + * What: /sys/fs/orangefs/getattr_timeout_msecs + * Date: Jul 2016 + * Contact: Martin Brandenburg <martin@omnibond.com> + * Description: + * Time getattr is valid in milliseconds. * * What: /sys/fs/orangefs/acache/... * Date: Jun 2015 - * Contact: Mike Marshall <hubcap@omnibond.com> + * Contact: Martin Brandenburg <martin@omnibond.com> * Description: * Attribute cache configurable settings. * @@ -117,6 +128,8 @@ struct orangefs_obj { int perf_history_size; int perf_time_interval_secs; int slot_timeout_secs; + int dcache_timeout_msecs; + int getattr_timeout_msecs; }; struct acache_orangefs_obj { @@ -658,6 +671,20 @@ static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr) "%d\n", slot_timeout_secs); goto out; + } else if (!strcmp(orangefs_attr->attr.name, + "dcache_timeout_msecs")) { + rc = scnprintf(buf, + PAGE_SIZE, + "%d\n", + dcache_timeout_msecs); + goto out; + } else if (!strcmp(orangefs_attr->attr.name, + "getattr_timeout_msecs")) { + rc = scnprintf(buf, + PAGE_SIZE, + "%d\n", + getattr_timeout_msecs); + goto out; } else { goto out; } @@ -734,6 +761,12 @@ static ssize_t int_store(struct orangefs_obj *orangefs_obj, } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) { rc = kstrtoint(buf, 0, &slot_timeout_secs); goto out; + } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { + rc = kstrtoint(buf, 0, &dcache_timeout_msecs); + goto out; + } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) { + rc = kstrtoint(buf, 0, &getattr_timeout_msecs); + goto out; } else { goto out; } @@ -1361,6 +1394,12 @@ static struct orangefs_attribute op_timeout_secs_attribute = static struct orangefs_attribute slot_timeout_secs_attribute = __ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store); +static struct orangefs_attribute dcache_timeout_msecs_attribute = + __ATTR(dcache_timeout_msecs, 0664, int_orangefs_show, int_store); + +static struct orangefs_attribute getattr_timeout_msecs_attribute = + __ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store); + static struct orangefs_attribute perf_counter_reset_attribute = __ATTR(perf_counter_reset, 0664, @@ -1382,6 +1421,8 @@ static struct orangefs_attribute perf_time_interval_secs_attribute = static struct attribute *orangefs_default_attrs[] = { &op_timeout_secs_attribute.attr, &slot_timeout_secs_attribute.attr, + &dcache_timeout_msecs_attribute.attr, + &getattr_timeout_msecs_attribute.attr, &perf_counter_reset_attribute.attr, &perf_history_size_attribute.attr, &perf_time_interval_secs_attribute.attr, diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index c5fbc62357c6..d13c7291fd05 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -251,7 +251,7 @@ static int orangefs_inode_is_stale(struct inode *inode, int new, return 0; } -int orangefs_inode_getattr(struct inode *inode, int new, int size) +int orangefs_inode_getattr(struct inode *inode, int new, int bypass) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; @@ -261,12 +261,16 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size) gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__, get_khandle_from_ino(inode)); + if (!new && !bypass) { + if (time_before(jiffies, orangefs_inode->getattr_time)) + return 0; + } + new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR); if (!new_op) return -ENOMEM; new_op->upcall.req.getattr.refn = orangefs_inode->refn; - new_op->upcall.req.getattr.mask = size ? - ORANGEFS_ATTR_SYS_ALL_NOHINT : ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE; + new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT; ret = service_operation(new_op, __func__, get_interruptible_flag(inode)); @@ -287,20 +291,18 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size) case S_IFREG: inode->i_flags = orangefs_inode_flags(&new_op-> downcall.resp.getattr.attributes); - if (size) { - inode_size = (loff_t)new_op-> - downcall.resp.getattr.attributes.size; - rounded_up_size = - (inode_size + (4096 - (inode_size % 4096))); - inode->i_size = inode_size; - orangefs_inode->blksize = - new_op->downcall.resp.getattr.attributes.blksize; - spin_lock(&inode->i_lock); - inode->i_bytes = inode_size; - inode->i_blocks = - (unsigned long)(rounded_up_size / 512); - spin_unlock(&inode->i_lock); - } + inode_size = (loff_t)new_op-> + downcall.resp.getattr.attributes.size; + rounded_up_size = + (inode_size + (4096 - (inode_size % 4096))); + inode->i_size = inode_size; + orangefs_inode->blksize = + new_op->downcall.resp.getattr.attributes.blksize; + spin_lock(&inode->i_lock); + inode->i_bytes = inode_size; + inode->i_blocks = + (unsigned long)(rounded_up_size / 512); + spin_unlock(&inode->i_lock); break; case S_IFDIR: inode->i_size = PAGE_SIZE; @@ -345,6 +347,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size) inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) | orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes); + orangefs_inode->getattr_time = jiffies + getattr_timeout_msecs*HZ/1000; ret = 0; out: op_release(new_op); @@ -418,6 +421,7 @@ int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr) ClearMtimeFlag(orangefs_inode); ClearCtimeFlag(orangefs_inode); ClearModeFlag(orangefs_inode); + orangefs_inode->getattr_time = jiffies - 1; } return ret; diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h index 1efc6f8a5224..3d7418c728f5 100644 --- a/fs/orangefs/protocol.h +++ b/fs/orangefs/protocol.h @@ -207,14 +207,6 @@ typedef __s64 ORANGEFS_offset; ORANGEFS_ATTR_SYS_DIRENT_COUNT | \ ORANGEFS_ATTR_SYS_BLKSIZE) -#define ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE \ - (ORANGEFS_ATTR_SYS_COMMON_ALL | \ - ORANGEFS_ATTR_SYS_LNK_TARGET | \ - ORANGEFS_ATTR_SYS_DFILE_COUNT | \ - ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT | \ - ORANGEFS_ATTR_SYS_DIRENT_COUNT | \ - ORANGEFS_ATTR_SYS_BLKSIZE) - #define ORANGEFS_XATTR_REPLACE 0x2 #define ORANGEFS_XATTR_CREATE 0x1 #define ORANGEFS_MAX_SERVER_ADDR_LEN 256 diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c index 6418dd638680..8fecf823f5ba 100644 --- a/fs/orangefs/symlink.c +++ b/fs/orangefs/symlink.c @@ -8,7 +8,7 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -struct inode_operations orangefs_symlink_inode_operations = { +const struct inode_operations orangefs_symlink_inode_operations = { .readlink = generic_readlink, .get_link = simple_get_link, .setattr = orangefs_setattr, diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 80aa6f1eb336..54e5d6681786 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -292,6 +292,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, goto out_cleanup; ovl_dentry_update(dentry, newdentry); + ovl_inode_update(d_inode(dentry), d_inode(newdentry)); newdentry = NULL; /* diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 5c9d2d80ff70..12bcd07b9e32 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -138,9 +138,12 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, int err; enum ovl_path_type type; struct path realpath; + const struct cred *old_cred; type = ovl_path_real(dentry, &realpath); + old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getattr(&realpath, stat); + revert_creds(old_cred); if (err) return err; @@ -158,6 +161,22 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } +/* Common operations required to be done after creation of file on upper */ +static void ovl_instantiate(struct dentry *dentry, struct inode *inode, + struct dentry *newdentry, bool hardlink) +{ + ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_update(dentry, newdentry); + if (!hardlink) { + ovl_inode_update(inode, d_inode(newdentry)); + ovl_copyattr(newdentry->d_inode, inode); + } else { + WARN_ON(ovl_inode_real(inode, NULL) != d_inode(newdentry)); + inc_nlink(inode); + } + d_instantiate(dentry, inode); +} + static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct kstat *stat, const char *link, struct dentry *hardlink) @@ -177,10 +196,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, if (err) goto out_dput; - ovl_dentry_version_inc(dentry->d_parent); - ovl_dentry_update(dentry, newdentry); - ovl_copyattr(newdentry->d_inode, inode); - d_instantiate(dentry, inode); + ovl_instantiate(dentry, inode, newdentry, !!hardlink); newdentry = NULL; out_dput: dput(newdentry); @@ -291,23 +307,29 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry) { int err; struct dentry *ret = NULL; + enum ovl_path_type type = ovl_path_type(dentry); LIST_HEAD(list); err = ovl_check_empty_dir(dentry, &list); - if (err) + if (err) { ret = ERR_PTR(err); - else { - /* - * If no upperdentry then skip clearing whiteouts. - * - * Can race with copy-up, since we don't hold the upperdir - * mutex. Doesn't matter, since copy-up can't create a - * non-empty directory from an empty one. - */ - if (ovl_dentry_upper(dentry)) - ret = ovl_clear_empty(dentry, &list); + goto out_free; } + /* + * When removing an empty opaque directory, then it makes no sense to + * replace it with an exact replica of itself. + * + * If no upperdentry then skip clearing whiteouts. + * + * Can race with copy-up, since we don't hold the upperdir mutex. + * Doesn't matter, since copy-up can't create a non-empty directory + * from an empty one. + */ + if (OVL_TYPE_UPPER(type) && OVL_TYPE_MERGE(type)) + ret = ovl_clear_empty(dentry, &list); + +out_free: ovl_cache_free(&list); return ret; @@ -347,7 +369,23 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_dput2; - if (S_ISDIR(stat->mode)) { + /* + * mode could have been mutilated due to umask (e.g. sgid directory) + */ + if (!hardlink && + !S_ISLNK(stat->mode) && newdentry->d_inode->i_mode != stat->mode) { + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat->mode, + }; + inode_lock(newdentry->d_inode); + err = notify_change(newdentry, &attr, NULL); + inode_unlock(newdentry->d_inode); + if (err) + goto out_cleanup; + } + + if (!hardlink && S_ISDIR(stat->mode)) { err = ovl_set_opaque(newdentry); if (err) goto out_cleanup; @@ -363,10 +401,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; } - ovl_dentry_version_inc(dentry->d_parent); - ovl_dentry_update(dentry, newdentry); - ovl_copyattr(newdentry->d_inode, inode); - d_instantiate(dentry, inode); + ovl_instantiate(dentry, inode, newdentry, !!hardlink); newdentry = NULL; out_dput2: dput(upper); @@ -382,52 +417,42 @@ out_cleanup: goto out_dput2; } -static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev, - const char *link, struct dentry *hardlink) +static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, + struct kstat *stat, const char *link, + struct dentry *hardlink) { int err; - struct inode *inode; - struct kstat stat = { - .mode = mode, - .rdev = rdev, - }; - - err = -ENOMEM; - inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata); - if (!inode) - goto out; + const struct cred *old_cred; + struct cred *override_cred; err = ovl_copy_up(dentry->d_parent); if (err) - goto out_iput; - - if (!ovl_dentry_is_opaque(dentry)) { - err = ovl_create_upper(dentry, inode, &stat, link, hardlink); - } else { - const struct cred *old_cred; - struct cred *override_cred; - - old_cred = ovl_override_creds(dentry->d_sb); - - err = -ENOMEM; - override_cred = prepare_creds(); - if (override_cred) { - override_cred->fsuid = old_cred->fsuid; - override_cred->fsgid = old_cred->fsgid; - put_cred(override_creds(override_cred)); - put_cred(override_cred); + return err; - err = ovl_create_over_whiteout(dentry, inode, &stat, - link, hardlink); - } - revert_creds(old_cred); + old_cred = ovl_override_creds(dentry->d_sb); + err = -ENOMEM; + override_cred = prepare_creds(); + if (override_cred) { + override_cred->fsuid = inode->i_uid; + override_cred->fsgid = inode->i_gid; + put_cred(override_creds(override_cred)); + put_cred(override_cred); + + if (!ovl_dentry_is_opaque(dentry)) + err = ovl_create_upper(dentry, inode, stat, link, + hardlink); + else + err = ovl_create_over_whiteout(dentry, inode, stat, + link, hardlink); } + revert_creds(old_cred); + if (!err) { + struct inode *realinode = d_inode(ovl_dentry_upper(dentry)); - if (!err) - inode = NULL; -out_iput: - iput(inode); -out: + WARN_ON(inode->i_mode != realinode->i_mode); + WARN_ON(!uid_eq(inode->i_uid, realinode->i_uid)); + WARN_ON(!gid_eq(inode->i_gid, realinode->i_gid)); + } return err; } @@ -435,13 +460,30 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, const char *link) { int err; + struct inode *inode; + struct kstat stat = { + .rdev = rdev, + }; err = ovl_want_write(dentry); - if (!err) { - err = ovl_create_or_link(dentry, mode, rdev, link, NULL); - ovl_drop_write(dentry); - } + if (err) + goto out; + + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, mode); + if (!inode) + goto out_drop_write; + + inode_init_owner(inode, dentry->d_parent->d_inode, mode); + stat.mode = inode->i_mode; + err = ovl_create_or_link(dentry, inode, &stat, link, NULL); + if (err) + iput(inode); + +out_drop_write: + ovl_drop_write(dentry); +out: return err; } @@ -476,7 +518,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir, struct dentry *new) { int err; - struct dentry *upper; + struct inode *inode; err = ovl_want_write(old); if (err) @@ -486,8 +528,12 @@ static int ovl_link(struct dentry *old, struct inode *newdir, if (err) goto out_drop_write; - upper = ovl_dentry_upper(old); - err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper); + inode = d_inode(old); + ihold(inode); + + err = ovl_create_or_link(new, inode, NULL, NULL, ovl_dentry_upper(old)); + if (err) + iput(inode); out_drop_write: ovl_drop_write(old); @@ -511,24 +557,10 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) return -EROFS; if (is_dir) { - if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) { - opaquedir = ovl_check_empty_and_clear(dentry); - err = PTR_ERR(opaquedir); - if (IS_ERR(opaquedir)) - goto out; - } else { - LIST_HEAD(list); - - /* - * When removing an empty opaque directory, then it - * makes no sense to replace it with an exact replica of - * itself. But emptiness still needs to be checked. - */ - err = ovl_check_empty_dir(dentry, &list); - ovl_cache_free(&list); - if (err) - goto out; - } + opaquedir = ovl_check_empty_and_clear(dentry); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; } err = ovl_lock_rename_workdir(workdir, upperdir); @@ -633,6 +665,8 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) { enum ovl_path_type type; int err; + const struct cred *old_cred; + err = ovl_check_sticky(dentry); if (err) @@ -647,14 +681,18 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) goto out_drop_write; type = ovl_path_type(dentry); - if (OVL_TYPE_PURE_UPPER(type)) { - err = ovl_remove_upper(dentry, is_dir); - } else { - const struct cred *old_cred = ovl_override_creds(dentry->d_sb); + old_cred = ovl_override_creds(dentry->d_sb); + if (OVL_TYPE_PURE_UPPER(type)) + err = ovl_remove_upper(dentry, is_dir); + else err = ovl_remove_and_whiteout(dentry, is_dir); - - revert_creds(old_cred); + revert_creds(old_cred); + if (!err) { + if (is_dir) + clear_nlink(dentry->d_inode); + else + drop_nlink(dentry->d_inode); } out_drop_write: ovl_drop_write(dentry); @@ -760,8 +798,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, old_opaque = !OVL_TYPE_PURE_UPPER(old_type); new_opaque = !OVL_TYPE_PURE_UPPER(new_type); - if (old_opaque || new_opaque) - old_cred = ovl_override_creds(old->d_sb); + old_cred = ovl_override_creds(old->d_sb); if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) { opaquedir = ovl_check_empty_and_clear(new); @@ -891,8 +928,7 @@ out_dput_old: out_unlock: unlock_rename(new_upperdir, old_upperdir); out_revert_creds: - if (old_opaque || new_opaque) - revert_creds(old_cred); + revert_creds(old_cred); out_drop_write: ovl_drop_write(old); out: @@ -913,8 +949,10 @@ const struct inode_operations ovl_dir_inode_operations = { .mknod = ovl_mknod, .permission = ovl_permission, .getattr = ovl_dir_getattr, - .setxattr = ovl_setxattr, + .setxattr = generic_setxattr, .getxattr = ovl_getxattr, .listxattr = ovl_listxattr, .removexattr = ovl_removexattr, + .get_acl = ovl_get_acl, + .update_time = ovl_update_time, }; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index d1cdc60dd68f..1b885c156028 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -41,6 +41,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) { int err; struct dentry *upperdentry; + const struct cred *old_cred; /* * Check for permissions before trying to copy-up. This is redundant @@ -84,7 +85,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid &= ~ATTR_MODE; inode_lock(upperdentry->d_inode); + old_cred = ovl_override_creds(dentry->d_sb); err = notify_change(upperdentry, attr, NULL); + revert_creds(old_cred); if (!err) ovl_copyattr(upperdentry->d_inode, dentry->d_inode); inode_unlock(upperdentry->d_inode); @@ -102,96 +105,46 @@ static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct path realpath; + const struct cred *old_cred; + int err; ovl_path_real(dentry, &realpath); - return vfs_getattr(&realpath, stat); + old_cred = ovl_override_creds(dentry->d_sb); + err = vfs_getattr(&realpath, stat); + revert_creds(old_cred); + return err; } int ovl_permission(struct inode *inode, int mask) { - struct ovl_entry *oe; - struct dentry *alias = NULL; - struct inode *realinode; - struct dentry *realdentry; bool is_upper; + struct inode *realinode = ovl_inode_real(inode, &is_upper); + const struct cred *old_cred; int err; - if (S_ISDIR(inode->i_mode)) { - oe = inode->i_private; - } else if (mask & MAY_NOT_BLOCK) { - return -ECHILD; - } else { - /* - * For non-directories find an alias and get the info - * from there. - */ - alias = d_find_any_alias(inode); - if (WARN_ON(!alias)) - return -ENOENT; - - oe = alias->d_fsdata; - } - - realdentry = ovl_entry_real(oe, &is_upper); - - if (ovl_is_default_permissions(inode)) { - struct kstat stat; - struct path realpath = { .dentry = realdentry }; - - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper); - - err = vfs_getattr(&realpath, &stat); - if (err) - goto out_dput; - - err = -ESTALE; - if ((stat.mode ^ inode->i_mode) & S_IFMT) - goto out_dput; - - inode->i_mode = stat.mode; - inode->i_uid = stat.uid; - inode->i_gid = stat.gid; - - err = generic_permission(inode, mask); - goto out_dput; - } - /* Careful in RCU walk mode */ - realinode = ACCESS_ONCE(realdentry->d_inode); if (!realinode) { WARN_ON(!(mask & MAY_NOT_BLOCK)); - err = -ENOENT; - goto out_dput; + return -ECHILD; } - if (mask & MAY_WRITE) { - umode_t mode = realinode->i_mode; - - /* - * Writes will always be redirected to upper layer, so - * ignore lower layer being read-only. - * - * If the overlay itself is read-only then proceed - * with the permission check, don't return EROFS. - * This will only happen if this is the lower layer of - * another overlayfs. - * - * If upper fs becomes read-only after the overlay was - * constructed return EROFS to prevent modification of - * upper layer. - */ - err = -EROFS; - if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) - goto out_dput; + /* + * Check overlay inode with the creds of task and underlying inode + * with creds of mounter + */ + err = generic_permission(inode, mask); + if (err) + return err; + + old_cred = ovl_override_creds(inode->i_sb); + if (!is_upper && !special_file(realinode->i_mode) && mask & MAY_WRITE) { + mask &= ~(MAY_WRITE | MAY_APPEND); + /* Make sure mounter can read file for copy up later */ + mask |= MAY_READ; } + err = inode_permission(realinode, mask); + revert_creds(old_cred); - err = __inode_permission(realinode, mask); -out_dput: - dput(alias); return err; } @@ -201,6 +154,8 @@ static const char *ovl_get_link(struct dentry *dentry, { struct dentry *realdentry; struct inode *realinode; + const struct cred *old_cred; + const char *p; if (!dentry) return ERR_PTR(-ECHILD); @@ -211,13 +166,18 @@ static const char *ovl_get_link(struct dentry *dentry, if (WARN_ON(!realinode->i_op->get_link)) return ERR_PTR(-EPERM); - return realinode->i_op->get_link(realdentry, realinode, done); + old_cred = ovl_override_creds(dentry->d_sb); + p = realinode->i_op->get_link(realdentry, realinode, done); + revert_creds(old_cred); + return p; } static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) { struct path realpath; struct inode *realinode; + const struct cred *old_cred; + int err; ovl_path_real(dentry, &realpath); realinode = realpath.dentry->d_inode; @@ -225,15 +185,17 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) if (!realinode->i_op->readlink) return -EINVAL; - touch_atime(&realpath); - - return realinode->i_op->readlink(realpath.dentry, buf, bufsiz); + old_cred = ovl_override_creds(dentry->d_sb); + err = realinode->i_op->readlink(realpath.dentry, buf, bufsiz); + revert_creds(old_cred); + return err; } - static bool ovl_is_private_xattr(const char *name) { - return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0; +#define OVL_XATTR_PRE_NAME OVL_XATTR_PREFIX "." + return strncmp(name, OVL_XATTR_PRE_NAME, + sizeof(OVL_XATTR_PRE_NAME) - 1) == 0; } int ovl_setxattr(struct dentry *dentry, struct inode *inode, @@ -242,21 +204,20 @@ int ovl_setxattr(struct dentry *dentry, struct inode *inode, { int err; struct dentry *upperdentry; + const struct cred *old_cred; err = ovl_want_write(dentry); if (err) goto out; - err = -EPERM; - if (ovl_is_private_xattr(name)) - goto out_drop_write; - err = ovl_copy_up(dentry); if (err) goto out_drop_write; upperdentry = ovl_dentry_upper(dentry); + old_cred = ovl_override_creds(dentry->d_sb); err = vfs_setxattr(upperdentry, name, value, size, flags); + revert_creds(old_cred); out_drop_write: ovl_drop_write(dentry); @@ -268,11 +229,16 @@ ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { struct dentry *realdentry = ovl_dentry_real(dentry); + ssize_t res; + const struct cred *old_cred; if (ovl_is_private_xattr(name)) return -ENODATA; - return vfs_getxattr(realdentry, name, value, size); + old_cred = ovl_override_creds(dentry->d_sb); + res = vfs_getxattr(realdentry, name, value, size); + revert_creds(old_cred); + return res; } ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) @@ -280,8 +246,11 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) struct dentry *realdentry = ovl_dentry_real(dentry); ssize_t res; int off; + const struct cred *old_cred; + old_cred = ovl_override_creds(dentry->d_sb); res = vfs_listxattr(realdentry, list, size); + revert_creds(old_cred); if (res <= 0 || size == 0) return res; @@ -308,6 +277,7 @@ int ovl_removexattr(struct dentry *dentry, const char *name) int err; struct path realpath; enum ovl_path_type type = ovl_path_real(dentry, &realpath); + const struct cred *old_cred; err = ovl_want_write(dentry); if (err) @@ -329,13 +299,34 @@ int ovl_removexattr(struct dentry *dentry, const char *name) ovl_path_upper(dentry, &realpath); } + old_cred = ovl_override_creds(dentry->d_sb); err = vfs_removexattr(realpath.dentry, name); + revert_creds(old_cred); out_drop_write: ovl_drop_write(dentry); out: return err; } +struct posix_acl *ovl_get_acl(struct inode *inode, int type) +{ + struct inode *realinode = ovl_inode_real(inode, NULL); + const struct cred *old_cred; + struct posix_acl *acl; + + if (!IS_POSIXACL(realinode)) + return NULL; + + if (!realinode->i_op->get_acl) + return NULL; + + old_cred = ovl_override_creds(inode->i_sb); + acl = realinode->i_op->get_acl(realinode, type); + revert_creds(old_cred); + + return acl; +} + static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, struct dentry *realdentry) { @@ -351,46 +342,60 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, return true; } -struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags) +int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags) { - int err; + int err = 0; struct path realpath; enum ovl_path_type type; - if (d_is_dir(dentry)) - return d_backing_inode(dentry); - type = ovl_path_real(dentry, &realpath); if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) { err = ovl_want_write(dentry); - if (err) - return ERR_PTR(err); + if (!err) { + if (file_flags & O_TRUNC) + err = ovl_copy_up_truncate(dentry); + else + err = ovl_copy_up(dentry); + ovl_drop_write(dentry); + } + } - if (file_flags & O_TRUNC) - err = ovl_copy_up_truncate(dentry); - else - err = ovl_copy_up(dentry); - ovl_drop_write(dentry); - if (err) - return ERR_PTR(err); + return err; +} - ovl_path_upper(dentry, &realpath); +int ovl_update_time(struct inode *inode, struct timespec *ts, int flags) +{ + struct dentry *alias; + struct path upperpath; + + if (!(flags & S_ATIME)) + return 0; + + alias = d_find_any_alias(inode); + if (!alias) + return 0; + + ovl_path_upper(alias, &upperpath); + if (upperpath.dentry) { + touch_atime(&upperpath); + inode->i_atime = d_inode(upperpath.dentry)->i_atime; } - if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE) - return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags); + dput(alias); - return d_backing_inode(realpath.dentry); + return 0; } static const struct inode_operations ovl_file_inode_operations = { .setattr = ovl_setattr, .permission = ovl_permission, .getattr = ovl_getattr, - .setxattr = ovl_setxattr, + .setxattr = generic_setxattr, .getxattr = ovl_getxattr, .listxattr = ovl_listxattr, .removexattr = ovl_removexattr, + .get_acl = ovl_get_acl, + .update_time = ovl_update_time, }; static const struct inode_operations ovl_symlink_inode_operations = { @@ -398,29 +403,22 @@ static const struct inode_operations ovl_symlink_inode_operations = { .get_link = ovl_get_link, .readlink = ovl_readlink, .getattr = ovl_getattr, - .setxattr = ovl_setxattr, + .setxattr = generic_setxattr, .getxattr = ovl_getxattr, .listxattr = ovl_listxattr, .removexattr = ovl_removexattr, + .update_time = ovl_update_time, }; -struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, - struct ovl_entry *oe) +static void ovl_fill_inode(struct inode *inode, umode_t mode) { - struct inode *inode; - - inode = new_inode(sb); - if (!inode) - return NULL; - inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_flags |= S_NOATIME | S_NOCMTIME; + inode->i_flags |= S_NOCMTIME; mode &= S_IFMT; switch (mode) { case S_IFDIR: - inode->i_private = oe; inode->i_op = &ovl_dir_inode_operations; inode->i_fop = &ovl_dir_operations; break; @@ -429,6 +427,10 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, inode->i_op = &ovl_symlink_inode_operations; break; + default: + WARN(1, "illegal file type: %i\n", mode); + /* Fall through */ + case S_IFREG: case S_IFSOCK: case S_IFBLK: @@ -436,11 +438,42 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, case S_IFIFO: inode->i_op = &ovl_file_inode_operations; break; + } +} - default: - WARN(1, "illegal file type: %i\n", mode); - iput(inode); - inode = NULL; +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode) +{ + struct inode *inode; + + inode = new_inode(sb); + if (inode) + ovl_fill_inode(inode, mode); + + return inode; +} + +static int ovl_inode_test(struct inode *inode, void *data) +{ + return ovl_inode_real(inode, NULL) == data; +} + +static int ovl_inode_set(struct inode *inode, void *data) +{ + inode->i_private = (void *) (((unsigned long) data) | OVL_ISUPPER_MASK); + return 0; +} + +struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode) + +{ + struct inode *inode; + + inode = iget5_locked(sb, (unsigned long) realinode, + ovl_inode_test, ovl_inode_set, realinode); + if (inode && inode->i_state & I_NEW) { + ovl_fill_inode(inode, realinode->i_mode); + set_nlink(inode, realinode->i_nlink); + unlock_new_inode(inode); } return inode; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index cfbca53590d0..e4f5c9536bfe 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -23,9 +23,11 @@ enum ovl_path_type { #define OVL_TYPE_MERGE_OR_LOWER(type) \ (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type)) -#define OVL_XATTR_PRE_NAME "trusted.overlay." -#define OVL_XATTR_PRE_LEN 16 -#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque" + +#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay" +#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX ".opaque" + +#define OVL_ISUPPER_MASK 1UL static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { @@ -131,6 +133,16 @@ static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) return err; } +static inline struct inode *ovl_inode_real(struct inode *inode, bool *is_upper) +{ + unsigned long x = (unsigned long) READ_ONCE(inode->i_private); + + if (is_upper) + *is_upper = x & OVL_ISUPPER_MASK; + + return (struct inode *) (x & ~OVL_ISUPPER_MASK); +} + enum ovl_path_type ovl_path_type(struct dentry *dentry); u64 ovl_dentry_version_get(struct dentry *dentry); void ovl_dentry_version_inc(struct dentry *dentry); @@ -141,11 +153,9 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); -struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper); struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode, bool is_upper); struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); -bool ovl_is_default_permissions(struct inode *inode); void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); struct dentry *ovl_workdir(struct dentry *dentry); int ovl_want_write(struct dentry *dentry); @@ -155,6 +165,7 @@ void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); bool ovl_is_whiteout(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); +void ovl_inode_update(struct inode *inode, struct inode *upperinode); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); struct file *ovl_path_open(struct path *path, int flags); @@ -179,15 +190,20 @@ ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); int ovl_removexattr(struct dentry *dentry, const char *name); -struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags); +struct posix_acl *ovl_get_acl(struct inode *inode, int type); +int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); +int ovl_update_time(struct inode *inode, struct timespec *ts, int flags); -struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, - struct ovl_entry *oe); +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode); +struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode); static inline void ovl_copyattr(struct inode *from, struct inode *to) { to->i_uid = from->i_uid; to->i_gid = from->i_gid; to->i_mode = from->i_mode; + to->i_atime = from->i_atime; + to->i_mtime = from->i_mtime; + to->i_ctime = from->i_ctime; } /* dir.c */ diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 9a7693d5f8ff..4036132842b5 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -16,10 +16,10 @@ #include <linux/slab.h> #include <linux/parser.h> #include <linux/module.h> -#include <linux/pagemap.h> #include <linux/sched.h> #include <linux/statfs.h> #include <linux/seq_file.h> +#include <linux/posix_acl_xattr.h> #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); @@ -145,18 +145,11 @@ struct dentry *ovl_dentry_real(struct dentry *dentry) return realdentry; } -struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) +static void ovl_inode_init(struct inode *inode, struct inode *realinode, + bool is_upper) { - struct dentry *realdentry; - - realdentry = ovl_upperdentry_dereference(oe); - if (realdentry) { - *is_upper = true; - } else { - realdentry = __ovl_dentry_lower(oe); - *is_upper = false; - } - return realdentry; + WRITE_ONCE(inode->i_private, (unsigned long) realinode | + (is_upper ? OVL_ISUPPER_MASK : 0)); } struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode, @@ -178,13 +171,6 @@ struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) return oe->cache; } -bool ovl_is_default_permissions(struct inode *inode) -{ - struct ovl_fs *ofs = inode->i_sb->s_fs_info; - - return ofs->config.default_permissions; -} - void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) { struct ovl_entry *oe = dentry->d_fsdata; @@ -235,7 +221,6 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode)); WARN_ON(oe->__upperdentry); - BUG_ON(!upperdentry->d_inode); /* * Make sure upperdentry is consistent before making it visible to * ovl_upperdentry_dereference(). @@ -244,6 +229,16 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) oe->__upperdentry = upperdentry; } +void ovl_inode_update(struct inode *inode, struct inode *upperinode) +{ + WARN_ON(!upperinode); + WARN_ON(!inode_unhashed(inode)); + WRITE_ONCE(inode->i_private, + (unsigned long) upperinode | OVL_ISUPPER_MASK); + if (!S_ISDIR(upperinode->i_mode)) + __insert_inode_hash(inode, (unsigned long) upperinode); +} + void ovl_dentry_version_inc(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; @@ -304,7 +299,9 @@ static void ovl_dentry_release(struct dentry *dentry) } } -static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode) +static struct dentry *ovl_d_real(struct dentry *dentry, + const struct inode *inode, + unsigned int open_flags) { struct dentry *real; @@ -314,6 +311,16 @@ static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode) goto bug; } + if (d_is_negative(dentry)) + return dentry; + + if (open_flags) { + int err = ovl_open_maybe_copy_up(dentry, open_flags); + + if (err) + return ERR_PTR(err); + } + real = ovl_dentry_upper(dentry); if (real && (!inode || inode == d_inode(real))) return real; @@ -326,11 +333,9 @@ static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode) return real; /* Handle recursion */ - if (real->d_flags & DCACHE_OP_REAL) - return real->d_op->d_real(real, inode); - + return d_real(real, inode, open_flags); bug: - WARN(1, "ovl_d_real(%pd4, %s:%lu\n): real dentry not found\n", dentry, + WARN(1, "ovl_d_real(%pd4, %s:%lu): real dentry not found\n", dentry, inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0); return dentry; } @@ -378,13 +383,11 @@ static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, - .d_select_inode = ovl_d_select_inode, .d_real = ovl_d_real, }; static const struct dentry_operations ovl_reval_dentry_operations = { .d_release = ovl_dentry_release, - .d_select_inode = ovl_d_select_inode, .d_real = ovl_d_real, .d_revalidate = ovl_dentry_revalidate, .d_weak_revalidate = ovl_dentry_weak_revalidate, @@ -404,7 +407,8 @@ static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) static bool ovl_dentry_remote(struct dentry *dentry) { return dentry->d_flags & - (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); + (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | + DCACHE_OP_REAL); } static bool ovl_dentry_weird(struct dentry *dentry) @@ -415,12 +419,16 @@ static bool ovl_dentry_weird(struct dentry *dentry) DCACHE_OP_COMPARE); } -static inline struct dentry *ovl_lookup_real(struct dentry *dir, - struct qstr *name) +static inline struct dentry *ovl_lookup_real(struct super_block *ovl_sb, + struct dentry *dir, + const struct qstr *name) { + const struct cred *old_cred; struct dentry *dentry; - dentry = lookup_hash(name, dir); + old_cred = ovl_override_creds(ovl_sb); + dentry = lookup_one_len_unlocked(name->name, dir, name->len); + revert_creds(old_cred); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) @@ -473,7 +481,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, upperdir = ovl_upperdentry_dereference(poe); if (upperdir) { - this = ovl_lookup_real(upperdir, &dentry->d_name); + this = ovl_lookup_real(dentry->d_sb, upperdir, &dentry->d_name); err = PTR_ERR(this); if (IS_ERR(this)) goto out; @@ -506,7 +514,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, bool opaque = false; struct path lowerpath = poe->lowerstack[i]; - this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name); + this = ovl_lookup_real(dentry->d_sb, + lowerpath.dentry, &dentry->d_name); err = PTR_ERR(this); if (IS_ERR(this)) { /* @@ -561,12 +570,19 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (upperdentry || ctr) { struct dentry *realdentry; + struct inode *realinode; realdentry = upperdentry ? upperdentry : stack[0].dentry; + realinode = d_inode(realdentry); err = -ENOMEM; - inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, - oe); + if (upperdentry && !d_is_dir(upperdentry)) { + inode = ovl_get_inode(dentry->d_sb, realinode); + } else { + inode = ovl_new_inode(dentry->d_sb, realinode->i_mode); + if (inode) + ovl_inode_init(inode, realinode, !!upperdentry); + } if (!inode) goto out_free_oe; ovl_copyattr(realdentry->d_inode, inode); @@ -595,7 +611,7 @@ out: struct file *ovl_path_open(struct path *path, int flags) { - return dentry_open(path, flags, current_cred()); + return dentry_open(path, flags | O_NOATIME, current_cred()); } static void ovl_put_super(struct super_block *sb) @@ -678,6 +694,7 @@ static const struct super_operations ovl_super_operations = { .statfs = ovl_statfs, .show_options = ovl_show_options, .remount_fs = ovl_remount, + .drop_inode = generic_delete_inode, }; enum { @@ -950,11 +967,102 @@ static unsigned int ovl_split_lowerdirs(char *str) return ctr; } +static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *realinode = ovl_inode_real(inode, NULL); + struct posix_acl *acl = NULL; + int err; + + /* Check that everything is OK before copy-up */ + if (value) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + err = -EOPNOTSUPP; + if (!IS_POSIXACL(d_inode(workdir))) + goto out_acl_release; + if (!realinode->i_op->set_acl) + goto out_acl_release; + if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) { + err = acl ? -EACCES : 0; + goto out_acl_release; + } + err = -EPERM; + if (!inode_owner_or_capable(inode)) + goto out_acl_release; + + posix_acl_release(acl); + + return ovl_setxattr(dentry, inode, handler->name, value, size, flags); + +out_acl_release: + posix_acl_release(acl); + return err; +} + +static int ovl_other_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return ovl_setxattr(dentry, inode, name, value, size, flags); +} + +static int ovl_own_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + return -EPERM; +} + +static const struct xattr_handler ovl_posix_acl_access_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_ACCESS, + .flags = ACL_TYPE_ACCESS, + .set = ovl_posix_acl_xattr_set, +}; + +static const struct xattr_handler ovl_posix_acl_default_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .set = ovl_posix_acl_xattr_set, +}; + +static const struct xattr_handler ovl_own_xattr_handler = { + .prefix = OVL_XATTR_PREFIX, + .set = ovl_own_xattr_set, +}; + +static const struct xattr_handler ovl_other_xattr_handler = { + .prefix = "", /* catch all */ + .set = ovl_other_xattr_set, +}; + +static const struct xattr_handler *ovl_xattr_handlers[] = { + &ovl_posix_acl_access_xattr_handler, + &ovl_posix_acl_default_xattr_handler, + &ovl_own_xattr_handler, + &ovl_other_xattr_handler, + NULL +}; + +static const struct xattr_handler *ovl_xattr_noacl_handlers[] = { + &ovl_own_xattr_handler, + &ovl_other_xattr_handler, + NULL, +}; + static int ovl_fill_super(struct super_block *sb, void *data, int silent) { struct path upperpath = { NULL, NULL }; struct path workpath = { NULL, NULL }; struct dentry *root_dentry; + struct inode *realinode; struct ovl_entry *oe; struct ovl_fs *ufs; struct path *stack = NULL; @@ -1061,6 +1169,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) pr_err("overlayfs: failed to clone upperpath\n"); goto out_put_lowerpath; } + /* Don't inherit atime flags */ + ufs->upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); + + sb->s_time_gran = ufs->upper_mnt->mnt_sb->s_time_gran; ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); err = PTR_ERR(ufs->workdir); @@ -1108,7 +1220,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) * Make lower_mnt R/O. That way fchmod/fchown on lower file * will fail instead of modifying lower fs. */ - mnt->mnt_flags |= MNT_READONLY; + mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; ufs->lower_mnt[ufs->numlower] = mnt; ufs->numlower++; @@ -1132,7 +1244,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!oe) goto out_put_cred; - root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe)); + root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR)); if (!root_dentry) goto out_free_oe; @@ -1151,13 +1263,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) root_dentry->d_fsdata = oe; - ovl_copyattr(ovl_dentry_real(root_dentry)->d_inode, - root_dentry->d_inode); + realinode = d_inode(ovl_dentry_real(root_dentry)); + ovl_inode_init(d_inode(root_dentry), realinode, !!upperpath.dentry); + ovl_copyattr(realinode, d_inode(root_dentry)); sb->s_magic = OVERLAYFS_SUPER_MAGIC; sb->s_op = &ovl_super_operations; + if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) + sb->s_xattr = ovl_xattr_handlers; + else + sb->s_xattr = ovl_xattr_noacl_handlers; sb->s_root = root_dentry; sb->s_fs_info = ufs; + sb->s_flags |= MS_POSIXACL; return 0; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index edc452c2a563..59d47ab0791a 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -205,7 +205,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. */ int -posix_acl_valid(const struct posix_acl *acl) +posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl) { const struct posix_acl_entry *pa, *pe; int state = ACL_USER_OBJ; @@ -225,7 +225,7 @@ posix_acl_valid(const struct posix_acl *acl) case ACL_USER: if (state != ACL_USER) return -EINVAL; - if (!uid_valid(pa->e_uid)) + if (!kuid_has_mapping(user_ns, pa->e_uid)) return -EINVAL; needs_mask = 1; break; @@ -240,7 +240,7 @@ posix_acl_valid(const struct posix_acl *acl) case ACL_GROUP: if (state != ACL_GROUP) return -EINVAL; - if (!gid_valid(pa->e_gid)) + if (!kgid_has_mapping(user_ns, pa->e_gid)) return -EINVAL; needs_mask = 1; break; @@ -834,7 +834,7 @@ set_posix_acl(struct inode *inode, int type, struct posix_acl *acl) return -EPERM; if (acl) { - int ret = posix_acl_valid(acl); + int ret = posix_acl_valid(inode->i_sb->s_user_ns, acl); if (ret) return ret; } diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 7151ea428041..12c6922c913c 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -4,6 +4,7 @@ obj-y += proc.o +CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,) proc-y := nommu.o task_nommu.o proc-$(CONFIG_MMU) := task_mmu.o diff --git a/fs/proc/base.c b/fs/proc/base.c index a11eb7196ec8..54e270262979 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -579,11 +579,8 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, unsigned long totalpages = totalram_pages + total_swap_pages; unsigned long points = 0; - read_lock(&tasklist_lock); - if (pid_alive(task)) - points = oom_badness(task, NULL, NULL, totalpages) * - 1000 / totalpages; - read_unlock(&tasklist_lock); + points = oom_badness(task, NULL, NULL, totalpages) * + 1000 / totalpages; seq_printf(m, "%lu\n", points); return 0; @@ -1024,23 +1021,107 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, char buffer[PROC_NUMBUF]; int oom_adj = OOM_ADJUST_MIN; size_t len; - unsigned long flags; if (!task) return -ESRCH; - if (lock_task_sighand(task, &flags)) { - if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) - oom_adj = OOM_ADJUST_MAX; - else - oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / - OOM_SCORE_ADJ_MAX; - unlock_task_sighand(task, &flags); - } + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) + oom_adj = OOM_ADJUST_MAX; + else + oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / + OOM_SCORE_ADJ_MAX; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } +static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) +{ + static DEFINE_MUTEX(oom_adj_mutex); + struct mm_struct *mm = NULL; + struct task_struct *task; + int err = 0; + + task = get_proc_task(file_inode(file)); + if (!task) + return -ESRCH; + + mutex_lock(&oom_adj_mutex); + if (legacy) { + if (oom_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_unlock; + } + /* + * /proc/pid/oom_adj is provided for legacy purposes, ask users to use + * /proc/pid/oom_score_adj instead. + */ + pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), task_pid_nr(task), + task_pid_nr(task)); + } else { + if ((short)oom_adj < task->signal->oom_score_adj_min && + !capable(CAP_SYS_RESOURCE)) { + err = -EACCES; + goto err_unlock; + } + } + + /* + * Make sure we will check other processes sharing the mm if this is + * not vfrok which wants its own oom_score_adj. + * pin the mm so it doesn't go away and get reused after task_unlock + */ + if (!task->vfork_done) { + struct task_struct *p = find_lock_task_mm(task); + + if (p) { + if (atomic_read(&p->mm->mm_users) > 1) { + mm = p->mm; + atomic_inc(&mm->mm_count); + } + task_unlock(p); + } + } + + task->signal->oom_score_adj = oom_adj; + if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = (short)oom_adj; + trace_oom_score_adj_update(task); + + if (mm) { + struct task_struct *p; + + rcu_read_lock(); + for_each_process(p) { + if (same_thread_group(task, p)) + continue; + + /* do not touch kernel threads or the global init */ + if (p->flags & PF_KTHREAD || is_global_init(p)) + continue; + + task_lock(p); + if (!p->vfork_done && process_shares_mm(p, mm)) { + pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n", + task_pid_nr(p), p->comm, + p->signal->oom_score_adj, oom_adj, + task_pid_nr(task), task->comm); + p->signal->oom_score_adj = oom_adj; + if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) + p->signal->oom_score_adj_min = (short)oom_adj; + } + task_unlock(p); + } + rcu_read_unlock(); + mmdrop(mm); + } +err_unlock: + mutex_unlock(&oom_adj_mutex); + put_task_struct(task); + return err; +} + /* * /proc/pid/oom_adj exists solely for backwards compatibility with previous * kernels. The effective policy is defined by oom_score_adj, which has a @@ -1054,10 +1135,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task; char buffer[PROC_NUMBUF]; int oom_adj; - unsigned long flags; int err; memset(buffer, 0, sizeof(buffer)); @@ -1077,23 +1156,6 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf, goto out; } - task = get_proc_task(file_inode(file)); - if (!task) { - err = -ESRCH; - goto out; - } - - task_lock(task); - if (!task->mm) { - err = -EINVAL; - goto err_task_lock; - } - - if (!lock_task_sighand(task, &flags)) { - err = -ESRCH; - goto err_task_lock; - } - /* * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum * value is always attainable. @@ -1103,27 +1165,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf, else oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; - if (oom_adj < task->signal->oom_score_adj && - !capable(CAP_SYS_RESOURCE)) { - err = -EACCES; - goto err_sighand; - } - - /* - * /proc/pid/oom_adj is provided for legacy purposes, ask users to use - * /proc/pid/oom_score_adj instead. - */ - pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", - current->comm, task_pid_nr(current), task_pid_nr(task), - task_pid_nr(task)); - - task->signal->oom_score_adj = oom_adj; - trace_oom_score_adj_update(task); -err_sighand: - unlock_task_sighand(task, &flags); -err_task_lock: - task_unlock(task); - put_task_struct(task); + err = __set_oom_adj(file, oom_adj, true); out: return err < 0 ? err : count; } @@ -1140,15 +1182,11 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, struct task_struct *task = get_proc_task(file_inode(file)); char buffer[PROC_NUMBUF]; short oom_score_adj = OOM_SCORE_ADJ_MIN; - unsigned long flags; size_t len; if (!task) return -ESRCH; - if (lock_task_sighand(task, &flags)) { - oom_score_adj = task->signal->oom_score_adj; - unlock_task_sighand(task, &flags); - } + oom_score_adj = task->signal->oom_score_adj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); @@ -1157,9 +1195,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task; char buffer[PROC_NUMBUF]; - unsigned long flags; int oom_score_adj; int err; @@ -1180,39 +1216,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto out; } - task = get_proc_task(file_inode(file)); - if (!task) { - err = -ESRCH; - goto out; - } - - task_lock(task); - if (!task->mm) { - err = -EINVAL; - goto err_task_lock; - } - - if (!lock_task_sighand(task, &flags)) { - err = -ESRCH; - goto err_task_lock; - } - - if ((short)oom_score_adj < task->signal->oom_score_adj_min && - !capable(CAP_SYS_RESOURCE)) { - err = -EACCES; - goto err_sighand; - } - - task->signal->oom_score_adj = (short)oom_score_adj; - if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) - task->signal->oom_score_adj_min = (short)oom_score_adj; - trace_oom_score_adj_update(task); - -err_sighand: - unlock_task_sighand(task, &flags); -err_task_lock: - task_unlock(task); - put_task_struct(task); + err = __set_oom_adj(file, oom_score_adj, false); out: return err < 0 ? err : count; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 42305ddcbaa0..c1b72388e571 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -457,17 +457,30 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) return inode; } -int proc_fill_super(struct super_block *s) +int proc_fill_super(struct super_block *s, void *data, int silent) { + struct pid_namespace *ns = get_pid_ns(s->s_fs_info); struct inode *root_inode; int ret; + if (!proc_parse_options(data, ns)) + return -EINVAL; + + /* User space would break if executables or devices appear on proc */ + s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = PROC_SUPER_MAGIC; s->s_op = &proc_sops; s->s_time_gran = 1; + + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa2781095bd1..7931c558c192 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -212,7 +212,7 @@ extern const struct inode_operations proc_pid_link_inode_operations; extern void proc_init_inodecache(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); -extern int proc_fill_super(struct super_block *); +extern int proc_fill_super(struct super_block *, void *data, int flags); extern void proc_entry_rundown(struct proc_dir_entry *); /* @@ -268,6 +268,7 @@ static inline void proc_tty_init(void) {} * root.c */ extern struct proc_dir_entry proc_root; +extern int proc_parse_options(char *options, struct pid_namespace *pid); extern void proc_self_init(void); extern int proc_remount(struct super_block *, int *, char *); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index cf301a9ef512..09e18fdf61e5 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) si_swapinfo(&i); committed = percpu_counter_read_positive(&vm_committed_as); - cached = global_page_state(NR_FILE_PAGES) - + cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; if (cached < 0) cached = 0; @@ -138,23 +138,23 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #endif K(i.totalswap), K(i.freeswap), - K(global_page_state(NR_FILE_DIRTY)), - K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES)), - K(global_page_state(NR_FILE_MAPPED)), + K(global_node_page_state(NR_FILE_DIRTY)), + K(global_node_page_state(NR_WRITEBACK)), + K(global_node_page_state(NR_ANON_MAPPED)), + K(global_node_page_state(NR_FILE_MAPPED)), K(i.sharedram), K(global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)), K(global_page_state(NR_SLAB_UNRECLAIMABLE)), - global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, + global_page_state(NR_KERNEL_STACK_KB), K(global_page_state(NR_PAGETABLE)), #ifdef CONFIG_QUICKLIST K(quicklist_total_size()), #endif - K(global_page_state(NR_UNSTABLE_NFS)), + K(global_node_page_state(NR_UNSTABLE_NFS)), K(global_page_state(NR_BOUNCE)), - K(global_page_state(NR_WRITEBACK_TEMP)), + K(global_node_page_state(NR_WRITEBACK_TEMP)), K(vm_commit_limit()), K(committed), (unsigned long)VMALLOC_TOTAL >> 10, @@ -164,9 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - , K(global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) - , K(global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) - , K(global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) + , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) + , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) + , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) #endif #ifdef CONFIG_CMA , K(totalcma_pages) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5e57c3e46e1d..b59db94d2ff4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -623,7 +623,7 @@ static bool proc_sys_fill_cache(struct file *file, qname.name = table->procname; qname.len = strlen(table->procname); - qname.hash = full_name_hash(qname.name, qname.len); + qname.hash = full_name_hash(dir, qname.name, qname.len); child = d_lookup(dir, &qname); if (!child) { diff --git a/fs/proc/root.c b/fs/proc/root.c index 06702783bf40..8d3e484055a6 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -23,21 +23,6 @@ #include "internal.h" -static int proc_test_super(struct super_block *sb, void *data) -{ - return sb->s_fs_info == data; -} - -static int proc_set_super(struct super_block *sb, void *data) -{ - int err = set_anon_super(sb, NULL); - if (!err) { - struct pid_namespace *ns = (struct pid_namespace *)data; - sb->s_fs_info = get_pid_ns(ns); - } - return err; -} - enum { Opt_gid, Opt_hidepid, Opt_err, }; @@ -48,7 +33,7 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; -static int proc_parse_options(char *options, struct pid_namespace *pid) +int proc_parse_options(char *options, struct pid_namespace *pid) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -100,52 +85,16 @@ int proc_remount(struct super_block *sb, int *flags, char *data) static struct dentry *proc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - int err; - struct super_block *sb; struct pid_namespace *ns; - char *options; if (flags & MS_KERNMOUNT) { - ns = (struct pid_namespace *)data; - options = NULL; + ns = data; + data = NULL; } else { ns = task_active_pid_ns(current); - options = data; - - /* Does the mounter have privilege over the pid namespace? */ - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - } - - sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); - if (IS_ERR(sb)) - return ERR_CAST(sb); - - /* - * procfs isn't actually a stacking filesystem; however, there is - * too much magic going on inside it to permit stacking things on - * top of it - */ - sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; - - if (!proc_parse_options(options, ns)) { - deactivate_locked_super(sb); - return ERR_PTR(-EINVAL); - } - - if (!sb->s_root) { - err = proc_fill_super(sb); - if (err) { - deactivate_locked_super(sb); - return ERR_PTR(err); - } - - sb->s_flags |= MS_ACTIVE; - /* User space would break if executables appear on proc */ - sb->s_iflags |= SB_I_NOEXEC; } - return dget(sb->s_root); + return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super); } static void proc_kill_sb(struct super_block *sb) @@ -165,7 +114,7 @@ static struct file_system_type proc_fs_type = { .name = "proc", .mount = proc_mount, .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 510413eb25b8..7907e456ac4f 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -80,19 +80,17 @@ static u64 get_iowait_time(int cpu) static int show_stat(struct seq_file *p, void *v) { int i, j; - unsigned long jif; u64 user, nice, system, idle, iowait, irq, softirq, steal; u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; - struct timespec boottime; + struct timespec64 boottime; user = nice = system = idle = iowait = irq = softirq = steal = 0; guest = guest_nice = 0; - getboottime(&boottime); - jif = boottime.tv_sec; + getboottime64(&boottime); for_each_possible_cpu(i) { user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; @@ -163,12 +161,12 @@ static int show_stat(struct seq_file *p, void *v) seq_printf(p, "\nctxt %llu\n" - "btime %lu\n" + "btime %llu\n" "processes %lu\n" "procs_running %lu\n" "procs_blocked %lu\n", nr_context_switches(), - (unsigned long)jif, + (unsigned long long)boottime.tv_sec, total_forks, nr_running(), nr_iowait()); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 47516a794011..7a034d62cf8c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -486,30 +486,21 @@ static int ramoops_parse_dt(struct platform_device *pdev, struct ramoops_platform_data *pdata) { struct device_node *of_node = pdev->dev.of_node; - struct device_node *mem_region; - struct resource res; + struct resource *res; u32 value; int ret; dev_dbg(&pdev->dev, "using Device Tree\n"); - mem_region = of_parse_phandle(of_node, "memory-region", 0); - if (!mem_region) { - dev_err(&pdev->dev, "no memory-region phandle\n"); - return -ENODEV; - } - - ret = of_address_to_resource(mem_region, 0, &res); - of_node_put(mem_region); - if (ret) { + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { dev_err(&pdev->dev, - "failed to translate memory-region to resource: %d\n", - ret); - return ret; + "failed to locate DT /reserved-memory resource\n"); + return -EINVAL; } - pdata->mem_size = resource_size(&res); - pdata->mem_address = res.start; + pdata->mem_size = resource_size(res); + pdata->mem_address = res->start; pdata->mem_type = of_property_read_bool(of_node, "unbuffered"); pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops"); @@ -652,11 +643,11 @@ fail_buf: kfree(cxt->pstore.buf); fail_clear: cxt->pstore.bufsize = 0; - kfree(cxt->mprz); + persistent_ram_free(cxt->mprz); fail_init_mprz: - kfree(cxt->fprz); + persistent_ram_free(cxt->fprz); fail_init_fprz: - kfree(cxt->cprz); + persistent_ram_free(cxt->cprz); fail_init_cprz: ramoops_free_przs(cxt); fail_out: diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ff21980d0119..1bfac28b7e7d 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -841,6 +841,9 @@ struct dquot *dqget(struct super_block *sb, struct kqid qid) unsigned int hashent = hashfn(sb, qid); struct dquot *dquot, *empty = NULL; + if (!qid_has_mapping(sb->s_user_ns, qid)) + return ERR_PTR(-EINVAL); + if (!sb_has_quota_active(sb, qid.type)) return ERR_PTR(-ESRCH); we_slept: @@ -1133,7 +1136,7 @@ static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) else dquot->dq_dqb.dqb_curinodes = 0; if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) - dquot->dq_dqb.dqb_itime = (time_t) 0; + dquot->dq_dqb.dqb_itime = (time64_t) 0; clear_bit(DQ_INODES_B, &dquot->dq_flags); } @@ -1145,7 +1148,7 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number) else dquot->dq_dqb.dqb_curspace = 0; if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) - dquot->dq_dqb.dqb_btime = (time_t) 0; + dquot->dq_dqb.dqb_btime = (time64_t) 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } @@ -1292,7 +1295,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, if (dquot->dq_dqb.dqb_isoftlimit && newinodes > dquot->dq_dqb.dqb_isoftlimit && dquot->dq_dqb.dqb_itime && - get_seconds() >= dquot->dq_dqb.dqb_itime && + ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN); return -EDQUOT; @@ -1302,7 +1305,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, newinodes > dquot->dq_dqb.dqb_isoftlimit && dquot->dq_dqb.dqb_itime == 0) { prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN); - dquot->dq_dqb.dqb_itime = get_seconds() + + dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace; } @@ -1334,7 +1337,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, if (dquot->dq_dqb.dqb_bsoftlimit && tspace > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime && - get_seconds() >= dquot->dq_dqb.dqb_btime && + ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime && !ignore_hardlimit(dquot)) { if (!prealloc) prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN); @@ -1346,7 +1349,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, dquot->dq_dqb.dqb_btime == 0) { if (!prealloc) { prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN); - dquot->dq_dqb.dqb_btime = get_seconds() + + dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() + sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace; } else @@ -2268,6 +2271,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, error = -EINVAL; goto out_fmt; } + /* Filesystems outside of init_user_ns not yet supported */ + if (sb->s_user_ns != &init_user_ns) { + error = -EINVAL; + goto out_fmt; + } /* Usage always has to be set... */ if (!(flags & DQUOT_USAGE_ENABLED)) { error = -EINVAL; @@ -2695,7 +2703,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) clear_bit(DQ_BLKS_B, &dquot->dq_flags); } else if (!(di->d_fieldmask & QC_SPC_TIMER)) /* Set grace only if user hasn't provided his own... */ - dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; + dm->dqb_btime = ktime_get_real_seconds() + dqi->dqi_bgrace; } if (check_ilim) { if (!dm->dqb_isoftlimit || @@ -2704,7 +2712,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) clear_bit(DQ_INODES_B, &dquot->dq_flags); } else if (!(di->d_fieldmask & QC_INO_TIMER)) /* Set grace only if user hasn't provided his own... */ - dm->dqb_itime = get_seconds() + dqi->dqi_igrace; + dm->dqb_itime = ktime_get_real_seconds() + dqi->dqi_igrace; } if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit) diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 0f10ee9892ce..35df08ee9c97 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -211,7 +211,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); if (ret) @@ -237,7 +237,7 @@ static int quota_getnextquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq); if (ret) @@ -288,7 +288,7 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; copy_from_if_dqblk(&fdq, &idq); return sb->s_qcop->set_dqblk(sb, qid, &fdq); @@ -581,10 +581,10 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->set_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; /* Are we actually setting timer / warning limits for all users? */ - if (from_kqid(&init_user_ns, qid) == 0 && + if (from_kqid(sb->s_user_ns, qid) == 0 && fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) { struct qc_info qinfo; int ret; @@ -642,7 +642,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_dqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_dqblk(sb, qid, &qdq); if (ret) @@ -669,7 +669,7 @@ static int quota_getnextxquota(struct super_block *sb, int type, qid_t id, if (!sb->s_qcop->get_nextdqblk) return -ENOSYS; qid = make_kqid(current_user_ns(), type, id); - if (!qid_valid(qid)) + if (!qid_has_mapping(sb->s_user_ns, qid)) return -EINVAL; ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq); if (ret) diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c index b751eea32e20..5db6f45b3fed 100644 --- a/fs/reiserfs/ibalance.c +++ b/fs/reiserfs/ibalance.c @@ -1153,8 +1153,9 @@ int balance_internal(struct tree_balance *tb, insert_ptr); } - memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE); insert_ptr[0] = new_insert_ptr; + if (new_insert_ptr) + memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE); return order; } diff --git a/fs/super.c b/fs/super.c index 5806ffd45563..c2ff475c1711 100644 --- a/fs/super.c +++ b/fs/super.c @@ -33,6 +33,7 @@ #include <linux/cleancache.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> +#include <linux/user_namespace.h> #include "internal.h" @@ -165,6 +166,7 @@ static void destroy_super(struct super_block *s) list_lru_destroy(&s->s_inode_lru); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); + put_user_ns(s->s_user_ns); kfree(s->s_subtype); kfree(s->s_options); call_rcu(&s->rcu, destroy_super_rcu); @@ -174,11 +176,13 @@ static void destroy_super(struct super_block *s) * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags + * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ -static struct super_block *alloc_super(struct file_system_type *type, int flags) +static struct super_block *alloc_super(struct file_system_type *type, int flags, + struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; @@ -188,6 +192,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) return NULL; INIT_LIST_HEAD(&s->s_mounts); + s->s_user_ns = get_user_ns(user_ns); if (security_sb_alloc(s)) goto fail; @@ -201,6 +206,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; + if (s->s_user_ns != &init_user_ns) + s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); mutex_init(&s->s_sync_lock); @@ -445,29 +452,42 @@ void generic_shutdown_super(struct super_block *sb) EXPORT_SYMBOL(generic_shutdown_super); /** - * sget - find or create a superblock + * sget_userns - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags + * @user_ns: User namespace for the super_block * @data: argument to each of them */ -struct super_block *sget(struct file_system_type *type, +struct super_block *sget_userns(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), - int flags, + int flags, struct user_namespace *user_ns, void *data) { struct super_block *s = NULL; struct super_block *old; int err; + if (!(flags & MS_KERNMOUNT) && + !(type->fs_flags & FS_USERNS_MOUNT) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; + if (user_ns != old->s_user_ns) { + spin_unlock(&sb_lock); + if (s) { + up_write(&s->s_umount); + destroy_super(s); + } + return ERR_PTR(-EBUSY); + } if (!grab_super(old)) goto retry; if (s) { @@ -480,7 +500,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, flags); + s = alloc_super(type, flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -503,6 +523,31 @@ retry: return s; } +EXPORT_SYMBOL(sget_userns); + +/** + * sget - find or create a superblock + * @type: filesystem type superblock should belong to + * @test: comparison callback + * @set: setup callback + * @flags: mount flags + * @data: argument to each of them + */ +struct super_block *sget(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + int flags, + void *data) +{ + struct user_namespace *user_ns = current_user_ns(); + + /* Ensure the requestor has permissions over the target filesystem */ + if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + return sget_userns(type, test, set, flags, user_ns, data); +} + EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) @@ -920,12 +965,20 @@ static int ns_set_super(struct super_block *sb, void *data) return set_anon_super(sb, NULL); } -struct dentry *mount_ns(struct file_system_type *fs_type, int flags, - void *data, int (*fill_super)(struct super_block *, void *, int)) +struct dentry *mount_ns(struct file_system_type *fs_type, + int flags, void *data, void *ns, struct user_namespace *user_ns, + int (*fill_super)(struct super_block *, void *, int)) { struct super_block *sb; - sb = sget(fs_type, ns_test_super, ns_set_super, flags, data); + /* Don't allow mounting unless the caller has CAP_SYS_ADMIN + * over the namespace. + */ + if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags, + user_ns, ns); if (IS_ERR(sb)) return ERR_CAST(sb); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index f3db82071cfb..20b8f82e115b 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -41,8 +41,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, if (IS_ERR(root) || !new_sb) kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); else if (new_sb) - /* Userspace would break if executables appear on sysfs */ - root->d_sb->s_iflags |= SB_I_NOEXEC; + root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; return root; } @@ -59,7 +58,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 90b60c03b588..a42de45ce40d 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -33,7 +33,7 @@ static int sysv_hash(const struct dentry *dentry, struct qstr *qstr) function. */ if (qstr->len > SYSV_NAMELEN) { qstr->len = SYSV_NAMELEN; - qstr->hash = full_name_hash(qstr->name, qstr->len); + qstr->hash = full_name_hash(dentry, qstr->name, qstr->len); } return 0; } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 4a0e48f92104..ad40b64c5e2f 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -541,9 +541,6 @@ void tracefs_remove(struct dentry *dentry) return; parent = dentry->d_parent; - if (!parent || !parent->d_inode) - return; - inode_lock(parent->d_inode); ret = __tracefs_remove(dentry, parent); inode_unlock(parent->d_inode); @@ -566,10 +563,6 @@ void tracefs_remove_recursive(struct dentry *dentry) if (IS_ERR_OR_NULL(dentry)) return; - parent = dentry->d_parent; - if (!parent || !parent->d_inode) - return; - parent = dentry; down: inode_lock(parent->d_inode); diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 9718da86ad01..821b34816976 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -100,10 +100,6 @@ static int switch_gc_head(struct ubifs_info *c) if (err) return err; - err = ubifs_wbuf_sync_nolock(wbuf); - if (err) - return err; - err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); if (err) return err; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 70349954e78b..4ec051089186 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -520,19 +520,19 @@ static int init_constants_early(struct ubifs_info *c) c->max_write_shift = fls(c->max_write_size) - 1; if (c->leb_size < UBIFS_MIN_LEB_SZ) { - ubifs_err(c, "too small LEBs (%d bytes), min. is %d bytes", - c->leb_size, UBIFS_MIN_LEB_SZ); + ubifs_errc(c, "too small LEBs (%d bytes), min. is %d bytes", + c->leb_size, UBIFS_MIN_LEB_SZ); return -EINVAL; } if (c->leb_cnt < UBIFS_MIN_LEB_CNT) { - ubifs_err(c, "too few LEBs (%d), min. is %d", - c->leb_cnt, UBIFS_MIN_LEB_CNT); + ubifs_errc(c, "too few LEBs (%d), min. is %d", + c->leb_cnt, UBIFS_MIN_LEB_CNT); return -EINVAL; } if (!is_power_of_2(c->min_io_size)) { - ubifs_err(c, "bad min. I/O size %d", c->min_io_size); + ubifs_errc(c, "bad min. I/O size %d", c->min_io_size); return -EINVAL; } @@ -543,8 +543,8 @@ static int init_constants_early(struct ubifs_info *c) if (c->max_write_size < c->min_io_size || c->max_write_size % c->min_io_size || !is_power_of_2(c->max_write_size)) { - ubifs_err(c, "bad write buffer size %d for %d min. I/O unit", - c->max_write_size, c->min_io_size); + ubifs_errc(c, "bad write buffer size %d for %d min. I/O unit", + c->max_write_size, c->min_io_size); return -EINVAL; } @@ -2108,8 +2108,9 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, */ ubi = open_ubi(name, UBI_READONLY); if (IS_ERR(ubi)) { - pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", - current->pid, name, (int)PTR_ERR(ubi)); + if (!(flags & MS_SILENT)) + pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", + current->pid, name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index ddf9f6b9eee2..4617d459022a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1783,8 +1783,8 @@ void ubifs_err(const struct ubifs_info *c, const char *fmt, ...); __printf(2, 3) void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...); /* - * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description - * object as an argument. + * A conditional variant of 'ubifs_err()' which doesn't output anything + * if probing (ie. MS_SILENT set). */ #define ubifs_errc(c, fmt, ...) \ do { \ diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index b5fc27969e9d..e237811f09ce 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -592,19 +592,19 @@ static int ubifs_xattr_set(const struct xattr_handler *handler, return __ubifs_removexattr(inode, name); } -const struct xattr_handler ubifs_user_xattr_handler = { +static const struct xattr_handler ubifs_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = ubifs_xattr_get, .set = ubifs_xattr_set, }; -const struct xattr_handler ubifs_trusted_xattr_handler = { +static const struct xattr_handler ubifs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = ubifs_xattr_get, .set = ubifs_xattr_set, }; -const struct xattr_handler ubifs_security_xattr_handler = { +static const struct xattr_handler ubifs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = ubifs_xattr_get, .set = ubifs_xattr_set, diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 57dcceda17d6..fa3bda1a860f 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -279,12 +279,6 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, de = (struct ufs_dir_entry *) kaddr; kaddr += ufs_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { - if (de->d_reclen == 0) { - ufs_error(dir->i_sb, __func__, - "zero-length directory entry"); - ufs_put_page(page); - goto out; - } if (ufs_match(sb, namelen, name, de)) goto found; de = ufs_next_entry(sb, de); @@ -414,11 +408,8 @@ ufs_validate_entry(struct super_block *sb, char *base, { struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset); struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask)); - while ((char*)p < (char*)de) { - if (p->d_reclen == 0) - break; + while ((char*)p < (char*)de) p = ufs_next_entry(sb, p); - } return (char *)p - base; } @@ -469,12 +460,6 @@ ufs_readdir(struct file *file, struct dir_context *ctx) de = (struct ufs_dir_entry *)(kaddr+offset); limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1); for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) { - if (de->d_reclen == 0) { - ufs_error(sb, __func__, - "zero-length directory entry"); - ufs_put_page(page); - return -EIO; - } if (de->d_ino) { unsigned char d_type = DT_UNKNOWN; diff --git a/fs/xattr.c b/fs/xattr.c index 4beafc43daa5..c243905835ab 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -38,6 +38,13 @@ xattr_permission(struct inode *inode, const char *name, int mask) if (mask & MAY_WRITE) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; + /* + * Updating an xattr will likely cause i_uid and i_gid + * to be writen back improperly if their true value is + * unknown to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return -EPERM; } /* diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 3542d94fddce..52c288514be1 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -121,5 +121,4 @@ xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o -xfs-$(CONFIG_NFSD_BLOCKLAYOUT) += xfs_pnfs.o -xfs-$(CONFIG_NFSD_SCSILAYOUT) += xfs_pnfs.o +xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index a1b2dd828b9d..fe1bfee35898 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = { .fh_to_parent = xfs_fs_fh_to_parent, .get_parent = xfs_fs_get_parent, .commit_metadata = xfs_fs_nfs_commit_metadata, -#ifdef CONFIG_NFSD_BLOCKLAYOUT +#ifdef CONFIG_EXPORTFS_BLOCK_OPS .get_uuid = xfs_fs_get_uuid, .map_blocks = xfs_fs_map_blocks, .commit_blocks = xfs_fs_commit_blocks, diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index 93f74853961b..e8339f74966b 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -1,7 +1,7 @@ #ifndef _XFS_PNFS_H #define _XFS_PNFS_H 1 -#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT) +#ifdef CONFIG_EXPORTFS_BLOCK_OPS int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, struct iomap *iomap, bool write, u32 *device_generation); @@ -15,5 +15,5 @@ xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) { return 0; } -#endif /* CONFIG_NFSD_PNFS */ +#endif /* CONFIG_EXPORTFS_BLOCK_OPS */ #endif /* _XFS_PNFS_H */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 8686df6c7609..d266e835ecc3 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -128,7 +128,6 @@ static int xqm_proc_open(struct inode *inode, struct file *file) } static const struct file_operations xqm_proc_fops = { - .owner = THIS_MODULE, .open = xqm_proc_open, .read = seq_read, .llseek = seq_lseek, |