diff options
Diffstat (limited to 'fs')
211 files changed, 7820 insertions, 3025 deletions
diff --git a/fs/Makefile b/fs/Makefile index 1148c555c4d3..98be354fdb61 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -37,7 +37,7 @@ obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FILE_LOCKING) += locks.o -obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o +obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o diff --git a/fs/attr.c b/fs/attr.c index df28035aa23e..b4bbdbd4c8ca 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -183,18 +183,12 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) inode->i_uid = attr->ia_uid; if (ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; - if (ia_valid & ATTR_ATIME) { - inode->i_atime = timestamp_truncate(attr->ia_atime, - inode); - } - if (ia_valid & ATTR_MTIME) { - inode->i_mtime = timestamp_truncate(attr->ia_mtime, - inode); - } - if (ia_valid & ATTR_CTIME) { - inode->i_ctime = timestamp_truncate(attr->ia_ctime, - inode); - } + if (ia_valid & ATTR_ATIME) + inode->i_atime = attr->ia_atime; + if (ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; @@ -268,8 +262,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de attr->ia_ctime = now; if (!(ia_valid & ATTR_ATIME_SET)) attr->ia_atime = now; + else + attr->ia_atime = timestamp_truncate(attr->ia_atime, inode); if (!(ia_valid & ATTR_MTIME_SET)) attr->ia_mtime = now; + else + attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode); + if (ia_valid & ATTR_KILL_PRIV) { error = security_inode_need_killpriv(dentry); if (error < 0) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ecd8d2698515..f4713ea76e82 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -97,7 +97,7 @@ static struct linux_binfmt elf_format = { .min_coredump = ELF_EXEC_PAGESIZE, }; -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) +#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) static int set_brk(unsigned long start, unsigned long end, int prot) { @@ -161,9 +161,11 @@ static int padzero(unsigned long elf_bss) #endif static int -create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, - unsigned long load_addr, unsigned long interp_load_addr) +create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, + unsigned long load_addr, unsigned long interp_load_addr, + unsigned long e_entry) { + struct mm_struct *mm = current->mm; unsigned long p = bprm->p; int argc = bprm->argc; int envc = bprm->envc; @@ -176,7 +178,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned char k_rand_bytes[16]; int items; elf_addr_t *elf_info; - int ei_index = 0; + int ei_index; const struct cred *cred = current_cred(); struct vm_area_struct *vma; @@ -226,12 +228,12 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; /* Create the ELF interpreter info */ - elf_info = (elf_addr_t *)current->mm->saved_auxv; + elf_info = (elf_addr_t *)mm->saved_auxv; /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ #define NEW_AUX_ENT(id, val) \ do { \ - elf_info[ei_index++] = id; \ - elf_info[ei_index++] = val; \ + *elf_info++ = id; \ + *elf_info++ = val; \ } while (0) #ifdef ARCH_DLINFO @@ -251,7 +253,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); NEW_AUX_ENT(AT_FLAGS, 0); - NEW_AUX_ENT(AT_ENTRY, exec->e_entry); + NEW_AUX_ENT(AT_ENTRY, e_entry); NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid)); NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid)); NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid)); @@ -275,12 +277,13 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, } #undef NEW_AUX_ENT /* AT_NULL is zero; clear the rest too */ - memset(&elf_info[ei_index], 0, - sizeof current->mm->saved_auxv - ei_index * sizeof elf_info[0]); + memset(elf_info, 0, (char *)mm->saved_auxv + + sizeof(mm->saved_auxv) - (char *)elf_info); /* And advance past the AT_NULL entry. */ - ei_index += 2; + elf_info += 2; + ei_index = elf_info - (elf_addr_t *)mm->saved_auxv; sp = STACK_ADD(p, ei_index); items = (argc + 1) + (envc + 1) + 1; @@ -299,7 +302,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ - vma = find_extend_vma(current->mm, bprm->p); + vma = find_extend_vma(mm, bprm->p); if (!vma) return -EFAULT; @@ -308,7 +311,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; /* Populate list of argv pointers back to argv strings. */ - p = current->mm->arg_end = current->mm->arg_start; + p = mm->arg_end = mm->arg_start; while (argc-- > 0) { size_t len; if (__put_user((elf_addr_t)p, sp++)) @@ -320,10 +323,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, } if (__put_user(0, sp++)) return -EFAULT; - current->mm->arg_end = p; + mm->arg_end = p; /* Populate list of envp pointers back to envp strings. */ - current->mm->env_end = current->mm->env_start = p; + mm->env_end = mm->env_start = p; while (envc-- > 0) { size_t len; if (__put_user((elf_addr_t)p, sp++)) @@ -335,10 +338,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, } if (__put_user(0, sp++)) return -EFAULT; - current->mm->env_end = p; + mm->env_end = p; /* Put the elf_info on the stack in the right place. */ - if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t))) + if (copy_to_user(sp, mm->saved_auxv, ei_index * sizeof(elf_addr_t))) return -EFAULT; return 0; } @@ -689,15 +692,17 @@ static int load_elf_binary(struct linux_binprm *bprm) int bss_prot = 0; int retval, i; unsigned long elf_entry; + unsigned long e_entry; unsigned long interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; + struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf; struct { - struct elfhdr elf_ex; struct elfhdr interp_elf_ex; } *loc; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; + struct mm_struct *mm; struct pt_regs *regs; loc = kmalloc(sizeof(*loc), GFP_KERNEL); @@ -705,30 +710,27 @@ static int load_elf_binary(struct linux_binprm *bprm) retval = -ENOMEM; goto out_ret; } - - /* Get the exec-header */ - loc->elf_ex = *((struct elfhdr *)bprm->buf); retval = -ENOEXEC; /* First of all, some simple consistency checks */ - if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out; - if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN) + if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN) goto out; - if (!elf_check_arch(&loc->elf_ex)) + if (!elf_check_arch(elf_ex)) goto out; - if (elf_check_fdpic(&loc->elf_ex)) + if (elf_check_fdpic(elf_ex)) goto out; if (!bprm->file->f_op->mmap) goto out; - elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file); + elf_phdata = load_elf_phdrs(elf_ex, bprm->file); if (!elf_phdata) goto out; elf_ppnt = elf_phdata; - for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { + for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { char *elf_interpreter; if (elf_ppnt->p_type != PT_INTERP) @@ -782,7 +784,7 @@ out_free_interp: } elf_ppnt = elf_phdata; - for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) + for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_GNU_STACK: if (elf_ppnt->p_flags & PF_X) @@ -792,7 +794,7 @@ out_free_interp: break; case PT_LOPROC ... PT_HIPROC: - retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt, + retval = arch_elf_pt_proc(elf_ex, elf_ppnt, bprm->file, false, &arch_state); if (retval) @@ -836,7 +838,7 @@ out_free_interp: * still possible to return an error to the code that invoked * the exec syscall. */ - retval = arch_check_elf(&loc->elf_ex, + retval = arch_check_elf(elf_ex, !!interpreter, &loc->interp_elf_ex, &arch_state); if (retval) @@ -849,8 +851,8 @@ out_free_interp: /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ - SET_PERSONALITY2(loc->elf_ex, &arch_state); - if (elf_read_implies_exec(loc->elf_ex, executable_stack)) + SET_PERSONALITY2(*elf_ex, &arch_state); + if (elf_read_implies_exec(*elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) @@ -877,7 +879,7 @@ out_free_interp: /* Now we do a little grungy work by mmapping the ELF image into the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; - i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { + i < elf_ex->e_phnum; i++, elf_ppnt++) { int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; @@ -921,9 +923,9 @@ out_free_interp: * If we are loading ET_EXEC or we have already performed * the ET_DYN load_addr calculations, proceed normally. */ - if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { + if (elf_ex->e_type == ET_EXEC || load_addr_set) { elf_flags |= MAP_FIXED; - } else if (loc->elf_ex.e_type == ET_DYN) { + } else if (elf_ex->e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program * Header for ET_DYN binaries to calculate the @@ -972,7 +974,7 @@ out_free_interp: load_bias = ELF_PAGESTART(load_bias - vaddr); total_size = total_mapping_size(elf_phdata, - loc->elf_ex.e_phnum); + elf_ex->e_phnum); if (!total_size) { retval = -EINVAL; goto out_free_dentry; @@ -990,7 +992,7 @@ out_free_interp: if (!load_addr_set) { load_addr_set = 1; load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); - if (loc->elf_ex.e_type == ET_DYN) { + if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); load_addr += load_bias; @@ -998,7 +1000,7 @@ out_free_interp: } } k = elf_ppnt->p_vaddr; - if (k < start_code) + if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; if (start_data < k) start_data = k; @@ -1031,7 +1033,7 @@ out_free_interp: } } - loc->elf_ex.e_entry += load_bias; + e_entry = elf_ex->e_entry + load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; @@ -1074,7 +1076,7 @@ out_free_interp: allow_write_access(interpreter); fput(interpreter); } else { - elf_entry = loc->elf_ex.e_entry; + elf_entry = e_entry; if (BAD_ADDR(elf_entry)) { retval = -EINVAL; goto out_free_dentry; @@ -1092,15 +1094,17 @@ out_free_interp: goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - retval = create_elf_tables(bprm, &loc->elf_ex, - load_addr, interp_load_addr); + retval = create_elf_tables(bprm, elf_ex, + load_addr, interp_load_addr, e_entry); if (retval < 0) goto out; - current->mm->end_code = end_code; - current->mm->start_code = start_code; - current->mm->start_data = start_data; - current->mm->end_data = end_data; - current->mm->start_stack = bprm->p; + + mm = current->mm; + mm->end_code = end_code; + mm->start_code = start_code; + mm->start_data = start_data; + mm->end_data = end_data; + mm->start_stack = bprm->p; if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { /* @@ -1111,12 +1115,11 @@ out_free_interp: * growing down), and into the unused ELF_ET_DYN_BASE region. */ if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && - loc->elf_ex.e_type == ET_DYN && !interpreter) - current->mm->brk = current->mm->start_brk = - ELF_ET_DYN_BASE; + elf_ex->e_type == ET_DYN && !interpreter) { + mm->brk = mm->start_brk = ELF_ET_DYN_BASE; + } - current->mm->brk = current->mm->start_brk = - arch_randomize_brk(current->mm); + mm->brk = mm->start_brk = arch_randomize_brk(mm); #ifdef compat_brk_randomized current->brk_randomized = 1; #endif @@ -1574,6 +1577,7 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, */ static int fill_files_note(struct memelfnote *note) { + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned count, size, names_ofs, remaining, n; user_long_t *data; @@ -1581,7 +1585,7 @@ static int fill_files_note(struct memelfnote *note) char *name_base, *name_curpos; /* *Estimated* file count and total data size needed */ - count = current->mm->map_count; + count = mm->map_count; if (count > UINT_MAX / 64) return -EINVAL; size = count * 64; @@ -1591,6 +1595,10 @@ static int fill_files_note(struct memelfnote *note) if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ return -EINVAL; size = round_up(size, PAGE_SIZE); + /* + * "size" can be 0 here legitimately. + * Let it ENOMEM and omit NT_FILE section which will be empty anyway. + */ data = kvmalloc(size, GFP_KERNEL); if (ZERO_OR_NULL_PTR(data)) return -ENOMEM; @@ -1599,7 +1607,7 @@ static int fill_files_note(struct memelfnote *note) name_base = name_curpos = ((char *)data) + names_ofs; remaining = size - names_ofs; count = 0; - for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { + for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { struct file *file; const char *filename; @@ -1633,10 +1641,10 @@ static int fill_files_note(struct memelfnote *note) data[0] = count; data[1] = PAGE_SIZE; /* - * Count usually is less than current->mm->map_count, + * Count usually is less than mm->map_count, * we need to move filenames down. */ - n = current->mm->map_count - count; + n = mm->map_count - count; if (n != 0) { unsigned shift_bytes = n * 3 * sizeof(data[0]); memmove(name_base - shift_bytes, name_base, @@ -2182,7 +2190,7 @@ static int elf_core_dump(struct coredump_params *cprm) int segs, i; size_t vma_data_size = 0; struct vm_area_struct *vma, *gate_vma; - struct elfhdr *elf = NULL; + struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; struct elf_phdr *phdr4note = NULL; @@ -2203,10 +2211,6 @@ static int elf_core_dump(struct coredump_params *cprm) * exists while dumping the mm->vm_next areas to the core file. */ - /* alloc memory for large data structures: too large to be on stack */ - elf = kmalloc(sizeof(*elf), GFP_KERNEL); - if (!elf) - goto out; /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. @@ -2230,7 +2234,7 @@ static int elf_core_dump(struct coredump_params *cprm) * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs)) + if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs)) goto cleanup; has_dumped = 1; @@ -2238,7 +2242,7 @@ static int elf_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); - offset += sizeof(*elf); /* Elf header */ + offset += sizeof(elf); /* Elf header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ /* Write notes phdr entry */ @@ -2257,11 +2261,13 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz)) - goto end_coredump; + /* + * Zero vma process will get ZERO_SIZE_PTR here. + * Let coredump continue for register state at least. + */ vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)), GFP_KERNEL); - if (ZERO_OR_NULL_PTR(vma_filesz)) + if (!vma_filesz) goto end_coredump; for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; @@ -2281,12 +2287,12 @@ static int elf_core_dump(struct coredump_params *cprm) shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); if (!shdr4extnum) goto end_coredump; - fill_extnum_info(elf, shdr4extnum, e_shoff, segs); + fill_extnum_info(&elf, shdr4extnum, e_shoff, segs); } offset = dataoff; - if (!dump_emit(cprm, elf, sizeof(*elf))) + if (!dump_emit(cprm, &elf, sizeof(elf))) goto end_coredump; if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) @@ -2370,8 +2376,6 @@ cleanup: kfree(shdr4extnum); kvfree(vma_filesz); kfree(phdr4note); - kfree(elf); -out: return has_dumped; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 14851584e245..404e050ce8ee 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1191,7 +1191,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; - u64 sinfo_used; int ret = -ENOSPC; spin_lock(&sinfo->lock); @@ -1205,19 +1204,38 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) num_bytes = cache->length - cache->reserved - cache->pinned - cache->bytes_super - cache->used; - sinfo_used = btrfs_space_info_used(sinfo, true); /* - * sinfo_used + num_bytes should always <= sinfo->total_bytes. - * - * Here we make sure if we mark this bg RO, we still have enough - * free space as buffer. + * Data never overcommits, even in mixed mode, so do just the straight + * check of left over space in how much we have allocated. */ - if (sinfo_used + num_bytes <= sinfo->total_bytes) { + if (force) { + ret = 0; + } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { + u64 sinfo_used = btrfs_space_info_used(sinfo, true); + + /* + * Here we make sure if we mark this bg RO, we still have enough + * free space as buffer. + */ + if (sinfo_used + num_bytes <= sinfo->total_bytes) + ret = 0; + } else { + /* + * We overcommit metadata, so we need to do the + * btrfs_can_overcommit check here, and we need to pass in + * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of + * leeway to allow us to mark this block group as read only. + */ + if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, + BTRFS_RESERVE_NO_FLUSH)) + ret = 0; + } + + if (!ret) { sinfo->bytes_readonly += num_bytes; cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); - ret = 0; } out: spin_unlock(&cache->lock); @@ -1225,9 +1243,6 @@ out: if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); - btrfs_info(cache->fs_info, - "sinfo_used=%llu bg_num_bytes=%llu", - sinfo_used, num_bytes); btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); } return ret; @@ -2225,7 +2240,7 @@ again: } } - ret = inc_block_group_ro(cache, !do_chunk_alloc); + ret = inc_block_group_ro(cache, 0); if (!do_chunk_alloc) goto unlock_out; if (!ret) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index de95ad27722f..9ab610cc9114 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1290,7 +1290,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, /* copy bytes from the working buffer into the pages */ while (working_bytes > 0) { bytes = min_t(unsigned long, bvec.bv_len, - PAGE_SIZE - buf_offset); + PAGE_SIZE - (buf_offset % PAGE_SIZE)); bytes = min(bytes, working_bytes); kaddr = kmap_atomic(bvec.bv_page); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 24658b5a5787..f2ec1a9bae28 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -326,12 +326,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { write_lock(&fs_info->tree_mod_log_lock); - spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - spin_unlock(&fs_info->tree_mod_seq_lock); write_unlock(&fs_info->tree_mod_log_lock); return elem->seq; @@ -351,7 +349,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, if (!seq_putting) return; - spin_lock(&fs_info->tree_mod_seq_lock); + write_lock(&fs_info->tree_mod_log_lock); list_del(&elem->list); elem->seq = 0; @@ -362,19 +360,17 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * blocker with lower sequence number exists, we * cannot remove anything from the log */ - spin_unlock(&fs_info->tree_mod_seq_lock); + write_unlock(&fs_info->tree_mod_log_lock); return; } min_seq = cur_elem->seq; } } - spin_unlock(&fs_info->tree_mod_seq_lock); /* * anything that's lower than the lowest existing (read: blocked) * sequence number can be removed from the tree. */ - write_lock(&fs_info->tree_mod_log_lock); tm_root = &fs_info->tree_mod_log; for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f90b82050d2d..36df977b64d9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -714,14 +714,12 @@ struct btrfs_fs_info { atomic_t nr_delayed_iputs; wait_queue_head_t delayed_iputs_wait; - /* this protects tree_mod_seq_list */ - spinlock_t tree_mod_seq_lock; atomic64_t tree_mod_seq; - struct list_head tree_mod_seq_list; - /* this protects tree_mod_log */ + /* this protects tree_mod_log and tree_mod_seq_list */ rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; + struct list_head tree_mod_seq_list; atomic_t async_delalloc_pages; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index df3bd880061d..dfdb7d4f8406 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -492,7 +492,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, if (head->is_data) return; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { struct seq_list *elem; @@ -500,7 +500,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct seq_list, list); seq = elem->seq; } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); again: for (node = rb_first_cached(&head->ref_tree); node; @@ -518,7 +518,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) struct seq_list *elem; int ret = 0; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { elem = list_first_entry(&fs_info->tree_mod_seq_list, struct seq_list, list); @@ -531,7 +531,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) } } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index aea48d6ddc0c..7fa9bb79ad08 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2697,7 +2697,6 @@ int __cold open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e2d30287e2d5..c0f202741e09 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1593,21 +1593,25 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, /* Find first extent with bits cleared */ while (1) { node = __etree_search(tree, start, &next, &prev, NULL, NULL); - if (!node) { + if (!node && !next && !prev) { + /* + * Tree is completely empty, send full range and let + * caller deal with it + */ + *start_ret = 0; + *end_ret = -1; + goto out; + } else if (!node && !next) { + /* + * We are past the last allocated chunk, set start at + * the end of the last extent. + */ + state = rb_entry(prev, struct extent_state, rb_node); + *start_ret = state->end + 1; + *end_ret = -1; + goto out; + } else if (!node) { node = next; - if (!node) { - /* - * We are past the last allocated chunk, - * set start at the end of the last extent. The - * device alloc tree should never be empty so - * prev is always set. - */ - ASSERT(prev); - state = rb_entry(prev, struct extent_state, rb_node); - *start_ret = state->end + 1; - *end_ret = -1; - goto out; - } } /* * At this point 'node' either contains 'start' or start is @@ -3438,11 +3442,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, ret = btrfs_writepage_cow_fixup(page, start, page_end); if (ret) { /* Fixup worker will requeue */ - if (ret == -EBUSY) - wbc->pages_skipped++; - else - redirty_page_for_writepage(wbc, page); - + redirty_page_for_writepage(wbc, page); update_nr_written(wbc, nr_written); unlock_page(page); return 1; @@ -4166,7 +4166,16 @@ retry: */ scanned = 1; index = 0; - goto retry; + + /* + * If we're looping we could run into a page that is locked by a + * writer and that writer could be waiting on writeback for a + * page in our current bio, and thus deadlock, so flush the + * write bio here. + */ + ret = flush_write_bio(epd); + if (!ret) + goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6d2bb58d277a..5b3ec93ff911 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2189,6 +2189,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; + struct inode *inode; struct btrfs_work work; }; @@ -2202,27 +2203,71 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct inode *inode; u64 page_start; u64 page_end; - int ret; + int ret = 0; + bool free_delalloc_space = true; fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; + inode = fixup->inode; + page_start = page_offset(page); + page_end = page_offset(page) + PAGE_SIZE - 1; + + /* + * This is similar to page_mkwrite, we need to reserve the space before + * we take the page lock. + */ + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, + PAGE_SIZE); again: lock_page(page); + + /* + * Before we queued this fixup, we took a reference on the page. + * page->mapping may go NULL, but it shouldn't be moved to a different + * address space. + */ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { - ClearPageChecked(page); + /* + * Unfortunately this is a little tricky, either + * + * 1) We got here and our page had already been dealt with and + * we reserved our space, thus ret == 0, so we need to just + * drop our space reservation and bail. This can happen the + * first time we come into the fixup worker, or could happen + * while waiting for the ordered extent. + * 2) Our page was already dealt with, but we happened to get an + * ENOSPC above from the btrfs_delalloc_reserve_space. In + * this case we obviously don't have anything to release, but + * because the page was already dealt with we don't want to + * mark the page with an error, so make sure we're resetting + * ret to 0. This is why we have this check _before_ the ret + * check, because we do not want to have a surprise ENOSPC + * when the page was already properly dealt with. + */ + if (!ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE, + true); + } + ret = 0; goto out_page; } - inode = page->mapping->host; - page_start = page_offset(page); - page_end = page_offset(page) + PAGE_SIZE - 1; + /* + * We can't mess with the page state unless it is locked, so now that + * it is locked bail if we failed to make our space reservation. + */ + if (ret) + goto out_page; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PagePrivate2(page)) - goto out; + goto out_reserved; ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); @@ -2235,39 +2280,49 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); - if (ret) { - mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); - ClearPageChecked(page); - goto out; - } - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state); - if (ret) { - mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); - ClearPageChecked(page); + if (ret) goto out_reserved; - } - ClearPageChecked(page); - set_page_dirty(page); + /* + * Everything went as planned, we're now the owner of a dirty page with + * delayed allocation bits set and space reserved for our COW + * destination. + * + * The page was dirty when we started, nothing should have cleaned it. + */ + BUG_ON(!PageDirty(page)); + free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - if (ret) + if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); -out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); out_page: + if (ret) { + /* + * We hit ENOSPC or other errors. Update the mapping and page + * to reflect the errors and clean the page. + */ + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + clear_page_dirty_for_io(page); + SetPageError(page); + } + ClearPageChecked(page); unlock_page(page); put_page(page); kfree(fixup); extent_changeset_free(data_reserved); + /* + * As a precaution, do a delayed iput in case it would be the last iput + * that could need flushing space. Recursing back to fixup worker would + * deadlock. + */ + btrfs_add_delayed_iput(inode); } /* @@ -2291,6 +2346,13 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) if (TestClearPagePrivate2(page)) return 0; + /* + * PageChecked is set below when we create a fixup worker for this page, + * don't try to create another one if we're already PageChecked() + * + * The extent_io writepage code will redirty the page if we send back + * EAGAIN. + */ if (PageChecked(page)) return -EAGAIN; @@ -2298,12 +2360,21 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) if (!fixup) return -EAGAIN; + /* + * We are already holding a reference to this inode from + * write_cache_pages. We need to hold it because the space reservation + * takes place outside of the page lock, and we can't trust + * page->mapping outside of the page lock. + */ + ihold(inode); SetPageChecked(page); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; + fixup->inode = inode; btrfs_queue_work(fs_info->fixup_workers, &fixup->work); - return -EBUSY; + + return -EAGAIN; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 091e5bc8c7ea..a055b657cb85 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1269,7 +1269,8 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) * destination of the stream. */ if (ino == bctx->cur_objectid && - offset >= bctx->sctx->cur_inode_next_write_offset) + offset + bctx->extent_len > + bctx->sctx->cur_inode_next_write_offset) return 0; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 537bc310a673..01297c5b2666 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -159,9 +159,9 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) return (global->size << 1); } -static int can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) { u64 profile; u64 avail; @@ -226,7 +226,8 @@ again: /* Check and see if our ticket can be satisified now. */ if ((used + ticket->bytes <= space_info->total_bytes) || - can_overcommit(fs_info, space_info, ticket->bytes, flush)) { + btrfs_can_overcommit(fs_info, space_info, ticket->bytes, + flush)) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, ticket->bytes); @@ -639,13 +640,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, return to_reclaim; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) + if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) return 0; used = btrfs_space_info_used(space_info, true); - if (can_overcommit(fs_info, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) + if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -1004,7 +1006,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, */ if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || - can_overcommit(fs_info, space_info, orig_bytes, flush))) { + btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); ret = 0; diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 1a349e3f9cc1..24514cd2c6c1 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -127,6 +127,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, enum btrfs_reserve_flush_enum flush); void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a906315efd19..0616a5434793 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2135,7 +2135,15 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ thresh = SZ_4M; - if (!mixed && total_free_meta - thresh < block_rsv->size) + /* + * We only want to claim there's no available space if we can no longer + * allocate chunks for our metadata profile and our global reserve will + * not fit in the free metadata space. If we aren't ->full then we + * still can allocate chunks and thus are fine using the currently + * calculated f_bavail. + */ + if (!mixed && block_rsv->space_info->full && + total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index c12b91ff5f56..84fb3fa940a6 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -142,7 +142,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) spin_lock_init(&fs_info->qgroup_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); mutex_init(&fs_info->qgroup_ioctl_lock); mutex_init(&fs_info->qgroup_rescan_lock); rwlock_init(&fs_info->tree_mod_log_lock); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 123d9a614357..df7ce874a74b 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -441,8 +441,17 @@ static int test_find_first_clear_extent_bit(void) int ret = -EINVAL; test_msg("running find_first_clear_extent_bit test"); + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + /* Test correct handling of empty tree */ + find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); + if (start != 0 || end != -1) { + test_err( + "error getting a range from completely empty tree: start %llu end %llu", + start, end); + goto out; + } /* * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between * 4M-32M diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index a6c90a003c12..05615a1099db 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -20,9 +20,13 @@ #include <linux/refcount.h> #include "compression.h" +/* workspace buffer size for s390 zlib hardware support */ +#define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE) + struct workspace { z_stream strm; char *buf; + unsigned int buf_size; struct list_head list; int level; }; @@ -61,7 +65,21 @@ struct list_head *zlib_alloc_workspace(unsigned int level) zlib_inflate_workspacesize()); workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); workspace->level = level; - workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf = NULL; + /* + * In case of s390 zlib hardware support, allocate lager workspace + * buffer. If allocator fails, fall back to a single page buffer. + */ + if (zlib_deflate_dfltcc_enabled()) { + workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE, + __GFP_NOMEMALLOC | __GFP_NORETRY | + __GFP_NOWARN | GFP_NOIO); + workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE; + } + if (!workspace->buf) { + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf_size = PAGE_SIZE; + } if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -85,6 +103,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, struct page *in_page = NULL; struct page *out_page = NULL; unsigned long bytes_left; + unsigned int in_buf_pages; unsigned long len = *total_out; unsigned long nr_dest_pages = *out_pages; const unsigned long max_out = nr_dest_pages * PAGE_SIZE; @@ -102,9 +121,6 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (out_page == NULL) { ret = -ENOMEM; @@ -114,12 +130,51 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, pages[0] = out_page; nr_pages = 1; - workspace->strm.next_in = data_in; + workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = 0; workspace->strm.next_out = cpage_out; workspace->strm.avail_out = PAGE_SIZE; - workspace->strm.avail_in = min(len, PAGE_SIZE); while (workspace->strm.total_in < len) { + /* + * Get next input pages and copy the contents to + * the workspace buffer if required. + */ + if (workspace->strm.avail_in == 0) { + bytes_left = len - workspace->strm.total_in; + in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), + workspace->buf_size / PAGE_SIZE); + if (in_buf_pages > 1) { + int i; + + for (i = 0; i < in_buf_pages; i++) { + if (in_page) { + kunmap(in_page); + put_page(in_page); + } + in_page = find_get_page(mapping, + start >> PAGE_SHIFT); + data_in = kmap(in_page); + memcpy(workspace->buf + i * PAGE_SIZE, + data_in, PAGE_SIZE); + start += PAGE_SIZE; + } + workspace->strm.next_in = workspace->buf; + } else { + if (in_page) { + kunmap(in_page); + put_page(in_page); + } + in_page = find_get_page(mapping, + start >> PAGE_SHIFT); + data_in = kmap(in_page); + start += PAGE_SIZE; + workspace->strm.next_in = data_in; + } + workspace->strm.avail_in = min(bytes_left, + (unsigned long) workspace->buf_size); + } + ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); if (ret != Z_OK) { pr_debug("BTRFS: deflate in loop returned %d\n", @@ -161,33 +216,43 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, /* we're all done */ if (workspace->strm.total_in >= len) break; - - /* we've read in a full page, get a new one */ - if (workspace->strm.avail_in == 0) { - if (workspace->strm.total_out > max_out) - break; - - bytes_left = len - workspace->strm.total_in; - kunmap(in_page); - put_page(in_page); - - start += PAGE_SIZE; - in_page = find_get_page(mapping, - start >> PAGE_SHIFT); - data_in = kmap(in_page); - workspace->strm.avail_in = min(bytes_left, - PAGE_SIZE); - workspace->strm.next_in = data_in; - } + if (workspace->strm.total_out > max_out) + break; } workspace->strm.avail_in = 0; - ret = zlib_deflate(&workspace->strm, Z_FINISH); - zlib_deflateEnd(&workspace->strm); - - if (ret != Z_STREAM_END) { - ret = -EIO; - goto out; + /* + * Call deflate with Z_FINISH flush parameter providing more output + * space but no more input data, until it returns with Z_STREAM_END. + */ + while (ret != Z_STREAM_END) { + ret = zlib_deflate(&workspace->strm, Z_FINISH); + if (ret == Z_STREAM_END) + break; + if (ret != Z_OK && ret != Z_BUF_ERROR) { + zlib_deflateEnd(&workspace->strm); + ret = -EIO; + goto out; + } else if (workspace->strm.avail_out == 0) { + /* get another page for the stream end */ + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.next_out = cpage_out; + } } + zlib_deflateEnd(&workspace->strm); if (workspace->strm.total_out >= workspace->strm.total_in) { ret = -E2BIG; @@ -231,7 +296,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->strm.total_out = 0; workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ @@ -270,7 +335,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; if (workspace->strm.avail_in == 0) { unsigned long tmp; @@ -320,7 +385,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, workspace->strm.total_in = 0; workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; workspace->strm.total_out = 0; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ @@ -364,7 +429,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, buf_offset = 0; bytes = min(PAGE_SIZE - pg_offset, - PAGE_SIZE - buf_offset); + PAGE_SIZE - (buf_offset % PAGE_SIZE)); bytes = min(bytes, bytes_left); kaddr = kmap_atomic(dest_page); @@ -375,7 +440,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, bytes_left -= bytes; next: workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; } if (ret != Z_STREAM_END && bytes_left != 0) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 145d46ba25ae..21ada2c4a88c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2072,7 +2072,6 @@ struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) { struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); - struct timespec64 ts; if (!req) return ERR_PTR(-ENOMEM); @@ -2091,8 +2090,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); - ktime_get_coarse_real_ts64(&ts); - req->r_stamp = timespec64_trunc(ts, mdsc->fsc->sb->s_time_gran); + ktime_get_coarse_real_ts64(&req->r_stamp); req->r_op = op; req->r_direct_mode = mode; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 19f6e592b941..276e4b5ea8e0 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -611,12 +611,12 @@ static int cifs_stats_proc_open(struct inode *inode, struct file *file) return single_open(file, cifs_stats_proc_show, NULL); } -static const struct file_operations cifs_stats_proc_fops = { - .open = cifs_stats_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_stats_proc_write, +static const struct proc_ops cifs_stats_proc_ops = { + .proc_open = cifs_stats_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_stats_proc_write, }; #ifdef CONFIG_CIFS_SMB_DIRECT @@ -640,12 +640,12 @@ static int name##_open(struct inode *inode, struct file *file) \ return single_open(file, name##_proc_show, NULL); \ } \ \ -static const struct file_operations cifs_##name##_proc_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ - .write = name##_write, \ +static const struct proc_ops cifs_##name##_proc_fops = { \ + .proc_open = name##_open, \ + .proc_read = seq_read, \ + .proc_lseek = seq_lseek, \ + .proc_release = single_release, \ + .proc_write = name##_write, \ } PROC_FILE_DEFINE(rdma_readwrite_threshold); @@ -659,11 +659,11 @@ PROC_FILE_DEFINE(smbd_receive_credit_max); #endif static struct proc_dir_entry *proc_fs_cifs; -static const struct file_operations cifsFYI_proc_fops; -static const struct file_operations cifs_lookup_cache_proc_fops; -static const struct file_operations traceSMB_proc_fops; -static const struct file_operations cifs_security_flags_proc_fops; -static const struct file_operations cifs_linux_ext_proc_fops; +static const struct proc_ops cifsFYI_proc_ops; +static const struct proc_ops cifs_lookup_cache_proc_ops; +static const struct proc_ops traceSMB_proc_ops; +static const struct proc_ops cifs_security_flags_proc_ops; +static const struct proc_ops cifs_linux_ext_proc_ops; void cifs_proc_init(void) @@ -678,18 +678,18 @@ cifs_proc_init(void) proc_create_single("open_files", 0400, proc_fs_cifs, cifs_debug_files_proc_show); - proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops); - proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops); - proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops); + proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_ops); + proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_ops); + proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_ops); proc_create("LinuxExtensionsEnabled", 0644, proc_fs_cifs, - &cifs_linux_ext_proc_fops); + &cifs_linux_ext_proc_ops); proc_create("SecurityFlags", 0644, proc_fs_cifs, - &cifs_security_flags_proc_fops); + &cifs_security_flags_proc_ops); proc_create("LookupCacheEnabled", 0644, proc_fs_cifs, - &cifs_lookup_cache_proc_fops); + &cifs_lookup_cache_proc_ops); #ifdef CONFIG_CIFS_DFS_UPCALL - proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_fops); + proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_ops); #endif #ifdef CONFIG_CIFS_SMB_DIRECT @@ -774,12 +774,12 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, return count; } -static const struct file_operations cifsFYI_proc_fops = { - .open = cifsFYI_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifsFYI_proc_write, +static const struct proc_ops cifsFYI_proc_ops = { + .proc_open = cifsFYI_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifsFYI_proc_write, }; static int cifs_linux_ext_proc_show(struct seq_file *m, void *v) @@ -805,12 +805,12 @@ static ssize_t cifs_linux_ext_proc_write(struct file *file, return count; } -static const struct file_operations cifs_linux_ext_proc_fops = { - .open = cifs_linux_ext_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_linux_ext_proc_write, +static const struct proc_ops cifs_linux_ext_proc_ops = { + .proc_open = cifs_linux_ext_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_linux_ext_proc_write, }; static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v) @@ -836,12 +836,12 @@ static ssize_t cifs_lookup_cache_proc_write(struct file *file, return count; } -static const struct file_operations cifs_lookup_cache_proc_fops = { - .open = cifs_lookup_cache_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_lookup_cache_proc_write, +static const struct proc_ops cifs_lookup_cache_proc_ops = { + .proc_open = cifs_lookup_cache_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_lookup_cache_proc_write, }; static int traceSMB_proc_show(struct seq_file *m, void *v) @@ -867,12 +867,12 @@ static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, return count; } -static const struct file_operations traceSMB_proc_fops = { - .open = traceSMB_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = traceSMB_proc_write, +static const struct proc_ops traceSMB_proc_ops = { + .proc_open = traceSMB_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = traceSMB_proc_write, }; static int cifs_security_flags_proc_show(struct seq_file *m, void *v) @@ -978,12 +978,12 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, return count; } -static const struct file_operations cifs_security_flags_proc_fops = { - .open = cifs_security_flags_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_security_flags_proc_write, +static const struct proc_ops cifs_security_flags_proc_ops = { + .proc_open = cifs_security_flags_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_security_flags_proc_write, }; #else inline void cifs_proc_init(void) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 096a4c18fbd0..b87456bae1a1 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -156,5 +156,5 @@ extern int cifs_truncate_page(struct address_space *mapping, loff_t from); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.24" +#define CIFS_VERSION "2.25" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 9a384d1e27b4..43c1b43a07ec 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -8,6 +8,7 @@ #include <linux/jhash.h> #include <linux/ktime.h> #include <linux/slab.h> +#include <linux/proc_fs.h> #include <linux/nls.h> #include <linux/workqueue.h> #include "cifsglob.h" @@ -211,12 +212,12 @@ static int dfscache_proc_open(struct inode *inode, struct file *file) return single_open(file, dfscache_proc_show, NULL); } -const struct file_operations dfscache_proc_fops = { - .open = dfscache_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = dfscache_proc_write, +const struct proc_ops dfscache_proc_ops = { + .proc_open = dfscache_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = dfscache_proc_write, }; #ifdef CONFIG_CIFS_DEBUG2 diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h index 76c732943f5f..99ee44f8ad07 100644 --- a/fs/cifs/dfs_cache.h +++ b/fs/cifs/dfs_cache.h @@ -24,7 +24,7 @@ struct dfs_cache_tgt_iterator { extern int dfs_cache_init(void); extern void dfs_cache_destroy(void); -extern const struct file_operations dfscache_proc_fops; +extern const struct proc_ops dfscache_proc_ops; extern int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_codepage, int remap, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 9b547f7f5f5d..676e96a7a9f0 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -113,6 +113,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) } /* revalidate if mtime or size have changed */ + fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); if (timespec64_equal(&inode->i_mtime, &fattr->cf_mtime) && cifs_i->server_eof == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", @@ -162,6 +163,9 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) cifs_revalidate_cache(inode, fattr); spin_lock(&inode->i_lock); + fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); + fattr->cf_atime = timestamp_truncate(fattr->cf_atime, inode); + fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode); /* we do not want atime to be less than mtime, it broke some apps */ if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime) < 0) inode->i_atime = fattr->cf_mtime; @@ -329,8 +333,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; fattr->cf_uid = cifs_sb->mnt_uid; fattr->cf_gid = cifs_sb->mnt_gid; - ktime_get_real_ts64(&fattr->cf_mtime); - fattr->cf_mtime = timespec64_trunc(fattr->cf_mtime, sb->s_time_gran); + ktime_get_coarse_real_ts64(&fattr->cf_mtime); fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime; fattr->cf_nlink = 2; fattr->cf_flags = CIFS_FATTR_DFS_REFERRAL; @@ -609,10 +612,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, if (info->LastAccessTime) fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); - else { - ktime_get_real_ts64(&fattr->cf_atime); - fattr->cf_atime = timespec64_trunc(fattr->cf_atime, sb->s_time_gran); - } + else + ktime_get_coarse_real_ts64(&fattr->cf_atime); fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 7edba3e6d5e6..14f209f7376f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -312,7 +312,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) if (server->tcpStatus != CifsNeedReconnect) break; - if (--retries) + if (retries && --retries) continue; /* diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c deleted file mode 100644 index 358ea2ecf36b..000000000000 --- a/fs/compat_ioctl.c +++ /dev/null @@ -1,261 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ioctl32.c: Conversion between 32bit and 64bit native ioctls. - * - * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) - * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs - * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) - * - * These routines maintain argument size conversion between 32bit and 64bit - * ioctls. - */ - -#include <linux/types.h> -#include <linux/compat.h> -#include <linux/kernel.h> -#include <linux/capability.h> -#include <linux/compiler.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/ioctl.h> -#include <linux/if.h> -#include <linux/raid/md_u.h> -#include <linux/falloc.h> -#include <linux/file.h> -#include <linux/ppp-ioctl.h> -#include <linux/if_pppox.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> -#include <linux/blkdev.h> -#include <linux/serial.h> -#include <linux/ctype.h> -#include <linux/syscalls.h> -#include <linux/gfp.h> -#include <linux/cec.h> - -#include "internal.h" - -#ifdef CONFIG_BLOCK -#include <linux/cdrom.h> -#include <linux/fd.h> -#include <scsi/scsi.h> -#include <scsi/scsi_ioctl.h> -#include <scsi/sg.h> -#endif - -#include <linux/uaccess.h> -#include <linux/watchdog.h> - -#include <linux/hiddev.h> - - -#include <linux/sort.h> - -/* - * simple reversible transform to make our table more evenly - * distributed after sorting. - */ -#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) - -#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd), -static unsigned int ioctl_pointer[] = { -#ifdef CONFIG_BLOCK -/* Big S */ -COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) -COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK) -COMPATIBLE_IOCTL(SCSI_IOCTL_DOORUNLOCK) -COMPATIBLE_IOCTL(SCSI_IOCTL_TEST_UNIT_READY) -COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER) -COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) -COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) -COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) -#endif -#ifdef CONFIG_BLOCK -/* SG stuff */ -COMPATIBLE_IOCTL(SG_IO) -COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) -COMPATIBLE_IOCTL(SG_SET_TIMEOUT) -COMPATIBLE_IOCTL(SG_GET_TIMEOUT) -COMPATIBLE_IOCTL(SG_EMULATED_HOST) -COMPATIBLE_IOCTL(SG_GET_TRANSFORM) -COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE) -COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE) -COMPATIBLE_IOCTL(SG_GET_SCSI_ID) -COMPATIBLE_IOCTL(SG_SET_FORCE_LOW_DMA) -COMPATIBLE_IOCTL(SG_GET_LOW_DMA) -COMPATIBLE_IOCTL(SG_SET_FORCE_PACK_ID) -COMPATIBLE_IOCTL(SG_GET_PACK_ID) -COMPATIBLE_IOCTL(SG_GET_NUM_WAITING) -COMPATIBLE_IOCTL(SG_SET_DEBUG) -COMPATIBLE_IOCTL(SG_GET_SG_TABLESIZE) -COMPATIBLE_IOCTL(SG_GET_COMMAND_Q) -COMPATIBLE_IOCTL(SG_SET_COMMAND_Q) -COMPATIBLE_IOCTL(SG_GET_VERSION_NUM) -COMPATIBLE_IOCTL(SG_NEXT_CMD_LEN) -COMPATIBLE_IOCTL(SG_SCSI_RESET) -COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) -COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN) -COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN) -#endif -}; - -/* - * Convert common ioctl arguments based on their command number - * - * Please do not add any code in here. Instead, implement - * a compat_ioctl operation in the place that handleÑ• the - * ioctl for the native case. - */ -static long do_ioctl_trans(unsigned int cmd, - unsigned long arg, struct file *file) -{ - return -ENOIOCTLCMD; -} - -static int compat_ioctl_check_table(unsigned int xcmd) -{ -#ifdef CONFIG_BLOCK - int i; - const int max = ARRAY_SIZE(ioctl_pointer) - 1; - - BUILD_BUG_ON(max >= (1 << 16)); - - /* guess initial offset into table, assuming a - normalized distribution */ - i = ((xcmd >> 16) * max) >> 16; - - /* do linear search up first, until greater or equal */ - while (ioctl_pointer[i] < xcmd && i < max) - i++; - - /* then do linear search down */ - while (ioctl_pointer[i] > xcmd && i > 0) - i--; - - return ioctl_pointer[i] == xcmd; -#else - return 0; -#endif -} - -COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, - compat_ulong_t, arg32) -{ - unsigned long arg = arg32; - struct fd f = fdget(fd); - int error = -EBADF; - if (!f.file) - goto out; - - /* RED-PEN how should LSM module know it's handling 32bit? */ - error = security_file_ioctl(f.file, cmd, arg); - if (error) - goto out_fput; - - switch (cmd) { - /* these are never seen by ->ioctl(), no argument or int argument */ - case FIOCLEX: - case FIONCLEX: - case FIFREEZE: - case FITHAW: - case FICLONE: - goto do_ioctl; - /* these are never seen by ->ioctl(), pointer argument */ - case FIONBIO: - case FIOASYNC: - case FIOQSIZE: - case FS_IOC_FIEMAP: - case FIGETBSZ: - case FICLONERANGE: - case FIDEDUPERANGE: - goto found_handler; - /* - * The next group is the stuff handled inside file_ioctl(). - * For regular files these never reach ->ioctl(); for - * devices, sockets, etc. they do and one (FIONREAD) is - * even accepted in some cases. In all those cases - * argument has the same type, so we can handle these - * here, shunting them towards do_vfs_ioctl(). - * ->compat_ioctl() will never see any of those. - */ - /* pointer argument, never actually handled by ->ioctl() */ - case FIBMAP: - goto found_handler; - /* handled by some ->ioctl(); always a pointer to int */ - case FIONREAD: - goto found_handler; - /* these get messy on amd64 due to alignment differences */ -#if defined(CONFIG_X86_64) - case FS_IOC_RESVSP_32: - case FS_IOC_RESVSP64_32: - error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg)); - goto out_fput; - case FS_IOC_UNRESVSP_32: - case FS_IOC_UNRESVSP64_32: - error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE, - compat_ptr(arg)); - goto out_fput; - case FS_IOC_ZERO_RANGE_32: - error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE, - compat_ptr(arg)); - goto out_fput; -#else - case FS_IOC_RESVSP: - case FS_IOC_RESVSP64: - case FS_IOC_UNRESVSP: - case FS_IOC_UNRESVSP64: - case FS_IOC_ZERO_RANGE: - goto found_handler; -#endif - - default: - if (f.file->f_op->compat_ioctl) { - error = f.file->f_op->compat_ioctl(f.file, cmd, arg); - if (error != -ENOIOCTLCMD) - goto out_fput; - } - - if (!f.file->f_op->unlocked_ioctl) - goto do_ioctl; - break; - } - - if (compat_ioctl_check_table(XFORM(cmd))) - goto found_handler; - - error = do_ioctl_trans(cmd, arg, f.file); - if (error == -ENOIOCTLCMD) - error = -ENOTTY; - - goto out_fput; - - found_handler: - arg = (unsigned long)compat_ptr(arg); - do_ioctl: - error = do_vfs_ioctl(f.file, fd, cmd, arg); - out_fput: - fdput(f); - out: - return error; -} - -static int __init init_sys32_ioctl_cmp(const void *p, const void *q) -{ - unsigned int a, b; - a = *(unsigned int *)p; - b = *(unsigned int *)q; - if (a > b) - return 1; - if (a < b) - return -1; - return 0; -} - -static int __init init_sys32_ioctl(void) -{ - sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer), - init_sys32_ioctl_cmp, NULL); - return 0; -} -__initcall(init_sys32_ioctl); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 680aba9c00d5..fd0b5dd68f9e 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -76,14 +76,11 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) if (ia_valid & ATTR_GID) sd_iattr->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) - sd_iattr->ia_atime = timestamp_truncate(iattr->ia_atime, - inode); + sd_iattr->ia_atime = iattr->ia_atime; if (ia_valid & ATTR_MTIME) - sd_iattr->ia_mtime = timestamp_truncate(iattr->ia_mtime, - inode); + sd_iattr->ia_mtime = iattr->ia_mtime; if (ia_valid & ATTR_CTIME) - sd_iattr->ia_ctime = timestamp_truncate(iattr->ia_ctime, - inode); + sd_iattr->ia_ctime = iattr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = iattr->ia_mode; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index dc6cffc4feba..e742dfc66933 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -332,7 +332,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) parent = debugfs_mount->mnt_root; inode_lock(d_inode(parent)); - dentry = lookup_one_len(name, parent, strlen(name)); + if (unlikely(IS_DEADDIR(d_inode(parent)))) + dentry = ERR_PTR(-ENOENT); + else + dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { if (d_is_dir(dentry)) pr_err("Directory '%s' with parent '%s' already present!\n", @@ -681,62 +684,15 @@ static void __debugfs_file_removed(struct dentry *dentry) wait_for_completion(&fsd->active_users_drained); } -static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) -{ - int ret = 0; - - if (simple_positive(dentry)) { - dget(dentry); - if (d_is_dir(dentry)) { - ret = simple_rmdir(d_inode(parent), dentry); - if (!ret) - fsnotify_rmdir(d_inode(parent), dentry); - } else { - simple_unlink(d_inode(parent), dentry); - fsnotify_unlink(d_inode(parent), dentry); - } - if (!ret) - d_delete(dentry); - if (d_is_reg(dentry)) - __debugfs_file_removed(dentry); - dput(dentry); - } - return ret; -} - -/** - * debugfs_remove - removes a file or directory from the debugfs filesystem - * @dentry: a pointer to a the dentry of the file or directory to be - * removed. If this parameter is NULL or an error value, nothing - * will be done. - * - * This function removes a file or directory in debugfs that was previously - * created with a call to another debugfs function (like - * debugfs_create_file() or variants thereof.) - * - * This function is required to be called in order for the file to be - * removed, no automatic cleanup of files will happen when a module is - * removed, you are responsible here. - */ -void debugfs_remove(struct dentry *dentry) +static void remove_one(struct dentry *victim) { - struct dentry *parent; - int ret; - - if (IS_ERR_OR_NULL(dentry)) - return; - - parent = dentry->d_parent; - inode_lock(d_inode(parent)); - ret = __debugfs_remove(dentry, parent); - inode_unlock(d_inode(parent)); - if (!ret) - simple_release_fs(&debugfs_mount, &debugfs_mount_count); + if (d_is_reg(victim)) + __debugfs_file_removed(victim); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); } -EXPORT_SYMBOL_GPL(debugfs_remove); /** - * debugfs_remove_recursive - recursively removes a directory + * debugfs_remove - recursively removes a directory * @dentry: a pointer to a the dentry of the directory to be removed. If this * parameter is NULL or an error value, nothing will be done. * @@ -748,65 +704,16 @@ EXPORT_SYMBOL_GPL(debugfs_remove); * removed, no automatic cleanup of files will happen when a module is * removed, you are responsible here. */ -void debugfs_remove_recursive(struct dentry *dentry) +void debugfs_remove(struct dentry *dentry) { - struct dentry *child, *parent; - if (IS_ERR_OR_NULL(dentry)) return; - parent = dentry; - down: - inode_lock(d_inode(parent)); - loop: - /* - * The parent->d_subdirs is protected by the d_lock. Outside that - * lock, the child can be unlinked and set to be freed which can - * use the d_u.d_child as the rcu head and corrupt this list. - */ - spin_lock(&parent->d_lock); - list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!simple_positive(child)) - continue; - - /* perhaps simple_empty(child) makes more sense */ - if (!list_empty(&child->d_subdirs)) { - spin_unlock(&parent->d_lock); - inode_unlock(d_inode(parent)); - parent = child; - goto down; - } - - spin_unlock(&parent->d_lock); - - if (!__debugfs_remove(child, parent)) - simple_release_fs(&debugfs_mount, &debugfs_mount_count); - - /* - * The parent->d_lock protects agaist child from unlinking - * from d_subdirs. When releasing the parent->d_lock we can - * no longer trust that the next pointer is valid. - * Restart the loop. We'll skip this one with the - * simple_positive() check. - */ - goto loop; - } - spin_unlock(&parent->d_lock); - - inode_unlock(d_inode(parent)); - child = parent; - parent = parent->d_parent; - inode_lock(d_inode(parent)); - - if (child != dentry) - /* go up */ - goto loop; - - if (!__debugfs_remove(child, parent)) - simple_release_fs(&debugfs_mount, &debugfs_mount_count); - inode_unlock(d_inode(parent)); + simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); + simple_recursive_removal(dentry, remove_one); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); } -EXPORT_SYMBOL_GPL(debugfs_remove_recursive); +EXPORT_SYMBOL_GPL(debugfs_remove); /** * debugfs_rename - rename a file/directory in the debugfs filesystem diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 67a395039268..b041b66002db 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -354,12 +354,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) return container_of(p, struct ep_pqueue, pt)->epi; } -/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ -static inline int ep_op_has_event(int op) -{ - return op != EPOLL_CTL_DEL; -} - /* Initialize the poll safe wake up structure */ static void ep_nested_calls_init(struct nested_calls *ncalls) { @@ -2074,27 +2068,28 @@ SYSCALL_DEFINE1(epoll_create, int, size) return do_epoll_create(0); } -/* - * The following function implements the controller interface for - * the eventpoll file that enables the insertion/removal/change of - * file descriptors inside the interest set. - */ -SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, - struct epoll_event __user *, event) +static inline int epoll_mutex_lock(struct mutex *mutex, int depth, + bool nonblock) +{ + if (!nonblock) { + mutex_lock_nested(mutex, depth); + return 0; + } + if (mutex_trylock(mutex)) + return 0; + return -EAGAIN; +} + +int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, + bool nonblock) { int error; int full_check = 0; struct fd f, tf; struct eventpoll *ep; struct epitem *epi; - struct epoll_event epds; struct eventpoll *tep = NULL; - error = -EFAULT; - if (ep_op_has_event(op) && - copy_from_user(&epds, event, sizeof(struct epoll_event))) - goto error_return; - error = -EBADF; f = fdget(epfd); if (!f.file) @@ -2112,7 +2107,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) - ep_take_care_of_epollwakeup(&epds); + ep_take_care_of_epollwakeup(epds); /* * We have to check that the file structure underneath the file descriptor @@ -2128,11 +2123,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ - if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) { + if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || - (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) + (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } @@ -2157,13 +2152,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ - mutex_lock_nested(&ep->mtx, 0); + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); + if (error) + goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (!list_empty(&f.file->f_ep_links) || is_file_epoll(tf.file)) { - full_check = 1; mutex_unlock(&ep->mtx); - mutex_lock(&epmutex); + error = epoll_mutex_lock(&epmutex, 0, nonblock); + if (error) + goto error_tgt_fput; + full_check = 1; if (is_file_epoll(tf.file)) { error = -ELOOP; if (ep_loop_check(ep, tf.file) != 0) { @@ -2173,10 +2172,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, } else list_add(&tf.file->f_tfile_llink, &tfile_check_list); - mutex_lock_nested(&ep->mtx, 0); + error = epoll_mutex_lock(&ep->mtx, 0, nonblock); + if (error) { +out_del: + list_del(&tf.file->f_tfile_llink); + goto error_tgt_fput; + } if (is_file_epoll(tf.file)) { tep = tf.file->private_data; - mutex_lock_nested(&tep->mtx, 1); + error = epoll_mutex_lock(&tep->mtx, 1, nonblock); + if (error) { + mutex_unlock(&ep->mtx); + goto out_del; + } } } } @@ -2192,8 +2200,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= EPOLLERR | EPOLLHUP; - error = ep_insert(ep, &epds, tf.file, fd, full_check); + epds->events |= EPOLLERR | EPOLLHUP; + error = ep_insert(ep, epds, tf.file, fd, full_check); } else error = -EEXIST; if (full_check) @@ -2208,8 +2216,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { - epds.events |= EPOLLERR | EPOLLHUP; - error = ep_modify(ep, epi, &epds); + epds->events |= EPOLLERR | EPOLLHUP; + error = ep_modify(ep, epi, epds); } } else error = -ENOENT; @@ -2232,6 +2240,23 @@ error_return: } /* + * The following function implements the controller interface for + * the eventpoll file that enables the insertion/removal/change of + * file descriptors inside the interest set. + */ +SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, + struct epoll_event __user *, event) +{ + struct epoll_event epds; + + if (ep_op_has_event(op) && + copy_from_user(&epds, event, sizeof(struct epoll_event))) + return -EFAULT; + + return do_epoll_ctl(epfd, op, fd, &epds, false); +} + +/* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ diff --git a/fs/exec.c b/fs/exec.c index 74d88dab98dd..db17be51b112 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -272,7 +272,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm) goto err; mm->stack_vm = mm->total_vm = 1; - arch_bprm_mm_init(mm, vma); up_write(&mm->mmap_sem); bprm->p = vma->vm_end - sizeof(void *); return 0; @@ -761,6 +760,11 @@ int setup_arg_pages(struct linux_binprm *bprm, goto out_unlock; BUG_ON(prev != vma); + if (unlikely(vm_flags & VM_EXEC)) { + pr_warn_once("process '%pD4' started with executable stack\n", + bprm->file); + } + /* Move stack pages down in memory. */ if (stack_shift) { ret = shift_arg_pages(vma, stack_shift); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index bcffe25da2f0..4a4ab683250d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1073,9 +1073,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (EXT2_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext2; - sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_first_data_block) - 1) - / EXT2_BLOCKS_PER_GROUP(sb)) + 1; + sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) - 1) + / EXT2_BLOCKS_PER_GROUP(sb)) + 1; db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc_array (db_count, @@ -1138,6 +1138,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_count_dirs(sb), GFP_KERNEL); } if (err) { + ret = err; ext2_msg(sb, KERN_ERR, "error: insufficient memory"); goto failed_mount3; } diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 2de970cfc33c..2a592e38cdfe 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -4,12 +4,7 @@ # kernels after the removal of ext3 driver. config EXT3_FS tristate "The Extended 3 (ext3) filesystem" - # These must match EXT4_FS selects... select EXT4_FS - select JBD2 - select CRC16 - select CRYPTO - select CRYPTO_CRC32C help This config option is here only for backward compatibility. ext3 filesystem is now handled by the ext4 driver. @@ -33,7 +28,6 @@ config EXT3_FS_SECURITY config EXT4_FS tristate "The Extended 4 (ext4) filesystem" - # Please update EXT3_FS selects when changing these select JBD2 select CRC16 select CRYPTO diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0b202e00d93f..5f993a411251 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -371,7 +371,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, if (buffer_verified(bh)) goto verified; if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, - desc, bh))) { + desc, bh) || + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, @@ -505,7 +506,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO); if (!buffer_uptodate(bh)) { + ext4_set_errno(sb, EIO); ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 4e093277c8bf..1f340743c9a8 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -462,7 +462,6 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; memcpy(new_fn->name, ent_name->name, ent_name->len); - new_fn->name[ent_name->len] = 0; while (*p) { parent = *p; @@ -672,9 +671,11 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct qstr qstr = {.name = str, .len = len }; - struct inode *inode = dentry->d_parent->d_inode; + const struct dentry *parent = READ_ONCE(dentry->d_parent); + const struct inode *inode = READ_ONCE(parent->d_inode); - if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) { + if (!inode || !IS_CASEFOLDED(inode) || + !EXT4_SB(inode->i_sb)->s_encoding) { if (len != name->len) return -1; return memcmp(str, name->name, len); @@ -687,10 +688,11 @@ static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) { const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb); const struct unicode_map *um = sbi->s_encoding; + const struct inode *inode = READ_ONCE(dentry->d_inode); unsigned char *norm; int len, ret = 0; - if (!IS_CASEFOLDED(dentry->d_inode) || !um) + if (!inode || !IS_CASEFOLDED(inode) || !um) return 0; norm = kmalloc(PATH_MAX, GFP_ATOMIC); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f8578caba40d..9a2ee2428ecc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1052,8 +1052,6 @@ struct ext4_inode_info { /* allocation reservation info for delalloc */ /* In case of bigalloc, this refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; - ext4_lblk_t i_da_metadata_calc_last_lblock; - int i_da_metadata_calc_len; /* pending cluster reservations for bigalloc file systems */ struct ext4_pending_tree i_pending_tree; @@ -1343,7 +1341,8 @@ struct ext4_super_block { __u8 s_lastcheck_hi; __u8 s_first_error_time_hi; __u8 s_last_error_time_hi; - __u8 s_pad[2]; + __u8 s_first_error_errcode; + __u8 s_last_error_errcode; __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __le32 s_reserved[95]; /* Padding to the end of the block */ @@ -1556,6 +1555,9 @@ struct ext4_sb_info { /* Barrier between changing inodes' journal flags and writepages ops. */ struct percpu_rw_semaphore s_journal_flag_rwsem; struct dax_device *s_daxdev; +#ifdef CONFIG_EXT4_DEBUG + unsigned long s_simulate_fail; +#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1575,6 +1577,66 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) } /* + * Simulate_fail codes + */ +#define EXT4_SIM_BBITMAP_EIO 1 +#define EXT4_SIM_BBITMAP_CRC 2 +#define EXT4_SIM_IBITMAP_EIO 3 +#define EXT4_SIM_IBITMAP_CRC 4 +#define EXT4_SIM_INODE_EIO 5 +#define EXT4_SIM_INODE_CRC 6 +#define EXT4_SIM_DIRBLOCK_EIO 7 +#define EXT4_SIM_DIRBLOCK_CRC 8 + +static inline bool ext4_simulate_fail(struct super_block *sb, + unsigned long code) +{ +#ifdef CONFIG_EXT4_DEBUG + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (unlikely(sbi->s_simulate_fail == code)) { + sbi->s_simulate_fail = 0; + return true; + } +#endif + return false; +} + +static inline void ext4_simulate_fail_bh(struct super_block *sb, + struct buffer_head *bh, + unsigned long code) +{ + if (!IS_ERR(bh) && ext4_simulate_fail(sb, code)) + clear_buffer_uptodate(bh); +} + +/* + * Error number codes for s_{first,last}_error_errno + * + * Linux errno numbers are architecture specific, so we need to translate + * them into something which is architecture independent. We don't define + * codes for all errno's; just the ones which are most likely to be the cause + * of an ext4_error() call. + */ +#define EXT4_ERR_UNKNOWN 1 +#define EXT4_ERR_EIO 2 +#define EXT4_ERR_ENOMEM 3 +#define EXT4_ERR_EFSBADCRC 4 +#define EXT4_ERR_EFSCORRUPTED 5 +#define EXT4_ERR_ENOSPC 6 +#define EXT4_ERR_ENOKEY 7 +#define EXT4_ERR_EROFS 8 +#define EXT4_ERR_EFBIG 9 +#define EXT4_ERR_EEXIST 10 +#define EXT4_ERR_ERANGE 11 +#define EXT4_ERR_EOVERFLOW 12 +#define EXT4_ERR_EBUSY 13 +#define EXT4_ERR_ENOTDIR 14 +#define EXT4_ERR_ENOTEMPTY 15 +#define EXT4_ERR_ESHUTDOWN 16 +#define EXT4_ERR_EFAULT 17 + +/* * Inode dynamic state flags */ enum { @@ -2628,7 +2690,6 @@ extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, @@ -2679,8 +2740,6 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); -extern void *ext4_kvmalloc(size_t size, gfp_t flags); -extern void *ext4_kvzalloc(size_t size, gfp_t flags); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, @@ -2688,6 +2747,7 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno, extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); +extern void ext4_set_errno(struct super_block *sb, int err); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, @@ -3254,7 +3314,6 @@ struct ext4_extent; #define EXT_MAX_BLOCKS 0xffffffff extern int ext4_ext_tree_init(handle_t *handle, struct inode *); -extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); @@ -3271,14 +3330,9 @@ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern int ext4_ext_calc_metadata_amount(struct inode *inode, - ext4_lblk_t lblocks); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); -extern int ext4_can_extents_be_merged(struct inode *inode, - struct ext4_extent *ex1, - struct ext4_extent *ex2); extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path **, struct ext4_extent *, int); @@ -3294,8 +3348,6 @@ extern int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); -extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); -extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, @@ -3390,6 +3442,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } extern const struct iomap_ops ext4_iomap_ops; +extern const struct iomap_ops ext4_iomap_overwrite_ops; extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 98bd0e9ee7df..1c216fcc202a 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -267,10 +267,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } -#define ext4_ext_dirty(handle, inode, path) \ - __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) -int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, - struct inode *inode, struct ext4_ext_path *path); - #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index d3b8cdea5df7..1f53d64e42a5 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -7,6 +7,28 @@ #include <trace/events/ext4.h> +int ext4_inode_journal_mode(struct inode *inode) +{ + if (EXT4_JOURNAL(inode) == NULL) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + /* We do not support data journalling with delayed allocation */ + if (!S_ISREG(inode->i_mode) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC))) { + /* We do not support data journalling for encrypted data */ + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + } + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ + BUG(); +} + /* Just increment the non-pointer handle value */ static handle_t *ext4_get_nojournal(void) { @@ -58,6 +80,7 @@ static int ext4_journal_check_start(struct super_block *sb) * take the FS itself readonly cleanly. */ if (journal && is_journal_aborted(journal)) { + ext4_set_errno(sb, -journal->j_errno); ext4_abort(sb, "Detected aborted journal"); return -EROFS; } @@ -249,6 +272,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + ext4_set_errno(inode->i_sb, -err); __ext4_abort(inode->i_sb, where, line, "error %d when attempting revoke", err); } @@ -320,6 +344,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_block = cpu_to_le64(bh->b_blocknr); + ext4_set_errno(inode->i_sb, EIO); ext4_error_inode(inode, where, line, bh->b_blocknr, "IO error syncing itable block"); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index a6b9b66dbfad..7ea4f6fa173b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -463,27 +463,7 @@ int ext4_force_commit(struct super_block *sb); #define EXT4_INODE_ORDERED_DATA_MODE 0x02 /* ordered data mode */ #define EXT4_INODE_WRITEBACK_DATA_MODE 0x04 /* writeback data mode */ -static inline int ext4_inode_journal_mode(struct inode *inode) -{ - if (EXT4_JOURNAL(inode) == NULL) - return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - /* We do not support data journalling with delayed allocation */ - if (!S_ISREG(inode->i_mode) || - ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || - test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || - (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC))) { - /* We do not support data journalling for encrypted data */ - if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) - return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ - return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ - } - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) - return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ - if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) - return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - BUG(); -} +int ext4_inode_journal_mode(struct inode *inode); static inline int ext4_should_journal_data(struct inode *inode) { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0e8708b77da6..954013d6076b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -161,8 +161,9 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, * - ENOMEM * - EIO */ -int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, - struct inode *inode, struct ext4_ext_path *path) +static int __ext4_ext_dirty(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) { int err; @@ -179,6 +180,9 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle, return err; } +#define ext4_ext_dirty(handle, inode, path) \ + __ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path)) + static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) @@ -309,53 +313,6 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode, (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); } -/* - * Calculate the number of metadata blocks needed - * to allocate @blocks - * Worse case is one block per extent - */ -int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - int idxs; - - idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) - / sizeof(struct ext4_extent_idx)); - - /* - * If the new delayed allocation block is contiguous with the - * previous da block, it can share index blocks with the - * previous block, so we only need to allocate a new index - * block every idxs leaf blocks. At ldxs**2 blocks, we need - * an additional index block, and at ldxs**3 blocks, yet - * another index blocks. - */ - if (ei->i_da_metadata_calc_len && - ei->i_da_metadata_calc_last_lblock+1 == lblock) { - int num = 0; - - if ((ei->i_da_metadata_calc_len % idxs) == 0) - num++; - if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) - num++; - if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { - num++; - ei->i_da_metadata_calc_len = 0; - } else - ei->i_da_metadata_calc_len++; - ei->i_da_metadata_calc_last_lblock++; - return num; - } - - /* - * In the worst case we need a new set of index blocks at - * every level of the inode's extent tree. - */ - ei->i_da_metadata_calc_len = 1; - ei->i_da_metadata_calc_last_lblock = lblock; - return ext_depth(inode) + 1; -} - static int ext4_ext_max_entries(struct inode *inode, int depth) { @@ -492,6 +449,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, return 0; corrupted: + ext4_set_errno(inode->i_sb, -err); ext4_error_inode(inode, function, line, 0, "pblk %llu bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", @@ -510,6 +468,30 @@ int ext4_ext_check_inode(struct inode *inode) return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0); } +static void ext4_cache_extents(struct inode *inode, + struct ext4_extent_header *eh) +{ + struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); + ext4_lblk_t prev = 0; + int i; + + for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { + unsigned int status = EXTENT_STATUS_WRITTEN; + ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); + int len = ext4_ext_get_actual_len(ex); + + if (prev && (prev != lblk)) + ext4_es_cache_extent(inode, prev, lblk - prev, ~0, + EXTENT_STATUS_HOLE); + + if (ext4_ext_is_unwritten(ex)) + status = EXTENT_STATUS_UNWRITTEN; + ext4_es_cache_extent(inode, lblk, len, + ext4_ext_pblock(ex), status); + prev = lblk + len; + } +} + static struct buffer_head * __read_extent_tree_block(const char *function, unsigned int line, struct inode *inode, ext4_fsblk_t pblk, int depth, @@ -544,26 +526,7 @@ __read_extent_tree_block(const char *function, unsigned int line, */ if (!(flags & EXT4_EX_NOCACHE) && depth == 0) { struct ext4_extent_header *eh = ext_block_hdr(bh); - struct ext4_extent *ex = EXT_FIRST_EXTENT(eh); - ext4_lblk_t prev = 0; - int i; - - for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) { - unsigned int status = EXTENT_STATUS_WRITTEN; - ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); - int len = ext4_ext_get_actual_len(ex); - - if (prev && (prev != lblk)) - ext4_es_cache_extent(inode, prev, - lblk - prev, ~0, - EXTENT_STATUS_HOLE); - - if (ext4_ext_is_unwritten(ex)) - status = EXTENT_STATUS_UNWRITTEN; - ext4_es_cache_extent(inode, lblk, len, - ext4_ext_pblock(ex), status); - prev = lblk + len; - } + ext4_cache_extents(inode, eh); } return bh; errout: @@ -649,8 +612,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) ext_debug("path:"); for (k = 0; k <= l; k++, path++) { if (path->p_idx) { - ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), - ext4_idx_pblock(path->p_idx)); + ext_debug(" %d->%llu", + le32_to_cpu(path->p_idx->ei_block), + ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { ext_debug(" %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), @@ -731,11 +695,12 @@ void ext4_ext_drop_refs(struct ext4_ext_path *path) if (!path) return; depth = path->p_depth; - for (i = 0; i <= depth; i++, path++) + for (i = 0; i <= depth; i++, path++) { if (path->p_bh) { brelse(path->p_bh); path->p_bh = NULL; } + } } /* @@ -777,8 +742,8 @@ ext4_ext_binsearch_idx(struct inode *inode, chix = ix = EXT_FIRST_INDEX(eh); for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { - if (k != 0 && - le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { + if (k != 0 && le32_to_cpu(ix->ei_block) <= + le32_to_cpu(ix[-1].ei_block)) { printk(KERN_DEBUG "k=%d, ix=0x%p, " "first=0x%p\n", k, ix, EXT_FIRST_INDEX(eh)); @@ -911,6 +876,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, path[0].p_bh = NULL; i = depth; + if (!(flags & EXT4_EX_NOCACHE) && depth == 0) + ext4_cache_extents(inode, eh); /* walk through the tree */ while (i) { ext_debug("depth %d: num %d, max %d\n", @@ -1632,17 +1599,16 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) return EXT_MAX_BLOCKS; while (depth >= 0) { + struct ext4_ext_path *p = &path[depth]; + if (depth == path->p_depth) { /* leaf */ - if (path[depth].p_ext && - path[depth].p_ext != - EXT_LAST_EXTENT(path[depth].p_hdr)) - return le32_to_cpu(path[depth].p_ext[1].ee_block); + if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr)) + return le32_to_cpu(p->p_ext[1].ee_block); } else { /* index */ - if (path[depth].p_idx != - EXT_LAST_INDEX(path[depth].p_hdr)) - return le32_to_cpu(path[depth].p_idx[1].ei_block); + if (p->p_idx != EXT_LAST_INDEX(p->p_hdr)) + return le32_to_cpu(p->p_idx[1].ei_block); } depth--; } @@ -1742,9 +1708,9 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, return err; } -int -ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, - struct ext4_extent *ex2) +static int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2) { unsigned short ext1_ee_len, ext2_ee_len; @@ -1758,11 +1724,6 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, le32_to_cpu(ex2->ee_block)) return 0; - /* - * To allow future support for preallocated extents to be added - * as an RO_COMPAT feature, refuse to merge to extents if - * this can result in the top bit of ee_len being set. - */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; @@ -1870,13 +1831,14 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, } /* - * This function tries to merge the @ex extent to neighbours in the tree. - * return 1 if merge left else 0. + * This function tries to merge the @ex extent to neighbours in the tree, then + * tries to collapse the extent tree into the inode. */ static void ext4_ext_try_to_merge(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *ex) { + struct ext4_extent *ex) +{ struct ext4_extent_header *eh; unsigned int depth; int merge_done = 0; @@ -3718,9 +3680,6 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); - if (IS_ENCRYPTED(inode)) - max_zeroout = 0; - /* * five cases: * 1. split the extent into three extents. @@ -4706,6 +4665,10 @@ retry: return ret > 0 ? ret2 : ret; } +static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); + +static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); + static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { @@ -4723,9 +4686,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - /* Call ext4_force_commit to flush all data in case of data=journal. */ if (ext4_should_journal_data(inode)) { ret = ext4_force_commit(inode->i_sb); @@ -4765,7 +4725,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, } if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > i_size_read(inode) || + (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); @@ -4849,7 +4809,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, * Mark that we allocate beyond EOF so the subsequent truncate * can proceed even if the new size is the same as i_size. */ - if ((offset + len) > i_size_read(inode)) + if (offset + len > inode->i_size) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } ext4_mark_inode_dirty(handle, inode); @@ -4890,14 +4850,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) * range since we would need to re-encrypt blocks with a * different IV or XTS tweak (which are based on the logical * block number). - * - * XXX It's not clear why zero range isn't working, but we'll - * leave it disabled for encrypted inodes for now. This is a - * bug we should fix.... */ if (IS_ENCRYPTED(inode) && - (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | - FALLOC_FL_ZERO_RANGE))) + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; /* Return error if mode is not supported */ @@ -4941,7 +4896,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > i_size_read(inode) || + (offset + len > inode->i_size || offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); @@ -5268,7 +5223,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; - bool update = 0; + bool update = false; depth = path->p_depth; while (depth >= 0) { @@ -5284,7 +5239,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, goto out; if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) - update = 1; + update = true; while (ex_start <= ex_last) { if (SHIFT == SHIFT_LEFT) { @@ -5472,7 +5427,7 @@ out: * This implements the fallocate's collapse range functionality for ext4 * Returns: 0 and non-zero on error. */ -int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; ext4_lblk_t punch_start, punch_stop; @@ -5489,12 +5444,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Collapse range works only on fs block size aligned offsets. */ - if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || - len & (EXT4_CLUSTER_SIZE(sb) - 1)) - return -EINVAL; - - if (!S_ISREG(inode->i_mode)) + /* Collapse range works only on fs cluster size aligned regions. */ + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; trace_ext4_collapse_range(inode, offset, len); @@ -5514,7 +5465,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ - if (offset + len >= i_size_read(inode)) { + if (offset + len >= inode->i_size) { ret = -EINVAL; goto out_mutex; } @@ -5592,7 +5543,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_stop; } - new_size = i_size_read(inode) - len; + new_size = inode->i_size - len; i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; @@ -5620,7 +5571,7 @@ out_mutex: * by len bytes. * Returns 0 on success, error otherwise. */ -int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; handle_t *handle; @@ -5639,14 +5590,10 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Insert range works only on fs block size aligned offsets. */ - if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || - len & (EXT4_CLUSTER_SIZE(sb) - 1)) + /* Insert range works only on fs cluster size aligned regions. */ + if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - trace_ext4_insert_range(inode, offset, len); offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); @@ -5666,14 +5613,14 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - /* Check for wrap through zero */ - if (inode->i_size + len > inode->i_sb->s_maxbytes) { + /* Check whether the maximum file size would be exceeded */ + if (len > inode->i_sb->s_maxbytes - inode->i_size) { ret = -EFBIG; goto out_mutex; } - /* Offset should be less than i_size */ - if (offset >= i_size_read(inode)) { + /* Offset must be less than i_size */ + if (offset >= inode->i_size) { ret = -EINVAL; goto out_mutex; } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 825313c59752..4ec30a798260 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -209,6 +209,12 @@ static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) return es->es_pblk & ~ES_MASK; } +static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) +{ + ext4_fsblk_t pblock = ext4_es_pblock(es); + return pblock == ~ES_MASK ? 0 : pblock; +} + static inline void ext4_es_store_pblock(struct extent_status *es, ext4_fsblk_t pb) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6a7293a5cda2..5f225881176b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -88,9 +88,10 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; - if (!inode_trylock_shared(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) return -EAGAIN; + } else { inode_lock_shared(inode); } /* @@ -165,19 +166,25 @@ static int ext4_release_file(struct inode *inode, struct file *filp) * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ -static int -ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) +static bool +ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; - int blockmask = sb->s_blocksize - 1; - - if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) - return 0; + unsigned long blockmask = sb->s_blocksize - 1; if ((pos | iov_iter_alignment(from)) & blockmask) - return 1; + return true; - return 0; + return false; +} + +static bool +ext4_extending_io(struct inode *inode, loff_t offset, size_t len) +{ + if (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize) + return true; + return false; } /* Is IO overwriting allocated and initialized blocks? */ @@ -203,7 +210,8 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); } -static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +static ssize_t ext4_generic_write_checks(struct kiocb *iocb, + struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; @@ -227,11 +235,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } + return iov_iter_count(from); +} + +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + ssize_t ret, count; + + count = ext4_generic_write_checks(iocb, from); + if (count <= 0) + return count; + ret = file_modified(iocb->ki_filp); if (ret) return ret; - - return iov_iter_count(from); + return count; } static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, @@ -363,62 +381,137 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; +/* + * The intention here is to start with shared lock acquired then see if any + * condition requires an exclusive inode lock. If yes, then we restart the + * whole operation by releasing the shared lock and acquiring exclusive lock. + * + * - For unaligned_io we never take shared lock as it may cause data corruption + * when two unaligned IO tries to modify the same block e.g. while zeroing. + * + * - For extending writes case we don't take the shared lock, since it requires + * updating inode i_disksize and/or orphan handling with exclusive lock. + * + * - shared locking will only be true mostly with overwrites. Otherwise we will + * switch to exclusive i_rwsem lock. + */ +static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, + bool *ilock_shared, bool *extend) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + loff_t offset; + size_t count; + ssize_t ret; + +restart: + ret = ext4_generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + offset = iocb->ki_pos; + count = ret; + if (ext4_extending_io(inode, offset, count)) + *extend = true; + /* + * Determine whether the IO operation will overwrite allocated + * and initialized blocks. + * We need exclusive i_rwsem for changing security info + * in file_modified(). + */ + if (*ilock_shared && (!IS_NOSEC(inode) || *extend || + !ext4_overwrite_io(inode, offset, count))) { + inode_unlock_shared(inode); + *ilock_shared = false; + inode_lock(inode); + goto restart; + } + + ret = file_modified(file); + if (ret < 0) + goto out; + + return count; +out: + if (*ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); + return ret; +} + static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; - size_t count; - loff_t offset; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); - bool extend = false, overwrite = false, unaligned_aio = false; + loff_t offset = iocb->ki_pos; + size_t count = iov_iter_count(from); + const struct iomap_ops *iomap_ops = &ext4_iomap_ops; + bool extend = false, unaligned_io = false; + bool ilock_shared = true; + + /* + * We initially start with shared inode lock unless it is + * unaligned IO which needs exclusive lock anyways. + */ + if (ext4_unaligned_io(inode, from, offset)) { + unaligned_io = true; + ilock_shared = false; + } + /* + * Quick check here without any i_rwsem lock to see if it is extending + * IO. A more reliable check is done in ext4_dio_write_checks() with + * proper locking in place. + */ + if (offset + count > i_size_read(inode)) + ilock_shared = false; if (iocb->ki_flags & IOCB_NOWAIT) { - if (!inode_trylock(inode)) - return -EAGAIN; + if (ilock_shared) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + if (!inode_trylock(inode)) + return -EAGAIN; + } } else { - inode_lock(inode); + if (ilock_shared) + inode_lock_shared(inode); + else + inode_lock(inode); } + /* Fallback to buffered I/O if the inode does not support direct I/O. */ if (!ext4_dio_supported(inode)) { - inode_unlock(inode); - /* - * Fallback to buffered I/O if the inode does not support - * direct I/O. - */ + if (ilock_shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); return ext4_buffered_write_iter(iocb, from); } - ret = ext4_write_checks(iocb, from); - if (ret <= 0) { - inode_unlock(inode); + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); + if (ret <= 0) return ret; - } - /* - * Unaligned asynchronous direct I/O must be serialized among each - * other as the zeroing of partial blocks of two competing unaligned - * asynchronous direct I/O writes can result in data corruption. - */ offset = iocb->ki_pos; - count = iov_iter_count(from); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && - !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) { - unaligned_aio = true; - inode_dio_wait(inode); - } + count = ret; /* - * Determine whether the I/O will overwrite allocated and initialized - * blocks. If so, check to see whether it is possible to take the - * dioread_nolock path. + * Unaligned direct IO must be serialized among each other as zeroing + * of partial blocks of two competing unaligned IOs can result in data + * corruption. + * + * So we make sure we don't allow any unaligned IO in flight. + * For IOs where we need not wait (like unaligned non-AIO DIO), + * below inode_dio_wait() may anyway become a no-op, since we start + * with exclusive lock. */ - if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) && - ext4_should_dioread_nolock(inode)) { - overwrite = true; - downgrade_write(&inode->i_rwsem); - } + if (unaligned_io) + inode_dio_wait(inode); - if (offset + count > EXT4_I(inode)->i_disksize) { + if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -431,18 +524,19 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - extend = true; ext4_journal_stop(handle); } - ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops, - is_sync_kiocb(iocb) || unaligned_aio || extend); + if (ilock_shared) + iomap_ops = &ext4_iomap_overwrite_ops; + ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, + is_sync_kiocb(iocb) || unaligned_io || extend); if (extend) ret = ext4_handle_inode_extension(inode, offset, ret, count); out: - if (overwrite) + if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); @@ -487,9 +581,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); - if (!inode_trylock(inode)) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) return -EAGAIN; + } else { inode_lock(inode); } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 8ca4a23129aa..c66e8f9451a2 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -94,7 +94,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, goto verified; blk = ext4_inode_bitmap(sb, desc); if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, - EXT4_INODES_PER_GROUP(sb) / 8)) { + EXT4_INODES_PER_GROUP(sb) / 8) || + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, blk); @@ -192,8 +193,10 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) get_bh(bh); submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); if (!buffer_uptodate(bh)) { put_bh(bh); + ext4_set_errno(sb, EIO); ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); @@ -1223,6 +1226,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); + ext4_set_errno(sb, -err); ext4_error(sb, "couldn't read orphan inode %lu (err %d)", ino, err); return inode; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 3a4ab70fe9e0..569fc68e8975 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -660,32 +660,6 @@ out: } /* - * Calculate the number of metadata blocks need to reserve - * to allocate a new block at @lblocks for non extent file based file - */ -int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); - int blk_bits; - - if (lblock < EXT4_NDIR_BLOCKS) - return 0; - - lblock -= EXT4_NDIR_BLOCKS; - - if (ei->i_da_metadata_calc_len && - (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { - ei->i_da_metadata_calc_len++; - return 0; - } - ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; - ei->i_da_metadata_calc_len = 1; - blk_bits = order_base_2(lblock); - return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; -} - -/* * Calculate number of indirect blocks touched by mapping @nrblocks logically * contiguous blocks */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 2fec62d764fa..fad82d08fca5 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -98,6 +98,7 @@ int ext4_get_max_inline_size(struct inode *inode) error = ext4_get_inode_loc(inode, &iloc); if (error) { + ext4_set_errno(inode->i_sb, -error); ext4_error_inode(inode, __func__, __LINE__, 0, "can't get inode location %lu", inode->i_ino); @@ -849,7 +850,7 @@ out: /* * Prepare the write for the inline data. - * If the the data can be written into the inode, we just read + * If the data can be written into the inode, we just read * the page and make it uptodate, and start the journal. * Otherwise read the page, makes it dirty so that it can be * handle in writepages(the i_disksize update is left to the @@ -1761,6 +1762,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) err = ext4_get_inode_loc(dir, &iloc); if (err) { + ext4_set_errno(dir->i_sb, -err); EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", err, dir->i_ino); return true; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 629a25d999f0..3313168b680f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -48,8 +48,6 @@ #include <trace/events/ext4.h> -#define MPAGE_DA_EXTENT_TAIL 0x01 - static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { @@ -271,6 +269,7 @@ void ext4_evict_inode(struct inode *inode) if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { + ext4_set_errno(inode->i_sb, -err); ext4_error(inode->i_sb, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); @@ -402,7 +401,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, { int ret; - if (IS_ENCRYPTED(inode)) + if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); @@ -2478,10 +2477,12 @@ update_disksize: EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); - if (err2) + if (err2) { + ext4_set_errno(inode->i_sb, -err2); ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", inode->i_ino); + } if (!err) err = err2; } @@ -3448,6 +3449,22 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return 0; } +static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned flags, struct iomap *iomap, + struct iomap *srcmap) +{ + int ret; + + /* + * Even for writes we don't need to allocate blocks, so just pretend + * we are reading to save overhead of starting a transaction. + */ + flags &= ~IOMAP_WRITE; + ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); + WARN_ON_ONCE(iomap->type != IOMAP_MAPPED); + return ret; +} + static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { @@ -3469,6 +3486,11 @@ const struct iomap_ops ext4_iomap_ops = { .iomap_end = ext4_iomap_end, }; +const struct iomap_ops ext4_iomap_overwrite_ops = { + .iomap_begin = ext4_iomap_overwrite_begin, + .iomap_end = ext4_iomap_end, +}; + static bool ext4_iomap_is_delalloc(struct inode *inode, struct ext4_map_blocks *map) { @@ -3701,8 +3723,12 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); - WARN_ON_ONCE(fscrypt_decrypt_pagecache_blocks( - page, blocksize, bh_offset(bh))); + err = fscrypt_decrypt_pagecache_blocks(page, blocksize, + bh_offset(bh)); + if (err) { + clear_buffer_uptodate(bh); + goto unlock; + } } } if (ext4_should_journal_data(inode)) { @@ -3912,9 +3938,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) unsigned int credits; int ret = 0; - if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; - trace_ext4_punch_hole(inode, offset, length, 0); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); @@ -4240,6 +4263,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; + if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)) + goto simulate_eio; if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -4338,6 +4363,8 @@ make_io: blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { + simulate_eio: + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE_BLOCK(inode, block, "unable to read itable block"); brelse(bh); @@ -4551,7 +4578,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, sizeof(gen)); } - if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { + if (!ext4_inode_csum_verify(inode, raw_inode, ei) || + ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, function, line, 0, "iget: checksum invalid"); ret = -EFSBADCRC; @@ -5090,6 +5119,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, "IO error syncing inode"); err = -EIO; @@ -5368,7 +5398,8 @@ int ext4_getattr(const struct path *path, struct kstat *stat, struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; - if (EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { + if ((request_mask & STATX_BTIME) && + EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e8870fff8224..a0ec750018dd 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1377,6 +1377,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: + case EXT4_IOC_FSGETXATTR: + case EXT4_IOC_FSSETXATTR: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a3e2767bdf2f..f64838187559 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3895,6 +3895,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d reading block bitmap for %u", err, group); return 0; @@ -4063,6 +4064,7 @@ repeat: err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d loading buddy information for %u", err, group); continue; @@ -4071,6 +4073,7 @@ repeat: bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d reading block bitmap for %u", err, group); ext4_mb_unload_buddy(&e4b); @@ -4325,6 +4328,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { + ext4_set_errno(sb, -err); ext4_error(sb, "Error %d loading buddy information for %u", err, group); continue; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 2305b4374fd3..1c44b1a32001 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -173,8 +173,10 @@ static int kmmpd(void *data) * (s_mmp_update_interval * 60) seconds. */ if (retval) { - if ((failed_writes % 60) == 0) + if ((failed_writes % 60) == 0) { + ext4_set_errno(sb, -retval); ext4_error(sb, "Error writing to MMP block"); + } failed_writes++; } @@ -205,6 +207,7 @@ static int kmmpd(void *data) retval = read_mmp_block(sb, &bh_check, mmp_block); if (retval) { + ext4_set_errno(sb, -retval); ext4_error(sb, "error reading MMP data: %d", retval); goto exit_thread; @@ -218,6 +221,7 @@ static int kmmpd(void *data) "Error while updating MMP info. " "The filesystem seems to have been" " multiply mounted."); + ext4_set_errno(sb, EBUSY); ext4_error(sb, "abort"); put_bh(bh_check); retval = -EBUSY; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1cb42d940784..129d2ebae00d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -109,7 +109,10 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, struct ext4_dir_entry *dirent; int is_dx_block = 0; - bh = ext4_bread(NULL, inode, block, 0); + if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO)) + bh = ERR_PTR(-EIO); + else + bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, func, line, "inode #%lu: lblock %lu: comm %s: " @@ -153,9 +156,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, * caller is sure it should be an index block. */ if (is_dx_block && type == INDEX) { - if (ext4_dx_csum_verify(inode, dirent)) + if (ext4_dx_csum_verify(inode, dirent) && + !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, func, line, block, "Directory index failed checksum"); brelse(bh); @@ -163,9 +168,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, } } if (!is_dx_block) { - if (ext4_dirblock_csum_verify(inode, bh)) + if (ext4_dirblock_csum_verify(inode, bh) && + !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { + ext4_set_errno(inode->i_sb, EFSBADCRC); ext4_error_inode(inode, func, line, block, "Directory block failed checksum"); brelse(bh); @@ -1002,7 +1009,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); -#ifdef CONFIG_FS_ENCRYPTION /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { err = fscrypt_get_encryption_info(dir); @@ -1017,7 +1023,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, return err; } } -#endif + for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, @@ -1065,9 +1071,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, } errout: brelse(bh); -#ifdef CONFIG_FS_ENCRYPTION fscrypt_fname_free_buffer(&fname_crypto_str); -#endif return count; } @@ -1527,6 +1531,7 @@ restart: goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { + ext4_set_errno(sb, EIO); EXT4_ERROR_INODE(dir, "reading directory lblock %lu", (unsigned long) block); brelse(bh); @@ -1537,6 +1542,7 @@ restart: !is_dx_internal_node(dir, block, (struct ext4_dir_entry *)bh->b_data) && !ext4_dirblock_csum_verify(dir, bh)) { + ext4_set_errno(sb, EFSBADCRC); EXT4_ERROR_INODE(dir, "checksumming directory " "block %lu", (unsigned long)block); brelse(bh); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 24aeedb8fc75..68b39e75446a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -512,17 +512,26 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_t gfp_flags = GFP_NOFS; unsigned int enc_bytes = round_up(len, i_blocksize(inode)); + /* + * Since bounce page allocation uses a mempool, we can only use + * a waiting mask (i.e. request guaranteed allocation) on the + * first page of the bio. Otherwise it can deadlock. + */ + if (io->io_bio) + gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); - if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { - if (io->io_bio) { + if (ret == -ENOMEM && + (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { + gfp_flags = GFP_NOFS; + if (io->io_bio) ext4_io_submit(io); - congestion_wait(BLK_RW_ASYNC, HZ/50); - } - gfp_flags |= __GFP_NOFAIL; + else + gfp_flags |= __GFP_NOFAIL; + congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry_encrypt; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index fef7755300c3..c1769afbf799 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -57,6 +57,7 @@ enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, STEP_VERITY, + STEP_MAX, }; struct bio_post_read_ctx { @@ -106,10 +107,22 @@ static void verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; - fsverity_verify_bio(ctx->bio); + /* + * fsverity_verify_bio() may call readpages() again, and although verity + * will be disabled for that, decryption may still be needed, causing + * another bio_post_read_ctx to be allocated. So to guarantee that + * mempool_alloc() never deadlocks we must free the current ctx first. + * This is safe because verity is the last post-read step. + */ + BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); + mempool_free(ctx, bio_post_read_ctx_pool); + bio->bi_private = NULL; - bio_post_read_processing(ctx); + fsverity_verify_bio(bio); + + __read_end_io(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) @@ -176,12 +189,11 @@ static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } -static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode, - struct bio *bio, - pgoff_t first_idx) +static void ext4_set_bio_post_read_ctx(struct bio *bio, + const struct inode *inode, + pgoff_t first_idx) { unsigned int post_read_steps = 0; - struct bio_post_read_ctx *ctx = NULL; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) post_read_steps |= 1 << STEP_DECRYPT; @@ -190,14 +202,14 @@ static struct bio_post_read_ctx *get_bio_post_read_ctx(struct inode *inode, post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { - ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); - if (!ctx) - return ERR_PTR(-ENOMEM); + /* Due to the mempool, this never fails. */ + struct bio_post_read_ctx *ctx = + mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); + ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } - return ctx; } static inline loff_t ext4_readpage_limit(struct inode *inode) @@ -358,24 +370,16 @@ int ext4_mpage_readpages(struct address_space *mapping, bio = NULL; } if (bio == NULL) { - struct bio_post_read_ctx *ctx; - /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - ctx = get_bio_post_read_ctx(inode, bio, page->index); - if (IS_ERR(ctx)) { - bio_put(bio); - bio = NULL; - goto set_error_page; - } + ext4_set_bio_post_read_ctx(bio, inode, page->index); bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; - bio->bi_private = ctx; bio_set_op_attrs(bio, REQ_OP_READ, is_readahead ? REQ_RAHEAD : 0); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a8c0f2b5b6e1..86a2500ed292 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -824,9 +824,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (unlikely(err)) goto errout; - n_group_desc = ext4_kvmalloc((gdb_num + 1) * - sizeof(struct buffer_head *), - GFP_NOFS); + n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_KERNEL); if (!n_group_desc) { err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", @@ -900,9 +899,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb, gdb_bh = ext4_sb_bread(sb, gdblock, 0); if (IS_ERR(gdb_bh)) return PTR_ERR(gdb_bh); - n_group_desc = ext4_kvmalloc((gdb_num + 1) * - sizeof(struct buffer_head *), - GFP_NOFS); + n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_KERNEL); if (!n_group_desc) { brelse(gdb_bh); err = -ENOMEM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2937a8873fe1..8434217549b3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -154,7 +154,7 @@ ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags) if (bh == NULL) return ERR_PTR(-ENOMEM); - if (buffer_uptodate(bh)) + if (ext4_buffer_uptodate(bh)) return bh; ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh); wait_on_buffer(bh); @@ -204,26 +204,6 @@ void ext4_superblock_csum_set(struct super_block *sb) es->s_checksum = ext4_superblock_csum(sb, es); } -void *ext4_kvmalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kmalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags, PAGE_KERNEL); - return ret; -} - -void *ext4_kvzalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kzalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); - return ret; -} - ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { @@ -367,6 +347,8 @@ static void __save_error_info(struct super_block *sb, const char *func, ext4_update_tstamp(es, s_last_error_time); strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); es->s_last_error_line = cpu_to_le32(line); + if (es->s_last_error_errcode == 0) + es->s_last_error_errcode = EXT4_ERR_EFSCORRUPTED; if (!es->s_first_error_time) { es->s_first_error_time = es->s_last_error_time; es->s_first_error_time_hi = es->s_last_error_time_hi; @@ -375,6 +357,7 @@ static void __save_error_info(struct super_block *sb, const char *func, es->s_first_error_line = cpu_to_le32(line); es->s_first_error_ino = es->s_last_error_ino; es->s_first_error_block = es->s_last_error_block; + es->s_first_error_errcode = es->s_last_error_errcode; } /* * Start the daily error reporting function if it hasn't been @@ -631,6 +614,66 @@ const char *ext4_decode_error(struct super_block *sb, int errno, return errstr; } +void ext4_set_errno(struct super_block *sb, int err) +{ + if (err < 0) + err = -err; + + switch (err) { + case EIO: + err = EXT4_ERR_EIO; + break; + case ENOMEM: + err = EXT4_ERR_ENOMEM; + break; + case EFSBADCRC: + err = EXT4_ERR_EFSBADCRC; + break; + case EFSCORRUPTED: + err = EXT4_ERR_EFSCORRUPTED; + break; + case ENOSPC: + err = EXT4_ERR_ENOSPC; + break; + case ENOKEY: + err = EXT4_ERR_ENOKEY; + break; + case EROFS: + err = EXT4_ERR_EROFS; + break; + case EFBIG: + err = EXT4_ERR_EFBIG; + break; + case EEXIST: + err = EXT4_ERR_EEXIST; + break; + case ERANGE: + err = EXT4_ERR_ERANGE; + break; + case EOVERFLOW: + err = EXT4_ERR_EOVERFLOW; + break; + case EBUSY: + err = EXT4_ERR_EBUSY; + break; + case ENOTDIR: + err = EXT4_ERR_ENOTDIR; + break; + case ENOTEMPTY: + err = EXT4_ERR_ENOTEMPTY; + break; + case ESHUTDOWN: + err = EXT4_ERR_ESHUTDOWN; + break; + case EFAULT: + err = EXT4_ERR_EFAULT; + break; + default: + err = EXT4_ERR_UNKNOWN; + } + EXT4_SB(sb)->s_es->s_last_error_errcode = err; +} + /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ @@ -655,6 +698,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, sb->s_id, function, line, errstr); } + ext4_set_errno(sb, -errno); save_error_info(sb, function, line); ext4_handle_error(sb); } @@ -982,8 +1026,10 @@ static void ext4_put_super(struct super_block *sb) aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; - if ((err < 0) && !aborted) + if ((err < 0) && !aborted) { + ext4_set_errno(sb, -err); ext4_abort(sb, "Couldn't clean up the journal"); + } } ext4_unregister_sysfs(sb); @@ -1085,8 +1131,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; - ei->i_da_metadata_calc_len = 0; - ei->i_da_metadata_calc_last_lblock = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA @@ -1548,6 +1592,7 @@ static const match_table_t tokens = { {Opt_auto_da_alloc, "auto_da_alloc"}, {Opt_noauto_da_alloc, "noauto_da_alloc"}, {Opt_dioread_nolock, "dioread_nolock"}, + {Opt_dioread_lock, "nodioread_nolock"}, {Opt_dioread_lock, "dioread_lock"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, @@ -3720,6 +3765,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); + set_opt(sb, DIOREAD_NOLOCK); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif @@ -3887,9 +3933,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #endif if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk_once(KERN_WARNING "EXT4-fs: Warning: mounting " - "with data=journal disables delayed " - "allocation and O_DIRECT support!\n"); + printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n"); + clear_opt(sb, DIOREAD_NOLOCK); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); @@ -5540,9 +5585,15 @@ static int ext4_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = (dquot->dq_dqb.dqb_bsoftlimit ? - dquot->dq_dqb.dqb_bsoftlimit : - dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + limit = 0; + if (dquot->dq_dqb.dqb_bsoftlimit && + (!limit || dquot->dq_dqb.dqb_bsoftlimit < limit)) + limit = dquot->dq_dqb.dqb_bsoftlimit; + if (dquot->dq_dqb.dqb_bhardlimit && + (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) + limit = dquot->dq_dqb.dqb_bhardlimit; + limit >>= sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; @@ -5552,9 +5603,14 @@ static int ext4_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = dquot->dq_dqb.dqb_isoftlimit ? - dquot->dq_dqb.dqb_isoftlimit : - dquot->dq_dqb.dqb_ihardlimit; + limit = 0; + if (dquot->dq_dqb.dqb_isoftlimit && + (!limit || dquot->dq_dqb.dqb_isoftlimit < limit)) + limit = dquot->dq_dqb.dqb_isoftlimit; + if (dquot->dq_dqb.dqb_ihardlimit && + (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) + limit = dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = @@ -5987,7 +6043,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, bh = ext4_bread(handle, inode, blk, EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL); - } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) && + } while (PTR_ERR(bh) == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index eb1efad0e20a..d218ebdafa4a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -29,6 +29,10 @@ typedef enum { attr_last_error_time, attr_feature, attr_pointer_ui, + attr_pointer_ul, + attr_pointer_u64, + attr_pointer_u8, + attr_pointer_string, attr_pointer_atomic, attr_journal_task, } attr_id_t; @@ -46,6 +50,7 @@ struct ext4_attr { struct attribute attr; short attr_id; short attr_ptr; + unsigned short attr_size; union { int offset; void *explicit_ptr; @@ -154,12 +159,35 @@ static struct ext4_attr ext4_attr_##_name = { \ }, \ } +#define EXT4_ATTR_STRING(_name,_mode,_size,_struct,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_pointer_string, \ + .attr_size = _size, \ + .attr_ptr = ptr_##_struct##_offset, \ + .u = { \ + .offset = offsetof(struct _struct, _elname),\ + }, \ +} + #define EXT4_RO_ATTR_ES_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname) +#define EXT4_RO_ATTR_ES_U8(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_u8, ext4_super_block, _elname) + +#define EXT4_RO_ATTR_ES_U64(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_u64, ext4_super_block, _elname) + +#define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ + EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) + #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) +#define EXT4_RW_ATTR_SBI_UL(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_ul, ext4_sb_info, _elname) + #define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -194,7 +222,20 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +#ifdef CONFIG_EXT4_DEBUG +EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); +#endif EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); +EXT4_RO_ATTR_ES_U8(first_error_errcode, s_first_error_errcode); +EXT4_RO_ATTR_ES_U8(last_error_errcode, s_last_error_errcode); +EXT4_RO_ATTR_ES_UI(first_error_ino, s_first_error_ino); +EXT4_RO_ATTR_ES_UI(last_error_ino, s_last_error_ino); +EXT4_RO_ATTR_ES_U64(first_error_block, s_first_error_block); +EXT4_RO_ATTR_ES_U64(last_error_block, s_last_error_block); +EXT4_RO_ATTR_ES_UI(first_error_line, s_first_error_line); +EXT4_RO_ATTR_ES_UI(last_error_line, s_last_error_line); +EXT4_RO_ATTR_ES_STRING(first_error_func, s_first_error_func, 32); +EXT4_RO_ATTR_ES_STRING(last_error_func, s_last_error_func, 32); EXT4_ATTR(first_error_time, 0444, first_error_time); EXT4_ATTR(last_error_time, 0444, last_error_time); EXT4_ATTR(journal_task, 0444, journal_task); @@ -225,9 +266,22 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), ATTR_LIST(errors_count), + ATTR_LIST(first_error_ino), + ATTR_LIST(last_error_ino), + ATTR_LIST(first_error_block), + ATTR_LIST(last_error_block), + ATTR_LIST(first_error_line), + ATTR_LIST(last_error_line), + ATTR_LIST(first_error_func), + ATTR_LIST(last_error_func), + ATTR_LIST(first_error_errcode), + ATTR_LIST(last_error_errcode), ATTR_LIST(first_error_time), ATTR_LIST(last_error_time), ATTR_LIST(journal_task), +#ifdef CONFIG_EXT4_DEBUG + ATTR_LIST(simulate_fail), +#endif NULL, }; ATTRIBUTE_GROUPS(ext4); @@ -280,7 +334,7 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) { - return snprintf(buf, PAGE_SIZE, "%lld", + return snprintf(buf, PAGE_SIZE, "%lld\n", ((time64_t)hi << 32) + le32_to_cpu(lo)); } @@ -318,6 +372,30 @@ static ssize_t ext4_attr_show(struct kobject *kobj, else return snprintf(buf, PAGE_SIZE, "%u\n", *((unsigned int *) ptr)); + case attr_pointer_ul: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%lu\n", + *((unsigned long *) ptr)); + case attr_pointer_u8: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%u\n", + *((unsigned char *) ptr)); + case attr_pointer_u64: + if (!ptr) + return 0; + if (a->attr_ptr == ptr_ext4_super_block_offset) + return snprintf(buf, PAGE_SIZE, "%llu\n", + le64_to_cpup(ptr)); + else + return snprintf(buf, PAGE_SIZE, "%llu\n", + *((unsigned long long *) ptr)); + case attr_pointer_string: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%.*s\n", a->attr_size, + (char *) ptr); case attr_pointer_atomic: if (!ptr) return 0; @@ -361,6 +439,14 @@ static ssize_t ext4_attr_store(struct kobject *kobj, else *((unsigned int *) ptr) = t; return len; + case attr_pointer_ul: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + *((unsigned long *) ptr) = t; + return len; case attr_inode_readahead: return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8966a5439a22..8cac7d95c3ad 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1456,7 +1456,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, if (!ce) return NULL; - ea_data = ext4_kvmalloc(value_len, GFP_NOFS); + ea_data = kvmalloc(value_len, GFP_KERNEL); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); return NULL; @@ -2879,9 +2879,11 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); - if (error == -EIO) + if (error == -EIO) { + ext4_set_errno(inode->i_sb, EIO); EXT4_ERROR_INODE(inode, "block %llu read error", EXT4_I(inode)->i_file_acl); + } bh = NULL; goto cleanup; } diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 599fb9194c6a..f0faada30f30 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -22,7 +22,7 @@ config F2FS_FS config F2FS_STAT_FS bool "F2FS Status Information" - depends on F2FS_FS && DEBUG_FS + depends on F2FS_FS default y help /sys/kernel/debug/f2fs/ contains information about all the partitions @@ -93,3 +93,28 @@ config F2FS_FAULT_INJECTION Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. If unsure, say N. + +config F2FS_FS_COMPRESSION + bool "F2FS compression feature" + depends on F2FS_FS + help + Enable filesystem-level compression on f2fs regular files, + multiple back-end compression algorithms are supported. + +config F2FS_FS_LZO + bool "LZO compression support" + depends on F2FS_FS_COMPRESSION + select LZO_COMPRESS + select LZO_DECOMPRESS + default y + help + Support LZO compress algorithm, if unsure, say Y. + +config F2FS_FS_LZ4 + bool "LZ4 compression support" + depends on F2FS_FS_COMPRESSION + select LZ4_COMPRESS + select LZ4_DECOMPRESS + default y + help + Support LZ4 compress algorithm, if unsure, say Y. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 2aaecc63834f..ee7316b42f69 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -9,3 +9,4 @@ f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o f2fs-$(CONFIG_FS_VERITY) += verity.o +f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ffdaba0c55d2..44e84ac5c941 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1509,10 +1509,10 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_wait_on_all_pages_writeback(sbi); /* - * invalidate intermediate page cache borrowed from meta inode - * which are used for migration of encrypted inode's blocks. + * invalidate intermediate page cache borrowed from meta inode which are + * used for migration of encrypted or verity inode's blocks. */ - if (f2fs_sb_has_encrypt(sbi)) + if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi)) invalidate_mapping_pages(META_MAPPING(sbi), MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c new file mode 100644 index 000000000000..d8a64be90a50 --- /dev/null +++ b/fs/f2fs/compress.c @@ -0,0 +1,1176 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs compress support + * + * Copyright (c) 2019 Chao Yu <chao@kernel.org> + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/lzo.h> +#include <linux/lz4.h> + +#include "f2fs.h" +#include "node.h" +#include <trace/events/f2fs.h> + +struct f2fs_compress_ops { + int (*init_compress_ctx)(struct compress_ctx *cc); + void (*destroy_compress_ctx)(struct compress_ctx *cc); + int (*compress_pages)(struct compress_ctx *cc); + int (*decompress_pages)(struct decompress_io_ctx *dic); +}; + +static unsigned int offset_in_cluster(struct compress_ctx *cc, pgoff_t index) +{ + return index & (cc->cluster_size - 1); +} + +static pgoff_t cluster_idx(struct compress_ctx *cc, pgoff_t index) +{ + return index >> cc->log_cluster_size; +} + +static pgoff_t start_idx_of_cluster(struct compress_ctx *cc) +{ + return cc->cluster_idx << cc->log_cluster_size; +} + +bool f2fs_is_compressed_page(struct page *page) +{ + if (!PagePrivate(page)) + return false; + if (!page_private(page)) + return false; + if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page)) + return false; + f2fs_bug_on(F2FS_M_SB(page->mapping), + *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); + return true; +} + +static void f2fs_set_compressed_page(struct page *page, + struct inode *inode, pgoff_t index, void *data, refcount_t *r) +{ + SetPagePrivate(page); + set_page_private(page, (unsigned long)data); + + /* i_crypto_info and iv index */ + page->index = index; + page->mapping = inode->i_mapping; + if (r) + refcount_inc(r); +} + +static void f2fs_put_compressed_page(struct page *page) +{ + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + page->mapping = NULL; + unlock_page(page); + put_page(page); +} + +static void f2fs_drop_rpages(struct compress_ctx *cc, int len, bool unlock) +{ + int i; + + for (i = 0; i < len; i++) { + if (!cc->rpages[i]) + continue; + if (unlock) + unlock_page(cc->rpages[i]); + else + put_page(cc->rpages[i]); + } +} + +static void f2fs_put_rpages(struct compress_ctx *cc) +{ + f2fs_drop_rpages(cc, cc->cluster_size, false); +} + +static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) +{ + f2fs_drop_rpages(cc, len, true); +} + +static void f2fs_put_rpages_mapping(struct compress_ctx *cc, + struct address_space *mapping, + pgoff_t start, int len) +{ + int i; + + for (i = 0; i < len; i++) { + struct page *page = find_get_page(mapping, start + i); + + put_page(page); + put_page(page); + } +} + +static void f2fs_put_rpages_wbc(struct compress_ctx *cc, + struct writeback_control *wbc, bool redirty, int unlock) +{ + unsigned int i; + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + if (redirty) + redirty_page_for_writepage(wbc, cc->rpages[i]); + f2fs_put_page(cc->rpages[i], unlock); + } +} + +struct page *f2fs_compress_control_page(struct page *page) +{ + return ((struct compress_io_ctx *)page_private(page))->rpages[0]; +} + +int f2fs_init_compress_ctx(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + + if (cc->nr_rpages) + return 0; + + cc->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc->log_cluster_size, GFP_NOFS); + return cc->rpages ? 0 : -ENOMEM; +} + +void f2fs_destroy_compress_ctx(struct compress_ctx *cc) +{ + kfree(cc->rpages); + cc->rpages = NULL; + cc->nr_rpages = 0; + cc->nr_cpages = 0; + cc->cluster_idx = NULL_CLUSTER; +} + +void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page) +{ + unsigned int cluster_ofs; + + if (!f2fs_cluster_can_merge_page(cc, page->index)) + f2fs_bug_on(F2FS_I_SB(cc->inode), 1); + + cluster_ofs = offset_in_cluster(cc, page->index); + cc->rpages[cluster_ofs] = page; + cc->nr_rpages++; + cc->cluster_idx = cluster_idx(cc, page->index); +} + +#ifdef CONFIG_F2FS_FS_LZO +static int lzo_init_compress_ctx(struct compress_ctx *cc) +{ + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), + LZO1X_MEM_COMPRESS, GFP_NOFS); + if (!cc->private) + return -ENOMEM; + + cc->clen = lzo1x_worst_compress(PAGE_SIZE << cc->log_cluster_size); + return 0; +} + +static void lzo_destroy_compress_ctx(struct compress_ctx *cc) +{ + kvfree(cc->private); + cc->private = NULL; +} + +static int lzo_compress_pages(struct compress_ctx *cc) +{ + int ret; + + ret = lzo1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata, + &cc->clen, cc->private); + if (ret != LZO_E_OK) { + printk_ratelimited("%sF2FS-fs (%s): lzo compress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret); + return -EIO; + } + return 0; +} + +static int lzo_decompress_pages(struct decompress_io_ctx *dic) +{ + int ret; + + ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen, + dic->rbuf, &dic->rlen); + if (ret != LZO_E_OK) { + printk_ratelimited("%sF2FS-fs (%s): lzo decompress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret); + return -EIO; + } + + if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) { + printk_ratelimited("%sF2FS-fs (%s): lzo invalid rlen:%zu, " + "expected:%lu\n", KERN_ERR, + F2FS_I_SB(dic->inode)->sb->s_id, + dic->rlen, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + return 0; +} + +static const struct f2fs_compress_ops f2fs_lzo_ops = { + .init_compress_ctx = lzo_init_compress_ctx, + .destroy_compress_ctx = lzo_destroy_compress_ctx, + .compress_pages = lzo_compress_pages, + .decompress_pages = lzo_decompress_pages, +}; +#endif + +#ifdef CONFIG_F2FS_FS_LZ4 +static int lz4_init_compress_ctx(struct compress_ctx *cc) +{ + cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), + LZ4_MEM_COMPRESS, GFP_NOFS); + if (!cc->private) + return -ENOMEM; + + cc->clen = LZ4_compressBound(PAGE_SIZE << cc->log_cluster_size); + return 0; +} + +static void lz4_destroy_compress_ctx(struct compress_ctx *cc) +{ + kvfree(cc->private); + cc->private = NULL; +} + +static int lz4_compress_pages(struct compress_ctx *cc) +{ + int len; + + len = LZ4_compress_default(cc->rbuf, cc->cbuf->cdata, cc->rlen, + cc->clen, cc->private); + if (!len) { + printk_ratelimited("%sF2FS-fs (%s): lz4 compress failed\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id); + return -EIO; + } + cc->clen = len; + return 0; +} + +static int lz4_decompress_pages(struct decompress_io_ctx *dic) +{ + int ret; + + ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf, + dic->clen, dic->rlen); + if (ret < 0) { + printk_ratelimited("%sF2FS-fs (%s): lz4 decompress failed, ret:%d\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, ret); + return -EIO; + } + + if (ret != PAGE_SIZE << dic->log_cluster_size) { + printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, " + "expected:%lu\n", KERN_ERR, + F2FS_I_SB(dic->inode)->sb->s_id, + dic->rlen, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + return 0; +} + +static const struct f2fs_compress_ops f2fs_lz4_ops = { + .init_compress_ctx = lz4_init_compress_ctx, + .destroy_compress_ctx = lz4_destroy_compress_ctx, + .compress_pages = lz4_compress_pages, + .decompress_pages = lz4_decompress_pages, +}; +#endif + +static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { +#ifdef CONFIG_F2FS_FS_LZO + &f2fs_lzo_ops, +#else + NULL, +#endif +#ifdef CONFIG_F2FS_FS_LZ4 + &f2fs_lz4_ops, +#else + NULL, +#endif +}; + +bool f2fs_is_compress_backend_ready(struct inode *inode) +{ + if (!f2fs_compressed_file(inode)) + return true; + return f2fs_cops[F2FS_I(inode)->i_compress_algorithm]; +} + +static struct page *f2fs_grab_page(void) +{ + struct page *page; + + page = alloc_page(GFP_NOFS); + if (!page) + return NULL; + lock_page(page); + return page; +} + +static int f2fs_compress_pages(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + struct f2fs_inode_info *fi = F2FS_I(cc->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + unsigned int max_len, nr_cpages; + int i, ret; + + trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, + cc->cluster_size, fi->i_compress_algorithm); + + ret = cops->init_compress_ctx(cc); + if (ret) + goto out; + + max_len = COMPRESS_HEADER_SIZE + cc->clen; + cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); + + cc->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * + cc->nr_cpages, GFP_NOFS); + if (!cc->cpages) { + ret = -ENOMEM; + goto destroy_compress_ctx; + } + + for (i = 0; i < cc->nr_cpages; i++) { + cc->cpages[i] = f2fs_grab_page(); + if (!cc->cpages[i]) { + ret = -ENOMEM; + goto out_free_cpages; + } + } + + cc->rbuf = vmap(cc->rpages, cc->cluster_size, VM_MAP, PAGE_KERNEL_RO); + if (!cc->rbuf) { + ret = -ENOMEM; + goto out_free_cpages; + } + + cc->cbuf = vmap(cc->cpages, cc->nr_cpages, VM_MAP, PAGE_KERNEL); + if (!cc->cbuf) { + ret = -ENOMEM; + goto out_vunmap_rbuf; + } + + ret = cops->compress_pages(cc); + if (ret) + goto out_vunmap_cbuf; + + max_len = PAGE_SIZE * (cc->cluster_size - 1) - COMPRESS_HEADER_SIZE; + + if (cc->clen > max_len) { + ret = -EAGAIN; + goto out_vunmap_cbuf; + } + + cc->cbuf->clen = cpu_to_le32(cc->clen); + cc->cbuf->chksum = cpu_to_le32(0); + + for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) + cc->cbuf->reserved[i] = cpu_to_le32(0); + + vunmap(cc->cbuf); + vunmap(cc->rbuf); + + nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + + for (i = nr_cpages; i < cc->nr_cpages; i++) { + f2fs_put_compressed_page(cc->cpages[i]); + cc->cpages[i] = NULL; + } + + cc->nr_cpages = nr_cpages; + + trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, + cc->clen, ret); + return 0; + +out_vunmap_cbuf: + vunmap(cc->cbuf); +out_vunmap_rbuf: + vunmap(cc->rbuf); +out_free_cpages: + for (i = 0; i < cc->nr_cpages; i++) { + if (cc->cpages[i]) + f2fs_put_compressed_page(cc->cpages[i]); + } + kfree(cc->cpages); + cc->cpages = NULL; +destroy_compress_ctx: + cops->destroy_compress_ctx(cc); +out: + trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, + cc->clen, ret); + return ret; +} + +void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) +{ + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + struct f2fs_inode_info *fi= F2FS_I(dic->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + int ret; + + dec_page_count(sbi, F2FS_RD_DATA); + + if (bio->bi_status || PageError(page)) + dic->failed = true; + + if (refcount_dec_not_one(&dic->ref)) + return; + + trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, + dic->cluster_size, fi->i_compress_algorithm); + + /* submit partial compressed pages */ + if (dic->failed) { + ret = -EIO; + goto out_free_dic; + } + + dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL); + if (!dic->rbuf) { + ret = -ENOMEM; + goto out_free_dic; + } + + dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO); + if (!dic->cbuf) { + ret = -ENOMEM; + goto out_vunmap_rbuf; + } + + dic->clen = le32_to_cpu(dic->cbuf->clen); + dic->rlen = PAGE_SIZE << dic->log_cluster_size; + + if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { + ret = -EFSCORRUPTED; + goto out_vunmap_cbuf; + } + + ret = cops->decompress_pages(dic); + +out_vunmap_cbuf: + vunmap(dic->cbuf); +out_vunmap_rbuf: + vunmap(dic->rbuf); +out_free_dic: + if (!verity) + f2fs_decompress_end_io(dic->rpages, dic->cluster_size, + ret, false); + + trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, + dic->clen, ret); + if (!verity) + f2fs_free_dic(dic); +} + +static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) +{ + if (cc->cluster_idx == NULL_CLUSTER) + return true; + return cc->cluster_idx == cluster_idx(cc, index); +} + +bool f2fs_cluster_is_empty(struct compress_ctx *cc) +{ + return cc->nr_rpages == 0; +} + +static bool f2fs_cluster_is_full(struct compress_ctx *cc) +{ + return cc->cluster_size == cc->nr_rpages; +} + +bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) +{ + if (f2fs_cluster_is_empty(cc)) + return true; + return is_page_in_cluster(cc, index); +} + +static bool __cluster_may_compress(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + loff_t i_size = i_size_read(cc->inode); + unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); + int i; + + for (i = 0; i < cc->cluster_size; i++) { + struct page *page = cc->rpages[i]; + + f2fs_bug_on(sbi, !page); + + if (unlikely(f2fs_cp_error(sbi))) + return false; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return false; + + /* beyond EOF */ + if (page->index >= nr_pages) + return false; + } + return true; +} + +/* return # of compressed block addresses */ +static int f2fs_compressed_blocks(struct compress_ctx *cc) +{ + struct dnode_of_data dn; + int ret; + + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx_of_cluster(cc), + LOOKUP_NODE); + if (ret) { + if (ret == -ENOENT) + ret = 0; + goto fail; + } + + if (dn.data_blkaddr == COMPRESS_ADDR) { + int i; + + ret = 1; + for (i = 1; i < cc->cluster_size; i++) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node + i); + if (blkaddr != NULL_ADDR) + ret++; + } + } +fail: + f2fs_put_dnode(&dn); + return ret; +} + +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) +{ + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, + }; + + return f2fs_compressed_blocks(&cc); +} + +static bool cluster_may_compress(struct compress_ctx *cc) +{ + if (!f2fs_compressed_file(cc->inode)) + return false; + if (f2fs_is_atomic_file(cc->inode)) + return false; + if (f2fs_is_mmap_file(cc->inode)) + return false; + if (!f2fs_cluster_is_full(cc)) + return false; + return __cluster_may_compress(cc); +} + +static void set_cluster_writeback(struct compress_ctx *cc) +{ + int i; + + for (i = 0; i < cc->cluster_size; i++) { + if (cc->rpages[i]) + set_page_writeback(cc->rpages[i]); + } +} + +static void set_cluster_dirty(struct compress_ctx *cc) +{ + int i; + + for (i = 0; i < cc->cluster_size; i++) + if (cc->rpages[i]) + set_page_dirty(cc->rpages[i]); +} + +static int prepare_compress_overwrite(struct compress_ctx *cc, + struct page **pagep, pgoff_t index, void **fsdata) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + struct address_space *mapping = cc->inode->i_mapping; + struct page *page; + struct dnode_of_data dn; + sector_t last_block_in_bio; + unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; + pgoff_t start_idx = start_idx_of_cluster(cc); + int i, ret; + bool prealloc; + +retry: + ret = f2fs_compressed_blocks(cc); + if (ret <= 0) + return ret; + + /* compressed case */ + prealloc = (ret < cc->cluster_size); + + ret = f2fs_init_compress_ctx(cc); + if (ret) + return ret; + + /* keep page reference to avoid page reclaim */ + for (i = 0; i < cc->cluster_size; i++) { + page = f2fs_pagecache_get_page(mapping, start_idx + i, + fgp_flag, GFP_NOFS); + if (!page) { + ret = -ENOMEM; + goto unlock_pages; + } + + if (PageUptodate(page)) + unlock_page(page); + else + f2fs_compress_ctx_add_page(cc, page); + } + + if (!f2fs_cluster_is_empty(cc)) { + struct bio *bio = NULL; + + ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, + &last_block_in_bio, false); + f2fs_destroy_compress_ctx(cc); + if (ret) + goto release_pages; + if (bio) + f2fs_submit_bio(sbi, bio, DATA); + + ret = f2fs_init_compress_ctx(cc); + if (ret) + goto release_pages; + } + + for (i = 0; i < cc->cluster_size; i++) { + f2fs_bug_on(sbi, cc->rpages[i]); + + page = find_lock_page(mapping, start_idx + i); + f2fs_bug_on(sbi, !page); + + f2fs_wait_on_page_writeback(page, DATA, true, true); + + f2fs_compress_ctx_add_page(cc, page); + f2fs_put_page(page, 0); + + if (!PageUptodate(page)) { + f2fs_unlock_rpages(cc, i + 1); + f2fs_put_rpages_mapping(cc, mapping, start_idx, + cc->cluster_size); + f2fs_destroy_compress_ctx(cc); + goto retry; + } + } + + if (prealloc) { + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + + for (i = cc->cluster_size - 1; i > 0; i--) { + ret = f2fs_get_block(&dn, start_idx + i); + if (ret) { + i = cc->cluster_size; + break; + } + + if (dn.data_blkaddr != NEW_ADDR) + break; + } + + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + } + + if (likely(!ret)) { + *fsdata = cc->rpages; + *pagep = cc->rpages[offset_in_cluster(cc, index)]; + return cc->cluster_size; + } + +unlock_pages: + f2fs_unlock_rpages(cc, i); +release_pages: + f2fs_put_rpages_mapping(cc, mapping, start_idx, i); + f2fs_destroy_compress_ctx(cc); + return ret; +} + +int f2fs_prepare_compress_overwrite(struct inode *inode, + struct page **pagep, pgoff_t index, void **fsdata) +{ + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, + .rpages = NULL, + .nr_rpages = 0, + }; + + return prepare_compress_overwrite(&cc, pagep, index, fsdata); +} + +bool f2fs_compress_write_end(struct inode *inode, void *fsdata, + pgoff_t index, unsigned copied) + +{ + struct compress_ctx cc = { + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .rpages = fsdata, + }; + bool first_index = (index == cc.rpages[0]->index); + + if (copied) + set_cluster_dirty(&cc); + + f2fs_put_rpages_wbc(&cc, NULL, false, 1); + f2fs_destroy_compress_ctx(&cc); + + return first_index; +} + +static int f2fs_write_compressed_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct inode *inode = cc->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = cc->inode->i_ino, + .type = DATA, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NEW_ADDR, + .page = NULL, + .encrypted_page = NULL, + .compressed_page = NULL, + .submitted = false, + .need_lock = LOCK_RETRY, + .io_type = io_type, + .io_wbc = wbc, + .encrypted = f2fs_encrypted_file(cc->inode), + }; + struct dnode_of_data dn; + struct node_info ni; + struct compress_io_ctx *cic; + pgoff_t start_idx = start_idx_of_cluster(cc); + unsigned int last_index = cc->cluster_size - 1; + loff_t psize; + int i, err; + + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + + f2fs_lock_op(sbi); + + err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (err) + goto out_unlock_op; + + for (i = 0; i < cc->cluster_size; i++) { + if (datablock_addr(dn.inode, dn.node_page, + dn.ofs_in_node + i) == NULL_ADDR) + goto out_put_dnode; + } + + psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT; + + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + if (err) + goto out_put_dnode; + + fio.version = ni.version; + + cic = f2fs_kzalloc(sbi, sizeof(struct compress_io_ctx), GFP_NOFS); + if (!cic) + goto out_put_dnode; + + cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; + cic->inode = inode; + refcount_set(&cic->ref, 1); + cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc->log_cluster_size, GFP_NOFS); + if (!cic->rpages) + goto out_put_cic; + + cic->nr_rpages = cc->cluster_size; + + for (i = 0; i < cc->nr_cpages; i++) { + f2fs_set_compressed_page(cc->cpages[i], inode, + cc->rpages[i + 1]->index, + cic, i ? &cic->ref : NULL); + fio.compressed_page = cc->cpages[i]; + if (fio.encrypted) { + fio.page = cc->rpages[i + 1]; + err = f2fs_encrypt_one_page(&fio); + if (err) + goto out_destroy_crypt; + cc->cpages[i] = fio.encrypted_page; + } + } + + set_cluster_writeback(cc); + + for (i = 0; i < cc->cluster_size; i++) + cic->rpages[i] = cc->rpages[i]; + + for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, dn.node_page, + dn.ofs_in_node); + fio.page = cic->rpages[i]; + fio.old_blkaddr = blkaddr; + + /* cluster header */ + if (i == 0) { + if (blkaddr == COMPRESS_ADDR) + fio.compr_blocks++; + if (__is_valid_data_blkaddr(blkaddr)) + f2fs_invalidate_blocks(sbi, blkaddr); + f2fs_update_data_blkaddr(&dn, COMPRESS_ADDR); + goto unlock_continue; + } + + if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) + fio.compr_blocks++; + + if (i > cc->nr_cpages) { + if (__is_valid_data_blkaddr(blkaddr)) { + f2fs_invalidate_blocks(sbi, blkaddr); + f2fs_update_data_blkaddr(&dn, NEW_ADDR); + } + goto unlock_continue; + } + + f2fs_bug_on(fio.sbi, blkaddr == NULL_ADDR); + + if (fio.encrypted) + fio.encrypted_page = cc->cpages[i - 1]; + else + fio.compressed_page = cc->cpages[i - 1]; + + cc->cpages[i - 1] = NULL; + f2fs_outplace_write_data(&dn, &fio); + (*submitted)++; +unlock_continue: + inode_dec_dirty_pages(cc->inode); + unlock_page(fio.page); + } + + if (fio.compr_blocks) + f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); + f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); + + set_inode_flag(cc->inode, FI_APPEND_WRITE); + if (cc->cluster_idx == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + + down_write(&fi->i_sem); + if (fi->last_disk_size < psize) + fi->last_disk_size = psize; + up_write(&fi->i_sem); + + f2fs_put_rpages(cc); + f2fs_destroy_compress_ctx(cc); + return 0; + +out_destroy_crypt: + kfree(cic->rpages); + + for (--i; i >= 0; i--) + fscrypt_finalize_bounce_page(&cc->cpages[i]); + for (i = 0; i < cc->nr_cpages; i++) { + if (!cc->cpages[i]) + continue; + f2fs_put_page(cc->cpages[i], 1); + } +out_put_cic: + kfree(cic); +out_put_dnode: + f2fs_put_dnode(&dn); +out_unlock_op: + f2fs_unlock_op(sbi); + return -EAGAIN; +} + +void f2fs_compress_write_end_io(struct bio *bio, struct page *page) +{ + struct f2fs_sb_info *sbi = bio->bi_private; + struct compress_io_ctx *cic = + (struct compress_io_ctx *)page_private(page); + int i; + + if (unlikely(bio->bi_status)) + mapping_set_error(cic->inode->i_mapping, -EIO); + + f2fs_put_compressed_page(page); + + dec_page_count(sbi, F2FS_WB_DATA); + + if (refcount_dec_not_one(&cic->ref)) + return; + + for (i = 0; i < cic->nr_rpages; i++) { + WARN_ON(!cic->rpages[i]); + clear_cold_data(cic->rpages[i]); + end_page_writeback(cic->rpages[i]); + } + + kfree(cic->rpages); + kfree(cic); +} + +static int f2fs_write_raw_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct address_space *mapping = cc->inode->i_mapping; + int _submitted, compr_blocks, ret; + int i = -1, err = 0; + + compr_blocks = f2fs_compressed_blocks(cc); + if (compr_blocks < 0) { + err = compr_blocks; + goto out_err; + } + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; +retry_write: + if (cc->rpages[i]->mapping != mapping) { + unlock_page(cc->rpages[i]); + continue; + } + + BUG_ON(!PageLocked(cc->rpages[i])); + + ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, + NULL, NULL, wbc, io_type, + compr_blocks); + if (ret) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(cc->rpages[i]); + ret = 0; + } else if (ret == -EAGAIN) { + ret = 0; + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + lock_page(cc->rpages[i]); + clear_page_dirty_for_io(cc->rpages[i]); + goto retry_write; + } + err = ret; + goto out_fail; + } + + *submitted += _submitted; + } + return 0; + +out_fail: + /* TODO: revoke partially updated block addresses */ + BUG_ON(compr_blocks); +out_err: + for (++i; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + redirty_page_for_writepage(wbc, cc->rpages[i]); + unlock_page(cc->rpages[i]); + } + return err; +} + +int f2fs_write_multi_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) +{ + struct f2fs_inode_info *fi = F2FS_I(cc->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + int err; + + *submitted = 0; + if (cluster_may_compress(cc)) { + err = f2fs_compress_pages(cc); + if (err == -EAGAIN) { + goto write; + } else if (err) { + f2fs_put_rpages_wbc(cc, wbc, true, 1); + goto destroy_out; + } + + err = f2fs_write_compressed_pages(cc, submitted, + wbc, io_type); + cops->destroy_compress_ctx(cc); + if (!err) + return 0; + f2fs_bug_on(F2FS_I_SB(cc->inode), err != -EAGAIN); + } +write: + f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted); + + err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); + f2fs_put_rpages_wbc(cc, wbc, false, 0); +destroy_out: + f2fs_destroy_compress_ctx(cc); + return err; +} + +struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); + struct decompress_io_ctx *dic; + pgoff_t start_idx = start_idx_of_cluster(cc); + int i; + + dic = f2fs_kzalloc(sbi, sizeof(struct decompress_io_ctx), GFP_NOFS); + if (!dic) + return ERR_PTR(-ENOMEM); + + dic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << + cc->log_cluster_size, GFP_NOFS); + if (!dic->rpages) { + kfree(dic); + return ERR_PTR(-ENOMEM); + } + + dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; + dic->inode = cc->inode; + refcount_set(&dic->ref, 1); + dic->cluster_idx = cc->cluster_idx; + dic->cluster_size = cc->cluster_size; + dic->log_cluster_size = cc->log_cluster_size; + dic->nr_cpages = cc->nr_cpages; + dic->failed = false; + + for (i = 0; i < dic->cluster_size; i++) + dic->rpages[i] = cc->rpages[i]; + dic->nr_rpages = cc->cluster_size; + + dic->cpages = f2fs_kzalloc(sbi, sizeof(struct page *) * + dic->nr_cpages, GFP_NOFS); + if (!dic->cpages) + goto out_free; + + for (i = 0; i < dic->nr_cpages; i++) { + struct page *page; + + page = f2fs_grab_page(); + if (!page) + goto out_free; + + f2fs_set_compressed_page(page, cc->inode, + start_idx + i + 1, + dic, i ? &dic->ref : NULL); + dic->cpages[i] = page; + } + + dic->tpages = f2fs_kzalloc(sbi, sizeof(struct page *) * + dic->cluster_size, GFP_NOFS); + if (!dic->tpages) + goto out_free; + + for (i = 0; i < dic->cluster_size; i++) { + if (cc->rpages[i]) + continue; + + dic->tpages[i] = f2fs_grab_page(); + if (!dic->tpages[i]) + goto out_free; + } + + for (i = 0; i < dic->cluster_size; i++) { + if (dic->tpages[i]) + continue; + dic->tpages[i] = cc->rpages[i]; + } + + return dic; + +out_free: + f2fs_free_dic(dic); + return ERR_PTR(-ENOMEM); +} + +void f2fs_free_dic(struct decompress_io_ctx *dic) +{ + int i; + + if (dic->tpages) { + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) + continue; + f2fs_put_page(dic->tpages[i], 1); + } + kfree(dic->tpages); + } + + if (dic->cpages) { + for (i = 0; i < dic->nr_cpages; i++) { + if (!dic->cpages[i]) + continue; + f2fs_put_compressed_page(dic->cpages[i]); + } + kfree(dic->cpages); + } + + kfree(dic->rpages); + kfree(dic); +} + +void f2fs_decompress_end_io(struct page **rpages, + unsigned int cluster_size, bool err, bool verity) +{ + int i; + + for (i = 0; i < cluster_size; i++) { + struct page *rpage = rpages[i]; + + if (!rpage) + continue; + + if (err || PageError(rpage)) { + ClearPageUptodate(rpage); + ClearPageError(rpage); + } else { + if (!verity || fsverity_verify_page(rpage)) + SetPageUptodate(rpage); + else + SetPageError(rpage); + } + unlock_page(rpage); + } +} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0fa356e94ef5..8bd9afa81c54 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -31,6 +31,47 @@ static struct kmem_cache *bio_post_read_ctx_cache; static struct kmem_cache *bio_entry_slab; static mempool_t *bio_post_read_ctx_pool; +static struct bio_set f2fs_bioset; + +#define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE + +int __init f2fs_init_bioset(void) +{ + if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, + 0, BIOSET_NEED_BVECS)) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_bioset(void) +{ + bioset_exit(&f2fs_bioset); +} + +static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, + unsigned int nr_iovecs) +{ + return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); +} + +struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail) +{ + struct bio *bio; + + if (no_fail) { + /* No failure on bio allocation */ + bio = __f2fs_bio_alloc(GFP_NOIO, npages); + if (!bio) + bio = __f2fs_bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); + return bio; + } + if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { + f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); + return NULL; + } + + return __f2fs_bio_alloc(GFP_KERNEL, npages); +} static bool __is_cp_guaranteed(struct page *page) { @@ -41,6 +82,9 @@ static bool __is_cp_guaranteed(struct page *page) if (!mapping) return false; + if (f2fs_is_compressed_page(page)) + return false; + inode = mapping->host; sbi = F2FS_I_SB(inode); @@ -73,19 +117,19 @@ static enum count_type __read_io_type(struct page *page) /* postprocessing steps for read bios */ enum bio_post_read_step { - STEP_INITIAL = 0, STEP_DECRYPT, + STEP_DECOMPRESS, STEP_VERITY, }; struct bio_post_read_ctx { struct bio *bio; + struct f2fs_sb_info *sbi; struct work_struct work; - unsigned int cur_step; unsigned int enabled_steps; }; -static void __read_end_io(struct bio *bio) +static void __read_end_io(struct bio *bio, bool compr, bool verity) { struct page *page; struct bio_vec *bv; @@ -94,6 +138,13 @@ static void __read_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, iter_all) { page = bv->bv_page; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (compr && f2fs_is_compressed_page(page)) { + f2fs_decompress_pages(bio, page, verity); + continue; + } +#endif + /* PG_error was set if any post_read step failed */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); @@ -105,31 +156,107 @@ static void __read_end_io(struct bio *bio) dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); +} + +static void f2fs_release_read_bio(struct bio *bio); +static void __f2fs_read_end_io(struct bio *bio, bool compr, bool verity) +{ + if (!compr) + __read_end_io(bio, false, verity); + f2fs_release_read_bio(bio); +} + +static void f2fs_decompress_bio(struct bio *bio, bool verity) +{ + __read_end_io(bio, true, verity); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx); -static void decrypt_work(struct work_struct *work) +static void f2fs_decrypt_work(struct bio_post_read_ctx *ctx) +{ + fscrypt_decrypt_bio(ctx->bio); +} + +static void f2fs_decompress_work(struct bio_post_read_ctx *ctx) +{ + f2fs_decompress_bio(ctx->bio, ctx->enabled_steps & (1 << STEP_VERITY)); +} + +#ifdef CONFIG_F2FS_FS_COMPRESSION +static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size) +{ + f2fs_decompress_end_io(rpages, cluster_size, false, true); +} + +static void f2fs_verify_bio(struct bio *bio) +{ + struct page *page = bio_first_page_all(bio); + struct decompress_io_ctx *dic = + (struct decompress_io_ctx *)page_private(page); + + f2fs_verify_pages(dic->rpages, dic->cluster_size); + f2fs_free_dic(dic); +} +#endif + +static void f2fs_verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); + struct bio *bio = ctx->bio; +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned int enabled_steps = ctx->enabled_steps; +#endif - fscrypt_decrypt_bio(ctx->bio); + /* + * fsverity_verify_bio() may call readpages() again, and while verity + * will be disabled for this, decryption may still be needed, resulting + * in another bio_post_read_ctx being allocated. So to prevent + * deadlocks we need to release the current ctx to the mempool first. + * This assumes that verity is the last post-read step. + */ + mempool_free(ctx, bio_post_read_ctx_pool); + bio->bi_private = NULL; + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* previous step is decompression */ + if (enabled_steps & (1 << STEP_DECOMPRESS)) { + f2fs_verify_bio(bio); + f2fs_release_read_bio(bio); + return; + } +#endif - bio_post_read_processing(ctx); + fsverity_verify_bio(bio); + __f2fs_read_end_io(bio, false, false); } -static void verity_work(struct work_struct *work) +static void f2fs_post_read_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); - fsverity_verify_bio(ctx->bio); + if (ctx->enabled_steps & (1 << STEP_DECRYPT)) + f2fs_decrypt_work(ctx); - bio_post_read_processing(ctx); + if (ctx->enabled_steps & (1 << STEP_DECOMPRESS)) + f2fs_decompress_work(ctx); + + if (ctx->enabled_steps & (1 << STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verity_work); + fsverity_enqueue_verify_work(&ctx->work); + return; + } + + __f2fs_read_end_io(ctx->bio, + ctx->enabled_steps & (1 << STEP_DECOMPRESS), false); +} + +static void f2fs_enqueue_post_read_work(struct f2fs_sb_info *sbi, + struct work_struct *work) +{ + queue_work(sbi->post_read_wq, work); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) @@ -139,31 +266,26 @@ static void bio_post_read_processing(struct bio_post_read_ctx *ctx) * verity may require reading metadata pages that need decryption, and * we shouldn't recurse to the same workqueue. */ - switch (++ctx->cur_step) { - case STEP_DECRYPT: - if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { - INIT_WORK(&ctx->work, decrypt_work); - fscrypt_enqueue_decrypt_work(&ctx->work); - return; - } - ctx->cur_step++; - /* fall-through */ - case STEP_VERITY: - if (ctx->enabled_steps & (1 << STEP_VERITY)) { - INIT_WORK(&ctx->work, verity_work); - fsverity_enqueue_verify_work(&ctx->work); - return; - } - ctx->cur_step++; - /* fall-through */ - default: - __read_end_io(ctx->bio); + + if (ctx->enabled_steps & (1 << STEP_DECRYPT) || + ctx->enabled_steps & (1 << STEP_DECOMPRESS)) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + f2fs_enqueue_post_read_work(ctx->sbi, &ctx->work); + return; } + + if (ctx->enabled_steps & (1 << STEP_VERITY)) { + INIT_WORK(&ctx->work, f2fs_verity_work); + fsverity_enqueue_verify_work(&ctx->work); + return; + } + + __f2fs_read_end_io(ctx->bio, false, false); } static bool f2fs_bio_post_read_required(struct bio *bio) { - return bio->bi_private && !bio->bi_status; + return bio->bi_private; } static void f2fs_read_end_io(struct bio *bio) @@ -178,12 +300,11 @@ static void f2fs_read_end_io(struct bio *bio) if (f2fs_bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; - ctx->cur_step = STEP_INITIAL; bio_post_read_processing(ctx); return; } - __read_end_io(bio); + __f2fs_read_end_io(bio, false, false); } static void f2fs_write_end_io(struct bio *bio) @@ -214,6 +335,13 @@ static void f2fs_write_end_io(struct bio *bio) fscrypt_finalize_bounce_page(&page); +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_is_compressed_page(page)) { + f2fs_compress_write_end_io(bio, page); + continue; + } +#endif + if (unlikely(bio->bi_status)) { mapping_set_error(page->mapping, -EIO); if (type == F2FS_WB_CP_DATA) @@ -358,6 +486,12 @@ submit_io: submit_bio(bio); } +void f2fs_submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) +{ + __submit_bio(sbi, bio, type); +} + static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -380,7 +514,6 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, struct page *page, nid_t ino) { struct bio_vec *bvec; - struct page *target; struct bvec_iter_all iter_all; if (!bio) @@ -390,10 +523,18 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, return true; bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *target = bvec->bv_page; - target = bvec->bv_page; - if (fscrypt_is_bounce_page(target)) + if (fscrypt_is_bounce_page(target)) { target = fscrypt_pagecache_page(target); + if (IS_ERR(target)) + continue; + } + if (f2fs_is_compressed_page(target)) { + target = f2fs_compress_control_page(target); + if (IS_ERR(target)) + continue; + } if (inode && inode == target->mapping->host) return true; @@ -588,7 +729,8 @@ static int add_ipu_page(struct f2fs_sb_info *sbi, struct bio **bio, found = true; - if (bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) { + if (bio_add_page(*bio, page, PAGE_SIZE, 0) == + PAGE_SIZE) { ret = 0; break; } @@ -728,7 +870,12 @@ next: verify_fio_blkaddr(fio); - bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + if (fio->encrypted_page) + bio_page = fio->encrypted_page; + else if (fio->compressed_page) + bio_page = fio->compressed_page; + else + bio_page = fio->page; /* set submitted = true as a return value */ fio->submitted = true; @@ -797,17 +944,16 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, if (f2fs_encrypted_file(inode)) post_read_steps |= 1 << STEP_DECRYPT; - + if (f2fs_compressed_file(inode)) + post_read_steps |= 1 << STEP_DECOMPRESS; if (f2fs_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { + /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); - if (!ctx) { - bio_put(bio); - return ERR_PTR(-ENOMEM); - } ctx->bio = bio; + ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } @@ -815,6 +961,13 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, return bio; } +static void f2fs_release_read_bio(struct bio *bio) +{ + if (bio->bi_private) + mempool_free(bio->bi_private, bio_post_read_ctx_pool); + bio_put(bio); +} + /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr) @@ -1180,19 +1333,6 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) int err = 0; bool direct_io = iocb->ki_flags & IOCB_DIRECT; - /* convert inline data for Direct I/O*/ - if (direct_io) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - - if (direct_io && allow_outplace_dio(inode, iocb, from)) - return 0; - - if (is_inode_flag_set(inode, FI_NO_PREALLOC)) - return 0; - map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); if (map.m_len > map.m_lblk) @@ -1872,6 +2012,144 @@ out: return ret; } +#ifdef CONFIG_F2FS_FS_COMPRESSION +int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + unsigned nr_pages, sector_t *last_block_in_bio, + bool is_readahead) +{ + struct dnode_of_data dn; + struct inode *inode = cc->inode; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct bio *bio = *bio_ret; + unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; + sector_t last_block_in_file; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + struct decompress_io_ctx *dic = NULL; + int i; + int ret = 0; + + f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); + + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + + /* get rid of pages beyond EOF */ + for (i = 0; i < cc->cluster_size; i++) { + struct page *page = cc->rpages[i]; + + if (!page) + continue; + if ((sector_t)page->index >= last_block_in_file) { + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); + } else if (!PageUptodate(page)) { + continue; + } + unlock_page(page); + cc->rpages[i] = NULL; + cc->nr_rpages--; + } + + /* we are done since all pages are beyond EOF */ + if (f2fs_cluster_is_empty(cc)) + goto out; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); + if (ret) + goto out; + + /* cluster was overwritten as normal cluster */ + if (dn.data_blkaddr != COMPRESS_ADDR) + goto out; + + for (i = 1; i < cc->cluster_size; i++) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, dn.node_page, + dn.ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + break; + + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { + ret = -EFAULT; + goto out_put_dnode; + } + cc->nr_cpages++; + } + + /* nothing to decompress */ + if (cc->nr_cpages == 0) { + ret = 0; + goto out_put_dnode; + } + + dic = f2fs_alloc_dic(cc); + if (IS_ERR(dic)) { + ret = PTR_ERR(dic); + goto out_put_dnode; + } + + for (i = 0; i < dic->nr_cpages; i++) { + struct page *page = dic->cpages[i]; + block_t blkaddr; + + blkaddr = datablock_addr(dn.inode, dn.node_page, + dn.ofs_in_node + i + 1); + + if (bio && !page_is_mergeable(sbi, bio, + *last_block_in_bio, blkaddr)) { +submit_and_realloc: + __submit_bio(sbi, bio, DATA); + bio = NULL; + } + + if (!bio) { + bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, + is_readahead ? REQ_RAHEAD : 0, + page->index); + if (IS_ERR(bio)) { + ret = PTR_ERR(bio); + bio = NULL; + dic->failed = true; + if (refcount_sub_and_test(dic->nr_cpages - i, + &dic->ref)) + f2fs_decompress_end_io(dic->rpages, + cc->cluster_size, true, + false); + f2fs_free_dic(dic); + f2fs_put_dnode(&dn); + *bio_ret = bio; + return ret; + } + } + + f2fs_wait_on_block_writeback(inode, blkaddr); + + if (bio_add_page(bio, page, blocksize, 0) < blocksize) + goto submit_and_realloc; + + inc_page_count(sbi, F2FS_RD_DATA); + ClearPageError(page); + *last_block_in_bio = blkaddr; + } + + f2fs_put_dnode(&dn); + + *bio_ret = bio; + return 0; + +out_put_dnode: + f2fs_put_dnode(&dn); +out: + f2fs_decompress_end_io(cc->rpages, cc->cluster_size, true, false); + *bio_ret = bio; + return ret; +} +#endif + /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -1889,6 +2167,19 @@ int f2fs_mpage_readpages(struct address_space *mapping, sector_t last_block_in_bio = 0; struct inode *inode = mapping->host; struct f2fs_map_blocks map; +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = NULL_CLUSTER, + .rpages = NULL, + .cpages = NULL, + .nr_rpages = 0, + .nr_cpages = 0, + }; +#endif + unsigned max_nr_pages = nr_pages; int ret = 0; map.m_pblk = 0; @@ -1912,9 +2203,41 @@ int f2fs_mpage_readpages(struct address_space *mapping, goto next_page; } - ret = f2fs_read_single_page(inode, page, nr_pages, &map, &bio, - &last_block_in_bio, is_readahead); +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + /* there are remained comressed pages, submit them */ + if (!f2fs_cluster_can_merge_page(&cc, page->index)) { + ret = f2fs_read_multi_pages(&cc, &bio, + max_nr_pages, + &last_block_in_bio, + is_readahead); + f2fs_destroy_compress_ctx(&cc); + if (ret) + goto set_error_page; + } + ret = f2fs_is_compressed_cluster(inode, page->index); + if (ret < 0) + goto set_error_page; + else if (!ret) + goto read_single_page; + + ret = f2fs_init_compress_ctx(&cc); + if (ret) + goto set_error_page; + + f2fs_compress_ctx_add_page(&cc, page); + + goto next_page; + } +read_single_page: +#endif + + ret = f2fs_read_single_page(inode, page, max_nr_pages, &map, + &bio, &last_block_in_bio, is_readahead); if (ret) { +#ifdef CONFIG_F2FS_FS_COMPRESSION +set_error_page: +#endif SetPageError(page); zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); @@ -1922,6 +2245,19 @@ int f2fs_mpage_readpages(struct address_space *mapping, next_page: if (pages) put_page(page); + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + /* last page */ + if (nr_pages == 1 && !f2fs_cluster_is_empty(&cc)) { + ret = f2fs_read_multi_pages(&cc, &bio, + max_nr_pages, + &last_block_in_bio, + is_readahead); + f2fs_destroy_compress_ctx(&cc); + } + } +#endif } BUG_ON(pages && !list_empty(pages)); if (bio) @@ -1936,6 +2272,11 @@ static int f2fs_read_data_page(struct file *file, struct page *page) trace_f2fs_readpage(page, DATA); + if (!f2fs_is_compress_backend_ready(inode)) { + unlock_page(page); + return -EOPNOTSUPP; + } + /* If the file has inline data, try to read it directly */ if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); @@ -1954,6 +2295,9 @@ static int f2fs_read_data_pages(struct file *file, trace_f2fs_readpages(inode, page, nr_pages); + if (!f2fs_is_compress_backend_ready(inode)) + return 0; + /* If the file has inline data, skip readpages */ if (f2fs_has_inline_data(inode)) return 0; @@ -1961,22 +2305,23 @@ static int f2fs_read_data_pages(struct file *file, return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); } -static int encrypt_one_page(struct f2fs_io_info *fio) +int f2fs_encrypt_one_page(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; - struct page *mpage; + struct page *mpage, *page; gfp_t gfp_flags = GFP_NOFS; if (!f2fs_encrypted_file(inode)) return 0; + page = fio->compressed_page ? fio->compressed_page : fio->page; + /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(fio->page, - PAGE_SIZE, 0, - gfp_flags); + fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page, + PAGE_SIZE, 0, gfp_flags); if (IS_ERR(fio->encrypted_page)) { /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { @@ -2136,7 +2481,7 @@ got_it: if (ipu_force || (__is_valid_data_blkaddr(fio->old_blkaddr) && need_inplace_update(fio))) { - err = encrypt_one_page(fio); + err = f2fs_encrypt_one_page(fio); if (err) goto out_writepage; @@ -2172,13 +2517,16 @@ got_it: fio->version = ni.version; - err = encrypt_one_page(fio); + err = f2fs_encrypt_one_page(fio); if (err) goto out_writepage; set_page_writeback(page); ClearPageError(page); + if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR) + f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false); + /* LFS mode write path */ f2fs_outplace_write_data(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); @@ -2193,16 +2541,17 @@ out: return err; } -static int __write_data_page(struct page *page, bool *submitted, +int f2fs_write_single_data_page(struct page *page, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, - enum iostat_type io_type) + enum iostat_type io_type, + int compr_blocks) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); - const pgoff_t end_index = ((unsigned long long) i_size) + const pgoff_t end_index = ((unsigned long long)i_size) >> PAGE_SHIFT; loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT; unsigned offset = 0; @@ -2218,6 +2567,7 @@ static int __write_data_page(struct page *page, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, + .compr_blocks = compr_blocks, .need_lock = LOCK_RETRY, .io_type = io_type, .io_wbc = wbc, @@ -2242,7 +2592,9 @@ static int __write_data_page(struct page *page, bool *submitted, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; - if (page->index < end_index || f2fs_verity_in_progress(inode)) + if (page->index < end_index || + f2fs_verity_in_progress(inode) || + compr_blocks) goto write; /* @@ -2318,7 +2670,6 @@ out: f2fs_remove_dirty_inode(inode); submitted = NULL; } - unlock_page(page); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && !F2FS_I(inode)->cp_task) @@ -2331,7 +2682,7 @@ out: } if (submitted) - *submitted = fio.submitted; + *submitted = fio.submitted ? 1 : 0; return 0; @@ -2352,7 +2703,23 @@ redirty_out: static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { - return __write_data_page(page, NULL, NULL, NULL, wbc, FS_DATA_IO); +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct inode *inode = page->mapping->host; + + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + goto out; + + if (f2fs_compressed_file(inode)) { + if (f2fs_is_compressed_cluster(inode, page->index)) { + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; + } + } +out: +#endif + + return f2fs_write_single_data_page(page, NULL, NULL, NULL, + wbc, FS_DATA_IO, 0); } /* @@ -2365,11 +2732,27 @@ static int f2fs_write_cache_pages(struct address_space *mapping, enum iostat_type io_type) { int ret = 0; - int done = 0; + int done = 0, retry = 0; struct pagevec pvec; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; +#ifdef CONFIG_F2FS_FS_COMPRESSION + struct inode *inode = mapping->host; + struct compress_ctx cc = { + .inode = inode, + .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, + .cluster_size = F2FS_I(inode)->i_cluster_size, + .cluster_idx = NULL_CLUSTER, + .rpages = NULL, + .nr_rpages = 0, + .cpages = NULL, + .rbuf = NULL, + .cbuf = NULL, + .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, + .private = NULL, + }; +#endif int nr_pages; pgoff_t uninitialized_var(writeback_index); pgoff_t index; @@ -2379,6 +2762,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int range_whole = 0; xa_mark_t tag; int nwritten = 0; + int submitted = 0; + int i; pagevec_init(&pvec); @@ -2408,12 +2793,11 @@ static int f2fs_write_cache_pages(struct address_space *mapping, else tag = PAGECACHE_TAG_DIRTY; retry: + retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; - while (!done && (index <= end)) { - int i; - + while (!done && !retry && (index <= end)) { nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, tag); if (nr_pages == 0) @@ -2421,15 +2805,62 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - bool submitted = false; + bool need_readd; +readd: + need_readd = false; +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + ret = f2fs_init_compress_ctx(&cc); + if (ret) { + done = 1; + break; + } + + if (!f2fs_cluster_can_merge_page(&cc, + page->index)) { + ret = f2fs_write_multi_pages(&cc, + &submitted, wbc, io_type); + if (!ret) + need_readd = true; + goto result; + } + if (unlikely(f2fs_cp_error(sbi))) + goto lock_page; + + if (f2fs_cluster_is_empty(&cc)) { + void *fsdata = NULL; + struct page *pagep; + int ret2; + + ret2 = f2fs_prepare_compress_overwrite( + inode, &pagep, + page->index, &fsdata); + if (ret2 < 0) { + ret = ret2; + done = 1; + break; + } else if (ret2 && + !f2fs_compress_write_end(inode, + fsdata, page->index, + 1)) { + retry = 1; + break; + } + } else { + goto lock_page; + } + } +#endif /* give a priority to WB_SYNC threads */ if (atomic_read(&sbi->wb_sync_req[DATA]) && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } - +#ifdef CONFIG_F2FS_FS_COMPRESSION +lock_page: +#endif done_index = page->index; retry_write: lock_page(page); @@ -2456,45 +2887,71 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = __write_data_page(page, &submitted, &bio, - &last_block, wbc, io_type); +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + get_page(page); + f2fs_compress_ctx_add_page(&cc, page); + continue; + } +#endif + ret = f2fs_write_single_data_page(page, &submitted, + &bio, &last_block, wbc, io_type, 0); + if (ret == AOP_WRITEPAGE_ACTIVATE) + unlock_page(page); +#ifdef CONFIG_F2FS_FS_COMPRESSION +result: +#endif + nwritten += submitted; + wbc->nr_to_write -= submitted; + if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to * get # of written pages. */ if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); ret = 0; - continue; + goto next; } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { cond_resched(); congestion_wait(BLK_RW_ASYNC, - HZ/50); + HZ/50); goto retry_write; } - continue; + goto next; } done_index = page->index + 1; done = 1; break; - } else if (submitted) { - nwritten++; } - if (--wbc->nr_to_write <= 0 && + if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } +next: + if (need_readd) + goto readd; } pagevec_release(&pvec); cond_resched(); } - - if (!cycled && !done) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* flush remained pages in compress cluster */ + if (f2fs_compressed_file(inode) && !f2fs_cluster_is_empty(&cc)) { + ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); + nwritten += submitted; + wbc->nr_to_write -= submitted; + if (ret) { + done = 1; + retry = 0; + } + } +#endif + if ((!cycled && !done) || retry) { cycled = 1; index = 0; end = writeback_index - 1; @@ -2518,6 +2975,8 @@ static inline bool __should_serialize_io(struct inode *inode, { if (!S_ISREG(inode->i_mode)) return false; + if (f2fs_compressed_file(inode)) + return true; if (IS_NOQUOTA(inode)) return false; /* to avoid deadlock in path of data flush */ @@ -2613,14 +3072,16 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); + if (IS_NOQUOTA(inode)) + return; + /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - if (!IS_NOQUOTA(inode)) - f2fs_truncate_blocks(inode, i_size, true); + f2fs_truncate_blocks(inode, i_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -2660,6 +3121,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, __do_map_lock(sbi, flag, true); locked = true; } + restart: /* check inline_data */ ipage = f2fs_get_node_page(sbi, inode->i_ino); @@ -2750,6 +3212,24 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, if (err) goto fail; } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + int ret; + + *fsdata = NULL; + + ret = f2fs_prepare_compress_overwrite(inode, pagep, + index, fsdata); + if (ret < 0) { + err = ret; + goto fail; + } else if (ret) { + return 0; + } + } +#endif + repeat: /* * Do not use grab_cache_page_write_begin() to avoid deadlock due to @@ -2762,6 +3242,8 @@ repeat: goto fail; } + /* TODO: cluster can be compressed due to race with .writepage */ + *pagep = page; err = prepare_write_begin(sbi, page, pos, len, @@ -2845,6 +3327,16 @@ static int f2fs_write_end(struct file *file, else SetPageUptodate(page); } + +#ifdef CONFIG_F2FS_FS_COMPRESSION + /* overwrite compressed file */ + if (f2fs_compressed_file(inode) && fsdata) { + f2fs_compress_write_end(inode, fsdata, page->index, copied); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return copied; + } +#endif + if (!copied) goto unlock_out; @@ -3145,7 +3637,8 @@ int f2fs_migrate_page(struct address_space *mapping, #ifdef CONFIG_SWAP /* Copied from generic_swapfile_activate() to check any holes */ -static int check_swap_activate(struct file *swap_file, unsigned int max) +static int check_swap_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; @@ -3156,6 +3649,8 @@ static int check_swap_activate(struct file *swap_file, unsigned int max) sector_t last_block; sector_t lowest_block = -1; sector_t highest_block = 0; + int nr_extents = 0; + int ret; blkbits = inode->i_blkbits; blocks_per_page = PAGE_SIZE >> blkbits; @@ -3167,7 +3662,8 @@ static int check_swap_activate(struct file *swap_file, unsigned int max) probe_block = 0; page_no = 0; last_block = i_size_read(inode) >> blkbits; - while ((probe_block + blocks_per_page) <= last_block && page_no < max) { + while ((probe_block + blocks_per_page) <= last_block && + page_no < sis->max) { unsigned block_in_page; sector_t first_block; @@ -3207,13 +3703,27 @@ static int check_swap_activate(struct file *swap_file, unsigned int max) highest_block = first_block; } + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, page_no, 1, first_block); + if (ret < 0) + goto out; + nr_extents += ret; page_no++; probe_block += blocks_per_page; reprobe: continue; } - return 0; - + ret = nr_extents; + *span = 1 + highest_block - lowest_block; + if (page_no == 0) + page_no = 1; /* force Empty message */ + sis->max = page_no; + sis->pages = page_no - 1; + sis->highest_bit = page_no - 1; +out: + return ret; bad_bmap: pr_err("swapon: swapfile has holes\n"); return -EINVAL; @@ -3235,14 +3745,17 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) return ret; - ret = check_swap_activate(file, sis->max); - if (ret) + if (f2fs_disable_compressed_file(inode)) + return -EINVAL; + + ret = check_swap_activate(sis, file, span); + if (ret < 0) return ret; set_inode_flag(inode, FI_PIN_FILE); f2fs_precache_extents(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return 0; + return ret; } static void f2fs_swap_deactivate(struct file *file) @@ -3319,6 +3832,27 @@ void f2fs_destroy_post_read_processing(void) kmem_cache_destroy(bio_post_read_ctx_cache); } +int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) +{ + if (!f2fs_sb_has_encrypt(sbi) && + !f2fs_sb_has_verity(sbi) && + !f2fs_sb_has_compression(sbi)) + return 0; + + sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); + if (!sbi->post_read_wq) + return -ENOMEM; + return 0; +} + +void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) +{ + if (sbi->post_read_wq) + destroy_workqueue(sbi->post_read_wq); +} + int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab", @@ -3328,7 +3862,7 @@ int __init f2fs_init_bio_entry_cache(void) return 0; } -void __exit f2fs_destroy_bio_entry_cache(void) +void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 9b0bedd82581..6b89eae5e4ca 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -21,9 +21,45 @@ #include "gc.h" static LIST_HEAD(f2fs_stat_list); -static struct dentry *f2fs_debugfs_root; static DEFINE_MUTEX(f2fs_stat_mutex); +#ifdef CONFIG_DEBUG_FS +static struct dentry *f2fs_debugfs_root; +#endif + +/* + * This function calculates BDF of every segments + */ +void f2fs_update_sit_info(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + unsigned long long blks_per_sec, hblks_per_sec, total_vblocks; + unsigned long long bimodal, dist; + unsigned int segno, vblocks; + int ndirty = 0; + + bimodal = 0; + total_vblocks = 0; + blks_per_sec = BLKS_PER_SEC(sbi); + hblks_per_sec = blks_per_sec / 2; + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { + vblocks = get_valid_blocks(sbi, segno, true); + dist = abs(vblocks - hblks_per_sec); + bimodal += dist * dist; + + if (vblocks > 0 && vblocks < blks_per_sec) { + total_vblocks += vblocks; + ndirty++; + } + } + dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); + si->bimodal = div64_u64(bimodal, dist); + if (si->dirty_count) + si->avg_vblocks = div_u64(total_vblocks, ndirty); + else + si->avg_vblocks = 0; +} +#ifdef CONFIG_DEBUG_FS static void update_general_status(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); @@ -56,7 +92,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->aw_cnt = sbi->atomic_files; si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); @@ -94,6 +130,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->compr_inode = atomic_read(&sbi->compr_inode); + si->compr_blocks = atomic_read(&sbi->compr_blocks); si->append = sbi->im[APPEND_INO].ino_num; si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; @@ -114,7 +152,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; - si->bg_gc = sbi->bg_gc; si->io_skip_bggc = sbi->io_skip_bggc; si->other_skip_bggc = sbi->other_skip_bggc; si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; @@ -146,39 +183,6 @@ static void update_general_status(struct f2fs_sb_info *sbi) } /* - * This function calculates BDF of every segments - */ -static void update_sit_info(struct f2fs_sb_info *sbi) -{ - struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned long long blks_per_sec, hblks_per_sec, total_vblocks; - unsigned long long bimodal, dist; - unsigned int segno, vblocks; - int ndirty = 0; - - bimodal = 0; - total_vblocks = 0; - blks_per_sec = BLKS_PER_SEC(sbi); - hblks_per_sec = blks_per_sec / 2; - for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, true); - dist = abs(vblocks - hblks_per_sec); - bimodal += dist * dist; - - if (vblocks > 0 && vblocks < blks_per_sec) { - total_vblocks += vblocks; - ndirty++; - } - } - dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); - si->bimodal = div64_u64(bimodal, dist); - if (si->dirty_count) - si->avg_vblocks = div_u64(total_vblocks, ndirty); - else - si->avg_vblocks = 0; -} - -/* * This function calculates memory footprint. */ static void update_mem_info(struct f2fs_sb_info *sbi) @@ -315,6 +319,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); + seq_printf(s, " - Compressed Inode: %u, Blocks: %u\n", + si->compr_inode, si->compr_blocks); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", @@ -441,7 +447,7 @@ static int stat_show(struct seq_file *s, void *v) si->block_count[LFS], si->segment_count[LFS]); /* segment usage info */ - update_sit_info(si->sbi); + f2fs_update_sit_info(si->sbi); seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n", si->bimodal, si->avg_vblocks); @@ -461,6 +467,7 @@ static int stat_show(struct seq_file *s, void *v) } DEFINE_SHOW_ATTRIBUTE(stat); +#endif int f2fs_build_stats(struct f2fs_sb_info *sbi) { @@ -491,11 +498,12 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_xattr, 0); atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); + atomic_set(&sbi->compr_inode, 0); + atomic_set(&sbi->compr_blocks, 0); atomic_set(&sbi->inplace_count, 0); for (i = META_CP; i < META_MAX; i++) atomic_set(&sbi->meta_count[i], 0); - atomic_set(&sbi->aw_cnt, 0); atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); atomic_set(&sbi->max_vw_cnt, 0); @@ -520,14 +528,18 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) void __init f2fs_create_root_stats(void) { +#ifdef CONFIG_DEBUG_FS f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); +#endif } void f2fs_destroy_root_stats(void) { +#ifdef CONFIG_DEBUG_FS debugfs_remove_recursive(f2fs_debugfs_root); f2fs_debugfs_root = NULL; +#endif } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d9ad842945df..27d0dd7a16d6 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -578,6 +578,20 @@ next: goto next; } +bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, + struct fscrypt_name *fname) +{ + struct f2fs_dentry_ptr d; + unsigned int bit_pos; + int slots = GET_DENTRY_SLOTS(fname_len(fname)); + + make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ipage)); + + bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); + + return bit_pos < d.max; +} + void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct qstr *name, f2fs_hash_t name_hash, unsigned int bit_pos) @@ -1069,24 +1083,27 @@ static int f2fs_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { struct qstr qstr = {.name = str, .len = len }; + const struct dentry *parent = READ_ONCE(dentry->d_parent); + const struct inode *inode = READ_ONCE(parent->d_inode); - if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) { + if (!inode || !IS_CASEFOLDED(inode)) { if (len != name->len) return -1; - return memcmp(str, name, len); + return memcmp(str, name->name, len); } - return f2fs_ci_compare(dentry->d_parent->d_inode, name, &qstr, false); + return f2fs_ci_compare(inode, name, &qstr, false); } static int f2fs_d_hash(const struct dentry *dentry, struct qstr *str) { struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); const struct unicode_map *um = sbi->s_encoding; + const struct inode *inode = READ_ONCE(dentry->d_inode); unsigned char *norm; int len, ret = 0; - if (!IS_CASEFOLDED(dentry->d_inode)) + if (!inode || !IS_CASEFOLDED(inode)) return 0; norm = f2fs_kmalloc(sbi, PATH_MAX, GFP_ATOMIC); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 059ade83bfb1..5355be6b6755 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -116,6 +116,8 @@ typedef u32 block_t; /* */ typedef u32 nid_t; +#define COMPRESS_EXT_NUM 16 + struct f2fs_mount_info { unsigned int opt; int write_io_size_bits; /* Write IO size bits */ @@ -140,6 +142,12 @@ struct f2fs_mount_info { block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint */ + + /* For compression */ + unsigned char compress_algorithm; /* algorithm type */ + unsigned compress_log_size; /* cluster log size */ + unsigned char compress_ext_cnt; /* extension count */ + unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ }; #define F2FS_FEATURE_ENCRYPT 0x0001 @@ -155,6 +163,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_VERITY 0x0400 #define F2FS_FEATURE_SB_CHKSUM 0x0800 #define F2FS_FEATURE_CASEFOLD 0x1000 +#define F2FS_FEATURE_COMPRESSION 0x2000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -712,6 +721,12 @@ struct f2fs_inode_info { int i_inline_xattr_size; /* inline xattr size */ struct timespec64 i_crtime; /* inode creation time */ struct timespec64 i_disk_time[4];/* inode disk times */ + + /* for file compress */ + u64 i_compr_blocks; /* # of compressed blocks */ + unsigned char i_compress_algorithm; /* algorithm type */ + unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned int i_cluster_size; /* cluster size */ }; static inline void get_extent_info(struct extent_info *ext, @@ -1018,6 +1033,7 @@ enum need_lock_type { enum cp_reason_type { CP_NO_NEEDED, CP_NON_REGULAR, + CP_COMPRESSED, CP_HARDLINK, CP_SB_NEED_CP, CP_WRONG_PINO, @@ -1056,12 +1072,15 @@ struct f2fs_io_info { block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + struct page *compressed_page; /* compressed page */ struct list_head list; /* serialize IOs */ bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ bool is_por; /* indicate IO is from recovery or not */ bool retry; /* need to reallocate block address */ + int compr_blocks; /* # of compressed block addresses */ + bool encrypted; /* indicate file is encrypted */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ struct bio **bio; /* bio for ipu */ @@ -1169,6 +1188,18 @@ enum fsync_mode { FSYNC_MODE_NOBARRIER, /* fsync behaves nobarrier based on posix */ }; +/* + * this value is set in page as a private data which indicate that + * the page is atomically written, and it is in inmem_pages list. + */ +#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) +#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) + +#define IS_ATOMIC_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) +#define IS_DUMMY_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) + #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) @@ -1176,6 +1207,75 @@ enum fsync_mode { #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif +/* For compression */ +enum compress_algorithm_type { + COMPRESS_LZO, + COMPRESS_LZ4, + COMPRESS_MAX, +}; + +#define COMPRESS_DATA_RESERVED_SIZE 4 +struct compress_data { + __le32 clen; /* compressed data size */ + __le32 chksum; /* checksum of compressed data */ + __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ + u8 cdata[]; /* compressed data */ +}; + +#define COMPRESS_HEADER_SIZE (sizeof(struct compress_data)) + +#define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 + +/* compress context */ +struct compress_ctx { + struct inode *inode; /* inode the context belong to */ + pgoff_t cluster_idx; /* cluster index number */ + unsigned int cluster_size; /* page count in cluster */ + unsigned int log_cluster_size; /* log of cluster size */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + struct page **cpages; /* pages store compressed data in cluster */ + unsigned int nr_cpages; /* total page number in cpages */ + void *rbuf; /* virtual mapped address on rpages */ + struct compress_data *cbuf; /* virtual mapped address on cpages */ + size_t rlen; /* valid data length in rbuf */ + size_t clen; /* valid data length in cbuf */ + void *private; /* payload buffer for specified compression algorithm */ +}; + +/* compress context for write IO path */ +struct compress_io_ctx { + u32 magic; /* magic number to indicate page is compressed */ + struct inode *inode; /* inode the context belong to */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + refcount_t ref; /* referrence count of raw page */ +}; + +/* decompress io context for read IO path */ +struct decompress_io_ctx { + u32 magic; /* magic number to indicate page is compressed */ + struct inode *inode; /* inode the context belong to */ + pgoff_t cluster_idx; /* cluster index number */ + unsigned int cluster_size; /* page count in cluster */ + unsigned int log_cluster_size; /* log of cluster size */ + struct page **rpages; /* pages store raw data in cluster */ + unsigned int nr_rpages; /* total page number in rpages */ + struct page **cpages; /* pages store compressed data in cluster */ + unsigned int nr_cpages; /* total page number in cpages */ + struct page **tpages; /* temp pages to pad holes in cluster */ + void *rbuf; /* virtual mapped address on rpages */ + struct compress_data *cbuf; /* virtual mapped address on cpages */ + size_t rlen; /* valid data length in rbuf */ + size_t clen; /* valid data length in cbuf */ + refcount_t ref; /* referrence count of compressed page */ + bool failed; /* indicate IO error during decompression */ +}; + +#define NULL_CLUSTER ((unsigned int)(~0)) +#define MIN_COMPRESS_LOG_SIZE 2 +#define MAX_COMPRESS_LOG_SIZE 8 + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -1291,7 +1391,10 @@ struct f2fs_sb_info { struct f2fs_mount_info mount_opt; /* mount options */ /* for cleaning operations */ - struct mutex gc_mutex; /* mutex for GC */ + struct rw_semaphore gc_lock; /* + * semaphore for GC, avoid + * race between GC and GC or CP + */ struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ @@ -1327,11 +1430,11 @@ struct f2fs_sb_info { atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ - atomic_t aw_cnt; /* # of atomic writes */ + atomic_t compr_inode; /* # of compressed inodes */ + atomic_t compr_blocks; /* # of compressed blocks */ atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ atomic_t max_vw_cnt; /* max # of volatile writes */ - int bg_gc; /* background gc calls */ unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ @@ -1365,6 +1468,8 @@ struct f2fs_sb_info { /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_chksum_seed; + + struct workqueue_struct *post_read_wq; /* post read workqueue */ }; struct f2fs_private_dio { @@ -2222,26 +2327,6 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } -static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, - int npages, bool no_fail) -{ - struct bio *bio; - - if (no_fail) { - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); - if (!bio) - bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); - return bio; - } - if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { - f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); - return NULL; - } - - return bio_alloc(GFP_KERNEL, npages); -} - static inline bool is_idle(struct f2fs_sb_info *sbi, int type) { if (sbi->gc_mode == GC_URGENT) @@ -2378,11 +2463,13 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* * On-disk inode flags (f2fs_inode::i_flags) */ +#define F2FS_COMPR_FL 0x00000004 /* Compress file */ #define F2FS_SYNC_FL 0x00000008 /* Synchronous updates */ #define F2FS_IMMUTABLE_FL 0x00000010 /* Immutable file */ #define F2FS_APPEND_FL 0x00000020 /* writes to file may only append */ #define F2FS_NODUMP_FL 0x00000040 /* do not dump file */ #define F2FS_NOATIME_FL 0x00000080 /* do not update atime */ +#define F2FS_NOCOMP_FL 0x00000400 /* Don't compress */ #define F2FS_INDEX_FL 0x00001000 /* hash-indexed directory */ #define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ @@ -2391,7 +2478,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* Flags that should be inherited by new inodes from their parent. */ #define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \ F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ - F2FS_CASEFOLD_FL) + F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \ @@ -2443,6 +2530,8 @@ enum { FI_PIN_FILE, /* indicate file should not be gced */ FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ + FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ + FI_MMAP_FILE, /* indicate file was mmapped */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -2459,6 +2548,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: + case FI_COMPRESSED_FILE: f2fs_mark_inode_dirty_sync(inode, true); } } @@ -2614,16 +2704,27 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_XATTR); } +static inline int f2fs_compressed_file(struct inode *inode) +{ + return S_ISREG(inode->i_mode) && + is_inode_flag_set(inode, FI_COMPRESSED_FILE); +} + static inline unsigned int addrs_per_inode(struct inode *inode) { unsigned int addrs = CUR_ADDRS_PER_INODE(inode) - get_inline_xattr_addrs(inode); - return ALIGN_DOWN(addrs, 1); + + if (!f2fs_compressed_file(inode)) + return addrs; + return ALIGN_DOWN(addrs, F2FS_I(inode)->i_cluster_size); } static inline unsigned int addrs_per_block(struct inode *inode) { - return ALIGN_DOWN(DEF_ADDRS_PER_BLOCK, 1); + if (!f2fs_compressed_file(inode)) + return DEF_ADDRS_PER_BLOCK; + return ALIGN_DOWN(DEF_ADDRS_PER_BLOCK, F2FS_I(inode)->i_cluster_size); } static inline void *inline_xattr_addr(struct inode *inode, struct page *page) @@ -2656,6 +2757,11 @@ static inline int f2fs_has_inline_dots(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DOTS); } +static inline int f2fs_is_mmap_file(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_MMAP_FILE); +} + static inline bool f2fs_is_pinned_file(struct inode *inode) { return is_inode_flag_set(inode, FI_PIN_FILE); @@ -2783,7 +2889,8 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (!test_opt(sbi, EXTENT_CACHE) || - is_inode_flag_set(inode, FI_NO_EXTENT)) + is_inode_flag_set(inode, FI_NO_EXTENT) || + is_inode_flag_set(inode, FI_COMPRESSED_FILE)) return false; /* @@ -2903,7 +3010,8 @@ static inline void verify_blkaddr(struct f2fs_sb_info *sbi, static inline bool __is_valid_data_blkaddr(block_t blkaddr) { - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR || + blkaddr == COMPRESS_ADDR) return false; return true; } @@ -3001,6 +3109,8 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct page **page); void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode); +bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, + struct fscrypt_name *fname); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct qstr *name, f2fs_hash_t name_hash, unsigned int bit_pos); @@ -3155,6 +3265,8 @@ void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc); void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi); +int f2fs_check_write_pointer(struct f2fs_sb_info *sbi); int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); @@ -3205,10 +3317,13 @@ void f2fs_destroy_checkpoint_caches(void); /* * data.c */ -int f2fs_init_post_read_processing(void); -void f2fs_destroy_post_read_processing(void); +int __init f2fs_init_bioset(void); +void f2fs_destroy_bioset(void); +struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); +void f2fs_submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, @@ -3245,8 +3360,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); +int f2fs_encrypt_one_page(struct f2fs_io_info *fio); bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio); bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio); +int f2fs_write_single_data_page(struct page *page, int *submitted, + struct bio **bio, sector_t *last_block, + struct writeback_control *wbc, + enum iostat_type io_type, + int compr_blocks); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -3256,6 +3377,10 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); void f2fs_clear_page_cache_dirty_tag(struct page *page); +int f2fs_init_post_read_processing(void); +void f2fs_destroy_post_read_processing(void); +int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); +void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); /* * gc.c @@ -3302,6 +3427,7 @@ struct f2fs_stat_info { int nr_discard_cmd; unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; + int compr_inode, compr_blocks; int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; @@ -3333,7 +3459,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_cp_count(si) ((si)->cp_count++) #define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) -#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) +#define stat_inc_bggc_count(si) ((si)->bg_gc++) #define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++) #define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) @@ -3372,6 +3498,20 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (f2fs_has_inline_dentry(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ } while (0) +#define stat_inc_compr_inode(inode) \ + do { \ + if (f2fs_compressed_file(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->compr_inode)); \ + } while (0) +#define stat_dec_compr_inode(inode) \ + do { \ + if (f2fs_compressed_file(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->compr_inode)); \ + } while (0) +#define stat_add_compr_blocks(inode, blocks) \ + (atomic_add(blocks, &F2FS_I_SB(inode)->compr_blocks)) +#define stat_sub_compr_blocks(inode, blocks) \ + (atomic_sub(blocks, &F2FS_I_SB(inode)->compr_blocks)) #define stat_inc_meta_count(sbi, blkaddr) \ do { \ if (blkaddr < SIT_I(sbi)->sit_base_addr) \ @@ -3389,13 +3529,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) -#define stat_inc_atomic_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)) -#define stat_dec_atomic_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)) #define stat_update_max_atomic_write(inode) \ do { \ - int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ + int cur = F2FS_I_SB(inode)->atomic_files; \ int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ @@ -3447,6 +3583,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi); void f2fs_destroy_stats(struct f2fs_sb_info *sbi); void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); +void f2fs_update_sit_info(struct f2fs_sb_info *sbi); #else #define stat_inc_cp_count(si) do { } while (0) #define stat_inc_bg_cp_count(si) do { } while (0) @@ -3456,8 +3593,8 @@ void f2fs_destroy_root_stats(void); #define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) -#define stat_inc_total_hit(sb) do { } while (0) -#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_total_hit(sbi) do { } while (0) +#define stat_inc_rbtree_node_hit(sbi) do { } while (0) #define stat_inc_largest_node_hit(sbi) do { } while (0) #define stat_inc_cached_node_hit(sbi) do { } while (0) #define stat_inc_inline_xattr(inode) do { } while (0) @@ -3466,6 +3603,10 @@ void f2fs_destroy_root_stats(void); #define stat_dec_inline_inode(inode) do { } while (0) #define stat_inc_inline_dir(inode) do { } while (0) #define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_compr_inode(inode) do { } while (0) +#define stat_dec_compr_inode(inode) do { } while (0) +#define stat_add_compr_blocks(inode, blocks) do { } while (0) +#define stat_sub_compr_blocks(inode, blocks) do { } while (0) #define stat_inc_atomic_write(inode) do { } while (0) #define stat_dec_atomic_write(inode) do { } while (0) #define stat_update_max_atomic_write(inode) do { } while (0) @@ -3485,6 +3626,7 @@ static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } +static inline void update_sit_info(struct f2fs_sb_info *sbi) {} #endif extern const struct file_operations f2fs_dir_operations; @@ -3513,6 +3655,7 @@ void f2fs_truncate_inline_inode(struct inode *inode, int f2fs_read_inline_data(struct inode *inode, struct page *page); int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry); int f2fs_write_inline_data(struct inode *inode, struct page *page); bool f2fs_recover_inline_data(struct inode *inode, struct page *npage); struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, @@ -3605,7 +3748,85 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) */ static inline bool f2fs_post_read_required(struct inode *inode) { - return f2fs_encrypted_file(inode) || fsverity_active(inode); + return f2fs_encrypted_file(inode) || fsverity_active(inode) || + f2fs_compressed_file(inode); +} + +/* + * compress.c + */ +#ifdef CONFIG_F2FS_FS_COMPRESSION +bool f2fs_is_compressed_page(struct page *page); +struct page *f2fs_compress_control_page(struct page *page); +int f2fs_prepare_compress_overwrite(struct inode *inode, + struct page **pagep, pgoff_t index, void **fsdata); +bool f2fs_compress_write_end(struct inode *inode, void *fsdata, + pgoff_t index, unsigned copied); +void f2fs_compress_write_end_io(struct bio *bio, struct page *page); +bool f2fs_is_compress_backend_ready(struct inode *inode); +void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity); +bool f2fs_cluster_is_empty(struct compress_ctx *cc); +bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); +int f2fs_write_multi_pages(struct compress_ctx *cc, + int *submitted, + struct writeback_control *wbc, + enum iostat_type io_type); +int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); +int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, + unsigned nr_pages, sector_t *last_block_in_bio, + bool is_readahead); +struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); +void f2fs_free_dic(struct decompress_io_ctx *dic); +void f2fs_decompress_end_io(struct page **rpages, + unsigned int cluster_size, bool err, bool verity); +int f2fs_init_compress_ctx(struct compress_ctx *cc); +void f2fs_destroy_compress_ctx(struct compress_ctx *cc); +void f2fs_init_compress_info(struct f2fs_sb_info *sbi); +#else +static inline bool f2fs_is_compressed_page(struct page *page) { return false; } +static inline bool f2fs_is_compress_backend_ready(struct inode *inode) +{ + if (!f2fs_compressed_file(inode)) + return true; + /* not support compression */ + return false; +} +static inline struct page *f2fs_compress_control_page(struct page *page) +{ + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +} +#endif + +static inline void set_compress_context(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + F2FS_I(inode)->i_compress_algorithm = + F2FS_OPTION(sbi).compress_algorithm; + F2FS_I(inode)->i_log_cluster_size = + F2FS_OPTION(sbi).compress_log_size; + F2FS_I(inode)->i_cluster_size = + 1 << F2FS_I(inode)->i_log_cluster_size; + F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; + set_inode_flag(inode, FI_COMPRESSED_FILE); + stat_inc_compr_inode(inode); +} + +static inline u64 f2fs_disable_compressed_file(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + if (!f2fs_compressed_file(inode)) + return 0; + if (fi->i_compr_blocks) + return fi->i_compr_blocks; + + fi->i_flags &= ~F2FS_COMPR_FL; + clear_inode_flag(inode, FI_COMPRESSED_FILE); + stat_dec_compr_inode(inode); + return 0; } #define F2FS_FEATURE_FUNCS(name, flagname) \ @@ -3626,6 +3847,7 @@ F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); F2FS_FEATURE_FUNCS(verity, VERITY); F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); F2FS_FEATURE_FUNCS(casefold, CASEFOLD); +F2FS_FEATURE_FUNCS(compression, COMPRESSION); #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, @@ -3707,6 +3929,30 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } +static inline bool f2fs_may_compress(struct inode *inode) +{ + if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || + f2fs_is_atomic_file(inode) || + f2fs_is_volatile_file(inode)) + return false; + return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode); +} + +static inline void f2fs_i_compr_blocks_update(struct inode *inode, + u64 blocks, bool add) +{ + int diff = F2FS_I(inode)->i_cluster_size - blocks; + + if (add) { + F2FS_I(inode)->i_compr_blocks += diff; + stat_add_compr_blocks(inode, diff); + } else { + F2FS_I(inode)->i_compr_blocks -= diff; + stat_sub_compr_blocks(inode, diff); + } + f2fs_mark_inode_dirty_sync(inode, true); +} + static inline int block_unaligned_IO(struct inode *inode, struct kiocb *iocb, struct iov_iter *iter) { @@ -3738,6 +3984,8 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, return true; if (f2fs_is_multi_device(sbi)) return true; + if (f2fs_compressed_file(inode)) + return true; /* * for blkzoned device, fallback direct IO to buffered IO, so * all IOs can be serialized by log-structured write. diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 85af112e868d..0d4da644df3b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -50,8 +50,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct dnode_of_data dn = { .node_changed = false }; - int err; + struct dnode_of_data dn; + bool need_alloc = true; + int err = 0; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -63,6 +64,26 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto err; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (f2fs_compressed_file(inode)) { + int ret = f2fs_is_compressed_cluster(inode, page->index); + + if (ret < 0) { + err = ret; + goto err; + } else if (ret) { + if (ret < F2FS_I(inode)->i_cluster_size) { + err = -EAGAIN; + goto err; + } + need_alloc = false; + } + } +#endif + /* should do out of any locked page */ + if (need_alloc) + f2fs_balance_fs(sbi, true); + sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -78,15 +99,17 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto out_sem; } - /* block allocation */ - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_get_block(&dn, page->index); - f2fs_put_dnode(&dn); - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); - if (err) { - unlock_page(page); - goto out_sem; + if (need_alloc) { + /* block allocation */ + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_block(&dn, page->index); + f2fs_put_dnode(&dn); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + if (err) { + unlock_page(page); + goto out_sem; + } } /* fill the page */ @@ -120,8 +143,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) out_sem: up_read(&F2FS_I(inode)->i_mmap_sem); - f2fs_balance_fs(sbi, dn.node_changed); - sb_end_pagefault(inode->i_sb); err: return block_page_mkwrite_return(err); @@ -155,6 +176,8 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) if (!S_ISREG(inode->i_mode)) cp_reason = CP_NON_REGULAR; + else if (f2fs_compressed_file(inode)) + cp_reason = CP_COMPRESSED; else if (inode->i_nlink != 1) cp_reason = CP_HARDLINK; else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) @@ -485,6 +508,9 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) @@ -492,6 +518,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; + set_inode_flag(inode, FI_MMAP_FILE); return 0; } @@ -502,6 +529,9 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) if (err) return err; + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + err = fsverity_file_open(inode, filp); if (err) return err; @@ -518,6 +548,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; int base = 0; + bool compressed_cluster = false; + int cluster_index = 0, valid_blocks = 0; + int cluster_size = F2FS_I(dn->inode)->i_cluster_size; if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) base = get_extra_isize(dn->inode); @@ -525,26 +558,43 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) raw_node = F2FS_NODE(dn->node_page); addr = blkaddr_in_node(raw_node) + base + ofs; - for (; count > 0; count--, addr++, dn->ofs_in_node++) { + /* Assumption: truncateion starts with cluster */ + for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) { block_t blkaddr = le32_to_cpu(*addr); + if (f2fs_compressed_file(dn->inode) && + !(cluster_index & (cluster_size - 1))) { + if (compressed_cluster) + f2fs_i_compr_blocks_update(dn->inode, + valid_blocks, false); + compressed_cluster = (blkaddr == COMPRESS_ADDR); + valid_blocks = 0; + } + if (blkaddr == NULL_ADDR) continue; dn->data_blkaddr = NULL_ADDR; f2fs_set_data_blkaddr(dn); - if (__is_valid_data_blkaddr(blkaddr) && - !f2fs_is_valid_blkaddr(sbi, blkaddr, + if (__is_valid_data_blkaddr(blkaddr)) { + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) - continue; + continue; + if (compressed_cluster) + valid_blocks++; + } - f2fs_invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); + + f2fs_invalidate_blocks(sbi, blkaddr); nr_free++; } + if (compressed_cluster) + f2fs_i_compr_blocks_update(dn->inode, valid_blocks, false); + if (nr_free) { pgoff_t fofs; /* @@ -587,6 +637,9 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; } + if (f2fs_compressed_file(inode)) + return 0; + page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); @@ -602,7 +655,7 @@ truncate_out: return 0; } -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) +static int do_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -667,6 +720,28 @@ free_partial: return err; } +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) +{ + u64 free_from = from; + + /* + * for compressed file, only support cluster size + * aligned truncation. + */ + if (f2fs_compressed_file(inode)) { + size_t cluster_shift = PAGE_SHIFT + + F2FS_I(inode)->i_log_cluster_size; + size_t cluster_mask = (1 << cluster_shift) - 1; + + free_from = from >> cluster_shift; + if (from & cluster_mask) + free_from++; + free_from <<= cluster_shift; + } + + return do_truncate_blocks(inode, free_from, lock); +} + int f2fs_truncate(struct inode *inode) { int err; @@ -754,18 +829,12 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) inode->i_uid = attr->ia_uid; if (ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; - if (ia_valid & ATTR_ATIME) { - inode->i_atime = timestamp_truncate(attr->ia_atime, - inode); - } - if (ia_valid & ATTR_MTIME) { - inode->i_mtime = timestamp_truncate(attr->ia_mtime, - inode); - } - if (ia_valid & ATTR_CTIME) { - inode->i_ctime = timestamp_truncate(attr->ia_ctime, - inode); - } + if (ia_valid & ATTR_ATIME) + inode->i_atime = attr->ia_atime; + if (ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; @@ -786,6 +855,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; + if ((attr->ia_valid & ATTR_SIZE) && + !f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + err = setattr_prepare(dentry, attr); if (err) return err; @@ -1026,8 +1099,8 @@ next_dnode: } else if (ret == -ENOENT) { if (dn.max_level == 0) return -ENOENT; - done = min((pgoff_t)ADDRS_PER_BLOCK(inode) - dn.ofs_in_node, - len); + done = min((pgoff_t)ADDRS_PER_BLOCK(inode) - + dn.ofs_in_node, len); blkaddr += done; do_replace += done; goto next; @@ -1190,13 +1263,13 @@ static int __exchange_data_block(struct inode *src_inode, src_blkaddr = f2fs_kvzalloc(F2FS_I_SB(src_inode), array_size(olen, sizeof(block_t)), - GFP_KERNEL); + GFP_NOFS); if (!src_blkaddr) return -ENOMEM; do_replace = f2fs_kvzalloc(F2FS_I_SB(src_inode), array_size(olen, sizeof(int)), - GFP_KERNEL); + GFP_NOFS); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; @@ -1563,7 +1636,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, NULL_SEGNO); if (err && err != -ENODATA && err != -EAGAIN) goto out_err; @@ -1621,6 +1694,8 @@ static long f2fs_fallocate(struct file *file, int mode, return -EIO; if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) return -ENOSPC; + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; /* f2fs only support ->fallocate for regular file */ if (!S_ISREG(inode->i_mode)) @@ -1630,6 +1705,11 @@ static long f2fs_fallocate(struct file *file, int mode, (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; + if (f2fs_compressed_file(inode) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) + return -EOPNOTSUPP; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE)) @@ -1719,7 +1799,40 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) return -ENOTEMPTY; } + if (iflags & (F2FS_COMPR_FL | F2FS_NOCOMP_FL)) { + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + if ((iflags & F2FS_COMPR_FL) && (iflags & F2FS_NOCOMP_FL)) + return -EINVAL; + } + + if ((iflags ^ fi->i_flags) & F2FS_COMPR_FL) { + if (S_ISREG(inode->i_mode) && + (fi->i_flags & F2FS_COMPR_FL || i_size_read(inode) || + F2FS_HAS_BLOCKS(inode))) + return -EINVAL; + if (iflags & F2FS_NOCOMP_FL) + return -EINVAL; + if (iflags & F2FS_COMPR_FL) { + int err = f2fs_convert_inline_inode(inode); + + if (err) + return err; + + if (!f2fs_may_compress(inode)) + return -EINVAL; + + set_compress_context(inode); + } + } + if ((iflags ^ fi->i_flags) & F2FS_NOCOMP_FL) { + if (fi->i_flags & F2FS_COMPR_FL) + return -EINVAL; + } + fi->i_flags = iflags | (fi->i_flags & ~mask); + f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & F2FS_COMPR_FL) && + (fi->i_flags & F2FS_NOCOMP_FL)); if (fi->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); @@ -1745,11 +1858,13 @@ static const struct { u32 iflag; u32 fsflag; } f2fs_fsflags_map[] = { + { F2FS_COMPR_FL, FS_COMPR_FL }, { F2FS_SYNC_FL, FS_SYNC_FL }, { F2FS_IMMUTABLE_FL, FS_IMMUTABLE_FL }, { F2FS_APPEND_FL, FS_APPEND_FL }, { F2FS_NODUMP_FL, FS_NODUMP_FL }, { F2FS_NOATIME_FL, FS_NOATIME_FL }, + { F2FS_NOCOMP_FL, FS_NOCOMP_FL }, { F2FS_INDEX_FL, FS_INDEX_FL }, { F2FS_DIRSYNC_FL, FS_DIRSYNC_FL }, { F2FS_PROJINHERIT_FL, FS_PROJINHERIT_FL }, @@ -1757,11 +1872,13 @@ static const struct { }; #define F2FS_GETTABLE_FS_FL ( \ + FS_COMPR_FL | \ FS_SYNC_FL | \ FS_IMMUTABLE_FL | \ FS_APPEND_FL | \ FS_NODUMP_FL | \ FS_NOATIME_FL | \ + FS_NOCOMP_FL | \ FS_INDEX_FL | \ FS_DIRSYNC_FL | \ FS_PROJINHERIT_FL | \ @@ -1772,11 +1889,13 @@ static const struct { FS_CASEFOLD_FL) #define F2FS_SETTABLE_FS_FL ( \ + FS_COMPR_FL | \ FS_SYNC_FL | \ FS_IMMUTABLE_FL | \ FS_APPEND_FL | \ FS_NODUMP_FL | \ FS_NOATIME_FL | \ + FS_NOCOMP_FL | \ FS_DIRSYNC_FL | \ FS_PROJINHERIT_FL | \ FS_CASEFOLD_FL) @@ -1897,6 +2016,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); + f2fs_disable_compressed_file(inode); + if (f2fs_is_atomic_file(inode)) { if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) ret = -EINVAL; @@ -1935,7 +2056,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; - stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: inode_unlock(inode); @@ -2324,12 +2444,12 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) return ret; if (!sync) { - if (!mutex_trylock(&sbi->gc_mutex)) { + if (!down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); } ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); @@ -2367,12 +2487,12 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) do_more: if (!range.sync) { - if (!mutex_trylock(&sbi->gc_mutex)) { + if (!down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } } else { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); } ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); @@ -2803,7 +2923,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) end_segno = min(start_segno + range.segments, dev_end_segno); while (start_segno < end_segno) { - if (!mutex_trylock(&sbi->gc_mutex)) { + if (!down_write_trylock(&sbi->gc_lock)) { ret = -EBUSY; goto out; } @@ -3098,10 +3218,16 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) ret = -EAGAIN; goto out; } + ret = f2fs_convert_inline_inode(inode); if (ret) goto out; + if (f2fs_disable_compressed_file(inode)) { + ret = -EOPNOTSUPP; + goto out; + } + set_inode_flag(inode, FI_PIN_FILE); ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN]; done: @@ -3350,6 +3476,17 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } } +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + + return generic_file_read_iter(iocb, iter); +} + static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -3361,6 +3498,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } + if (!f2fs_is_compress_backend_ready(inode)) + return -EOPNOTSUPP; + if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) { ret = -EAGAIN; @@ -3389,18 +3529,41 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = -EAGAIN; goto out; } - } else { - preallocated = true; - target_size = iocb->ki_pos + iov_iter_count(from); + goto write; + } - err = f2fs_preallocate_blocks(iocb, from); - if (err) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = err; - goto out; - } + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) + goto write; + + if (iocb->ki_flags & IOCB_DIRECT) { + /* + * Convert inline data for Direct I/O before entering + * f2fs_direct_IO(). + */ + err = f2fs_convert_inline_inode(inode); + if (err) + goto out_err; + /* + * If force_buffere_io() is true, we have to allocate + * blocks all the time, since f2fs_direct_IO will fall + * back to buffered IO. + */ + if (!f2fs_force_buffered_io(inode, iocb, from) && + allow_outplace_dio(inode, iocb, from)) + goto write; + } + preallocated = true; + target_size = iocb->ki_pos + iov_iter_count(from); + + err = f2fs_preallocate_blocks(iocb, from); + if (err) { +out_err: + clear_inode_flag(inode, FI_NO_PREALLOC); + inode_unlock(inode); + ret = err; + goto out; } +write: ret = __generic_file_write_iter(iocb, from); clear_inode_flag(inode, FI_NO_PREALLOC); @@ -3475,7 +3638,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) const struct file_operations f2fs_file_operations = { .llseek = f2fs_llseek, - .read_iter = generic_file_read_iter, + .read_iter = f2fs_file_read_iter, .write_iter = f2fs_file_write_iter, .open = f2fs_file_open, .release = f2fs_release_file, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b3d399623290..db8725d473b5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -78,18 +78,18 @@ static int gc_thread_func(void *data) */ if (sbi->gc_mode == GC_URGENT) { wait_ms = gc_th->urgent_sleep_time; - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); goto do_gc; } - if (!mutex_trylock(&sbi->gc_mutex)) { + if (!down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); stat_io_skip_bggc_count(sbi); goto next; } @@ -99,7 +99,7 @@ static int gc_thread_func(void *data) else increase_sleep_time(gc_th, &wait_ms); do_gc: - stat_inc_bggc_count(sbi); + stat_inc_bggc_count(sbi->stat_info); /* if return value is not zero, no victim was selected */ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) @@ -1049,8 +1049,10 @@ next_step: if (phase == 3) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode)) + if (IS_ERR(inode) || is_bad_inode(inode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); continue; + } if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { @@ -1368,7 +1370,7 @@ stop: reserved_segments(sbi), prefree_segments(sbi)); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); put_gc_inode(&gc_list); @@ -1407,9 +1409,9 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); do_garbage_collect(sbi, segno, &gc_list, FG_GC); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); put_gc_inode(&gc_list); if (get_valid_blocks(sbi, segno, true)) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 896db0416f0e..4167e5408151 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -368,7 +368,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct f2fs_dentry_ptr src, dst; int err; - page = f2fs_grab_cache_page(dir->i_mapping, 0, false); + page = f2fs_grab_cache_page(dir->i_mapping, 0, true); if (!page) { f2fs_put_page(ipage, 1); return -ENOMEM; @@ -530,7 +530,7 @@ recover: return err; } -static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, +static int do_convert_inline_dir(struct inode *dir, struct page *ipage, void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) @@ -539,6 +539,44 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); } +int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + struct fscrypt_name fname; + void *inline_dentry = NULL; + int err = 0; + + if (!f2fs_has_inline_dentry(dir)) + return 0; + + f2fs_lock_op(sbi); + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (err) + goto out; + + ipage = f2fs_get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } + + if (f2fs_has_enough_room(dir, ipage, &fname)) { + f2fs_put_page(ipage, 1); + goto out; + } + + inline_dentry = inline_data_addr(dir, ipage); + + err = do_convert_inline_dir(dir, ipage, inline_dentry); + if (!err) + f2fs_put_page(ipage, 1); +out: + f2fs_unlock_op(sbi); + return err; +} + int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode) @@ -562,7 +600,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); if (bit_pos >= d.max) { - err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); + err = do_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; err = -EAGAIN; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 502bd491336a..78c3f1d70f1d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -200,6 +200,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_inode *ri = F2FS_INODE(node_page); unsigned long long iblocks; iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); @@ -286,6 +287,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && + fi->i_flags & F2FS_COMPR_FL && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_log_cluster_size)) { + if (ri->i_compress_algorithm >= COMPRESS_MAX) + return false; + if (le64_to_cpu(ri->i_compr_blocks) > inode->i_blocks) + return false; + if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || + ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) + return false; + } + return true; } @@ -407,6 +421,18 @@ static int do_read_inode(struct inode *inode) fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); } + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_compression(sbi) && + (fi->i_flags & F2FS_COMPR_FL)) { + if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_log_cluster_size)) { + fi->i_compr_blocks = le64_to_cpu(ri->i_compr_blocks); + fi->i_compress_algorithm = ri->i_compress_algorithm; + fi->i_log_cluster_size = ri->i_log_cluster_size; + fi->i_cluster_size = 1 << fi->i_log_cluster_size; + set_inode_flag(inode, FI_COMPRESSED_FILE); + } + } + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; @@ -416,6 +442,8 @@ static int do_read_inode(struct inode *inode) stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); + stat_inc_compr_inode(inode); + stat_add_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); return 0; } @@ -569,6 +597,17 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) ri->i_crtime_nsec = cpu_to_le32(F2FS_I(inode)->i_crtime.tv_nsec); } + + if (f2fs_sb_has_compression(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_log_cluster_size)) { + ri->i_compr_blocks = + cpu_to_le64(F2FS_I(inode)->i_compr_blocks); + ri->i_compress_algorithm = + F2FS_I(inode)->i_compress_algorithm; + ri->i_log_cluster_size = + F2FS_I(inode)->i_log_cluster_size; + } } __set_inode_rdev(inode, ri); @@ -711,6 +750,8 @@ no_delete: stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); + stat_dec_compr_inode(inode); + stat_sub_compr_blocks(inode, F2FS_I(inode)->i_compr_blocks); if (likely(!f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a1c507b0b4ac..2aa035422c0f 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -119,6 +119,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); + if (f2fs_sb_has_compression(sbi)) { + /* Inherit the compression flag in directory */ + if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) && + f2fs_may_compress(inode)) + set_compress_context(inode); + } + f2fs_set_inode_flags(inode); trace_f2fs_new_inode(inode, 0); @@ -149,6 +156,9 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub) size_t sublen = strlen(sub); int i; + if (sublen == 1 && *sub == '*') + return 1; + /* * filename format of multimedia file should be defined as: * "filename + '.' + extension + (optional: '.' + temp extension)". @@ -262,6 +272,45 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, return 0; } +static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, + const unsigned char *name) +{ + __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; + unsigned char (*ext)[F2FS_EXTENSION_LEN]; + unsigned int ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + int i, cold_count, hot_count; + + if (!f2fs_sb_has_compression(sbi) || + is_inode_flag_set(inode, FI_COMPRESSED_FILE) || + F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL || + !f2fs_may_compress(inode)) + return; + + down_read(&sbi->sb_lock); + + cold_count = le32_to_cpu(sbi->raw_super->extension_count); + hot_count = sbi->raw_super->hot_ext_count; + + for (i = cold_count; i < cold_count + hot_count; i++) { + if (is_extension_exist(name, extlist[i])) { + up_read(&sbi->sb_lock); + return; + } + } + + up_read(&sbi->sb_lock); + + ext = F2FS_OPTION(sbi).extensions; + + for (i = 0; i < ext_cnt; i++) { + if (!is_extension_exist(name, ext[i])) + continue; + + set_compress_context(inode); + return; + } +} + static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -286,6 +335,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (!test_opt(sbi, DISABLE_EXT_IDENTIFY)) set_file_temperature(sbi, inode, dentry->d_name.name); + set_compress_inode(sbi, inode, dentry->d_name.name); + inode->i_op = &f2fs_file_inode_operations; inode->i_fop = &f2fs_file_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; @@ -797,6 +848,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, if (whiteout) { f2fs_i_links_write(inode, false); + inode->i_state |= I_LINKABLE; *whiteout = inode; } else { d_tmpfile(dentry, inode); @@ -849,12 +901,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); struct inode *whiteout = NULL; - struct page *old_dir_page; + struct page *old_dir_page = NULL; struct page *old_page, *new_page = NULL; struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; - bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err; if (unlikely(f2fs_cp_error(sbi))) @@ -867,6 +918,26 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, F2FS_I(old_dentry->d_inode)->i_projid))) return -EXDEV; + /* + * If new_inode is null, the below renaming flow will + * add a link in old_dir which can conver inline_dir. + * After then, if we failed to get the entry due to other + * reasons like ENOMEM, we had to remove the new entry. + * Instead of adding such the error handling routine, let's + * simply convert first here. + */ + if (old_dir == new_dir && !new_inode) { + err = f2fs_try_convert_inline_dir(old_dir, new_dentry); + if (err) + return err; + } + + if (flags & RENAME_WHITEOUT) { + err = f2fs_create_whiteout(old_dir, &whiteout); + if (err) + return err; + } + err = dquot_initialize(old_dir); if (err) goto out; @@ -898,17 +969,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - if (flags & RENAME_WHITEOUT) { - err = f2fs_create_whiteout(old_dir, &whiteout); - if (err) - goto out_dir; - } - if (new_inode) { err = -ENOTEMPTY; if (old_dir_entry && !f2fs_empty_dir(new_inode)) - goto out_whiteout; + goto out_dir; err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, @@ -916,7 +981,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) { if (IS_ERR(new_page)) err = PTR_ERR(new_page); - goto out_whiteout; + goto out_dir; } f2fs_balance_fs(sbi, true); @@ -928,6 +993,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto put_out_dir; f2fs_set_link(new_dir, new_entry, new_page, old_inode); + new_page = NULL; new_inode->i_ctime = current_time(new_inode); down_write(&F2FS_I(new_inode)->i_sem); @@ -948,33 +1014,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = f2fs_add_link(new_dentry, old_inode); if (err) { f2fs_unlock_op(sbi); - goto out_whiteout; + goto out_dir; } if (old_dir_entry) f2fs_i_links_write(new_dir, true); - - /* - * old entry and new entry can locate in the same inline - * dentry in inode, when attaching new entry in inline dentry, - * it could force inline dentry conversion, after that, - * old_entry and old_page will point to wrong address, in - * order to avoid this, let's do the check and update here. - */ - if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) { - f2fs_put_page(old_page, 0); - old_page = NULL; - - old_entry = f2fs_find_entry(old_dir, - &old_dentry->d_name, &old_page); - if (!old_entry) { - err = -ENOENT; - if (IS_ERR(old_page)) - err = PTR_ERR(old_page); - f2fs_unlock_op(sbi); - goto out_whiteout; - } - } } down_write(&F2FS_I(old_inode)->i_sem); @@ -989,9 +1033,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); + old_page = NULL; if (whiteout) { - whiteout->i_state |= I_LINKABLE; set_inode_flag(whiteout, FI_INC_LINK); err = f2fs_add_link(old_dentry, whiteout); if (err) @@ -1025,17 +1069,15 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, put_out_dir: f2fs_unlock_op(sbi); - if (new_page) - f2fs_put_page(new_page, 0); -out_whiteout: - if (whiteout) - iput(whiteout); + f2fs_put_page(new_page, 0); out_dir: if (old_dir_entry) f2fs_put_page(old_dir_page, 0); out_old: f2fs_put_page(old_page, 0); out: + if (whiteout) + iput(whiteout); return err; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3314a0f3405e..9d02cdcdbb07 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -875,7 +875,7 @@ static int truncate_dnode(struct dnode_of_data *dn) /* get direct node */ page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); - if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) + if (PTR_ERR(page) == -ENOENT) return 1; else if (IS_ERR(page)) return PTR_ERR(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 76477f71d4ee..763d5c0951d1 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -723,6 +723,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) int ret = 0; unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + bool fix_curseg_write_pointer = false; #ifdef CONFIG_QUOTA int quota_enabled; #endif @@ -774,6 +775,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) sbi->sb->s_flags = s_flags; } skip: + fix_curseg_write_pointer = !check_only || list_empty(&inode_list); + destroy_fsync_dnodes(&inode_list, err); destroy_fsync_dnodes(&tmp_inode_list, err); @@ -784,9 +787,22 @@ skip: if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); truncate_inode_pages_final(META_MAPPING(sbi)); - } else { - clear_sbi_flag(sbi, SBI_POR_DOING); } + + /* + * If fsync data succeeds or there is no fsync data to recover, + * and the f2fs is not read only, check and fix zoned block devices' + * write pointer consistency. + */ + if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) && + f2fs_sb_has_blkzoned(sbi)) { + err = f2fs_fix_curseg_write_pointer(sbi); + ret = err; + } + + if (!err) + clear_sbi_flag(sbi, SBI_POR_DOING); + mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 56e81447e2f3..cf0eb002cfd4 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -334,7 +334,6 @@ void f2fs_drop_inmem_pages(struct inode *inode) } fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - stat_dec_atomic_write(inode); spin_lock(&sbi->inode_lock[ATOMIC_FILE]); if (!list_empty(&fi->inmem_ilist)) @@ -505,7 +504,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * dir/node pages without enough free segments. */ if (has_not_enough_free_secs(sbi, 0, 0)) { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); f2fs_gc(sbi, false, false, NULL_SEGNO); } } @@ -2225,7 +2224,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) struct sit_info *sit_i = SIT_I(sbi); f2fs_bug_on(sbi, addr == NULL_ADDR); - if (addr == NEW_ADDR) + if (addr == NEW_ADDR || addr == COMPRESS_ADDR) return; invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); @@ -2861,9 +2860,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (sbi->discard_blks == 0) goto out; - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); if (err) goto out; @@ -3036,7 +3035,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (fio->type == DATA) { struct inode *inode = fio->page->mapping->host; - if (is_cold_data(fio->page) || file_is_cold(inode)) + if (is_cold_data(fio->page) || file_is_cold(inode) || + f2fs_compressed_file(inode)) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || @@ -3289,7 +3289,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); - if (fio->bio) + if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) err = f2fs_merge_page_bio(fio); else err = f2fs_submit_page_bio(fio); @@ -4368,6 +4368,263 @@ out: return 0; } +#ifdef CONFIG_BLK_DEV_ZONED + +static int check_zone_write_pointer(struct f2fs_sb_info *sbi, + struct f2fs_dev_info *fdev, + struct blk_zone *zone) +{ + unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno; + block_t zone_block, wp_block, last_valid_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; + int i, s, b, ret; + struct seg_entry *se; + + if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block); + wp_segno = GET_SEGNO(sbi, wp_block); + wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); + zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); + zone_segno = GET_SEGNO(sbi, zone_block); + zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno); + + if (zone_segno >= MAIN_SEGS(sbi)) + return 0; + + /* + * Skip check of zones cursegs point to, since + * fix_curseg_write_pointer() checks them. + */ + for (i = 0; i < NO_CHECK_TYPE; i++) + if (zone_secno == GET_SEC_FROM_SEG(sbi, + CURSEG_I(sbi, i)->segno)) + return 0; + + /* + * Get last valid block of the zone. + */ + last_valid_block = zone_block - 1; + for (s = sbi->segs_per_sec - 1; s >= 0; s--) { + segno = zone_segno + s; + se = get_seg_entry(sbi, segno); + for (b = sbi->blocks_per_seg - 1; b >= 0; b--) + if (f2fs_test_bit(b, se->cur_valid_map)) { + last_valid_block = START_BLOCK(sbi, segno) + b; + break; + } + if (last_valid_block >= zone_block) + break; + } + + /* + * If last valid block is beyond the write pointer, report the + * inconsistency. This inconsistency does not cause write error + * because the zone will not be selected for write operation until + * it get discarded. Just report it. + */ + if (last_valid_block >= wp_block) { + f2fs_notice(sbi, "Valid block beyond write pointer: " + "valid block[0x%x,0x%x] wp[0x%x,0x%x]", + GET_SEGNO(sbi, last_valid_block), + GET_BLKOFF_FROM_SEG0(sbi, last_valid_block), + wp_segno, wp_blkoff); + return 0; + } + + /* + * If there is no valid block in the zone and if write pointer is + * not at zone start, reset the write pointer. + */ + if (last_valid_block + 1 == zone_block && zone->wp != zone->start) { + f2fs_notice(sbi, + "Zone without valid block has non-zero write " + "pointer. Reset the write pointer: wp[0x%x,0x%x]", + wp_segno, wp_blkoff); + ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, + zone->len >> log_sectors_per_block); + if (ret) { + f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", + fdev->path, ret); + return ret; + } + } + + return 0; +} + +static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, + block_t zone_blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!bdev_is_zoned(FDEV(i).bdev)) + continue; + if (sbi->s_ndevs == 1 || (FDEV(i).start_blk <= zone_blkaddr && + zone_blkaddr <= FDEV(i).end_blk)) + return &FDEV(i); + } + + return NULL; +} + +static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) { + memcpy(data, zone, sizeof(struct blk_zone)); + return 0; +} + +static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *cs = CURSEG_I(sbi, type); + struct f2fs_dev_info *zbd; + struct blk_zone zone; + unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off; + block_t cs_zone_block, wp_block; + unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; + sector_t zone_sector; + int err; + + cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); + cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); + + zbd = get_target_zoned_dev(sbi, cs_zone_block); + if (!zbd) + return 0; + + /* report zone for the sector the curseg points to */ + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; + err = blkdev_report_zones(zbd->bdev, zone_sector, 1, + report_one_zone_cb, &zone); + if (err != 1) { + f2fs_err(sbi, "Report zone failed: %s errno=(%d)", + zbd->path, err); + return err; + } + + if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); + wp_segno = GET_SEGNO(sbi, wp_block); + wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); + wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); + + if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && + wp_sector_off == 0) + return 0; + + f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " + "curseg[0x%x,0x%x] wp[0x%x,0x%x]", + type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff); + + f2fs_notice(sbi, "Assign new section to curseg[%d]: " + "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); + allocate_segment_by_default(sbi, type, true); + + /* check consistency of the zone curseg pointed to */ + if (check_zone_write_pointer(sbi, zbd, &zone)) + return -EIO; + + /* check newly assigned zone */ + cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); + cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); + + zbd = get_target_zoned_dev(sbi, cs_zone_block); + if (!zbd) + return 0; + + zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) + << log_sectors_per_block; + err = blkdev_report_zones(zbd->bdev, zone_sector, 1, + report_one_zone_cb, &zone); + if (err != 1) { + f2fs_err(sbi, "Report zone failed: %s errno=(%d)", + zbd->path, err); + return err; + } + + if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) + return 0; + + if (zone.wp != zone.start) { + f2fs_notice(sbi, + "New zone for curseg[%d] is not yet discarded. " + "Reset the zone: curseg[0x%x,0x%x]", + type, cs->segno, cs->next_blkoff); + err = __f2fs_issue_discard_zone(sbi, zbd->bdev, + zone_sector >> log_sectors_per_block, + zone.len >> log_sectors_per_block); + if (err) { + f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", + zbd->path, err); + return err; + } + } + + return 0; +} + +int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < NO_CHECK_TYPE; i++) { + ret = fix_curseg_write_pointer(sbi, i); + if (ret) + return ret; + } + + return 0; +} + +struct check_zone_write_pointer_args { + struct f2fs_sb_info *sbi; + struct f2fs_dev_info *fdev; +}; + +static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, + void *data) { + struct check_zone_write_pointer_args *args; + args = (struct check_zone_write_pointer_args *)data; + + return check_zone_write_pointer(args->sbi, args->fdev, zone); +} + +int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +{ + int i, ret; + struct check_zone_write_pointer_args args; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (!bdev_is_zoned(FDEV(i).bdev)) + continue; + + args.sbi = sbi; + args.fdev = &FDEV(i); + ret = blkdev_report_zones(FDEV(i).bdev, 0, BLK_ALL_ZONES, + check_zone_write_pointer_cb, &args); + if (ret < 0) + return ret; + } + + return 0; +} +#else +int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) +{ + return 0; +} + +int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +{ + return 0; +} +#endif + /* * Update min, max modified time for cost-benefit GC algorithm */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a95467b202ea..459dc3901a57 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -200,18 +200,6 @@ struct segment_allocation { void (*allocate_segment)(struct f2fs_sb_info *, int, bool); }; -/* - * this value is set in page as a private data which indicate that - * the page is atomically written, and it is in inmem_pages list. - */ -#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) -#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) - -#define IS_ATOMIC_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) -#define IS_DUMMY_WRITTEN_PAGE(page) \ - (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) - #define MAX_SKIP_GC_COUNT 16 struct inmem_pages { @@ -619,8 +607,10 @@ static inline int utilization(struct f2fs_sb_info *sbi) * threashold, * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash * storages. IPU will be triggered only if the # of dirty - * pages over min_fsync_blocks. - * F2FS_IPUT_DISABLE - disable IPU. (=default option) + * pages over min_fsync_blocks. (=default option) + * F2FS_IPU_ASYNC - do IPU given by asynchronous write requests. + * F2FS_IPU_NOCACHE - disable IPU bio cache. + * F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 @@ -635,6 +625,7 @@ enum { F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, F2FS_IPU_ASYNC, + F2FS_IPU_NOCACHE, }; static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5111e1ffe58a..65a7a432dfee 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -141,6 +141,9 @@ enum { Opt_checkpoint_disable_cap, Opt_checkpoint_disable_cap_perc, Opt_checkpoint_enable, + Opt_compress_algorithm, + Opt_compress_log_size, + Opt_compress_extension, Opt_err, }; @@ -203,6 +206,9 @@ static match_table_t f2fs_tokens = { {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"}, {Opt_checkpoint_enable, "checkpoint=enable"}, + {Opt_compress_algorithm, "compress_algorithm=%s"}, + {Opt_compress_log_size, "compress_log_size=%u"}, + {Opt_compress_extension, "compress_extension=%s"}, {Opt_err, NULL}, }; @@ -391,8 +397,9 @@ static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; + unsigned char (*ext)[F2FS_EXTENSION_LEN]; char *p, *name; - int arg = 0; + int arg = 0, ext_cnt; kuid_t uid; kgid_t gid; #ifdef CONFIG_QUOTA @@ -810,6 +817,66 @@ static int parse_options(struct super_block *sb, char *options) case Opt_checkpoint_enable: clear_opt(sbi, DISABLE_CHECKPOINT); break; + case Opt_compress_algorithm: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_err(sbi, "Compression feature if off"); + return -EINVAL; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 3 && !strcmp(name, "lzo")) { + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_LZO; + } else if (strlen(name) == 3 && + !strcmp(name, "lz4")) { + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_LZ4; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_compress_log_size: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_err(sbi, "Compression feature is off"); + return -EINVAL; + } + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg < MIN_COMPRESS_LOG_SIZE || + arg > MAX_COMPRESS_LOG_SIZE) { + f2fs_err(sbi, + "Compress cluster log size is out of range"); + return -EINVAL; + } + F2FS_OPTION(sbi).compress_log_size = arg; + break; + case Opt_compress_extension: + if (!f2fs_sb_has_compression(sbi)) { + f2fs_err(sbi, "Compression feature is off"); + return -EINVAL; + } + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + ext = F2FS_OPTION(sbi).extensions; + ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt; + + if (strlen(name) >= F2FS_EXTENSION_LEN || + ext_cnt >= COMPRESS_EXT_NUM) { + f2fs_err(sbi, + "invalid extension length/number"); + kfree(name); + return -EINVAL; + } + + strcpy(ext[ext_cnt], name); + F2FS_OPTION(sbi).compress_ext_cnt++; + kfree(name); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1125,6 +1192,8 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_node_manager(sbi); f2fs_destroy_segment_manager(sbi); + f2fs_destroy_post_read_wq(sbi); + kvfree(sbi->ckpt); f2fs_unregister_sysfs(sbi); @@ -1169,9 +1238,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); } f2fs_trace_ios(NULL, 1); @@ -1213,12 +1282,10 @@ static int f2fs_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = 0; - if (dquot->dq_dqb.dqb_bsoftlimit) - limit = dquot->dq_dqb.dqb_bsoftlimit; - if (dquot->dq_dqb.dqb_bhardlimit && - (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) - limit = dquot->dq_dqb.dqb_bhardlimit; + limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, + dquot->dq_dqb.dqb_bhardlimit); + if (limit) + limit >>= sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; @@ -1228,12 +1295,8 @@ static int f2fs_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = 0; - if (dquot->dq_dqb.dqb_isoftlimit) - limit = dquot->dq_dqb.dqb_isoftlimit; - if (dquot->dq_dqb.dqb_ihardlimit && - (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) - limit = dquot->dq_dqb.dqb_ihardlimit; + limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, + dquot->dq_dqb.dqb_ihardlimit); if (limit && buf->f_files > limit) { buf->f_files = limit; @@ -1340,6 +1403,35 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, #endif } +static inline void f2fs_show_compress_options(struct seq_file *seq, + struct super_block *sb) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *algtype = ""; + int i; + + if (!f2fs_sb_has_compression(sbi)) + return; + + switch (F2FS_OPTION(sbi).compress_algorithm) { + case COMPRESS_LZO: + algtype = "lzo"; + break; + case COMPRESS_LZ4: + algtype = "lz4"; + break; + } + seq_printf(seq, ",compress_algorithm=%s", algtype); + + seq_printf(seq, ",compress_log_size=%u", + F2FS_OPTION(sbi).compress_log_size); + + for (i = 0; i < F2FS_OPTION(sbi).compress_ext_cnt; i++) { + seq_printf(seq, ",compress_extension=%s", + F2FS_OPTION(sbi).extensions[i]); + } +} + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); @@ -1462,6 +1554,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",fsync_mode=%s", "strict"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER) seq_printf(seq, ",fsync_mode=%s", "nobarrier"); + + f2fs_show_compress_options(seq, sbi->sb); return 0; } @@ -1476,6 +1570,9 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); + F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; + F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; + F2FS_OPTION(sbi).compress_ext_cnt = 0; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1524,7 +1621,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) f2fs_update_time(sbi, DISABLE_TIME); while (!f2fs_time_over(sbi, DISABLE_TIME)) { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, NULL_SEGNO); if (err == -ENODATA) { err = 0; @@ -1546,7 +1643,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) goto restore_flag; } - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); cpc.reason = CP_PAUSE; set_sbi_flag(sbi, SBI_CP_DISABLED); err = f2fs_write_checkpoint(sbi, &cpc); @@ -1558,7 +1655,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) spin_unlock(&sbi->stat_lock); out_unlock: - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); restore_flag: sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ return err; @@ -1566,12 +1663,12 @@ restore_flag: static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { - mutex_lock(&sbi->gc_mutex); + down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); clear_sbi_flag(sbi, SBI_CP_DISABLED); set_sbi_flag(sbi, SBI_IS_DIRTY); - mutex_unlock(&sbi->gc_mutex); + up_write(&sbi->gc_lock); f2fs_sync_fs(sbi->sb, 1); } @@ -2158,7 +2255,7 @@ static int f2fs_dquot_commit(struct dquot *dquot) struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); int ret; - down_read(&sbi->quota_sem); + down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING); ret = dquot_commit(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); @@ -2182,13 +2279,10 @@ static int f2fs_dquot_acquire(struct dquot *dquot) static int f2fs_dquot_release(struct dquot *dquot) { struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb); - int ret; + int ret = dquot_release(dquot); - down_read(&sbi->quota_sem); - ret = dquot_release(dquot); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); return ret; } @@ -2196,29 +2290,22 @@ static int f2fs_dquot_mark_dquot_dirty(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); - int ret; - - down_read(&sbi->quota_sem); - ret = dquot_mark_dquot_dirty(dquot); + int ret = dquot_mark_dquot_dirty(dquot); /* if we are using journalled quota */ if (is_journalled_quota(sbi)) set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); - up_read(&sbi->quota_sem); return ret; } static int f2fs_dquot_commit_info(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - int ret; + int ret = dquot_commit_info(sb, type); - down_read(&sbi->quota_sem); - ret = dquot_commit_info(sb, type); if (ret < 0) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); return ret; } @@ -3311,7 +3398,7 @@ try_onemore: /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; - mutex_init(&sbi->gc_mutex); + init_rwsem(&sbi->gc_lock); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); mutex_init(&sbi->resize_mutex); @@ -3400,6 +3487,12 @@ try_onemore: goto free_devices; } + err = f2fs_init_post_read_wq(sbi); + if (err) { + f2fs_err(sbi, "Failed to initialize post read workqueue"); + goto free_devices; + } + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, @@ -3544,6 +3637,17 @@ try_onemore: goto free_meta; } } + + /* + * If the f2fs is not readonly and fsync data recovery succeeds, + * check zoned block devices' write pointer consistency. + */ + if (!err && !f2fs_readonly(sb) && f2fs_sb_has_blkzoned(sbi)) { + err = f2fs_check_write_pointer(sbi); + if (err) + goto free_meta; + } + reset_checkpoint: /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -3621,6 +3725,7 @@ free_nm: f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); + f2fs_destroy_post_read_wq(sbi); free_devices: destroy_device_list(sbi); kvfree(sbi->ckpt); @@ -3762,8 +3867,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_bio_entry_cache(); if (err) goto free_post_read; + err = f2fs_init_bioset(); + if (err) + goto free_bio_enrty_cache; return 0; - +free_bio_enrty_cache: + f2fs_destroy_bio_entry_cache(); free_post_read: f2fs_destroy_post_read_processing(); free_root_stats: @@ -3789,6 +3898,7 @@ fail: static void __exit exit_f2fs_fs(void) { + f2fs_destroy_bioset(); f2fs_destroy_bio_entry_cache(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 70945ceb9c0c..91d649790b1b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -25,6 +25,9 @@ enum { DCC_INFO, /* struct discard_cmd_control */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_STAT_FS + STAT_INFO, /* struct f2fs_stat_info */ +#endif #ifdef CONFIG_F2FS_FAULT_INJECTION FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ @@ -42,6 +45,9 @@ struct f2fs_attr { int id; }; +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf); + static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) { if (struct_type == GC_THREAD) @@ -59,41 +65,25 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) struct_type == FAULT_INFO_TYPE) return (unsigned char *)&F2FS_OPTION(sbi).fault_info; #endif +#ifdef CONFIG_F2FS_STAT_FS + else if (struct_type == STAT_INFO) + return (unsigned char *)F2FS_STAT(sbi); +#endif return NULL; } static ssize_t dirty_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(dirty_segments(sbi))); + return sprintf(buf, "%llu\n", + (unsigned long long)(dirty_segments(sbi))); } -static ssize_t unusable_show(struct f2fs_attr *a, +static ssize_t free_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - block_t unusable; - - if (test_opt(sbi, DISABLE_CHECKPOINT)) - unusable = sbi->unusable_block_count; - else - unusable = f2fs_get_unusable_blocks(sbi); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)unusable); -} - -static ssize_t encoding_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ -#ifdef CONFIG_UNICODE - if (f2fs_sb_has_casefold(sbi)) - return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", - sbi->s_encoding->charset, - (sbi->s_encoding->version >> 16) & 0xff, - (sbi->s_encoding->version >> 8) & 0xff, - sbi->s_encoding->version & 0xff); -#endif - return snprintf(buf, PAGE_SIZE, "(none)"); + return sprintf(buf, "%llu\n", + (unsigned long long)(free_segments(sbi))); } static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, @@ -102,10 +92,10 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct super_block *sb = sbi->sb; if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); + return sprintf(buf, "0\n"); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->kbytes_written + + return sprintf(buf, "%llu\n", + (unsigned long long)(sbi->kbytes_written + BD_PART_WRITTEN(sbi))); } @@ -116,7 +106,7 @@ static ssize_t features_show(struct f2fs_attr *a, int len = 0; if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); + return sprintf(buf, "0\n"); if (f2fs_sb_has_encrypt(sbi)) len += snprintf(buf, PAGE_SIZE - len, "%s", @@ -154,6 +144,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_casefold(sbi)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "casefold"); + if (f2fs_sb_has_compression(sbi)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "compression"); len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "pin_file"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); @@ -163,9 +156,66 @@ static ssize_t features_show(struct f2fs_attr *a, static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks); + return sprintf(buf, "%u\n", sbi->current_reserved_blocks); +} + +static ssize_t unusable_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + block_t unusable; + + if (test_opt(sbi, DISABLE_CHECKPOINT)) + unusable = sbi->unusable_block_count; + else + unusable = f2fs_get_unusable_blocks(sbi); + return sprintf(buf, "%llu\n", (unsigned long long)unusable); +} + +static ssize_t encoding_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ +#ifdef CONFIG_UNICODE + if (f2fs_sb_has_casefold(sbi)) + return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", + sbi->s_encoding->charset, + (sbi->s_encoding->version >> 16) & 0xff, + (sbi->s_encoding->version >> 8) & 0xff, + sbi->s_encoding->version & 0xff); +#endif + return sprintf(buf, "(none)"); } +#ifdef CONFIG_F2FS_STAT_FS +static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + return sprintf(buf, "%llu\n", + (unsigned long long)(si->tot_blks - + (si->bg_data_blks + si->bg_node_blks))); +} + +static ssize_t moved_blocks_background_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + return sprintf(buf, "%llu\n", + (unsigned long long)(si->bg_data_blks + si->bg_node_blks)); +} + +static ssize_t avg_vblocks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + + si->dirty_count = dirty_segments(sbi); + f2fs_update_sit_info(sbi); + return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); +} +#endif + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -199,7 +249,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, ui = (unsigned int *)(ptr + a->offset); - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); + return sprintf(buf, "%u\n", *ui); } static ssize_t __sbi_store(struct f2fs_attr *a, @@ -389,6 +439,7 @@ enum feat_id { FEAT_VERITY, FEAT_SB_CHECKSUM, FEAT_CASEFOLD, + FEAT_COMPRESSION, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -408,7 +459,8 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_VERITY: case FEAT_SB_CHECKSUM: case FEAT_CASEFOLD: - return snprintf(buf, PAGE_SIZE, "supported\n"); + case FEAT_COMPRESSION: + return sprintf(buf, "supported\n"); } return 0; } @@ -437,6 +489,14 @@ static struct f2fs_attr f2fs_attr_##_name = { \ .id = _id, \ } +#define F2FS_STAT_ATTR(_struct_type, _struct_name, _name, _elname) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_sbi_show, \ + .struct_type = _struct_type, \ + .offset = offsetof(struct _struct_name, _elname), \ +} + F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, urgent_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); @@ -478,11 +538,21 @@ F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_GENERAL_RO_ATTR(dirty_segments); +F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); +#ifdef CONFIG_F2FS_STAT_FS +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_foreground_calls, call_count); +F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, gc_background_calls, bg_gc); +F2FS_GENERAL_RO_ATTR(moved_blocks_background); +F2FS_GENERAL_RO_ATTR(moved_blocks_foreground); +F2FS_GENERAL_RO_ATTR(avg_vblocks); +#endif #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); @@ -503,6 +573,7 @@ F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); #endif F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); +F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -543,12 +614,22 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(inject_type), #endif ATTR_LIST(dirty_segments), + ATTR_LIST(free_segments), ATTR_LIST(unusable), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), ATTR_LIST(reserved_blocks), ATTR_LIST(current_reserved_blocks), ATTR_LIST(encoding), +#ifdef CONFIG_F2FS_STAT_FS + ATTR_LIST(cp_foreground_calls), + ATTR_LIST(cp_background_calls), + ATTR_LIST(gc_foreground_calls), + ATTR_LIST(gc_background_calls), + ATTR_LIST(moved_blocks_foreground), + ATTR_LIST(moved_blocks_background), + ATTR_LIST(avg_vblocks), +#endif NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -573,6 +654,7 @@ static struct attribute *f2fs_feat_attrs[] = { #endif ATTR_LIST(sb_checksum), ATTR_LIST(casefold), + ATTR_LIST(compression), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -733,10 +815,12 @@ int __init f2fs_init_sysfs(void) ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, NULL, "features"); - if (ret) + if (ret) { + kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); - else + } else { f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + } return ret; } @@ -757,8 +841,11 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, "%s", sb->s_id); - if (err) + if (err) { + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); return err; + } if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -786,4 +873,5 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); } diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 1e08bd54c5fb..f1b2a1fc2a6a 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -271,6 +271,14 @@ static inline struct timespec64 fat_timespec64_trunc_2secs(struct timespec64 ts) { return (struct timespec64){ ts.tv_sec & ~1ULL, 0 }; } + +static inline struct timespec64 fat_timespec64_trunc_10ms(struct timespec64 ts) +{ + if (ts.tv_nsec) + ts.tv_nsec -= ts.tv_nsec % 10000000UL; + return ts; +} + /* * truncate the various times with appropriate granularity: * root inode: @@ -308,7 +316,7 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) } if (flags & S_CTIME) { if (sbi->options.isvfat) - inode->i_ctime = timespec64_trunc(*now, 10000000); + inode->i_ctime = fat_timespec64_trunc_10ms(*now); else inode->i_ctime = fat_timespec64_trunc_2secs(*now); } diff --git a/fs/file.c b/fs/file.c index 3da91a112bab..a364e1a9b7e8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -642,7 +642,9 @@ out_unlock: EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ /* - * variant of __close_fd that gets a ref on the file for later fput + * variant of __close_fd that gets a ref on the file for later fput. + * The caller must ensure that filp_close() called on the file, and then + * an fput(). */ int __close_fd_get_file(unsigned int fd, struct file **res) { @@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res) spin_unlock(&files->file_lock); get_file(file); *res = file; - return filp_close(file, files); + return 0; out_unlock: spin_unlock(&files->file_lock); @@ -706,9 +708,9 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } -static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs) +static struct file *__fget_files(struct files_struct *files, unsigned int fd, + fmode_t mask, unsigned int refs) { - struct files_struct *files = current->files; struct file *file; rcu_read_lock(); @@ -729,6 +731,12 @@ loop: return file; } +static inline struct file *__fget(unsigned int fd, fmode_t mask, + unsigned int refs) +{ + return __fget_files(current->files, fd, mask, refs); +} + struct file *fget_many(unsigned int fd, unsigned int refs) { return __fget(fd, FMODE_PATH, refs); @@ -746,6 +754,18 @@ struct file *fget_raw(unsigned int fd) } EXPORT_SYMBOL(fget_raw); +struct file *fget_task(struct task_struct *task, unsigned int fd) +{ + struct file *file = NULL; + + task_lock(task); + if (task->files) + file = __fget_files(task->files, fd, 0, 1); + task_unlock(task); + + return file; +} + /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 335607b8c5c0..76ac9c7d32ec 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2063,7 +2063,7 @@ void wb_workfn(struct work_struct *work) struct bdi_writeback, dwork); long pages_written; - set_worker_desc("flush-%s", dev_name(wb->bdi->dev)); + set_worker_desc("flush-%s", bdi_dev_name(wb->bdi)); current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 9616af3768e1..08e91efbce53 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -111,7 +111,7 @@ extern void fscache_enqueue_object(struct fscache_object *); * object-list.c */ #ifdef CONFIG_FSCACHE_OBJECT_LIST -extern const struct file_operations fscache_objlist_fops; +extern const struct proc_ops fscache_objlist_proc_ops; extern void fscache_objlist_add(struct fscache_object *); extern void fscache_objlist_remove(struct fscache_object *); diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 72ebfe578f40..e106a1a1600d 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -7,6 +7,7 @@ #define FSCACHE_DEBUG_LEVEL COOKIE #include <linux/module.h> +#include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/key.h> @@ -405,9 +406,9 @@ static int fscache_objlist_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -const struct file_operations fscache_objlist_fops = { - .open = fscache_objlist_open, - .read = seq_read, - .llseek = seq_lseek, - .release = fscache_objlist_release, +const struct proc_ops fscache_objlist_proc_ops = { + .proc_open = fscache_objlist_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = fscache_objlist_release, }; diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c index 5523446e2952..90a7bc22f7e1 100644 --- a/fs/fscache/proc.c +++ b/fs/fscache/proc.c @@ -35,7 +35,7 @@ int __init fscache_proc_init(void) #ifdef CONFIG_FSCACHE_OBJECT_LIST if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL, - &fscache_objlist_fops)) + &fscache_objlist_proc_ops)) goto error_objects; #endif diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 9c6df721321a..ba83b49ce18c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -183,14 +183,12 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - int ret; if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) goto out; if (PageChecked(page) || current->journal_info) goto out_ignore; - ret = __gfs2_jdata_writepage(page, wbc); - return ret; + return __gfs2_jdata_writepage(page, wbc); out_ignore: redirty_page_for_writepage(wbc, page); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index eb9c0578978f..c8b62577e2f2 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -73,9 +73,6 @@ #include "bmap.h" #include "util.h" -#define IS_LEAF 1 /* Hashed (leaf) directory */ -#define IS_DINODE 2 /* Linear (stuffed dinode block) directory */ - #define MAX_RA_BLOCKS 32 /* max read-ahead blocks */ #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index b7123de7c180..d0eceaff3cea 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -826,7 +826,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); if (glops->go_flags & GLOF_LVB) { - gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_NOFS); + gl->gl_lksb.sb_lvbptr = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); if (!gl->gl_lksb.sb_lvbptr) { kmem_cache_free(cachep, gl); return -ENOMEM; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 4ede1f18de85..061d22e1ceb6 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -95,7 +95,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) /* A shortened, inline version of gfs2_trans_begin() * tr->alloced is not set since the transaction structure is * on the stack */ - tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); + tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes); tr.tr_ip = _RET_IP_; if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) return; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 5f89c515f5bb..9fd88ed18807 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -387,8 +387,6 @@ struct gfs2_glock { struct rhash_head gl_node; }; -#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ - enum { GIF_INVALID = 0, GIF_QD_LOCKED = 1, @@ -505,6 +503,7 @@ struct gfs2_trans { unsigned int tr_num_buf_rm; unsigned int tr_num_databuf_rm; unsigned int tr_num_revoke; + unsigned int tr_num_revoke_rm; struct list_head tr_list; struct list_head tr_databuf; @@ -703,6 +702,7 @@ struct gfs2_sbd { u32 sd_fsb2bb_shift; u32 sd_diptrs; /* Number of pointers in a dinode */ u32 sd_inptrs; /* Number of pointers in a indirect block */ + u32 sd_ldptrs; /* Number of pointers in a log descriptor block */ u32 sd_jbsize; /* Size of a journaled data block */ u32 sd_hash_bsize; /* sizeof(exhash block) */ u32 sd_hash_bsize_shift; @@ -803,7 +803,7 @@ struct gfs2_sbd { struct gfs2_trans *sd_log_tr; unsigned int sd_log_blks_reserved; - int sd_log_commited_revoke; + int sd_log_committed_revoke; atomic_t sd_log_pinned; unsigned int sd_log_num_revoke; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index dafef10b91f1..2716d56ed0a0 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -136,7 +136,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); - ip->i_no_formal_ino = no_formal_ino; error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (unlikely(error)) @@ -175,21 +174,22 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, gfs2_glock_put(io_gl); io_gl = NULL; + /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ + inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); + inode->i_atime.tv_nsec = 0; + if (type == DT_UNKNOWN) { /* Inode glock must be locked already */ error = gfs2_inode_refresh(GFS2_I(inode)); if (error) goto fail_refresh; } else { + ip->i_no_formal_ino = no_formal_ino; inode->i_mode = DT2IF(type); } gfs2_set_iop(inode); - /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ - inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); - inode->i_atime.tv_nsec = 0; - unlock_new_inode(inode); } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index eb3f2e7b8085..00a2e721a374 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -37,7 +37,6 @@ static void gfs2_log_shutdown(struct gfs2_sbd *sdp); * gfs2_struct2blk - compute stuff * @sdp: the filesystem * @nstruct: the number of structures - * @ssize: the size of the structures * * Compute the number of log descriptor blocks needed to hold a certain number * of structures of a certain size. @@ -45,18 +44,16 @@ static void gfs2_log_shutdown(struct gfs2_sbd *sdp); * Returns: the number of blocks needed (minimum is always 1) */ -unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, - unsigned int ssize) +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct) { unsigned int blks; unsigned int first, second; blks = 1; - first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize; + first = sdp->sd_ldptrs; if (nstruct > first) { - second = (sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_meta_header)) / ssize; + second = sdp->sd_inptrs; blks += DIV_ROUND_UP(nstruct - first, second); } @@ -472,9 +469,8 @@ static unsigned int calc_reserved(struct gfs2_sbd *sdp) reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp)); } - if (sdp->sd_log_commited_revoke > 0) - reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke, - sizeof(u64)); + if (sdp->sd_log_committed_revoke > 0) + reserved += gfs2_struct2blk(sdp, sdp->sd_log_committed_revoke); /* One for the overall header */ if (reserved) reserved++; @@ -829,7 +825,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) if (unlikely(state == SFS_FROZEN)) gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); gfs2_assert_withdraw(sdp, - sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); + sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke); gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); @@ -848,7 +844,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_log_lock(sdp); sdp->sd_log_head = sdp->sd_log_flush_head; sdp->sd_log_blks_reserved = 0; - sdp->sd_log_commited_revoke = 0; + sdp->sd_log_committed_revoke = 0; spin_lock(&sdp->sd_ail_lock); if (tr && !list_empty(&tr->tr_ail1_list)) { @@ -899,6 +895,7 @@ static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new) old->tr_num_buf_rm += new->tr_num_buf_rm; old->tr_num_databuf_rm += new->tr_num_databuf_rm; old->tr_num_revoke += new->tr_num_revoke; + old->tr_num_revoke_rm += new->tr_num_revoke_rm; list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); list_splice_tail_init(&new->tr_buf, &old->tr_buf); @@ -920,7 +917,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) set_bit(TR_ATTACHED, &tr->tr_flags); } - sdp->sd_log_commited_revoke += tr->tr_num_revoke; + sdp->sd_log_committed_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; reserved = calc_reserved(sdp); maxres = sdp->sd_log_blks_reserved + tr->tr_reserved; gfs2_assert_withdraw(sdp, maxres >= reserved); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 2ff163a8dce1..c0a65e5a126b 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -60,9 +60,9 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) spin_unlock(&sdp->sd_ordered_lock); } } + extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); -extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, - unsigned int ssize); +extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 55fed7daf2b1..d9431724b788 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -259,7 +259,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, struct super_block *sb = sdp->sd_vfs; struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); + bio->bi_iter.bi_sector = blkno << (sb->s_blocksize_bits - 9); bio_set_dev(bio, sb->s_bdev); bio->bi_end_io = end_io; bio->bi_private = sdp; @@ -472,6 +472,20 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, put_page(page); /* Once more for find_or_create_page */ } +static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs) +{ + struct bio *new; + + new = bio_alloc(GFP_NOIO, nr_iovecs); + bio_copy_dev(new, prev); + new->bi_iter.bi_sector = bio_end_sector(prev); + new->bi_opf = prev->bi_opf; + new->bi_write_hint = prev->bi_write_hint; + bio_chain(new, prev); + submit_bio(prev); + return new; +} + /** * gfs2_find_jhead - find the head of a log * @jd: The journal descriptor @@ -488,15 +502,15 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct address_space *mapping = jd->jd_inode->i_mapping; unsigned int block = 0, blocks_submitted = 0, blocks_read = 0; - unsigned int bsize = sdp->sd_sb.sb_bsize; + unsigned int bsize = sdp->sd_sb.sb_bsize, off; unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; unsigned int shift = PAGE_SHIFT - bsize_shift; - unsigned int readhead_blocks = BIO_MAX_PAGES << shift; + unsigned int readahead_blocks = BIO_MAX_PAGES << shift; struct gfs2_journal_extent *je; int sz, ret = 0; struct bio *bio = NULL; struct page *page = NULL; - bool done = false; + bool bio_chained = false, done = false; errseq_t since; memset(head, 0, sizeof(*head)); @@ -505,9 +519,9 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, since = filemap_sample_wb_err(mapping); list_for_each_entry(je, &jd->extent_list, list) { - for (; block < je->lblock + je->blocks; block++) { - u64 dblock; + u64 dblock = je->dblock; + for (; block < je->lblock + je->blocks; block++, dblock++) { if (!page) { page = find_or_create_page(mapping, block >> shift, GFP_NOFS); @@ -516,35 +530,41 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, done = true; goto out; } + off = 0; } - if (bio) { - unsigned int off; - - off = (block << bsize_shift) & ~PAGE_MASK; + if (!bio || (bio_chained && !off)) { + /* start new bio */ + } else { sz = bio_add_page(bio, page, bsize, off); - if (sz == bsize) { /* block added */ - if (off + bsize == PAGE_SIZE) { - page = NULL; - goto page_added; - } - continue; + if (sz == bsize) + goto block_added; + if (off) { + unsigned int blocks = + (PAGE_SIZE - off) >> bsize_shift; + + bio = gfs2_chain_bio(bio, blocks); + bio_chained = true; + goto add_block_to_new_bio; } + } + + if (bio) { blocks_submitted = block + 1; submit_bio(bio); - bio = NULL; } - dblock = je->dblock + (block - je->lblock); bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read); bio->bi_opf = REQ_OP_READ; - sz = bio_add_page(bio, page, bsize, 0); - gfs2_assert_warn(sdp, sz == bsize); - if (bsize == PAGE_SIZE) + bio_chained = false; +add_block_to_new_bio: + sz = bio_add_page(bio, page, bsize, off); + BUG_ON(sz != bsize); +block_added: + off += bsize; + if (off == PAGE_SIZE) page = NULL; - -page_added: - if (blocks_submitted < blocks_read + readhead_blocks) { + if (blocks_submitted < blocks_read + readahead_blocks) { /* Keep at least one bio in flight */ continue; } @@ -846,7 +866,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) if (!sdp->sd_log_num_revoke) return; - length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64)); + length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke); page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke); offset = sizeof(struct gfs2_log_descriptor); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e8b7b0ce8404..b3e904bcc02c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -298,6 +298,8 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sizeof(struct gfs2_dinode)) / sizeof(u64); sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); + sdp->sd_ldptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_log_descriptor)) / sizeof(u64); sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 2466bb44a23c..e7bf91ec231c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -36,16 +36,6 @@ #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) -#if BITS_PER_LONG == 32 -#define LBITMASK (0x55555555UL) -#define LBITSKIP55 (0x55555555UL) -#define LBITSKIP00 (0x00000000UL) -#else -#define LBITMASK (0x5555555555555555UL) -#define LBITSKIP55 (0x5555555555555555UL) -#define LBITSKIP00 (0x0000000000000000UL) -#endif - /* * These routines are used by the resource group routines (rgrp.c) * to keep track of block allocation. Each block is represented by two diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 9d4227330de4..a685637a5b55 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -49,8 +49,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (blocks) tr->tr_reserved += 6 + blocks; if (revokes) - tr->tr_reserved += gfs2_struct2blk(sdp, revokes, - sizeof(u64)); + tr->tr_reserved += gfs2_struct2blk(sdp, revokes); INIT_LIST_HEAD(&tr->tr_databuf); INIT_LIST_HEAD(&tr->tr_buf); @@ -77,10 +76,10 @@ static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr) fs_warn(sdp, "blocks=%u revokes=%u reserved=%u touched=%u\n", tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, test_bit(TR_TOUCHED, &tr->tr_flags)); - fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u\n", + fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u/%u\n", tr->tr_num_buf_new, tr->tr_num_buf_rm, tr->tr_num_databuf_new, tr->tr_num_databuf_rm, - tr->tr_num_revoke); + tr->tr_num_revoke, tr->tr_num_revoke_rm); } void gfs2_trans_end(struct gfs2_sbd *sdp) @@ -265,7 +264,7 @@ void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) if (bd->bd_gl) gfs2_glock_remove_revoke(bd->bd_gl); kmem_cache_free(gfs2_bufdata_cachep, bd); - tr->tr_num_revoke--; + tr->tr_num_revoke_rm++; if (--n == 0) break; } diff --git a/fs/inode.c b/fs/inode.c index ea15c6d9f274..c7418b0b4168 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1683,12 +1683,9 @@ EXPORT_SYMBOL(generic_update_time); */ static int update_time(struct inode *inode, struct timespec64 *time, int flags) { - int (*update_time)(struct inode *, struct timespec64 *, int); - - update_time = inode->i_op->update_time ? inode->i_op->update_time : - generic_update_time; - - return update_time(inode, time, flags); + if (inode->i_op->update_time) + return inode->i_op->update_time(inode, time, flags); + return generic_update_time(inode, time, flags); } /** @@ -2154,30 +2151,6 @@ void inode_nohighmem(struct inode *inode) EXPORT_SYMBOL(inode_nohighmem); /** - * timespec64_trunc - Truncate timespec64 to a granularity - * @t: Timespec64 - * @gran: Granularity in ns. - * - * Truncate a timespec64 to a granularity. Always rounds down. gran must - * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). - */ -struct timespec64 timespec64_trunc(struct timespec64 t, unsigned gran) -{ - /* Avoid division in the common cases 1 ns and 1 s. */ - if (gran == 1) { - /* nothing */ - } else if (gran == NSEC_PER_SEC) { - t.tv_nsec = 0; - } else if (gran > 1 && gran < NSEC_PER_SEC) { - t.tv_nsec -= t.tv_nsec % gran; - } else { - WARN(1, "illegal file time granularity: %u", gran); - } - return t; -} -EXPORT_SYMBOL(timespec64_trunc); - -/** * timestamp_truncate - Truncate timespec to a granularity * @t: Timespec * @inode: inode being updated diff --git a/fs/internal.h b/fs/internal.h index e3fa69544b66..f3f280b952a3 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -124,6 +124,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, const char *, const struct open_flags *); +extern struct open_how build_open_how(int flags, umode_t mode); +extern int build_open_flags(const struct open_how *how, struct open_flags *op); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); long do_faccessat(int dfd, const char __user *filename, int mode); @@ -180,11 +182,11 @@ extern void mnt_pin_kill(struct mount *m); */ extern const struct dentry_operations ns_dentry_operations; -/* - * fs/ioctl.c - */ -extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, - unsigned long arg); - /* direct-io.c: */ int sb_init_dio_done_wq(struct super_block *sb); + +/* + * fs/stat.c: + */ +unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags); +int cp_statx(const struct kstat *stat, struct statx __user *buffer); diff --git a/fs/io-wq.c b/fs/io-wq.c index 5147d2213b01..cb60a42b9fdf 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -56,7 +56,8 @@ struct io_worker { struct rcu_head rcu; struct mm_struct *mm; - const struct cred *creds; + const struct cred *cur_creds; + const struct cred *saved_creds; struct files_struct *restore_files; }; @@ -109,10 +110,10 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; - const struct cred *creds; - struct mm_struct *mm; refcount_t refs; struct completion done; + + refcount_t use_refs; }; static bool io_worker_get(struct io_worker *worker) @@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) { bool dropped_lock = false; - if (worker->creds) { - revert_creds(worker->creds); - worker->creds = NULL; + if (worker->saved_creds) { + revert_creds(worker->saved_creds); + worker->cur_creds = worker->saved_creds = NULL; } if (current->files != worker->restore_files) { @@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) return NULL; } +static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) +{ + if (worker->mm) { + unuse_mm(worker->mm); + mmput(worker->mm); + worker->mm = NULL; + } + if (!work->mm) { + set_fs(KERNEL_DS); + return; + } + if (mmget_not_zero(work->mm)) { + use_mm(work->mm); + if (!worker->mm) + set_fs(USER_DS); + worker->mm = work->mm; + /* hang on to this mm */ + work->mm = NULL; + return; + } + + /* failed grabbing mm, ensure work gets cancelled */ + work->flags |= IO_WQ_WORK_CANCEL; +} + +static void io_wq_switch_creds(struct io_worker *worker, + struct io_wq_work *work) +{ + const struct cred *old_creds = override_creds(work->creds); + + worker->cur_creds = work->creds; + if (worker->saved_creds) + put_cred(old_creds); /* creds set by previous switch */ + else + worker->saved_creds = old_creds; +} + static void io_worker_handle_work(struct io_worker *worker) __releases(wqe->lock) { @@ -438,24 +476,19 @@ next: if (work->flags & IO_WQ_WORK_CB) work->func(&work); - if ((work->flags & IO_WQ_WORK_NEEDS_FILES) && - current->files != work->files) { + if (work->files && current->files != work->files) { task_lock(current); current->files = work->files; task_unlock(current); } - if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm && - wq->mm) { - if (mmget_not_zero(wq->mm)) { - use_mm(wq->mm); - set_fs(USER_DS); - worker->mm = wq->mm; - } else { - work->flags |= IO_WQ_WORK_CANCEL; - } - } - if (!worker->creds) - worker->creds = override_creds(wq->creds); + if (work->mm != worker->mm) + io_wq_switch_mm(worker, work); + if (worker->cur_creds != work->creds) + io_wq_switch_creds(worker, work); + /* + * OK to set IO_WQ_WORK_CANCEL even for uncancellable work, + * the worker function will do the right thing. + */ if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) work->flags |= IO_WQ_WORK_CANCEL; if (worker->mm) @@ -720,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + int work_flags; unsigned long flags; /* @@ -734,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; } + work_flags = work->flags; spin_lock_irqsave(&wqe->lock, flags); wq_list_add_tail(&work->list, &wqe->work_list); wqe->flags &= ~IO_WQE_FLAG_STALLED; spin_unlock_irqrestore(&wqe->lock, flags); - if (!atomic_read(&acct->nr_running)) + if ((work_flags & IO_WQ_WORK_CONCURRENT) || + !atomic_read(&acct->nr_running)) io_wqe_wake_worker(wqe, acct); } @@ -828,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data) */ spin_lock_irqsave(&worker->lock, flags); if (worker->cur_work && + !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) && data->cancel(worker->cur_work, data->caller_data)) { send_sig(SIGINT, worker->task, 1); ret = true; @@ -902,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) return false; spin_lock_irqsave(&worker->lock, flags); - if (worker->cur_work == work) { + if (worker->cur_work == work && + !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) { send_sig(SIGINT, worker->task, 1); ret = true; } @@ -1026,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) /* caller must already hold a reference to this */ wq->user = data->user; - wq->creds = data->creds; for_each_node(node) { struct io_wqe *wqe; @@ -1053,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) init_completion(&wq->done); - /* caller must have already done mmgrab() on this mm */ - wq->mm = data->mm; - wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); if (!IS_ERR(wq->manager)) { wake_up_process(wq->manager); @@ -1064,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ret = -ENOMEM; goto err; } + refcount_set(&wq->use_refs, 1); reinit_completion(&wq->done); return wq; } @@ -1078,13 +1113,21 @@ err: return ERR_PTR(ret); } +bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) +{ + if (data->get_work != wq->get_work || data->put_work != wq->put_work) + return false; + + return refcount_inc_not_zero(&wq->use_refs); +} + static bool io_wq_worker_wake(struct io_worker *worker, void *data) { wake_up_process(worker->task); return false; } -void io_wq_destroy(struct io_wq *wq) +static void __io_wq_destroy(struct io_wq *wq) { int node; @@ -1104,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq) kfree(wq->wqes); kfree(wq); } + +void io_wq_destroy(struct io_wq *wq) +{ + if (refcount_dec_and_test(&wq->use_refs)) + __io_wq_destroy(wq); +} diff --git a/fs/io-wq.h b/fs/io-wq.h index 3f5e356de980..50b3378febf2 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -7,11 +7,11 @@ enum { IO_WQ_WORK_CANCEL = 1, IO_WQ_WORK_HAS_MM = 2, IO_WQ_WORK_HASHED = 4, - IO_WQ_WORK_NEEDS_USER = 8, - IO_WQ_WORK_NEEDS_FILES = 16, IO_WQ_WORK_UNBOUND = 32, IO_WQ_WORK_INTERNAL = 64, IO_WQ_WORK_CB = 128, + IO_WQ_WORK_NO_CANCEL = 256, + IO_WQ_WORK_CONCURRENT = 512, IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -72,6 +72,8 @@ struct io_wq_work { }; void (*func)(struct io_wq_work **); struct files_struct *files; + struct mm_struct *mm; + const struct cred *creds; unsigned flags; }; @@ -81,21 +83,22 @@ struct io_wq_work { (work)->func = _func; \ (work)->flags = 0; \ (work)->files = NULL; \ + (work)->mm = NULL; \ + (work)->creds = NULL; \ } while (0) \ typedef void (get_work_fn)(struct io_wq_work *); typedef void (put_work_fn)(struct io_wq_work *); struct io_wq_data { - struct mm_struct *mm; struct user_struct *user; - const struct cred *creds; get_work_fn *get_work; put_work_fn *put_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); +bool io_wq_get(struct io_wq *wq, struct io_wq_data *data); void io_wq_destroy(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); diff --git a/fs/io_uring.c b/fs/io_uring.c index e54556b0fcc6..1806afddfea5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -46,6 +46,7 @@ #include <linux/compat.h> #include <linux/refcount.h> #include <linux/uio.h> +#include <linux/bits.h> #include <linux/sched/signal.h> #include <linux/fs.h> @@ -70,6 +71,10 @@ #include <linux/sizes.h> #include <linux/hugetlb.h> #include <linux/highmem.h> +#include <linux/namei.h> +#include <linux/fsnotify.h> +#include <linux/fadvise.h> +#include <linux/eventpoll.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -177,6 +182,21 @@ struct fixed_file_table { struct file **files; }; +enum { + FFD_F_ATOMIC, +}; + +struct fixed_file_data { + struct fixed_file_table *table; + struct io_ring_ctx *ctx; + + struct percpu_ref refs; + struct llist_head put_llist; + unsigned long state; + struct work_struct ref_work; + struct completion done; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -184,10 +204,11 @@ struct io_ring_ctx { struct { unsigned int flags; - bool compat; - bool account_mem; - bool cq_overflow_flushed; - bool drain_next; + int compat: 1; + int account_mem: 1; + int cq_overflow_flushed: 1; + int drain_next: 1; + int eventfd_async: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -207,13 +228,14 @@ struct io_ring_ctx { unsigned sq_thread_idle; unsigned cached_sq_dropped; atomic_t cached_cq_overflow; - struct io_uring_sqe *sq_sqes; + unsigned long sq_check_overflow; struct list_head defer_list; struct list_head timeout_list; struct list_head cq_overflow_list; wait_queue_head_t inflight_wait; + struct io_uring_sqe *sq_sqes; } ____cacheline_aligned_in_smp; struct io_rings *rings; @@ -229,8 +251,10 @@ struct io_ring_ctx { * readers must ensure that ->refs is alive as long as the file* is * used. Only updated through io_uring_register(2). */ - struct fixed_file_table *file_table; + struct fixed_file_data *file_data; unsigned nr_user_files; + int ring_fd; + struct file *ring_file; /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; @@ -250,11 +274,14 @@ struct io_ring_ctx { struct socket *ring_sock; #endif + struct idr personality_idr; + struct { unsigned cached_cq_tail; unsigned cq_entries; unsigned cq_mask; atomic_t cq_timeouts; + unsigned long cq_check_overflow; struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; struct eventfd_ctx *cq_ev_fd; @@ -267,7 +294,8 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; - bool poll_multi_file; + struct llist_head poll_llist; + /* * ->poll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. @@ -277,6 +305,7 @@ struct io_ring_ctx { struct list_head poll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; + bool poll_multi_file; spinlock_t inflight_lock; struct list_head inflight_list; @@ -299,6 +328,12 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +struct io_close { + struct file *file; + struct file *put_file; + int fd; +}; + struct io_timeout_data { struct io_kiocb *req; struct hrtimer timer; @@ -319,6 +354,7 @@ struct io_sync { loff_t len; loff_t off; int flags; + int mode; }; struct io_cancel { @@ -348,8 +384,52 @@ struct io_connect { struct io_sr_msg { struct file *file; - struct user_msghdr __user *msg; + union { + struct user_msghdr __user *msg; + void __user *buf; + }; int msg_flags; + size_t len; +}; + +struct io_open { + struct file *file; + int dfd; + union { + unsigned mask; + }; + struct filename *filename; + struct statx __user *buffer; + struct open_how how; +}; + +struct io_files_update { + struct file *file; + u64 arg; + u32 nr_args; + u32 offset; +}; + +struct io_fadvise { + struct file *file; + u64 offset; + u32 len; + u32 advice; +}; + +struct io_madvise { + struct file *file; + u64 addr; + u32 len; + u32 advice; +}; + +struct io_epoll { + struct file *file; + int epfd; + int op; + int fd; + struct epoll_event event; }; struct io_async_connect { @@ -370,15 +450,79 @@ struct io_async_rw { ssize_t size; }; +struct io_async_open { + struct filename *filename; +}; + struct io_async_ctx { union { struct io_async_rw rw; struct io_async_msghdr msg; struct io_async_connect connect; struct io_timeout_data timeout; + struct io_async_open open; }; }; +enum { + REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, + REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, + REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, + REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, + REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + + REQ_F_LINK_NEXT_BIT, + REQ_F_FAIL_LINK_BIT, + REQ_F_INFLIGHT_BIT, + REQ_F_CUR_POS_BIT, + REQ_F_NOWAIT_BIT, + REQ_F_IOPOLL_COMPLETED_BIT, + REQ_F_LINK_TIMEOUT_BIT, + REQ_F_TIMEOUT_BIT, + REQ_F_ISREG_BIT, + REQ_F_MUST_PUNT_BIT, + REQ_F_TIMEOUT_NOSEQ_BIT, + REQ_F_COMP_LOCKED_BIT, +}; + +enum { + /* ctx owns file */ + REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), + /* drain existing IO first */ + REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), + /* linked sqes */ + REQ_F_LINK = BIT(REQ_F_LINK_BIT), + /* doesn't sever on completion < 0 */ + REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), + /* IOSQE_ASYNC */ + REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + + /* already grabbed next link */ + REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT), + /* fail rest of links */ + REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), + /* on inflight list */ + REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), + /* read/write uses file position */ + REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), + /* must not punt to workers */ + REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), + /* polled IO has completed */ + REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT), + /* has linked timeout */ + REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), + /* timeout request */ + REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT), + /* regular file */ + REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + /* must be punted even for NONBLOCK */ + REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT), + /* no timeout sequence */ + REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT), + /* completion under lock */ + REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT), +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -396,11 +540,19 @@ struct io_kiocb { struct io_timeout timeout; struct io_connect connect; struct io_sr_msg sr_msg; + struct io_open open; + struct io_close close; + struct io_files_update files_update; + struct io_fadvise fadvise; + struct io_madvise madvise; + struct io_epoll epoll; }; struct io_async_ctx *io; - struct file *ring_file; - int ring_fd; + /* + * llist_node is only used for poll deferred completions + */ + struct llist_node llist_node; bool has_user; bool in_async; bool needs_fixed_file; @@ -414,23 +566,6 @@ struct io_kiocb { struct list_head link_list; unsigned int flags; refcount_t refs; -#define REQ_F_NOWAIT 1 /* must not punt to workers */ -#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ -#define REQ_F_FIXED_FILE 4 /* ctx owns file */ -#define REQ_F_LINK_NEXT 8 /* already grabbed next link */ -#define REQ_F_IO_DRAIN 16 /* drain existing IO first */ -#define REQ_F_IO_DRAINED 32 /* drain done */ -#define REQ_F_LINK 64 /* linked sqes */ -#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */ -#define REQ_F_FAIL_LINK 256 /* fail rest of links */ -#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */ -#define REQ_F_TIMEOUT 1024 /* timeout request */ -#define REQ_F_ISREG 2048 /* regular file */ -#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ -#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ -#define REQ_F_INFLIGHT 16384 /* on inflight list */ -#define REQ_F_COMP_LOCKED 32768 /* completion under lock */ -#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ u64 user_data; u32 result; u32 sequence; @@ -463,14 +598,162 @@ struct io_submit_state { unsigned int ios_left; }; +struct io_op_def { + /* needs req->io allocated for deferral/async */ + unsigned async_ctx : 1; + /* needs current->mm setup, does mm access */ + unsigned needs_mm : 1; + /* needs req->file assigned */ + unsigned needs_file : 1; + /* needs req->file assigned IFF fd is >= 0 */ + unsigned fd_non_neg : 1; + /* hash wq insertion if file is a regular file */ + unsigned hash_reg_file : 1; + /* unbound wq insertion if file is a non-regular file */ + unsigned unbound_nonreg_file : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; + /* needs file table */ + unsigned file_table : 1; +}; + +static const struct io_op_def io_op_defs[] = { + [IORING_OP_NOP] = {}, + [IORING_OP_READV] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_WRITEV] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_FSYNC] = { + .needs_file = 1, + }, + [IORING_OP_READ_FIXED] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_WRITE_FIXED] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_POLL_ADD] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_POLL_REMOVE] = {}, + [IORING_OP_SYNC_FILE_RANGE] = { + .needs_file = 1, + }, + [IORING_OP_SENDMSG] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_RECVMSG] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_TIMEOUT] = { + .async_ctx = 1, + .needs_mm = 1, + }, + [IORING_OP_TIMEOUT_REMOVE] = {}, + [IORING_OP_ACCEPT] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + .file_table = 1, + }, + [IORING_OP_ASYNC_CANCEL] = {}, + [IORING_OP_LINK_TIMEOUT] = { + .async_ctx = 1, + .needs_mm = 1, + }, + [IORING_OP_CONNECT] = { + .async_ctx = 1, + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_FALLOCATE] = { + .needs_file = 1, + }, + [IORING_OP_OPENAT] = { + .needs_file = 1, + .fd_non_neg = 1, + .file_table = 1, + }, + [IORING_OP_CLOSE] = { + .needs_file = 1, + .file_table = 1, + }, + [IORING_OP_FILES_UPDATE] = { + .needs_mm = 1, + .file_table = 1, + }, + [IORING_OP_STATX] = { + .needs_mm = 1, + .needs_file = 1, + .fd_non_neg = 1, + }, + [IORING_OP_READ] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_WRITE] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_FADVISE] = { + .needs_file = 1, + }, + [IORING_OP_MADVISE] = { + .needs_mm = 1, + }, + [IORING_OP_SEND] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_RECV] = { + .needs_mm = 1, + .needs_file = 1, + .unbound_nonreg_file = 1, + }, + [IORING_OP_OPENAT2] = { + .needs_file = 1, + .fd_non_neg = 1, + .file_table = 1, + }, + [IORING_OP_EPOLL_CTL] = { + .unbound_nonreg_file = 1, + .file_table = 1, + }, +}; + static void io_wq_submit_work(struct io_wq_work **workptr); static void io_cqring_fill_event(struct io_kiocb *req, long res); -static void __io_free_req(struct io_kiocb *req); static void io_put_req(struct io_kiocb *req); -static void io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *ip, + unsigned nr_args); +static int io_grab_files(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -537,9 +820,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->completions[0]); init_completion(&ctx->completions[1]); + idr_init(&ctx->personality_idr); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); + init_llist_head(&ctx->poll_llist); INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); @@ -566,7 +851,7 @@ static inline bool __req_need_defer(struct io_kiocb *req) static inline bool req_need_defer(struct io_kiocb *req) { - if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN) + if (unlikely(req->flags & REQ_F_IO_DRAIN)) return __req_need_defer(req); return false; @@ -606,53 +891,53 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) { - /* order cqe stores with ring update */ - smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); + /* order cqe stores with ring update */ + smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); - if (wq_has_sleeper(&ctx->cq_wait)) { - wake_up_interruptible(&ctx->cq_wait); - kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); - } + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); + } +} + +static inline void io_req_work_grab_env(struct io_kiocb *req, + const struct io_op_def *def) +{ + if (!req->work.mm && def->needs_mm) { + mmgrab(current->mm); + req->work.mm = current->mm; } + if (!req->work.creds) + req->work.creds = get_current_cred(); } -static inline bool io_req_needs_user(struct io_kiocb *req) +static inline void io_req_work_drop_env(struct io_kiocb *req) { - return !(req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED); + if (req->work.mm) { + mmdrop(req->work.mm); + req->work.mm = NULL; + } + if (req->work.creds) { + put_cred(req->work.creds); + req->work.creds = NULL; + } } static inline bool io_prep_async_work(struct io_kiocb *req, struct io_kiocb **link) { + const struct io_op_def *def = &io_op_defs[req->opcode]; bool do_hashed = false; - switch (req->opcode) { - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - /* only regular files should be hashed for writes */ - if (req->flags & REQ_F_ISREG) + if (req->flags & REQ_F_ISREG) { + if (def->hash_reg_file) do_hashed = true; - /* fall-through */ - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_SENDMSG: - case IORING_OP_RECVMSG: - case IORING_OP_ACCEPT: - case IORING_OP_POLL_ADD: - case IORING_OP_CONNECT: - /* - * We know REQ_F_ISREG is not set on some of these - * opcodes, but this enables us to keep the check in - * just one place. - */ - if (!(req->flags & REQ_F_ISREG)) + } else { + if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; - break; } - if (io_req_needs_user(req)) - req->work.flags |= IO_WQ_WORK_NEEDS_USER; + + io_req_work_grab_env(req, def); *link = io_prep_linked_timeout(req); return do_hashed; @@ -711,10 +996,8 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) __io_commit_cqring(ctx); - while ((req = io_get_deferred_req(ctx)) != NULL) { - req->flags |= REQ_F_IO_DRAINED; + while ((req = io_get_deferred_req(ctx)) != NULL) io_queue_async_work(req); - } } static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) @@ -735,13 +1018,20 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) return &rings->cqes[tail & ctx->cq_mask]; } +static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) +{ + if (!ctx->eventfd_async) + return true; + return io_wq_current_is_worker() || in_interrupt(); +} + static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); if (waitqueue_active(&ctx->sqo_wait)) wake_up(&ctx->sqo_wait); - if (ctx->cq_ev_fd) + if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); } @@ -766,7 +1056,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) /* if force is set, the ring is going away. always drop after that */ if (force) - ctx->cq_overflow_flushed = true; + ctx->cq_overflow_flushed = 1; cqe = NULL; while (!list_empty(&ctx->cq_overflow_list)) { @@ -788,6 +1078,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) } io_commit_cqring(ctx); + if (cqe) { + clear_bit(0, &ctx->sq_check_overflow); + clear_bit(0, &ctx->cq_check_overflow); + } spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -821,6 +1115,10 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); } else { + if (list_empty(&ctx->cq_overflow_list)) { + set_bit(0, &ctx->sq_check_overflow); + set_bit(0, &ctx->cq_check_overflow); + } refcount_inc(&req->refs); req->result = res; list_add_tail(&req->list, &ctx->cq_overflow_list); @@ -863,9 +1161,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; - if (!percpu_ref_tryget(&ctx->refs)) - return NULL; - if (!state) { req = kmem_cache_alloc(req_cachep, gfp); if (unlikely(!req)) @@ -898,7 +1193,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, got_it: req->io = NULL; - req->ring_file = NULL; req->file = NULL; req->ctx = ctx; req->flags = 0; @@ -915,24 +1209,35 @@ fallback: return NULL; } -static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) +static void __io_req_do_free(struct io_kiocb *req) { - if (*nr) { - kmem_cache_free_bulk(req_cachep, *nr, reqs); - percpu_ref_put_many(&ctx->refs, *nr); - *nr = 0; + if (likely(!io_is_fallback_req(req))) + kmem_cache_free(req_cachep, req); + else + clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); +} + +static void __io_req_aux_free(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + kfree(req->io); + if (req->file) { + if (req->flags & REQ_F_FIXED_FILE) + percpu_ref_put(&ctx->file_data->refs); + else + fput(req->file); } + + io_req_work_drop_env(req); } static void __io_free_req(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + __io_req_aux_free(req); - if (req->io) - kfree(req->io); - if (req->file && !(req->flags & REQ_F_FIXED_FILE)) - fput(req->file); if (req->flags & REQ_F_INFLIGHT) { + struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->inflight_lock, flags); @@ -941,11 +1246,63 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } - percpu_ref_put(&ctx->refs); - if (likely(!io_is_fallback_req(req))) - kmem_cache_free(req_cachep, req); - else - clear_bit_unlock(0, (unsigned long *) ctx->fallback_req); + + percpu_ref_put(&req->ctx->refs); + __io_req_do_free(req); +} + +struct req_batch { + void *reqs[IO_IOPOLL_BATCH]; + int to_free; + int need_iter; +}; + +static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) +{ + int fixed_refs = rb->to_free; + + if (!rb->to_free) + return; + if (rb->need_iter) { + int i, inflight = 0; + unsigned long flags; + + fixed_refs = 0; + for (i = 0; i < rb->to_free; i++) { + struct io_kiocb *req = rb->reqs[i]; + + if (req->flags & REQ_F_FIXED_FILE) { + req->file = NULL; + fixed_refs++; + } + if (req->flags & REQ_F_INFLIGHT) + inflight++; + __io_req_aux_free(req); + } + if (!inflight) + goto do_free; + + spin_lock_irqsave(&ctx->inflight_lock, flags); + for (i = 0; i < rb->to_free; i++) { + struct io_kiocb *req = rb->reqs[i]; + + if (req->flags & REQ_F_INFLIGHT) { + list_del(&req->inflight_entry); + if (!--inflight) + break; + } + } + spin_unlock_irqrestore(&ctx->inflight_lock, flags); + + if (waitqueue_active(&ctx->inflight_wait)) + wake_up(&ctx->inflight_wait); + } +do_free: + kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); + if (fixed_refs) + percpu_ref_put_many(&ctx->file_data->refs, fixed_refs); + percpu_ref_put_many(&ctx->refs, rb->to_free); + rb->to_free = rb->need_iter = 0; } static bool io_link_cancel_timeout(struct io_kiocb *req) @@ -1118,19 +1475,21 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) { struct io_rings *rings = ctx->rings; - /* - * noflush == true is from the waitqueue handler, just ensure we wake - * up the task, and the next invocation will flush the entries. We - * cannot safely to it from here. - */ - if (noflush && !list_empty(&ctx->cq_overflow_list)) - return -1U; + if (test_bit(0, &ctx->cq_check_overflow)) { + /* + * noflush == true is from the waitqueue handler, just ensure + * we wake up the task, and the next invocation will flush the + * entries. We cannot safely to it from here. + */ + if (noflush && !list_empty(&ctx->cq_overflow_list)) + return -1U; - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx, false); + } /* See comment at the top of this file */ smp_rmb(); - return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); + return ctx->cached_cq_tail - READ_ONCE(rings->cq.head); } static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) @@ -1141,17 +1500,30 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; } +static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) +{ + if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req)) + return false; + + if (!(req->flags & REQ_F_FIXED_FILE) || req->io) + rb->need_iter++; + + rb->reqs[rb->to_free++] = req; + if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs))) + io_free_req_many(req->ctx, rb); + return true; +} + /* * Find and free completed poll iocbs */ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, struct list_head *done) { - void *reqs[IO_IOPOLL_BATCH]; + struct req_batch rb; struct io_kiocb *req; - int to_free; - to_free = 0; + rb.to_free = rb.need_iter = 0; while (!list_empty(done)) { req = list_first_entry(done, struct io_kiocb, list); list_del(&req->list); @@ -1159,26 +1531,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, io_cqring_fill_event(req, req->result); (*nr_events)++; - if (refcount_dec_and_test(&req->refs)) { - /* If we're not using fixed files, we have to pair the - * completion part with the file put. Use regular - * completions for those, only batch free for fixed - * file and non-linked commands. - */ - if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == - REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && - !req->io) { - reqs[to_free++] = req; - if (to_free == ARRAY_SIZE(reqs)) - io_free_req_many(ctx, reqs, &to_free); - } else { - io_free_req(req); - } - } + if (refcount_dec_and_test(&req->refs) && + !io_req_multi_free(&rb, req)) + io_free_req(req); } io_commit_cqring(ctx); - io_free_req_many(ctx, reqs, &to_free); + io_free_req_many(ctx, &rb); } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, @@ -1503,6 +1862,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); + if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; + } kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -1574,6 +1937,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, bool in_async) { + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + + if (req->flags & REQ_F_CUR_POS) + req->file->f_pos = kiocb->ki_pos; if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) *nxt = __io_complete_rw(kiocb, ret); else @@ -1671,6 +2038,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, if (req->rw.kiocb.private) return -EINVAL; + if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { + ssize_t ret; + ret = import_single_range(rw, buf, sqe_len, *iovec, iter); + *iovec = NULL; + return ret; + } + if (req->io) { struct io_async_rw *iorw = &req->io->rw; @@ -1767,6 +2141,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, static int io_alloc_async_ctx(struct io_kiocb *req) { + if (!io_op_defs[req->opcode].async_ctx) + return 0; req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); return req->io == NULL; } @@ -1786,8 +2162,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, struct iovec *iovec, struct iovec *fast_iov, struct iov_iter *iter) { - if (req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED) + if (!io_op_defs[req->opcode].async_ctx) return 0; if (!req->io && io_alloc_async_ctx(req)) return -ENOMEM; @@ -2101,6 +2476,421 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, return 0; } +static void io_fallocate_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + int ret; + + ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, + req->sync.len); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_fallocate_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) + return -EINVAL; + + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->addr); + req->sync.mode = READ_ONCE(sqe->len); + return 0; +} + +static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; + + /* fallocate always requiring blocking context */ + if (force_nonblock) { + io_put_req(req); + req->work.func = io_fallocate_finish; + return -EAGAIN; + } + + work = old_work = &req->work; + io_fallocate_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + + return 0; +} + +static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + const char __user *fname; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + req->open.how.mode = READ_ONCE(sqe->len); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->open.how.flags = READ_ONCE(sqe->open_flags); + + req->open.filename = getname(fname); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct open_how __user *how; + const char __user *fname; + size_t len; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + len = READ_ONCE(sqe->len); + + if (len < OPEN_HOW_SIZE_VER0) + return -EINVAL; + + ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, + len); + if (ret) + return ret; + + if (!(req->open.how.flags & O_PATH) && force_o_largefile()) + req->open.how.flags |= O_LARGEFILE; + + req->open.filename = getname(fname); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct open_flags op; + struct file *file; + int ret; + + if (force_nonblock) + return -EAGAIN; + + ret = build_open_flags(&req->open.how, &op); + if (ret) + goto err; + + ret = get_unused_fd_flags(req->open.how.flags); + if (ret < 0) + goto err; + + file = do_filp_open(req->open.dfd, req->open.filename, &op); + if (IS_ERR(file)) { + put_unused_fd(ret); + ret = PTR_ERR(file); + } else { + fsnotify_open(file); + fd_install(ret, file); + } +err: + putname(req->open.filename); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + +static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + req->open.how = build_open_how(req->open.how.flags, req->open.how.mode); + return io_openat2(req, nxt, force_nonblock); +} + +static int io_epoll_ctl_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_EPOLL) + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->epoll.epfd = READ_ONCE(sqe->fd); + req->epoll.op = READ_ONCE(sqe->len); + req->epoll.fd = READ_ONCE(sqe->off); + + if (ep_op_has_event(req->epoll.op)) { + struct epoll_event __user *ev; + + ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); + if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) + return -EFAULT; + } + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_EPOLL) + struct io_epoll *ie = &req->epoll; + int ret; + + ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + if (sqe->ioprio || sqe->buf_index || sqe->off) + return -EINVAL; + + req->madvise.addr = READ_ONCE(sqe->addr); + req->madvise.len = READ_ONCE(sqe->len); + req->madvise.advice = READ_ONCE(sqe->fadvise_advice); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + struct io_madvise *ma = &req->madvise; + int ret; + + if (force_nonblock) + return -EAGAIN; + + ret = do_madvise(ma->addr, ma->len, ma->advice); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + if (sqe->ioprio || sqe->buf_index || sqe->addr) + return -EINVAL; + + req->fadvise.offset = READ_ONCE(sqe->off); + req->fadvise.len = READ_ONCE(sqe->len); + req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); + return 0; +} + +static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_fadvise *fa = &req->fadvise; + int ret; + + /* DONTNEED may block, others _should_ not */ + if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock) + return -EAGAIN; + + ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + +static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + const char __user *fname; + unsigned lookup_flags; + int ret; + + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; + + req->open.dfd = READ_ONCE(sqe->fd); + req->open.mask = READ_ONCE(sqe->len); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + req->open.how.flags = READ_ONCE(sqe->statx_flags); + + if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags)) + return -EINVAL; + + req->open.filename = getname_flags(fname, lookup_flags, NULL); + if (IS_ERR(req->open.filename)) { + ret = PTR_ERR(req->open.filename); + req->open.filename = NULL; + return ret; + } + + return 0; +} + +static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_open *ctx = &req->open; + unsigned lookup_flags; + struct path path; + struct kstat stat; + int ret; + + if (force_nonblock) + return -EAGAIN; + + if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags)) + return -EINVAL; + +retry: + /* filename_lookup() drops it, keep a reference */ + ctx->filename->refcnt++; + + ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path, + NULL); + if (ret) + goto err; + + ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags); + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + if (!ret) + ret = cp_statx(&stat, ctx->buffer); +err: + putname(ctx->filename); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, nxt); + return 0; +} + +static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + /* + * If we queue this for async, it must not be cancellable. That would + * leave the 'file' in an undeterminate state. + */ + req->work.flags |= IO_WQ_WORK_NO_CANCEL; + + if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || + sqe->rw_flags || sqe->buf_index) + return -EINVAL; + if (sqe->flags & IOSQE_FIXED_FILE) + return -EINVAL; + + req->close.fd = READ_ONCE(sqe->fd); + if (req->file->f_op == &io_uring_fops || + req->close.fd == req->ctx->ring_fd) + return -EBADF; + + return 0; +} + +static void io_close_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + /* Invoked with files, we need to do the close */ + if (req->work.files) { + int ret; + + ret = filp_close(req->close.put_file, req->work.files); + if (ret < 0) { + req_set_fail_links(req); + } + io_cqring_add_event(req, ret); + } + + fput(req->close.put_file); + + /* we bypassed the re-issue, drop the submission reference */ + io_put_req(req); + io_put_req_find_next(req, &nxt); + if (nxt) + io_wq_assign_next(workptr, nxt); +} + +static int io_close(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + int ret; + + req->close.put_file = NULL; + ret = __close_fd_get_file(req->close.fd, &req->close.put_file); + if (ret < 0) + return ret; + + /* if the file has a flush method, be safe and punt to async */ + if (req->close.put_file->f_op->flush && !io_wq_current_is_worker()) + goto eagain; + + /* + * No ->flush(), safely close from here and just punt the + * fput() to async context. + */ + ret = filp_close(req->close.put_file, current->files); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + + if (io_wq_current_is_worker()) { + struct io_wq_work *old_work, *work; + + old_work = work = &req->work; + io_close_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + return 0; + } + +eagain: + req->work.func = io_close_finish; + return -EAGAIN; +} + static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -2178,8 +2968,9 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->len = READ_ONCE(sqe->len); - if (!io) + if (!io || req->opcode == IORING_OP_SEND) return 0; io->msg.iov = io->msg.fast_iov; @@ -2259,6 +3050,56 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, #endif } +static int io_send(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; + unsigned flags; + + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, + &msg.msg_iter); + if (ret) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_sendmsg_sock(sock, &msg, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -2269,7 +3110,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (!io) + if (!io || req->opcode == IORING_OP_RECV) return 0; io->msg.iov = io->msg.fast_iov; @@ -2351,6 +3192,59 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, #endif } +static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct io_sr_msg *sr = &req->sr_msg; + struct msghdr msg; + struct iovec iov; + unsigned flags; + + ret = import_single_range(READ, sr->buf, sr->len, &iov, + &msg.msg_iter); + if (ret) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + msg.msg_iocb = NULL; + msg.msg_flags = 0; + + flags = req->sr_msg.msg_flags; + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + + static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) @@ -2414,7 +3308,6 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, ret = __io_accept(req, nxt, force_nonblock); if (ret == -EAGAIN && force_nonblock) { req->work.func = io_accept_finish; - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; io_put_req(req); return -EAGAIN; } @@ -2635,6 +3528,39 @@ static void io_poll_complete_work(struct io_wq_work **workptr) io_wq_assign_next(workptr, nxt); } +static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes) +{ + struct io_kiocb *req, *tmp; + struct req_batch rb; + + rb.to_free = rb.need_iter = 0; + spin_lock_irq(&ctx->completion_lock); + llist_for_each_entry_safe(req, tmp, nodes, llist_node) { + hash_del(&req->hash_node); + io_poll_complete(req, req->result, 0); + + if (refcount_dec_and_test(&req->refs) && + !io_req_multi_free(&rb, req)) { + req->flags |= REQ_F_COMP_LOCKED; + io_free_req(req); + } + } + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); + io_free_req_many(ctx, &rb); +} + +static void io_poll_flush(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct llist_node *nodes; + + nodes = llist_del_all(&req->ctx->poll_llist); + if (nodes) + __io_poll_flush(req->ctx, nodes); +} + static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { @@ -2642,7 +3568,6 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_ring_ctx *ctx = req->ctx; __poll_t mask = key_to_poll(key); - unsigned long flags; /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) @@ -2656,17 +3581,31 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * If we have a link timeout we're going to need the completion_lock * for finalizing the request, mark us as having grabbed that already. */ - if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - hash_del(&req->hash_node); - io_poll_complete(req, mask, 0); - req->flags |= REQ_F_COMP_LOCKED; - io_put_req(req); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + if (mask) { + unsigned long flags; - io_cqring_ev_posted(ctx); - } else { - io_queue_async_work(req); + if (llist_empty(&ctx->poll_llist) && + spin_trylock_irqsave(&ctx->completion_lock, flags)) { + hash_del(&req->hash_node); + io_poll_complete(req, mask, 0); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + req = NULL; + } else { + req->result = mask; + req->llist_node.next = NULL; + /* if the list wasn't empty, we're done */ + if (!llist_add(&req->llist_node, &ctx->poll_llist)) + req = NULL; + else + req->work.func = io_poll_flush; + } } + if (req) + io_queue_async_work(req); return 1; } @@ -3071,20 +4010,67 @@ static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) return 0; } +static int io_files_update_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (sqe->flags || sqe->ioprio || sqe->rw_flags) + return -EINVAL; + + req->files_update.offset = READ_ONCE(sqe->off); + req->files_update.nr_args = READ_ONCE(sqe->len); + if (!req->files_update.nr_args) + return -EINVAL; + req->files_update.arg = READ_ONCE(sqe->addr); + return 0; +} + +static int io_files_update(struct io_kiocb *req, bool force_nonblock) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_files_update up; + int ret; + + if (force_nonblock) + return -EAGAIN; + + up.offset = req->files_update.offset; + up.fds = req->files_update.arg; + + mutex_lock(&ctx->uring_lock); + ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args); + mutex_unlock(&ctx->uring_lock); + + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); + return 0; +} + static int io_req_defer_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { ssize_t ret = 0; + if (io_op_defs[req->opcode].file_table) { + ret = io_grab_files(req); + if (unlikely(ret)) + return ret; + } + + io_req_work_grab_env(req, &io_op_defs[req->opcode]); + switch (req->opcode) { case IORING_OP_NOP: break; case IORING_OP_READV: case IORING_OP_READ_FIXED: + case IORING_OP_READ: ret = io_read_prep(req, sqe, true); break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: ret = io_write_prep(req, sqe, true); break; case IORING_OP_POLL_ADD: @@ -3100,9 +4086,11 @@ static int io_req_defer_prep(struct io_kiocb *req, ret = io_prep_sfr(req, sqe); break; case IORING_OP_SENDMSG: + case IORING_OP_SEND: ret = io_sendmsg_prep(req, sqe); break; case IORING_OP_RECVMSG: + case IORING_OP_RECV: ret = io_recvmsg_prep(req, sqe); break; case IORING_OP_CONNECT: @@ -3123,6 +4111,33 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_ACCEPT: ret = io_accept_prep(req, sqe); break; + case IORING_OP_FALLOCATE: + ret = io_fallocate_prep(req, sqe); + break; + case IORING_OP_OPENAT: + ret = io_openat_prep(req, sqe); + break; + case IORING_OP_CLOSE: + ret = io_close_prep(req, sqe); + break; + case IORING_OP_FILES_UPDATE: + ret = io_files_update_prep(req, sqe); + break; + case IORING_OP_STATX: + ret = io_statx_prep(req, sqe); + break; + case IORING_OP_FADVISE: + ret = io_fadvise_prep(req, sqe); + break; + case IORING_OP_MADVISE: + ret = io_madvise_prep(req, sqe); + break; + case IORING_OP_OPENAT2: + ret = io_openat2_prep(req, sqe); + break; + case IORING_OP_EPOLL_CTL: + ret = io_epoll_ctl_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -3173,6 +4188,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; case IORING_OP_READV: case IORING_OP_READ_FIXED: + case IORING_OP_READ: if (sqe) { ret = io_read_prep(req, sqe, force_nonblock); if (ret < 0) @@ -3182,6 +4198,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, break; case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: if (sqe) { ret = io_write_prep(req, sqe, force_nonblock); if (ret < 0) @@ -3222,20 +4239,28 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ret = io_sync_file_range(req, nxt, force_nonblock); break; case IORING_OP_SENDMSG: + case IORING_OP_SEND: if (sqe) { ret = io_sendmsg_prep(req, sqe); if (ret < 0) break; } - ret = io_sendmsg(req, nxt, force_nonblock); + if (req->opcode == IORING_OP_SENDMSG) + ret = io_sendmsg(req, nxt, force_nonblock); + else + ret = io_send(req, nxt, force_nonblock); break; case IORING_OP_RECVMSG: + case IORING_OP_RECV: if (sqe) { ret = io_recvmsg_prep(req, sqe); if (ret) break; } - ret = io_recvmsg(req, nxt, force_nonblock); + if (req->opcode == IORING_OP_RECVMSG) + ret = io_recvmsg(req, nxt, force_nonblock); + else + ret = io_recv(req, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: if (sqe) { @@ -3277,6 +4302,78 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_async_cancel(req, nxt); break; + case IORING_OP_FALLOCATE: + if (sqe) { + ret = io_fallocate_prep(req, sqe); + if (ret) + break; + } + ret = io_fallocate(req, nxt, force_nonblock); + break; + case IORING_OP_OPENAT: + if (sqe) { + ret = io_openat_prep(req, sqe); + if (ret) + break; + } + ret = io_openat(req, nxt, force_nonblock); + break; + case IORING_OP_CLOSE: + if (sqe) { + ret = io_close_prep(req, sqe); + if (ret) + break; + } + ret = io_close(req, nxt, force_nonblock); + break; + case IORING_OP_FILES_UPDATE: + if (sqe) { + ret = io_files_update_prep(req, sqe); + if (ret) + break; + } + ret = io_files_update(req, force_nonblock); + break; + case IORING_OP_STATX: + if (sqe) { + ret = io_statx_prep(req, sqe); + if (ret) + break; + } + ret = io_statx(req, nxt, force_nonblock); + break; + case IORING_OP_FADVISE: + if (sqe) { + ret = io_fadvise_prep(req, sqe); + if (ret) + break; + } + ret = io_fadvise(req, nxt, force_nonblock); + break; + case IORING_OP_MADVISE: + if (sqe) { + ret = io_madvise_prep(req, sqe); + if (ret) + break; + } + ret = io_madvise(req, nxt, force_nonblock); + break; + case IORING_OP_OPENAT2: + if (sqe) { + ret = io_openat2_prep(req, sqe); + if (ret) + break; + } + ret = io_openat2(req, nxt, force_nonblock); + break; + case IORING_OP_EPOLL_CTL: + if (sqe) { + ret = io_epoll_ctl_prep(req, sqe); + if (ret) + break; + } + ret = io_epoll_ctl(req, nxt, force_nonblock); + break; default: ret = -EINVAL; break; @@ -3311,8 +4408,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr) struct io_kiocb *nxt = NULL; int ret = 0; - if (work->flags & IO_WQ_WORK_CANCEL) + /* if NO_CANCEL is set, we must still run the work */ + if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == + IO_WQ_WORK_CANCEL) { ret = -ECANCELED; + } if (!ret) { req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; @@ -3344,26 +4444,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_wq_assign_next(workptr, nxt); } -static bool io_req_op_valid(int op) -{ - return op >= IORING_OP_NOP && op < IORING_OP_LAST; -} - -static int io_req_needs_file(struct io_kiocb *req) +static int io_req_needs_file(struct io_kiocb *req, int fd) { - switch (req->opcode) { - case IORING_OP_NOP: - case IORING_OP_POLL_REMOVE: - case IORING_OP_TIMEOUT: - case IORING_OP_TIMEOUT_REMOVE: - case IORING_OP_ASYNC_CANCEL: - case IORING_OP_LINK_TIMEOUT: + if (!io_op_defs[req->opcode].needs_file) return 0; - default: - if (io_req_op_valid(req->opcode)) - return 1; - return -EINVAL; - } + if (fd == -1 && io_op_defs[req->opcode].fd_non_neg) + return 0; + return 1; } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, @@ -3371,8 +4458,8 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, { struct fixed_file_table *table; - table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT]; - return table->files[index & IORING_FILE_TABLE_MASK]; + table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT]; + return table->files[index & IORING_FILE_TABLE_MASK];; } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, @@ -3380,20 +4467,16 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, { struct io_ring_ctx *ctx = req->ctx; unsigned flags; - int fd, ret; + int fd; flags = READ_ONCE(sqe->flags); fd = READ_ONCE(sqe->fd); - if (flags & IOSQE_IO_DRAIN) - req->flags |= REQ_F_IO_DRAIN; - - ret = io_req_needs_file(req); - if (ret <= 0) - return ret; + if (!io_req_needs_file(req, fd)) + return 0; if (flags & IOSQE_FIXED_FILE) { - if (unlikely(!ctx->file_table || + if (unlikely(!ctx->file_data || (unsigned) fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); @@ -3401,6 +4484,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, if (!req->file) return -EBADF; req->flags |= REQ_F_FIXED_FILE; + percpu_ref_get(&ctx->file_data->refs); } else { if (req->needs_fixed_file) return -EBADF; @@ -3418,6 +4502,11 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; + if (req->work.files) + return 0; + if (!ctx->ring_file) + return -EBADF; + rcu_read_lock(); spin_lock_irq(&ctx->inflight_lock); /* @@ -3426,7 +4515,7 @@ static int io_grab_files(struct io_kiocb *req) * the fd has changed since we started down this path, and disallow * this operation if it has. */ - if (fcheck(req->ring_fd) == req->ring_file) { + if (fcheck(ctx->ring_fd) == ctx->ring_file) { list_add(&req->inflight_entry, &ctx->inflight_list); req->flags |= REQ_F_INFLIGHT; req->work.files = current->files; @@ -3532,7 +4621,8 @@ again: */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { - if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { +punt: + if (io_op_defs[req->opcode].file_table) { ret = io_grab_files(req); if (ret) goto err; @@ -3567,6 +4657,9 @@ done_req: if (nxt) { req = nxt; nxt = NULL; + + if (req->flags & REQ_F_FORCE_ASYNC) + goto punt; goto again; } } @@ -3575,21 +4668,27 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; - if (unlikely(req->ctx->drain_next)) { - req->flags |= REQ_F_IO_DRAIN; - req->ctx->drain_next = false; - } - req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK); - ret = io_req_defer(req, sqe); if (ret) { if (ret != -EIOCBQUEUED) { +fail_req: io_cqring_add_event(req, ret); req_set_fail_links(req); io_double_put_req(req); } - } else + } else if (req->flags & REQ_F_FORCE_ASYNC) { + ret = io_req_defer_prep(req, sqe); + if (unlikely(ret < 0)) + goto fail_req; + /* + * Never try inline submit of IOSQE_ASYNC is set, go straight + * to async execution. + */ + req->work.flags |= IO_WQ_WORK_CONCURRENT; + io_queue_async_work(req); + } else { __io_queue_sqe(req, sqe); + } } static inline void io_queue_link_head(struct io_kiocb *req) @@ -3602,25 +4701,47 @@ static inline void io_queue_link_head(struct io_kiocb *req) } #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK) + IOSQE_IO_HARDLINK | IOSQE_ASYNC) static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, struct io_submit_state *state, struct io_kiocb **link) { + const struct cred *old_creds = NULL; struct io_ring_ctx *ctx = req->ctx; - int ret; + unsigned int sqe_flags; + int ret, id; + + sqe_flags = READ_ONCE(sqe->flags); /* enforce forwards compatibility on users */ - if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) { + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; goto err_req; } + id = READ_ONCE(sqe->personality); + if (id) { + const struct cred *personality_creds; + + personality_creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!personality_creds)) { + ret = -EINVAL; + goto err_req; + } + old_creds = override_creds(personality_creds); + } + + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK| + IOSQE_ASYNC); + ret = io_req_set_file(state, req, sqe); if (unlikely(ret)) { err_req: io_cqring_add_event(req, ret); io_double_put_req(req); + if (old_creds) + revert_creds(old_creds); return false; } @@ -3632,14 +4753,19 @@ err_req: * conditions are true (normal request), then just queue it. */ if (*link) { - struct io_kiocb *prev = *link; - - if (sqe->flags & IOSQE_IO_DRAIN) - (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - - if (sqe->flags & IOSQE_IO_HARDLINK) - req->flags |= REQ_F_HARDLINK; + struct io_kiocb *head = *link; + /* + * Taking sequential execution of a link, draining both sides + * of the link also fullfils IOSQE_IO_DRAIN semantics for all + * requests in the link. So, it drains the head and the + * next after the link request. The last one is done via + * drain_next flag to persist the effect across calls. + */ + if (sqe_flags & IOSQE_IO_DRAIN) { + head->flags |= REQ_F_IO_DRAIN; + ctx->drain_next = 1; + } if (io_alloc_async_ctx(req)) { ret = -EAGAIN; goto err_req; @@ -3648,25 +4774,36 @@ err_req: ret = io_req_defer_prep(req, sqe); if (ret) { /* fail even hard links since we don't submit */ - prev->flags |= REQ_F_FAIL_LINK; + head->flags |= REQ_F_FAIL_LINK; goto err_req; } - trace_io_uring_link(ctx, req, prev); - list_add_tail(&req->link_list, &prev->link_list); - } else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { - req->flags |= REQ_F_LINK; - if (sqe->flags & IOSQE_IO_HARDLINK) - req->flags |= REQ_F_HARDLINK; - - INIT_LIST_HEAD(&req->link_list); - ret = io_req_defer_prep(req, sqe); - if (ret) - req->flags |= REQ_F_FAIL_LINK; - *link = req; + trace_io_uring_link(ctx, req, head); + list_add_tail(&req->link_list, &head->link_list); + + /* last request of a link, enqueue the link */ + if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) { + io_queue_link_head(head); + *link = NULL; + } } else { - io_queue_sqe(req, sqe); + if (unlikely(ctx->drain_next)) { + req->flags |= REQ_F_IO_DRAIN; + req->ctx->drain_next = 0; + } + if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { + req->flags |= REQ_F_LINK; + INIT_LIST_HEAD(&req->link_list); + ret = io_req_defer_prep(req, sqe); + if (ret) + req->flags |= REQ_F_FAIL_LINK; + *link = req; + } else { + io_queue_sqe(req, sqe); + } } + if (old_creds) + revert_creds(old_creds); return true; } @@ -3698,14 +4835,12 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) { - /* - * Ensure any loads from the SQEs are done at this point, - * since once we write the new head, the application could - * write new data to them. - */ - smp_store_release(&rings->sq.head, ctx->cached_sq_head); - } + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&rings->sq.head, ctx->cached_sq_head); } /* @@ -3719,7 +4854,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe **sqe_ptr) { - struct io_rings *rings = ctx->rings; u32 *sq_array = ctx->sq_array; unsigned head; @@ -3731,12 +4865,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ - head = ctx->cached_sq_head; - /* make sure SQ entry isn't read before tail */ - if (unlikely(head == smp_load_acquire(&rings->sq.tail))) - return false; - - head = READ_ONCE(sq_array[head & ctx->sq_mask]); + head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]); if (likely(head < ctx->sq_entries)) { /* * All io need record the previous position, if LINK vs DARIN, @@ -3754,7 +4883,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, /* drop invalid entries */ ctx->cached_sq_head++; ctx->cached_sq_dropped++; - WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped); + WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped); return false; } @@ -3768,19 +4897,29 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, bool mm_fault = false; /* if we have a backlog and couldn't flush it all, return BUSY */ - if (!list_empty(&ctx->cq_overflow_list) && - !io_cqring_overflow_flush(ctx, false)) - return -EBUSY; + if (test_bit(0, &ctx->sq_check_overflow)) { + if (!list_empty(&ctx->cq_overflow_list) && + !io_cqring_overflow_flush(ctx, false)) + return -EBUSY; + } + + /* make sure SQ entry isn't read before tail */ + nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); + + if (!percpu_ref_tryget_many(&ctx->refs, nr)) + return -EAGAIN; if (nr > IO_PLUG_THRESHOLD) { io_submit_state_start(&state, nr); statep = &state; } + ctx->ring_fd = ring_fd; + ctx->ring_file = ring_file; + for (i = 0; i < nr; i++) { const struct io_uring_sqe *sqe; struct io_kiocb *req; - unsigned int sqe_flags; req = io_get_req(ctx, statep); if (unlikely(!req)) { @@ -3789,11 +4928,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } if (!io_get_sqring(ctx, req, &sqe)) { - __io_free_req(req); + __io_req_do_free(req); break; } - if (io_req_needs_user(req) && !*mm) { + /* will complete beyond this point, count as submitted */ + submitted++; + + if (unlikely(req->opcode >= IORING_OP_LAST)) { + io_cqring_add_event(req, -EINVAL); + io_double_put_req(req); + break; + } + + if (io_op_defs[req->opcode].needs_mm && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -3801,27 +4949,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } - submitted++; - sqe_flags = sqe->flags; - - req->ring_file = ring_file; - req->ring_fd = ring_fd; req->has_user = *mm != NULL; req->in_async = async; req->needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, req->user_data, true, async); + trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, + true, async); if (!io_submit_sqe(req, sqe, statep, &link)) break; - /* - * If previous wasn't linked and we have a linked command, - * that's the end of the chain. Submit the previous link. - */ - if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) { - io_queue_link_head(link); - link = NULL; - } } + if (unlikely(submitted != nr)) { + int ref_used = (submitted == -EAGAIN) ? 0 : submitted; + + percpu_ref_put_many(&ctx->refs, nr - ref_used); + } if (link) io_queue_link_head(link); if (statep) @@ -3944,7 +5085,6 @@ static int io_sq_thread(void *data) ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } - to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); mutex_unlock(&ctx->uring_lock); @@ -4075,19 +5215,40 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) #endif } +static void io_file_ref_kill(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + complete(&data->done); +} + static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { + struct fixed_file_data *data = ctx->file_data; unsigned nr_tables, i; - if (!ctx->file_table) + if (!data) return -ENXIO; + /* protect against inflight atomic switch, which drops the ref */ + percpu_ref_get(&data->refs); + /* wait for existing switches */ + flush_work(&data->ref_work); + percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill); + wait_for_completion(&data->done); + percpu_ref_put(&data->refs); + /* flush potential new switch */ + flush_work(&data->ref_work); + percpu_ref_exit(&data->refs); + __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(data->table[i].files); + kfree(data->table); + kfree(data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; } @@ -4118,16 +5279,6 @@ static void io_finish_async(struct io_ring_ctx *ctx) } #if defined(CONFIG_UNIX) -static void io_destruct_skb(struct sk_buff *skb) -{ - struct io_ring_ctx *ctx = skb->sk->sk_user_data; - - if (ctx->io_wq) - io_wq_flush(ctx->io_wq); - - unix_destruct_scm(skb); -} - /* * Ensure the UNIX gc is aware of our file set, so we are certain that * the io_uring can be safely unregistered on process exit, even if we have @@ -4175,7 +5326,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) fpl->max = SCM_MAX_FD; fpl->count = nr_files; UNIXCB(skb).fp = fpl; - skb->destructor = io_destruct_skb; + skb->destructor = unix_destruct_scm; refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_queue_head(&sk->sk_receive_queue, skb); @@ -4237,7 +5388,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, int i; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; unsigned this_files; this_files = min(nr_files, IORING_MAX_FILES_TABLE); @@ -4252,36 +5403,159 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, return 0; for (i = 0; i < nr_tables; i++) { - struct fixed_file_table *table = &ctx->file_table[i]; + struct fixed_file_table *table = &ctx->file_data->table[i]; kfree(table->files); } return 1; } +static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) +{ +#if defined(CONFIG_UNIX) + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff_head list, *head = &sock->sk_receive_queue; + struct sk_buff *skb; + int i; + + __skb_queue_head_init(&list); + + /* + * Find the skb that holds this file in its SCM_RIGHTS. When found, + * remove this entry and rearrange the file array. + */ + skb = skb_dequeue(head); + while (skb) { + struct scm_fp_list *fp; + + fp = UNIXCB(skb).fp; + for (i = 0; i < fp->count; i++) { + int left; + + if (fp->fp[i] != file) + continue; + + unix_notinflight(fp->user, fp->fp[i]); + left = fp->count - 1 - i; + if (left) { + memmove(&fp->fp[i], &fp->fp[i + 1], + left * sizeof(struct file *)); + } + fp->count--; + if (!fp->count) { + kfree_skb(skb); + skb = NULL; + } else { + __skb_queue_tail(&list, skb); + } + fput(file); + file = NULL; + break; + } + + if (!file) + break; + + __skb_queue_tail(&list, skb); + + skb = skb_dequeue(head); + } + + if (skb_peek(&list)) { + spin_lock_irq(&head->lock); + while ((skb = __skb_dequeue(&list)) != NULL) + __skb_queue_tail(head, skb); + spin_unlock_irq(&head->lock); + } +#else + fput(file); +#endif +} + +struct io_file_put { + struct llist_node llist; + struct file *file; + struct completion *done; +}; + +static void io_ring_file_ref_switch(struct work_struct *work) +{ + struct io_file_put *pfile, *tmp; + struct fixed_file_data *data; + struct llist_node *node; + + data = container_of(work, struct fixed_file_data, ref_work); + + while ((node = llist_del_all(&data->put_llist)) != NULL) { + llist_for_each_entry_safe(pfile, tmp, node, llist) { + io_ring_file_put(data->ctx, pfile->file); + if (pfile->done) + complete(pfile->done); + else + kfree(pfile); + } + } + + percpu_ref_get(&data->refs); + percpu_ref_switch_to_percpu(&data->refs); +} + +static void io_file_data_ref_zero(struct percpu_ref *ref) +{ + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + + /* we can't safely switch from inside this context, punt to wq */ + queue_work(system_wq, &data->ref_work); +} + static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { __s32 __user *fds = (__s32 __user *) arg; unsigned nr_tables; + struct file *file; int fd, ret = 0; unsigned i; - if (ctx->file_table) + if (ctx->file_data) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; + ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); + if (!ctx->file_data) + return -ENOMEM; + ctx->file_data->ctx = ctx; + init_completion(&ctx->file_data->done); + nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); - ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table), + ctx->file_data->table = kcalloc(nr_tables, + sizeof(struct fixed_file_table), GFP_KERNEL); - if (!ctx->file_table) + if (!ctx->file_data->table) { + kfree(ctx->file_data); + ctx->file_data = NULL; return -ENOMEM; + } + + if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } + ctx->file_data->put_llist.first = NULL; + INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch); if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { - kfree(ctx->file_table); - ctx->file_table = NULL; + percpu_ref_exit(&ctx->file_data->refs); + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; return -ENOMEM; } @@ -4298,13 +5572,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, continue; } - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; - table->files[index] = fget(fd); + file = fget(fd); ret = -EBADF; - if (!table->files[index]) + if (!file) break; + /* * Don't allow io_uring instances to be registered. If UNIX * isn't enabled, then this causes a reference cycle and this @@ -4312,26 +5587,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, * handle it just fine, but there's still no point in allowing * a ring fd as it doesn't support regular read/write anyway. */ - if (table->files[index]->f_op == &io_uring_fops) { - fput(table->files[index]); + if (file->f_op == &io_uring_fops) { + fput(file); break; } ret = 0; + table->files[index] = file; } if (ret) { for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file; - file = io_file_from_index(ctx, i); if (file) fput(file); } for (i = 0; i < nr_tables; i++) - kfree(ctx->file_table[i].files); + kfree(ctx->file_data->table[i].files); - kfree(ctx->file_table); - ctx->file_table = NULL; + kfree(ctx->file_data->table); + kfree(ctx->file_data); + ctx->file_data = NULL; ctx->nr_user_files = 0; return ret; } @@ -4343,69 +5618,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index) -{ -#if defined(CONFIG_UNIX) - struct file *file = io_file_from_index(ctx, index); - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head list, *head = &sock->sk_receive_queue; - struct sk_buff *skb; - int i; - - __skb_queue_head_init(&list); - - /* - * Find the skb that holds this file in its SCM_RIGHTS. When found, - * remove this entry and rearrange the file array. - */ - skb = skb_dequeue(head); - while (skb) { - struct scm_fp_list *fp; - - fp = UNIXCB(skb).fp; - for (i = 0; i < fp->count; i++) { - int left; - - if (fp->fp[i] != file) - continue; - - unix_notinflight(fp->user, fp->fp[i]); - left = fp->count - 1 - i; - if (left) { - memmove(&fp->fp[i], &fp->fp[i + 1], - left * sizeof(struct file *)); - } - fp->count--; - if (!fp->count) { - kfree_skb(skb); - skb = NULL; - } else { - __skb_queue_tail(&list, skb); - } - fput(file); - file = NULL; - break; - } - - if (!file) - break; - - __skb_queue_tail(&list, skb); - - skb = skb_dequeue(head); - } - - if (skb_peek(&list)) { - spin_lock_irq(&head->lock); - while ((skb = __skb_dequeue(&list)) != NULL) - __skb_queue_tail(head, skb); - spin_unlock_irq(&head->lock); - } -#else - fput(io_file_from_index(ctx, index)); -#endif -} - static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, int index) { @@ -4449,29 +5661,65 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } -static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) +static void io_atomic_switch(struct percpu_ref *ref) { - struct io_uring_files_update up; + struct fixed_file_data *data; + + data = container_of(ref, struct fixed_file_data, refs); + clear_bit(FFD_F_ATOMIC, &data->state); +} + +static bool io_queue_file_removal(struct fixed_file_data *data, + struct file *file) +{ + struct io_file_put *pfile, pfile_stack; + DECLARE_COMPLETION_ONSTACK(done); + + /* + * If we fail allocating the struct we need for doing async reomval + * of this file, just punt to sync and wait for it. + */ + pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); + if (!pfile) { + pfile = &pfile_stack; + pfile->done = &done; + } + + pfile->file = file; + llist_add(&pfile->llist, &data->put_llist); + + if (pfile == &pfile_stack) { + if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, + io_atomic_switch); + } + wait_for_completion(&done); + flush_work(&data->ref_work); + return false; + } + + return true; +} + +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_files_update *up, + unsigned nr_args) +{ + struct fixed_file_data *data = ctx->file_data; + bool ref_switch = false; + struct file *file; __s32 __user *fds; int fd, i, err; __u32 done; - if (!ctx->file_table) - return -ENXIO; - if (!nr_args) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; - if (up.resv) - return -EINVAL; - if (check_add_overflow(up.offset, nr_args, &done)) + if (check_add_overflow(up->offset, nr_args, &done)) return -EOVERFLOW; if (done > ctx->nr_user_files) return -EINVAL; done = 0; - fds = u64_to_user_ptr(up.fds); + fds = u64_to_user_ptr(up->fds); while (nr_args) { struct fixed_file_table *table; unsigned index; @@ -4481,16 +5729,16 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, err = -EFAULT; break; } - i = array_index_nospec(up.offset, ctx->nr_user_files); - table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; + i = array_index_nospec(up->offset, ctx->nr_user_files); + table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; if (table->files[index]) { - io_sqe_file_unregister(ctx, i); + file = io_file_from_index(ctx, index); table->files[index] = NULL; + if (io_queue_file_removal(data, file)) + ref_switch = true; } if (fd != -1) { - struct file *file; - file = fget(fd); if (!file) { err = -EBADF; @@ -4516,11 +5764,32 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, } nr_args--; done++; - up.offset++; + up->offset++; + } + + if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) { + percpu_ref_put(&data->refs); + percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); } return done ? done : err; } +static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_files_update up; + + if (!ctx->file_data) + return -ENXIO; + if (!nr_args) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + if (up.resv) + return -EINVAL; + + return __io_sqe_files_update(ctx, &up, nr_args); +} static void io_put_work(struct io_wq_work *work) { @@ -4536,11 +5805,56 @@ static void io_get_work(struct io_wq_work *work) refcount_inc(&req->refs); } +static int io_init_wq_offload(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + struct io_wq_data data; + struct fd f; + struct io_ring_ctx *ctx_attach; + unsigned int concurrency; + int ret = 0; + + data.user = ctx->user; + data.get_work = io_get_work; + data.put_work = io_put_work; + + if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); + + ctx->io_wq = io_wq_create(concurrency, &data); + if (IS_ERR(ctx->io_wq)) { + ret = PTR_ERR(ctx->io_wq); + ctx->io_wq = NULL; + } + return ret; + } + + f = fdget(p->wq_fd); + if (!f.file) + return -EBADF; + + if (f.file->f_op != &io_uring_fops) { + ret = -EINVAL; + goto out_fput; + } + + ctx_attach = f.file->private_data; + /* @io_wq is protected by holding the fd */ + if (!io_wq_get(ctx_attach->io_wq, &data)) { + ret = -EINVAL; + goto out_fput; + } + + ctx->io_wq = ctx_attach->io_wq; +out_fput: + fdput(f); + return ret; +} + static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct io_wq_data data; - unsigned concurrency; int ret; init_waitqueue_head(&ctx->sqo_wait); @@ -4584,20 +5898,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, goto err; } - data.mm = ctx->sqo_mm; - data.user = ctx->user; - data.creds = ctx->creds; - data.get_work = io_get_work; - data.put_work = io_put_work; - - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, &data); - if (IS_ERR(ctx->io_wq)) { - ret = PTR_ERR(ctx->io_wq); - ctx->io_wq = NULL; + ret = io_init_wq_offload(ctx, p); + if (ret) goto err; - } return 0; err: @@ -4702,7 +6005,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; for (j = 0; j < imu->nr_bvecs; j++) - put_user_page(imu->bvec[j].bv_page); + unpin_user_page(imu->bvec[j].bv_page); if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); @@ -4823,7 +6126,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ret = 0; down_read(¤t->mm->mmap_sem); - pret = get_user_pages(ubuf, nr_pages, + pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages, vmas); if (pret == nr_pages) { @@ -4847,7 +6150,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, * release any pages we did get */ if (pret > 0) - put_user_pages(pages, pret); + unpin_user_pages(pages, pret); if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); kvfree(imu->bvec); @@ -4975,6 +6278,17 @@ static int io_uring_fasync(int fd, struct file *file, int on) return fasync_helper(fd, file, on, &ctx->cq_fasync); } +static int io_remove_personalities(int id, void *p, void *data) +{ + struct io_ring_ctx *ctx = data; + const struct cred *cred; + + cred = idr_remove(&ctx->personality_idr, id); + if (cred) + put_cred(cred); + return 0; +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); @@ -4991,6 +6305,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) /* if we failed setting up the ctx, we might not have any rings */ if (ctx->rings) io_cqring_overflow_flush(ctx, true); + idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); wait_for_completion(&ctx->completions[0]); io_ring_ctx_free(ctx); } @@ -5157,7 +6472,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } else if (to_submit) { struct mm_struct *cur_mm; - to_submit = min(to_submit, ctx->sq_entries); mutex_lock(&ctx->uring_lock); /* already have mm, so io_submit_sqes() won't try to grab it */ cur_mm = ctx->sqo_mm; @@ -5273,7 +6587,6 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; - ctx->ring_sock->sk->sk_user_data = ctx; #endif fd_install(ret, file); return ret; @@ -5292,8 +6605,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) bool account_mem; int ret; - if (!entries || entries > IORING_MAX_ENTRIES) + if (!entries) return -EINVAL; + if (entries > IORING_MAX_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + entries = IORING_MAX_ENTRIES; + } /* * Use twice as many entries for the CQ ring. It's possible for the @@ -5310,8 +6628,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) * to a power-of-two, if it isn't already. We do NOT impose * any cq vs sq ring sizing. */ - if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES) + if (p->cq_entries < p->sq_entries) return -EINVAL; + if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { + if (!(p->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + p->cq_entries = IORING_MAX_CQ_ENTRIES; + } p->cq_entries = roundup_pow_of_two(p->cq_entries); } else { p->cq_entries = 2 * p->sq_entries; @@ -5376,7 +6699,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) goto err; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE; + IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | + IORING_FEAT_CUR_PERSONALITY; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: @@ -5403,7 +6727,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) } if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE)) + IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | + IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) return -EINVAL; ret = io_uring_create(entries, &p); @@ -5422,6 +6747,84 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } +static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct io_uring_probe *p; + size_t size; + int i, ret; + + size = struct_size(p, ops, nr_args); + if (size == SIZE_MAX) + return -EOVERFLOW; + p = kzalloc(size, GFP_KERNEL); + if (!p) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(p, arg, size)) + goto out; + ret = -EINVAL; + if (memchr_inv(p, 0, size)) + goto out; + + p->last_op = IORING_OP_LAST - 1; + if (nr_args > IORING_OP_LAST) + nr_args = IORING_OP_LAST; + + for (i = 0; i < nr_args; i++) { + p->ops[i].op = i; + if (!io_op_defs[i].not_supported) + p->ops[i].flags = IO_URING_OP_SUPPORTED; + } + p->ops_len = i; + + ret = 0; + if (copy_to_user(arg, p, size)) + ret = -EFAULT; +out: + kfree(p); + return ret; +} + +static int io_register_personality(struct io_ring_ctx *ctx) +{ + const struct cred *creds = get_current_cred(); + int id; + + id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, + USHRT_MAX, GFP_KERNEL); + if (id < 0) + put_cred(creds); + return id; +} + +static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) +{ + const struct cred *old_creds; + + old_creds = idr_remove(&ctx->personality_idr, id); + if (old_creds) { + put_cred(old_creds); + return 0; + } + + return -EINVAL; +} + +static bool io_register_op_must_quiesce(int op) +{ + switch (op) { + case IORING_UNREGISTER_FILES: + case IORING_REGISTER_FILES_UPDATE: + case IORING_REGISTER_PROBE: + case IORING_REGISTER_PERSONALITY: + case IORING_UNREGISTER_PERSONALITY: + return false; + default: + return true; + } +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -5437,18 +6840,26 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (percpu_ref_is_dying(&ctx->refs)) return -ENXIO; - percpu_ref_kill(&ctx->refs); + if (io_register_op_must_quiesce(opcode)) { + percpu_ref_kill(&ctx->refs); - /* - * Drop uring mutex before waiting for references to exit. If another - * thread is currently inside io_uring_enter() it might need to grab - * the uring_lock to make progress. If we hold it here across the drain - * wait, then we can deadlock. It's safe to drop the mutex here, since - * no new references will come in after we've killed the percpu ref. - */ - mutex_unlock(&ctx->uring_lock); - wait_for_completion(&ctx->completions[0]); - mutex_lock(&ctx->uring_lock); + /* + * Drop uring mutex before waiting for references to exit. If + * another thread is currently inside io_uring_enter() it might + * need to grab the uring_lock to make progress. If we hold it + * here across the drain wait, then we can deadlock. It's safe + * to drop the mutex here, since no new references will come in + * after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); + ret = wait_for_completion_interruptible(&ctx->completions[0]); + mutex_lock(&ctx->uring_lock); + if (ret) { + percpu_ref_resurrect(&ctx->refs); + ret = -EINTR; + goto out; + } + } switch (opcode) { case IORING_REGISTER_BUFFERS: @@ -5473,10 +6884,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_sqe_files_update(ctx, arg, nr_args); break; case IORING_REGISTER_EVENTFD: + case IORING_REGISTER_EVENTFD_ASYNC: ret = -EINVAL; if (nr_args != 1) break; ret = io_eventfd_register(ctx, arg); + if (ret) + break; + if (opcode == IORING_REGISTER_EVENTFD_ASYNC) + ctx->eventfd_async = 1; + else + ctx->eventfd_async = 0; break; case IORING_UNREGISTER_EVENTFD: ret = -EINVAL; @@ -5484,14 +6902,35 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_eventfd_unregister(ctx); break; + case IORING_REGISTER_PROBE: + ret = -EINVAL; + if (!arg || nr_args > 256) + break; + ret = io_probe(ctx, arg, nr_args); + break; + case IORING_REGISTER_PERSONALITY: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_register_personality(ctx); + break; + case IORING_UNREGISTER_PERSONALITY: + ret = -EINVAL; + if (arg) + break; + ret = io_unregister_personality(ctx, nr_args); + break; default: ret = -EINVAL; break; } - /* bring the ctx back to life */ - reinit_completion(&ctx->completions[0]); - percpu_ref_reinit(&ctx->refs); + if (io_register_op_must_quiesce(opcode)) { + /* bring the ctx back to life */ + percpu_ref_reinit(&ctx->refs); +out: + reinit_completion(&ctx->completions[0]); + } return ret; } @@ -5524,6 +6963,7 @@ out_fput: static int __init io_uring_init(void) { + BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); return 0; }; diff --git a/fs/ioctl.c b/fs/ioctl.c index 2f5e4e5b97e1..7c9a5df5a597 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -467,7 +467,7 @@ EXPORT_SYMBOL(generic_block_fiemap); * Only the l_start, l_len and l_whence fields of the 'struct space_resv' * are used here, rest are ignored. */ -int ioctl_preallocate(struct file *filp, int mode, void __user *argp) +static int ioctl_preallocate(struct file *filp, int mode, void __user *argp) { struct inode *inode = file_inode(filp); struct space_resv sr; @@ -495,8 +495,8 @@ int ioctl_preallocate(struct file *filp, int mode, void __user *argp) /* on ia32 l_start is on a 32-bit boundary */ #if defined CONFIG_COMPAT && defined(CONFIG_X86_64) /* just account for different alignment */ -int compat_ioctl_preallocate(struct file *file, int mode, - struct space_resv_32 __user *argp) +static int compat_ioctl_preallocate(struct file *file, int mode, + struct space_resv_32 __user *argp) { struct inode *inode = file_inode(file); struct space_resv_32 sr; @@ -521,11 +521,9 @@ int compat_ioctl_preallocate(struct file *file, int mode, } #endif -static int file_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) +static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p) { struct inode *inode = file_inode(filp); - int __user *p = (int __user *)arg; switch (cmd) { case FIBMAP: @@ -542,7 +540,7 @@ static int file_ioctl(struct file *filp, unsigned int cmd, return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p); } - return vfs_ioctl(filp, cmd, arg); + return -ENOIOCTLCMD; } static int ioctl_fionbio(struct file *filp, int __user *argp) @@ -661,53 +659,48 @@ out: } /* - * When you add any new common ioctls to the switches above and below - * please update compat_sys_ioctl() too. - * * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. + * + * When you add any new common ioctls to the switches above and below, + * please ensure they have compatible arguments in compat mode. */ -int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, - unsigned long arg) +static int do_vfs_ioctl(struct file *filp, unsigned int fd, + unsigned int cmd, unsigned long arg) { - int error = 0; void __user *argp = (void __user *)arg; struct inode *inode = file_inode(filp); switch (cmd) { case FIOCLEX: set_close_on_exec(fd, 1); - break; + return 0; case FIONCLEX: set_close_on_exec(fd, 0); - break; + return 0; case FIONBIO: - error = ioctl_fionbio(filp, argp); - break; + return ioctl_fionbio(filp, argp); case FIOASYNC: - error = ioctl_fioasync(fd, filp, argp); - break; + return ioctl_fioasync(fd, filp, argp); case FIOQSIZE: if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { loff_t res = inode_get_bytes(inode); - error = copy_to_user(argp, &res, sizeof(res)) ? - -EFAULT : 0; - } else - error = -ENOTTY; - break; + return copy_to_user(argp, &res, sizeof(res)) ? + -EFAULT : 0; + } + + return -ENOTTY; case FIFREEZE: - error = ioctl_fsfreeze(filp); - break; + return ioctl_fsfreeze(filp); case FITHAW: - error = ioctl_fsthaw(filp); - break; + return ioctl_fsthaw(filp); case FS_IOC_FIEMAP: return ioctl_fiemap(filp, argp); @@ -716,6 +709,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, /* anon_bdev filesystems may not have a block size */ if (!inode->i_sb->s_blocksize) return -EINVAL; + return put_user(inode->i_sb->s_blocksize, (int __user *)argp); case FICLONE: @@ -729,24 +723,30 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, default: if (S_ISREG(inode->i_mode)) - error = file_ioctl(filp, cmd, arg); - else - error = vfs_ioctl(filp, cmd, arg); + return file_ioctl(filp, cmd, argp); break; } - return error; + + return -ENOIOCTLCMD; } int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { - int error; struct fd f = fdget(fd); + int error; if (!f.file) return -EBADF; + error = security_file_ioctl(f.file, cmd, arg); - if (!error) - error = do_vfs_ioctl(f.file, fd, cmd, arg); + if (error) + goto out; + + error = do_vfs_ioctl(f.file, fd, cmd, arg); + if (error == -ENOIOCTLCMD) + error = vfs_ioctl(f.file, cmd, arg); + +out: fdput(f); return error; } @@ -788,4 +788,65 @@ long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } EXPORT_SYMBOL(compat_ptr_ioctl); + +COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + struct fd f = fdget(fd); + int error; + + if (!f.file) + return -EBADF; + + /* RED-PEN how should LSM module know it's handling 32bit? */ + error = security_file_ioctl(f.file, cmd, arg); + if (error) + goto out; + + switch (cmd) { + /* FICLONE takes an int argument, so don't use compat_ptr() */ + case FICLONE: + error = ioctl_file_clone(f.file, arg, 0, 0, 0); + break; + +#if defined(CONFIG_X86_64) + /* these get messy on amd64 due to alignment differences */ + case FS_IOC_RESVSP_32: + case FS_IOC_RESVSP64_32: + error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg)); + break; + case FS_IOC_UNRESVSP_32: + case FS_IOC_UNRESVSP64_32: + error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE, + compat_ptr(arg)); + break; + case FS_IOC_ZERO_RANGE_32: + error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE, + compat_ptr(arg)); + break; +#endif + + /* + * everything else in do_vfs_ioctl() takes either a compatible + * pointer argument or no argument -- call it with a modified + * argument. + */ + default: + error = do_vfs_ioctl(f.file, fd, cmd, + (unsigned long)compat_ptr(arg)); + if (error != -ENOIOCTLCMD) + break; + + if (f.file->f_op->compat_ioctl) + error = f.file->f_op->compat_ioctl(f.file, cmd, arg); + if (error == -ENOIOCTLCMD) + error = -ENOTTY; + break; + } + + out: + fdput(f); + + return error; +} #endif diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 828444e14d09..7c84c4c027c4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1077,24 +1077,16 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); unsigned long length; - loff_t offset, size; + loff_t offset; ssize_t ret; lock_page(page); - size = i_size_read(inode); - offset = page_offset(page); - if (page->mapping != inode->i_mapping || offset > size) { - /* We overload EFAULT to mean page got truncated */ - ret = -EFAULT; + ret = page_mkwrite_check_truncate(page, inode); + if (ret < 0) goto out_unlock; - } - - /* page is wholly or partially inside EOF */ - if (offset > size - PAGE_SIZE) - length = offset_in_page(size); - else - length = PAGE_SIZE; + length = ret; + offset = page_offset(page); while (length > 0) { ret = iomap_apply(inode, offset, length, IOMAP_WRITE | IOMAP_FAULT, ops, page, diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 8fff6677a5da..96bf33986d03 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -164,7 +164,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) "journal space in %s\n", __func__, journal->j_devname); WARN_ON(1); - jbd2_journal_abort(journal, 0); + jbd2_journal_abort(journal, -EIO); } write_lock(&journal->j_state_lock); } else { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7f0b362b3842..2494095e0340 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -782,7 +782,7 @@ start_journal_io: err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) - __jbd2_journal_abort_hard(journal); + jbd2_journal_abort(journal, err); } blk_finish_plug(&plug); @@ -875,7 +875,7 @@ start_journal_io: err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) - __jbd2_journal_abort_hard(journal); + jbd2_journal_abort(journal, err); } if (cbh) err = journal_wait_on_commit_record(journal, cbh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5e408ee24a1a..eb8ca446d1ab 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -96,7 +96,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); EXPORT_SYMBOL(jbd2_inode_cache); -static void __journal_abort_soft (journal_t *journal, int errno); static int jbd2_journal_create_slab(size_t slab_size); #ifdef CONFIG_JBD2_DEBUG @@ -805,7 +804,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, "at offset %lu on %s\n", __func__, blocknr, journal->j_devname); err = -EIO; - __journal_abort_soft(journal, err); + jbd2_journal_abort(journal, err); } } else { *retp = blocknr; /* +journal->j_blk_offset */ @@ -982,6 +981,7 @@ static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) { + (*pos)++; return NULL; } @@ -1074,12 +1074,11 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static const struct file_operations jbd2_seq_info_fops = { - .owner = THIS_MODULE, - .open = jbd2_seq_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = jbd2_seq_info_release, +static const struct proc_ops jbd2_info_proc_ops = { + .proc_open = jbd2_seq_info_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = jbd2_seq_info_release, }; static struct proc_dir_entry *proc_jbd2_stats; @@ -1089,7 +1088,7 @@ static void jbd2_stats_proc_init(journal_t *journal) journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); if (journal->j_proc_entry) { proc_create_data("info", S_IRUGO, journal->j_proc_entry, - &jbd2_seq_info_fops, journal); + &jbd2_info_proc_ops, journal); } } @@ -1710,6 +1709,11 @@ int jbd2_journal_load(journal_t *journal) journal->j_devname); return -EFSCORRUPTED; } + /* + * clear JBD2_ABORT flag initialized in journal_init_common + * here to update log tail information with the newest seq. + */ + journal->j_flags &= ~JBD2_ABORT; /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory @@ -1717,7 +1721,6 @@ int jbd2_journal_load(journal_t *journal) if (journal_reset(journal)) goto recovery_error; - journal->j_flags &= ~JBD2_ABORT; journal->j_flags |= JBD2_LOADED; return 0; @@ -2098,67 +2101,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) return err; } -/* - * Journal abort has very specific semantics, which we describe - * for journal abort. - * - * Two internal functions, which provide abort to the jbd layer - * itself are here. - */ - -/* - * Quick version for internal journal use (doesn't lock the journal). - * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, - * and don't attempt to make any other journal updates. - */ -void __jbd2_journal_abort_hard(journal_t *journal) -{ - transaction_t *transaction; - - if (journal->j_flags & JBD2_ABORT) - return; - - printk(KERN_ERR "Aborting journal on device %s.\n", - journal->j_devname); - - write_lock(&journal->j_state_lock); - journal->j_flags |= JBD2_ABORT; - transaction = journal->j_running_transaction; - if (transaction) - __jbd2_log_start_commit(journal, transaction->t_tid); - write_unlock(&journal->j_state_lock); -} - -/* Soft abort: record the abort error status in the journal superblock, - * but don't do any other IO. */ -static void __journal_abort_soft (journal_t *journal, int errno) -{ - int old_errno; - - write_lock(&journal->j_state_lock); - old_errno = journal->j_errno; - if (!journal->j_errno || errno == -ESHUTDOWN) - journal->j_errno = errno; - - if (journal->j_flags & JBD2_ABORT) { - write_unlock(&journal->j_state_lock); - if (!old_errno && old_errno != -ESHUTDOWN && - errno == -ESHUTDOWN) - jbd2_journal_update_sb_errno(journal); - return; - } - write_unlock(&journal->j_state_lock); - - __jbd2_journal_abort_hard(journal); - - if (errno) { - jbd2_journal_update_sb_errno(journal); - write_lock(&journal->j_state_lock); - journal->j_flags |= JBD2_REC_ERR; - write_unlock(&journal->j_state_lock); - } -} - /** * void jbd2_journal_abort () - Shutdown the journal immediately. * @journal: the journal to shutdown. @@ -2198,16 +2140,51 @@ static void __journal_abort_soft (journal_t *journal, int errno) * failure to disk. ext3_error, for example, now uses this * functionality. * - * Errors which originate from within the journaling layer will NOT - * supply an errno; a null errno implies that absolutely no further - * writes are done to the journal (unless there are any already in - * progress). - * */ void jbd2_journal_abort(journal_t *journal, int errno) { - __journal_abort_soft(journal, errno); + transaction_t *transaction; + + /* + * ESHUTDOWN always takes precedence because a file system check + * caused by any other journal abort error is not required after + * a shutdown triggered. + */ + write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) { + int old_errno = journal->j_errno; + + write_unlock(&journal->j_state_lock); + if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) { + journal->j_errno = errno; + jbd2_journal_update_sb_errno(journal); + } + return; + } + + /* + * Mark the abort as occurred and start current running transaction + * to release all journaled buffer. + */ + pr_err("Aborting journal on device %s.\n", journal->j_devname); + + journal->j_flags |= JBD2_ABORT; + journal->j_errno = errno; + transaction = journal->j_running_transaction; + if (transaction) + __jbd2_log_start_commit(journal, transaction->t_tid); + write_unlock(&journal->j_state_lock); + + /* + * Record errno to the journal super block, so that fsck and jbd2 + * layer could realise that a filesystem check is needed. + */ + jbd2_journal_update_sb_errno(journal); + + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_REC_ERR; + write_unlock(&journal->j_state_lock); } /** @@ -2556,7 +2533,6 @@ static void __journal_remove_journal_head(struct buffer_head *bh) { struct journal_head *jh = bh2jh(bh); - J_ASSERT_JH(jh, jh->b_jcount >= 0); J_ASSERT_JH(jh, jh->b_transaction == NULL); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 27b9f9dee434..e77a5a0b4e46 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -525,7 +525,7 @@ EXPORT_SYMBOL(jbd2__journal_start); * modified buffers in the log. We block until the log can guarantee * that much space. Additionally, if rsv_blocks > 0, we also create another * handle with rsv_blocks reserved blocks in the journal. This handle is - * is stored in h_rsv_handle. It is not attached to any particular transaction + * stored in h_rsv_handle. It is not attached to any particular transaction * and thus doesn't block transaction commit. If the caller uses this reserved * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop() * on the parent handle will dispose the reserved one. Reserved handle has to @@ -1595,7 +1595,7 @@ out: * Allow this call even if the handle has aborted --- it may be part of * the caller's cleanup after an abort. */ -int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) +int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal; diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index 888cdd685a1e..44b62b3c322e 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -43,12 +43,12 @@ static ssize_t jfs_loglevel_proc_write(struct file *file, return count; } -static const struct file_operations jfs_loglevel_proc_fops = { - .open = jfs_loglevel_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = jfs_loglevel_proc_write, +static const struct proc_ops jfs_loglevel_proc_ops = { + .proc_open = jfs_loglevel_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = jfs_loglevel_proc_write, }; #endif @@ -68,7 +68,7 @@ void jfs_proc_init(void) #endif #ifdef CONFIG_JFS_DEBUG proc_create_single("TxAnchor", 0, base, jfs_txanchor_proc_show); - proc_create("loglevel", 0, base, &jfs_loglevel_proc_fops); + proc_create("loglevel", 0, base, &jfs_loglevel_proc_ops); #endif } diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index caade185e568..7dfcab2a2da6 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -4027,7 +4027,6 @@ static int dbGetL2AGSize(s64 nblocks) */ #define MAXL0PAGES (1 + LPERCTL) #define MAXL1PAGES (1 + LPERCTL * MAXL0PAGES) -#define MAXL2PAGES (1 + LPERCTL * MAXL1PAGES) /* * convert number of map pages to the zero origin top dmapctl level diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index eac277c63d42..d0f7a5abd9a9 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -160,9 +160,9 @@ static inline void set_inode_attr(struct inode *inode, { inode->i_uid = attrs->ia_uid; inode->i_gid = attrs->ia_gid; - inode->i_atime = timestamp_truncate(attrs->ia_atime, inode); - inode->i_mtime = timestamp_truncate(attrs->ia_mtime, inode); - inode->i_ctime = timestamp_truncate(attrs->ia_ctime, inode); + inode->i_atime = attrs->ia_atime; + inode->i_mtime = attrs->ia_mtime; + inode->i_ctime = attrs->ia_ctime; } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) diff --git a/fs/libfs.c b/fs/libfs.c index 1463b038ffc4..c686bd9caac6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -19,6 +19,7 @@ #include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <linux/fs_context.h> #include <linux/pseudo_fs.h> +#include <linux/fsnotify.h> #include <linux/uaccess.h> @@ -239,6 +240,75 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); +static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) +{ + struct dentry *child = NULL; + struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs; + + spin_lock(&parent->d_lock); + while ((p = p->next) != &parent->d_subdirs) { + struct dentry *d = container_of(p, struct dentry, d_child); + if (simple_positive(d)) { + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(d)) + child = dget_dlock(d); + spin_unlock(&d->d_lock); + if (likely(child)) + break; + } + } + spin_unlock(&parent->d_lock); + dput(prev); + return child; +} + +void simple_recursive_removal(struct dentry *dentry, + void (*callback)(struct dentry *)) +{ + struct dentry *this = dget(dentry); + while (true) { + struct dentry *victim = NULL, *child; + struct inode *inode = this->d_inode; + + inode_lock(inode); + if (d_is_dir(this)) + inode->i_flags |= S_DEAD; + while ((child = find_next_child(this, victim)) == NULL) { + // kill and ascend + // update metadata while it's still locked + inode->i_ctime = current_time(inode); + clear_nlink(inode); + inode_unlock(inode); + victim = this; + this = this->d_parent; + inode = this->d_inode; + inode_lock(inode); + if (simple_positive(victim)) { + d_invalidate(victim); // avoid lost mounts + if (d_is_dir(victim)) + fsnotify_rmdir(inode, victim); + else + fsnotify_unlink(inode, victim); + if (callback) + callback(victim); + dput(victim); // unpin it + } + if (victim == dentry) { + inode->i_ctime = inode->i_mtime = + current_time(inode); + if (d_is_dir(dentry)) + drop_nlink(inode); + inode_unlock(inode); + dput(dentry); + return; + } + } + inode_unlock(inode); + this = child; + } +} +EXPORT_SYMBOL(simple_recursive_removal); + static const struct super_operations simple_super_operations = { .statfs = simple_statfs, }; diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c index ca9228a56d65..a01f08c8c2f3 100644 --- a/fs/lockd/procfs.c +++ b/fs/lockd/procfs.c @@ -60,11 +60,11 @@ nlm_end_grace_read(struct file *file, char __user *buf, size_t size, return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp)); } -static const struct file_operations lockd_end_grace_operations = { - .write = nlm_end_grace_write, - .read = nlm_end_grace_read, - .llseek = default_llseek, - .release = simple_transaction_release, +static const struct proc_ops lockd_end_grace_proc_ops = { + .proc_write = nlm_end_grace_write, + .proc_read = nlm_end_grace_read, + .proc_lseek = default_llseek, + .proc_release = simple_transaction_release, }; int __init @@ -76,7 +76,7 @@ lockd_create_procfs(void) if (!entry) return -ENOMEM; entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry, - &lockd_end_grace_operations); + &lockd_end_grace_proc_ops); if (!entry) { remove_proc_entry("fs/lockd", NULL); return -ENOMEM; diff --git a/fs/namei.c b/fs/namei.c index 4167109297e0..db6565c99825 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3333,8 +3333,8 @@ static int do_last(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; - kuid_t dir_uid = dir->d_inode->i_uid; - umode_t dir_mode = dir->d_inode->i_mode; + kuid_t dir_uid = nd->inode->i_uid; + umode_t dir_mode = nd->inode->i_mode; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 11b42c523f04..7eb919f1b13f 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -157,11 +157,11 @@ static int exports_proc_open(struct inode *inode, struct file *file) return exports_net_open(current->nsproxy->net_ns, file); } -static const struct file_operations exports_proc_operations = { - .open = exports_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops exports_proc_ops = { + .proc_open = exports_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; static int exports_nfsd_open(struct inode *inode, struct file *file) @@ -1431,8 +1431,7 @@ static int create_proc_exports_entry(void) entry = proc_mkdir("fs/nfs", NULL); if (!entry) return -ENOMEM; - entry = proc_create("exports", 0, entry, - &exports_proc_operations); + entry = proc_create("exports", 0, entry, &exports_proc_ops); if (!entry) { remove_proc_entry("fs/nfs", NULL); return -ENOMEM; diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 9bce3b913189..b1bc582b0493 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -84,17 +84,17 @@ static int nfsd_proc_open(struct inode *inode, struct file *file) return single_open(file, nfsd_proc_show, NULL); } -static const struct file_operations nfsd_proc_fops = { - .open = nfsd_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops nfsd_proc_ops = { + .proc_open = nfsd_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; void nfsd_stat_init(void) { - svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_fops); + svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); } void diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 6c7388430ad3..d4359a1df3d5 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2899,18 +2899,12 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) ia_valid |= ATTR_MTIME | ATTR_CTIME; } } - if (ia_valid & ATTR_ATIME) { - vi->i_atime = timestamp_truncate(attr->ia_atime, - vi); - } - if (ia_valid & ATTR_MTIME) { - vi->i_mtime = timestamp_truncate(attr->ia_mtime, - vi); - } - if (ia_valid & ATTR_CTIME) { - vi->i_ctime = timestamp_truncate(attr->ia_ctime, - vi); - } + if (ia_valid & ATTR_ATIME) + vi->i_atime = attr->ia_atime; + if (ia_valid & ATTR_MTIME) + vi->i_mtime = attr->ia_mtime; + if (ia_valid & ATTR_CTIME) + vi->i_ctime = attr->ia_ctime; mark_inode_dirty(vi); out: return err; diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 5c424a099280..1ef24574f481 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -73,7 +73,7 @@ static void o2quo_fence_self(void) "system by restarting ***\n"); emergency_restart(); break; - }; + } } /* Indicate that a timeout occurred on a heartbeat region write. The diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index 38b224372776..5e700b45d32d 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -ccflags-y := -I $(srctree)/$(src)/.. - obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 4de89af96abf..6abaded3ff6b 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -23,15 +23,15 @@ #include <linux/spinlock.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index aaf24548b02a..0463dce65bb2 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -688,10 +688,6 @@ struct dlm_begin_reco __be32 pad2; }; - -#define BITS_PER_BYTE 8 -#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE) - struct dlm_query_join_request { u8 node_idx; diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 965f45dbe17b..6051edc33aef 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -23,9 +23,9 @@ #include <linux/spinlock.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -33,7 +33,7 @@ #include "dlmconvert.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" /* NOTE: __dlmconvert_master is the only function in here that * needs a spinlock held on entry (res->spinlock) and it is the diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 4d0b452012b2..c5c6efba7b5e 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -17,9 +17,9 @@ #include <linux/debugfs.h> #include <linux/export.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -27,7 +27,7 @@ #include "dlmdebug.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static int stringify_lockname(const char *lockname, int locklen, char *buf, int len); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index ee6f459f9770..357cfc702ce3 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -20,9 +20,9 @@ #include <linux/debugfs.h> #include <linux/sched/signal.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -30,7 +30,7 @@ #include "dlmdebug.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" /* * ocfs2 node maps are array of long int, which limits to send them freely diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index baff087f3863..83f0760e4fba 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -25,9 +25,9 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -35,7 +35,7 @@ #include "dlmconvert.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static struct kmem_cache *dlm_lock_cache; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 74b768ca1cd8..900f7e466d11 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -25,9 +25,9 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" @@ -35,7 +35,7 @@ #include "dlmdebug.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static void dlm_mle_node_down(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle, @@ -2554,8 +2554,6 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, if (!dlm_grab(dlm)) return -EINVAL; - BUG_ON(target == O2NM_MAX_NODES); - name = res->lockname.name; namelen = res->lockname.len; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 064ce5bbc3f6..4b566e88582f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -26,16 +26,16 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #include "dlmdomain.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); @@ -1668,7 +1668,7 @@ static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 nodenum, u8 *real_master) { - int ret = -EINVAL; + int ret; struct dlm_master_requery req; int status = DLM_LOCK_RES_OWNER_UNKNOWN; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 61c51c268460..fd40c17cd022 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -25,16 +25,16 @@ #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #include "dlmdomain.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD) -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static int dlm_thread(void *data); static void dlm_flush_asts(struct dlm_ctxt *dlm); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 3883633e82eb..dcb17ca8ae74 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -23,15 +23,15 @@ #include <linux/spinlock.h> #include <linux/delay.h> -#include "cluster/heartbeat.h" -#include "cluster/nodemanager.h" -#include "cluster/tcp.h" +#include "../cluster/heartbeat.h" +#include "../cluster/nodemanager.h" +#include "../cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" #define MLOG_MASK_PREFIX ML_DLM -#include "cluster/masklog.h" +#include "../cluster/masklog.h" #define DLM_UNLOCK_FREE_LOCK 0x00000001 #define DLM_UNLOCK_CALL_AST 0x00000002 diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index a9874e441bd4..c7895f65be0e 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -ccflags-y := -I $(srctree)/$(src)/.. - obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o ocfs2_dlmfs-objs := userdlm.o dlmfs.o diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 4f1668c81e1f..8e4f1ace467c 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -33,11 +33,11 @@ #include <linux/uaccess.h> -#include "stackglue.h" +#include "../stackglue.h" #include "userdlm.h" #define MLOG_MASK_PREFIX ML_DLMFS -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static const struct super_operations dlmfs_ops; diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 525b14ddfba5..3df5be25bfb1 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -21,12 +21,12 @@ #include <linux/types.h> #include <linux/crc32.h> -#include "ocfs2_lockingver.h" -#include "stackglue.h" +#include "../ocfs2_lockingver.h" +#include "../stackglue.h" #include "userdlm.h" #define MLOG_MASK_PREFIX ML_DLMFS -#include "cluster/masklog.h" +#include "../cluster/masklog.h" static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index cda1027d0819..cb9e6a73bea9 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -570,7 +570,7 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, mlog_bug_on_msg(1, "type: %d\n", type); ops = NULL; /* thanks, gcc */ break; - }; + } ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno, generation, res->l_name); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9876db52913a..6cd5e4924e4d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2101,17 +2101,15 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, struct buffer_head **di_bh, int meta_level, - int overwrite_io, int write_sem, int wait) { int ret = 0; if (wait) - ret = ocfs2_inode_lock(inode, NULL, meta_level); + ret = ocfs2_inode_lock(inode, di_bh, meta_level); else - ret = ocfs2_try_inode_lock(inode, - overwrite_io ? NULL : di_bh, meta_level); + ret = ocfs2_try_inode_lock(inode, di_bh, meta_level); if (ret < 0) goto out; @@ -2136,6 +2134,7 @@ static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, out_unlock: brelse(*di_bh); + *di_bh = NULL; ocfs2_inode_unlock(inode, meta_level); out: return ret; @@ -2177,7 +2176,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, ret = ocfs2_inode_lock_for_extent_tree(inode, &di_bh, meta_level, - overwrite_io, write_sem, wait); if (ret < 0) { @@ -2233,13 +2231,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file, &di_bh, meta_level, write_sem); + meta_level = 1; + write_sem = 1; ret = ocfs2_inode_lock_for_extent_tree(inode, &di_bh, meta_level, - overwrite_io, - 1, + write_sem, wait); - write_sem = 1; if (ret < 0) { if (ret != -EAGAIN) mlog_errno(ret); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3103ba7f97a2..bfe611ed1b1d 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -597,9 +597,11 @@ static inline void ocfs2_update_inode_fsync_trans(handle_t *handle, { struct ocfs2_inode_info *oi = OCFS2_I(inode); - oi->i_sync_tid = handle->h_transaction->t_tid; - if (datasync) - oi->i_datasync_tid = handle->h_transaction->t_tid; + if (!is_handle_aborted(handle)) { + oi->i_sync_tid = handle->h_transaction->t_tid; + if (datasync) + oi->i_datasync_tid = handle->h_transaction->t_tid; + } } #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8ea51cf27b97..da65251ef815 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -586,8 +586,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, mlog_errno(status); } - oi->i_sync_tid = handle->h_transaction->t_tid; - oi->i_datasync_tid = handle->h_transaction->t_tid; + ocfs2_update_inode_fsync_trans(handle, inode, 1); leave: if (status < 0) { diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 4180c3ef0a68..939df99d2dec 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -696,7 +696,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, ac, cl); - if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC)) + if (PTR_ERR(bg_bh) == -ENOSPC) bg_bh = ocfs2_block_group_alloc_discontig(handle, alloc_inode, ac, cl); diff --git a/fs/open.c b/fs/open.c index 8cdb2b675867..0788b3715731 100644 --- a/fs/open.c +++ b/fs/open.c @@ -958,7 +958,7 @@ EXPORT_SYMBOL(open_with_fake_path); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) -static inline struct open_how build_open_how(int flags, umode_t mode) +inline struct open_how build_open_how(int flags, umode_t mode) { struct open_how how = { .flags = flags & VALID_OPEN_FLAGS, @@ -974,8 +974,7 @@ static inline struct open_how build_open_how(int flags, umode_t mode) return how; } -static inline int build_open_flags(const struct open_how *how, - struct open_flags *op) +inline int build_open_flags(const struct open_how *how, struct open_flags *op) { int flags = how->flags; int lookup_flags = 0; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 6220642fe113..9fc47c2e078d 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -24,7 +24,7 @@ static int ovl_ccup_set(const char *buf, const struct kernel_param *param) { - pr_warn("overlayfs: \"check_copy_up\" module option is obsolete\n"); + pr_warn("\"check_copy_up\" module option is obsolete\n"); return 0; } @@ -123,6 +123,9 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) loff_t old_pos = 0; loff_t new_pos = 0; loff_t cloned; + loff_t data_pos = -1; + loff_t hole_len; + bool skip_hole = false; int error = 0; if (len == 0) @@ -144,7 +147,11 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) goto out; /* Couldn't clone, so now we try to copy the data */ - /* FIXME: copy up sparse files efficiently */ + /* Check if lower fs supports seek operation */ + if (old_file->f_mode & FMODE_LSEEK && + old_file->f_op->llseek) + skip_hole = true; + while (len) { size_t this_len = OVL_COPY_UP_CHUNK_SIZE; long bytes; @@ -157,6 +164,36 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) break; } + /* + * Fill zero for hole will cost unnecessary disk space + * and meanwhile slow down the copy-up speed, so we do + * an optimization for hole during copy-up, it relies + * on SEEK_DATA implementation in lower fs so if lower + * fs does not support it, copy-up will behave as before. + * + * Detail logic of hole detection as below: + * When we detect next data position is larger than current + * position we will skip that hole, otherwise we copy + * data in the size of OVL_COPY_UP_CHUNK_SIZE. Actually, + * it may not recognize all kind of holes and sometimes + * only skips partial of hole area. However, it will be + * enough for most of the use cases. + */ + + if (skip_hole && data_pos < old_pos) { + data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA); + if (data_pos > old_pos) { + hole_len = data_pos - old_pos; + len -= hole_len; + old_pos = new_pos = data_pos; + continue; + } else if (data_pos == -ENXIO) { + break; + } else if (data_pos < 0) { + skip_hole = false; + } + } + bytes = do_splice_direct(old_file, &old_pos, new_file, &new_pos, this_len, SPLICE_F_MOVE); @@ -480,7 +517,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) } inode_lock(temp->d_inode); - if (c->metacopy) + if (S_ISREG(c->stat.mode)) err = ovl_set_size(temp, &c->stat); if (!err) err = ovl_set_attr(temp, &c->stat); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 29abdb1d3b5c..8e57d5372b8f 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -35,7 +35,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) dput(wdentry); if (err) { - pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n", + pr_err("cleanup of '%pd2' failed (%i)\n", wdentry, err); } @@ -53,7 +53,7 @@ static struct dentry *ovl_lookup_temp(struct dentry *workdir) temp = lookup_one_len(name, workdir, strlen(name)); if (!IS_ERR(temp) && temp->d_inode) { - pr_err("overlayfs: workdir/%s already exists\n", name); + pr_err("workdir/%s already exists\n", name); dput(temp); temp = ERR_PTR(-EIO); } @@ -134,7 +134,7 @@ static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, d = lookup_one_len(dentry->d_name.name, dentry->d_parent, dentry->d_name.len); if (IS_ERR(d)) { - pr_warn("overlayfs: failed lookup after mkdir (%pd2, err=%i).\n", + pr_warn("failed lookup after mkdir (%pd2, err=%i).\n", dentry, err); return PTR_ERR(d); } @@ -267,7 +267,7 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, d_instantiate(dentry, inode); if (inode != oip.newinode) { - pr_warn_ratelimited("overlayfs: newly created inode found in cache (%pd2)\n", + pr_warn_ratelimited("newly created inode found in cache (%pd2)\n", dentry); } @@ -1009,7 +1009,7 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) spin_unlock(&dentry->d_lock); } else { kfree(redirect); - pr_warn_ratelimited("overlayfs: failed to set redirect (%i)\n", + pr_warn_ratelimited("failed to set redirect (%i)\n", err); /* Fall back to userspace copy-up */ err = -EXDEV; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 70e55588aedc..6f54d70cef27 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -30,7 +30,7 @@ static int ovl_encode_maybe_copy_up(struct dentry *dentry) } if (err) { - pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n", + pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n", dentry, err); } @@ -244,7 +244,7 @@ out: return err; fail: - pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", + pr_warn_ratelimited("failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", dentry, err, buflen, fh ? (int)fh->fb.len : 0, fh ? fh->fb.type : 0); goto out; @@ -358,7 +358,7 @@ static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx) */ static struct dentry *ovl_lookup_real_one(struct dentry *connected, struct dentry *real, - struct ovl_layer *layer) + const struct ovl_layer *layer) { struct inode *dir = d_inode(connected); struct dentry *this, *parent = NULL; @@ -406,7 +406,7 @@ out: return this; fail: - pr_warn_ratelimited("overlayfs: failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", + pr_warn_ratelimited("failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", real, layer->idx, connected, err); this = ERR_PTR(err); goto out; @@ -414,17 +414,16 @@ fail: static struct dentry *ovl_lookup_real(struct super_block *sb, struct dentry *real, - struct ovl_layer *layer); + const struct ovl_layer *layer); /* * Lookup an indexed or hashed overlay dentry by real inode. */ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, struct dentry *real, - struct ovl_layer *layer) + const struct ovl_layer *layer) { struct ovl_fs *ofs = sb->s_fs_info; - struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt }; struct dentry *index = NULL; struct dentry *this = NULL; struct inode *inode; @@ -466,7 +465,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, * recursive call walks back from indexed upper to the topmost * connected/hashed upper parent (or up to root). */ - this = ovl_lookup_real(sb, upper, &upper_layer); + this = ovl_lookup_real(sb, upper, &ofs->layers[0]); dput(upper); } @@ -487,7 +486,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, */ static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb, struct dentry *real, - struct ovl_layer *layer) + const struct ovl_layer *layer) { struct dentry *next, *parent = NULL; struct dentry *ancestor = ERR_PTR(-EIO); @@ -540,7 +539,7 @@ static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb, */ static struct dentry *ovl_lookup_real(struct super_block *sb, struct dentry *real, - struct ovl_layer *layer) + const struct ovl_layer *layer) { struct dentry *connected; int err = 0; @@ -631,7 +630,7 @@ static struct dentry *ovl_lookup_real(struct super_block *sb, return connected; fail: - pr_warn_ratelimited("overlayfs: failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", + pr_warn_ratelimited("failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n", real, layer->idx, connected, err); dput(connected); return ERR_PTR(err); @@ -646,8 +645,7 @@ static struct dentry *ovl_get_dentry(struct super_block *sb, struct dentry *index) { struct ovl_fs *ofs = sb->s_fs_info; - struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt }; - struct ovl_layer *layer = upper ? &upper_layer : lowerpath->layer; + const struct ovl_layer *layer = upper ? &ofs->layers[0] : lowerpath->layer; struct dentry *real = upper ?: (index ?: lowerpath->dentry); /* @@ -822,7 +820,7 @@ out: return dentry; out_err: - pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", + pr_warn_ratelimited("failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", fh_len, fh_type, flags, err); dentry = ERR_PTR(err); goto out; @@ -831,7 +829,7 @@ out_err: static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - pr_warn_ratelimited("overlayfs: connectable file handles not supported; use 'no_subtree_check' exportfs option.\n"); + pr_warn_ratelimited("connectable file handles not supported; use 'no_subtree_check' exportfs option.\n"); return ERR_PTR(-EACCES); } diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index e235a635d9ec..a5317216de73 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -9,8 +9,19 @@ #include <linux/xattr.h> #include <linux/uio.h> #include <linux/uaccess.h> +#include <linux/splice.h> +#include <linux/mm.h> +#include <linux/fs.h> #include "overlayfs.h" +struct ovl_aio_req { + struct kiocb iocb; + struct kiocb *orig_iocb; + struct fd fd; +}; + +static struct kmem_cache *ovl_aio_request_cachep; + static char ovl_whatisit(struct inode *inode, struct inode *realinode) { if (realinode != ovl_inode_upper(inode)) @@ -146,7 +157,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file_inode(file); struct fd real; const struct cred *old_cred; - ssize_t ret; + loff_t ret; /* * The two special cases below do not need to involve real fs, @@ -171,7 +182,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) * limitations that are more strict than ->s_maxbytes for specific * files, so we use the real file to perform seeks. */ - inode_lock(inode); + ovl_inode_lock(inode); real.file->f_pos = file->f_pos; old_cred = ovl_override_creds(inode->i_sb); @@ -179,7 +190,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) revert_creds(old_cred); file->f_pos = real.file->f_pos; - inode_unlock(inode); + ovl_inode_unlock(inode); fdput(real); @@ -225,6 +236,33 @@ static rwf_t ovl_iocb_to_rwf(struct kiocb *iocb) return flags; } +static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) +{ + struct kiocb *iocb = &aio_req->iocb; + struct kiocb *orig_iocb = aio_req->orig_iocb; + + if (iocb->ki_flags & IOCB_WRITE) { + struct inode *inode = file_inode(orig_iocb->ki_filp); + + file_end_write(iocb->ki_filp); + ovl_copyattr(ovl_inode_real(inode), inode); + } + + orig_iocb->ki_pos = iocb->ki_pos; + fdput(aio_req->fd); + kmem_cache_free(ovl_aio_request_cachep, aio_req); +} + +static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) +{ + struct ovl_aio_req *aio_req = container_of(iocb, + struct ovl_aio_req, iocb); + struct kiocb *orig_iocb = aio_req->orig_iocb; + + ovl_aio_cleanup_handler(aio_req); + orig_iocb->ki_complete(orig_iocb, res, res2); +} + static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; @@ -240,10 +278,28 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) return ret; old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb)); + if (is_sync_kiocb(iocb)) { + ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb)); + } else { + struct ovl_aio_req *aio_req; + + ret = -ENOMEM; + aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); + if (!aio_req) + goto out; + + aio_req->fd = real; + real.flags = 0; + aio_req->orig_iocb = iocb; + kiocb_clone(&aio_req->iocb, iocb, real.file); + aio_req->iocb.ki_complete = ovl_aio_rw_complete; + ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); + if (ret != -EIOCBQUEUED) + ovl_aio_cleanup_handler(aio_req); + } +out: revert_creds(old_cred); - ovl_file_accessed(file); fdput(real); @@ -274,15 +330,33 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; old_cred = ovl_override_creds(file_inode(file)->i_sb); - file_start_write(real.file); - ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, - ovl_iocb_to_rwf(iocb)); - file_end_write(real.file); + if (is_sync_kiocb(iocb)) { + file_start_write(real.file); + ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, + ovl_iocb_to_rwf(iocb)); + file_end_write(real.file); + /* Update size */ + ovl_copyattr(ovl_inode_real(inode), inode); + } else { + struct ovl_aio_req *aio_req; + + ret = -ENOMEM; + aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL); + if (!aio_req) + goto out; + + file_start_write(real.file); + aio_req->fd = real; + real.flags = 0; + aio_req->orig_iocb = iocb; + kiocb_clone(&aio_req->iocb, iocb, real.file); + aio_req->iocb.ki_complete = ovl_aio_rw_complete; + ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); + if (ret != -EIOCBQUEUED) + ovl_aio_cleanup_handler(aio_req); + } +out: revert_creds(old_cred); - - /* Update size */ - ovl_copyattr(ovl_inode_real(inode), inode); - fdput(real); out_unlock: @@ -291,6 +365,48 @@ out_unlock: return ret; } +static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + ssize_t ret; + struct fd real; + const struct cred *old_cred; + + ret = ovl_real_fdget(in, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(in)->i_sb); + ret = generic_file_splice_read(real.file, ppos, pipe, len, flags); + revert_creds(old_cred); + + ovl_file_accessed(in); + fdput(real); + return ret; +} + +static ssize_t +ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fd real; + const struct cred *old_cred; + ssize_t ret; + + ret = ovl_real_fdget(out, &real); + if (ret) + return ret; + + old_cred = ovl_override_creds(file_inode(out)->i_sb); + ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); + revert_creds(old_cred); + + ovl_file_accessed(out); + fdput(real); + return ret; +} + static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct fd real; @@ -647,7 +763,25 @@ const struct file_operations ovl_file_operations = { .fadvise = ovl_fadvise, .unlocked_ioctl = ovl_ioctl, .compat_ioctl = ovl_compat_ioctl, + .splice_read = ovl_splice_read, + .splice_write = ovl_splice_write, .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, }; + +int __init ovl_aio_request_cache_init(void) +{ + ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req", + sizeof(struct ovl_aio_req), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!ovl_aio_request_cachep) + return -ENOMEM; + + return 0; +} + +void ovl_aio_request_cache_destroy(void) +{ + kmem_cache_destroy(ovl_aio_request_cachep); +} diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b045cf1826fc..79e8994e3bc1 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -75,10 +75,9 @@ out: return err; } -static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, - struct ovl_layer *lower_layer) +static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) { - bool samefs = ovl_same_sb(dentry->d_sb); + bool samefs = ovl_same_fs(dentry->d_sb); unsigned int xinobits = ovl_xino_bits(dentry->d_sb); if (samefs) { @@ -100,12 +99,10 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, * persistent for a given layer configuration. */ if (stat->ino >> shift) { - pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n", + pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", dentry, stat->ino, xinobits); } else { - if (lower_layer) - stat->ino |= ((u64)lower_layer->fsid) << shift; - + stat->ino |= ((u64)fsid) << shift; stat->dev = dentry->d_sb->s_dev; return 0; } @@ -124,15 +121,14 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, */ stat->dev = dentry->d_sb->s_dev; stat->ino = dentry->d_inode->i_ino; - } else if (lower_layer && lower_layer->fsid) { + } else { /* * For non-samefs setup, if we cannot map all layers st_ino * to a unified address space, we need to make sure that st_dev - * is unique per lower fs. Upper layer uses real st_dev and - * lower layers use the unique anonymous bdev assigned to the - * lower fs. + * is unique per underlying fs, so we use the unique anonymous + * bdev assigned to the underlying fs. */ - stat->dev = lower_layer->fs->pseudo_dev; + stat->dev = OVL_FS(dentry->d_sb)->fs[fsid].pseudo_dev; } return 0; @@ -146,8 +142,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat, struct path realpath; const struct cred *old_cred; bool is_dir = S_ISDIR(dentry->d_inode->i_mode); - bool samefs = ovl_same_sb(dentry->d_sb); - struct ovl_layer *lower_layer = NULL; + int fsid = 0; int err; bool metacopy_blocks = false; @@ -168,9 +163,9 @@ int ovl_getattr(const struct path *path, struct kstat *stat, * If lower filesystem supports NFS file handles, this also guaranties * persistent st_ino across mount cycle. */ - if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) { + if (!is_dir || ovl_same_dev(dentry->d_sb)) { if (!OVL_TYPE_UPPER(type)) { - lower_layer = ovl_layer_lower(dentry); + fsid = ovl_layer_lower(dentry)->fsid; } else if (OVL_TYPE_ORIGIN(type)) { struct kstat lowerstat; u32 lowermask = STATX_INO | STATX_BLOCKS | @@ -200,14 +195,8 @@ int ovl_getattr(const struct path *path, struct kstat *stat, if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || (!ovl_verify_lower(dentry->d_sb) && (is_dir || lowerstat.nlink == 1))) { - lower_layer = ovl_layer_lower(dentry); - /* - * Cannot use origin st_dev;st_ino because - * origin inode content may differ from overlay - * inode content. - */ - if (samefs || lower_layer->fsid) - stat->ino = lowerstat.ino; + fsid = ovl_layer_lower(dentry)->fsid; + stat->ino = lowerstat.ino; } /* @@ -241,7 +230,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat, } } - err = ovl_map_dev_ino(dentry, stat, lower_layer); + err = ovl_map_dev_ino(dentry, stat, fsid); if (err) goto out; @@ -527,6 +516,27 @@ static const struct address_space_operations ovl_aops = { * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) * [...] &type->i_mutex_dir_key (stack_depth=0) + * + * Locking order w.r.t ovl_want_write() is important for nested overlayfs. + * + * This chain is valid: + * - inode->i_rwsem (inode_lock[2]) + * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) + * - OVL_I(inode)->lock (ovl_inode_lock[2]) + * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) + * + * And this chain is valid: + * - inode->i_rwsem (inode_lock[2]) + * - OVL_I(inode)->lock (ovl_inode_lock[2]) + * - lowerinode->i_rwsem (inode_lock[1]) + * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) + * + * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is + * held, because it is in reverse order of the non-nested case using the same + * upper fs: + * - inode->i_rwsem (inode_lock[1]) + * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) + * - OVL_I(inode)->lock (ovl_inode_lock[1]) */ #define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH @@ -565,7 +575,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real * upper inode i_ino on ovl_inode_init() or ovl_inode_update(). */ - if (ovl_same_sb(inode->i_sb) || xinobits) { + if (ovl_same_dev(inode->i_sb)) { inode->i_ino = ino; if (xinobits && fsid && !(ino >> (64 - xinobits))) inode->i_ino |= (unsigned long)fsid << (64 - xinobits); @@ -698,7 +708,7 @@ unsigned int ovl_get_nlink(struct dentry *lowerdentry, return nlink; fail: - pr_warn_ratelimited("overlayfs: failed to get index nlink (%pd2, err=%i)\n", + pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n", upperdentry, err); return fallback; } @@ -969,7 +979,7 @@ out: return inode; out_err: - pr_warn_ratelimited("overlayfs: failed to get inode (%i)\n", err); + pr_warn_ratelimited("failed to get inode (%i)\n", err); inode = ERR_PTR(err); goto out; } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 76ff66339173..ed9e129fae04 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -141,10 +141,10 @@ out: return NULL; fail: - pr_warn_ratelimited("overlayfs: failed to get origin (%i)\n", res); + pr_warn_ratelimited("failed to get origin (%i)\n", res); goto out; invalid: - pr_warn_ratelimited("overlayfs: invalid origin (%*phN)\n", res, fh); + pr_warn_ratelimited("invalid origin (%*phN)\n", res, fh); goto out; } @@ -322,16 +322,16 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *origin = NULL; int i; - for (i = 0; i < ofs->numlower; i++) { + for (i = 1; i < ofs->numlayer; i++) { /* * If lower fs uuid is not unique among lower fs we cannot match * fh->uuid to layer. */ - if (ofs->lower_layers[i].fsid && - ofs->lower_layers[i].fs->bad_uuid) + if (ofs->layers[i].fsid && + ofs->layers[i].fs->bad_uuid) continue; - origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt, + origin = ovl_decode_real_fh(fh, ofs->layers[i].mnt, connected); if (origin) break; @@ -354,13 +354,13 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, } **stackp = (struct ovl_path){ .dentry = origin, - .layer = &ofs->lower_layers[i] + .layer = &ofs->layers[i] }; return 0; invalid: - pr_warn_ratelimited("overlayfs: invalid origin (%pd2, ftype=%x, origin ftype=%x).\n", + pr_warn_ratelimited("invalid origin (%pd2, ftype=%x, origin ftype=%x).\n", upperdentry, d_inode(upperdentry)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); dput(origin); @@ -449,7 +449,7 @@ out: fail: inode = d_inode(real); - pr_warn_ratelimited("overlayfs: failed to verify %s (%pd2, ino=%lu, err=%i)\n", + pr_warn_ratelimited("failed to verify %s (%pd2, ino=%lu, err=%i)\n", is_upper ? "upper" : "origin", real, inode ? inode->i_ino : 0, err); goto out; @@ -475,7 +475,7 @@ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index) return upper ?: ERR_PTR(-ESTALE); if (!d_is_dir(upper)) { - pr_warn_ratelimited("overlayfs: invalid index upper (%pd2, upper=%pd2).\n", + pr_warn_ratelimited("invalid index upper (%pd2, upper=%pd2).\n", index, upper); dput(upper); return ERR_PTR(-EIO); @@ -589,12 +589,12 @@ out: return err; fail: - pr_warn_ratelimited("overlayfs: failed to verify index (%pd2, ftype=%x, err=%i)\n", + pr_warn_ratelimited("failed to verify index (%pd2, ftype=%x, err=%i)\n", index, d_inode(index)->i_mode & S_IFMT, err); goto out; orphan: - pr_warn_ratelimited("overlayfs: orphan index entry (%pd2, ftype=%x, nlink=%u)\n", + pr_warn_ratelimited("orphan index entry (%pd2, ftype=%x, nlink=%u)\n", index, d_inode(index)->i_mode & S_IFMT, d_inode(index)->i_nlink); err = -ENOENT; @@ -696,7 +696,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, index = NULL; goto out; } - pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" + pr_warn_ratelimited("failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, err); @@ -723,13 +723,13 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, * unlinked, which means that finding a lower origin on lookup * whose index is a whiteout should be treated as an error. */ - pr_warn_ratelimited("overlayfs: bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", + pr_warn_ratelimited("bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", index, d_inode(index)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); goto fail; } else if (is_dir && verify) { if (!upper) { - pr_warn_ratelimited("overlayfs: suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n", + pr_warn_ratelimited("suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n", origin, index); goto fail; } @@ -738,7 +738,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, err = ovl_verify_upper(index, upper, false); if (err) { if (err == -ESTALE) { - pr_warn_ratelimited("overlayfs: suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", + pr_warn_ratelimited("suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", upper, origin, index); } goto fail; @@ -885,7 +885,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (!d.stop && poe->numlower) { err = -ENOMEM; - stack = kcalloc(ofs->numlower, sizeof(struct ovl_path), + stack = kcalloc(ofs->numlayer - 1, sizeof(struct ovl_path), GFP_KERNEL); if (!stack) goto out_put_upper; @@ -967,7 +967,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, */ err = -EPERM; if (d.redirect && !ofs->config.redirect_follow) { - pr_warn_ratelimited("overlayfs: refusing to follow redirect for (%pd2)\n", + pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", dentry); goto out_put; } @@ -994,7 +994,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, err = -EPERM; if (!ofs->config.metacopy) { - pr_warn_ratelimited("overlay: refusing to follow metacopy origin for (%pd2)\n", + pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry); goto out_put; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index f283b1d69a9e..3623d28aa4fa 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -9,6 +9,9 @@ #include <linux/fs.h> #include "ovl_entry.h" +#undef pr_fmt +#define pr_fmt(fmt) "overlayfs: " fmt + enum ovl_path_type { __OVL_PATH_UPPER = (1 << 0), __OVL_PATH_MERGE = (1 << 1), @@ -221,7 +224,6 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); -struct super_block *ovl_same_sb(struct super_block *sb); int ovl_can_decode_fh(struct super_block *sb); struct dentry *ovl_indexdir(struct super_block *sb); bool ovl_index_all(struct super_block *sb); @@ -237,7 +239,7 @@ enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_lowerdata(struct dentry *dentry); -struct ovl_layer *ovl_layer_lower(struct dentry *dentry); +const struct ovl_layer *ovl_layer_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); @@ -299,11 +301,21 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); } -static inline unsigned int ovl_xino_bits(struct super_block *sb) +/* All layers on same fs? */ +static inline bool ovl_same_fs(struct super_block *sb) +{ + return OVL_FS(sb)->xino_mode == 0; +} + +/* All overlay inodes have same st_dev? */ +static inline bool ovl_same_dev(struct super_block *sb) { - struct ovl_fs *ofs = sb->s_fs_info; + return OVL_FS(sb)->xino_mode >= 0; +} - return ofs->xino_bits; +static inline unsigned int ovl_xino_bits(struct super_block *sb) +{ + return ovl_same_dev(sb) ? OVL_FS(sb)->xino_mode : 0; } static inline int ovl_inode_lock(struct inode *inode) @@ -438,6 +450,8 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); /* file.c */ extern const struct file_operations ovl_file_operations; +int __init ovl_aio_request_cache_init(void); +void ovl_aio_request_cache_destroy(void); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 28348c44ea5b..89015ea822e7 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -24,6 +24,8 @@ struct ovl_sb { dev_t pseudo_dev; /* Unusable (conflicting) uuid */ bool bad_uuid; + /* Used as a lower layer (but maybe also as upper) */ + bool is_lower; }; struct ovl_layer { @@ -38,18 +40,18 @@ struct ovl_layer { }; struct ovl_path { - struct ovl_layer *layer; + const struct ovl_layer *layer; struct dentry *dentry; }; /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; - unsigned int numlower; - /* Number of unique lower sb that differ from upper sb */ - unsigned int numlowerfs; - struct ovl_layer *lower_layers; - struct ovl_sb *lower_fs; + unsigned int numlayer; + /* Number of unique fs among layers including upper fs */ + unsigned int numfs; + const struct ovl_layer *layers; + struct ovl_sb *fs; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; /* workdir is the 'work' directory under workbasedir */ @@ -71,10 +73,15 @@ struct ovl_fs { struct inode *workbasedir_trap; struct inode *workdir_trap; struct inode *indexdir_trap; - /* Inode numbers in all layers do not use the high xino_bits */ - unsigned int xino_bits; + /* -1: disabled, 0: same fs, 1..32: number of unused ino bits */ + int xino_mode; }; +static inline struct ovl_fs *OVL_FS(struct super_block *sb) +{ + return (struct ovl_fs *)sb->s_fs_info; +} + /* private information held for every overlayfs dentry */ struct ovl_entry { union { diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 47a91c9733a5..40ac9ce2465a 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -441,7 +441,7 @@ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, const char *name, int namelen) { if (ino >> (64 - xinobits)) { - pr_warn_ratelimited("overlayfs: d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", + pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", namelen, name, ino, xinobits); return ino; } @@ -469,7 +469,7 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) int xinobits = ovl_xino_bits(dir->d_sb); int err = 0; - if (!ovl_same_sb(dir->d_sb) && !xinobits) + if (!ovl_same_dev(dir->d_sb)) goto out; if (p->name[0] == '.') { @@ -504,7 +504,13 @@ get: if (err) goto fail; - WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); + /* + * Directory inode is always on overlay st_dev. + * Non-dir with ovl_same_dev() could be on pseudo st_dev in case + * of xino bits overflow. + */ + WARN_ON_ONCE(S_ISDIR(stat.mode) && + dir->d_sb->s_dev != stat.dev); ino = stat.ino; } else if (xinobits && !OVL_TYPE_UPPER(type)) { ino = ovl_remap_lower_ino(ino, xinobits, @@ -518,7 +524,7 @@ out: return err; fail: - pr_warn_ratelimited("overlayfs: failed to look up (%s) for ino (%i)\n", + pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n", p->name, err); goto out; } @@ -685,7 +691,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) int err; struct ovl_dir_file *od = file->private_data; struct dentry *dir = file->f_path.dentry; - struct ovl_layer *lower_layer = ovl_layer_lower(dir); + const struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, .orig_ctx = ctx, @@ -738,7 +744,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) * entries. */ if (ovl_xino_bits(dentry->d_sb) || - (ovl_same_sb(dentry->d_sb) && + (ovl_same_fs(dentry->d_sb) && (ovl_is_impure_dir(file) || OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { return ovl_iterate_real(file, ctx); @@ -965,7 +971,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) dentry = lookup_one_len(p->name, upper, p->len); if (IS_ERR(dentry)) { - pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n", + pr_err("lookup '%s/%.*s' failed (%i)\n", upper->d_name.name, p->len, p->name, (int) PTR_ERR(dentry)); continue; @@ -1147,6 +1153,6 @@ next: out: ovl_cache_free(&list); if (err) - pr_err("overlayfs: failed index dir cleanup (%i)\n", err); + pr_err("failed index dir cleanup (%i)\n", err); return err; } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7621ff176d15..319fe0d355b0 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -224,14 +224,14 @@ static void ovl_free_fs(struct ovl_fs *ofs) if (ofs->upperdir_locked) ovl_inuse_unlock(ofs->upper_mnt->mnt_root); mntput(ofs->upper_mnt); - for (i = 0; i < ofs->numlower; i++) { - iput(ofs->lower_layers[i].trap); - mntput(ofs->lower_layers[i].mnt); + for (i = 1; i < ofs->numlayer; i++) { + iput(ofs->layers[i].trap); + mntput(ofs->layers[i].mnt); } - for (i = 0; i < ofs->numlowerfs; i++) - free_anon_bdev(ofs->lower_fs[i].pseudo_dev); - kfree(ofs->lower_layers); - kfree(ofs->lower_fs); + kfree(ofs->layers); + for (i = 0; i < ofs->numfs; i++) + free_anon_bdev(ofs->fs[i].pseudo_dev); + kfree(ofs->fs); kfree(ofs->config.lowerdir); kfree(ofs->config.upperdir); @@ -358,7 +358,7 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) if (ofs->config.nfs_export != ovl_nfs_export_def) seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ? "on" : "off"); - if (ofs->config.xino != ovl_xino_def()) + if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(sb)) seq_printf(m, ",xino=%s", ovl_xino_str[ofs->config.xino]); if (ofs->config.metacopy != ovl_metacopy_def) seq_printf(m, ",metacopy=%s", @@ -462,7 +462,7 @@ static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode) if (ovl_redirect_always_follow) config->redirect_follow = true; } else if (strcmp(mode, "nofollow") != 0) { - pr_err("overlayfs: bad mount option \"redirect_dir=%s\"\n", + pr_err("bad mount option \"redirect_dir=%s\"\n", mode); return -EINVAL; } @@ -560,14 +560,15 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) break; default: - pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); + pr_err("unrecognized mount option \"%s\" or missing value\n", + p); return -EINVAL; } } /* Workdir is useless in non-upper mount */ if (!config->upperdir && config->workdir) { - pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + pr_info("option \"workdir=%s\" is useless in a non-upper mount, ignore\n", config->workdir); kfree(config->workdir); config->workdir = NULL; @@ -587,7 +588,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) /* Resolve metacopy -> redirect_dir dependency */ if (config->metacopy && !config->redirect_dir) { if (metacopy_opt && redirect_opt) { - pr_err("overlayfs: conflicting options: metacopy=on,redirect_dir=%s\n", + pr_err("conflicting options: metacopy=on,redirect_dir=%s\n", config->redirect_mode); return -EINVAL; } @@ -596,7 +597,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) * There was an explicit redirect_dir=... that resulted * in this conflict. */ - pr_info("overlayfs: disabling metacopy due to redirect_dir=%s\n", + pr_info("disabling metacopy due to redirect_dir=%s\n", config->redirect_mode); config->metacopy = false; } else { @@ -692,7 +693,7 @@ out_unlock: out_dput: dput(work); out_err: - pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n", + pr_warn("failed to create directory %s/%s (errno: %i); mounting read-only\n", ofs->config.workdir, name, -err); work = NULL; goto out_unlock; @@ -716,21 +717,21 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) int err = -EINVAL; if (!*name) { - pr_err("overlayfs: empty lowerdir\n"); + pr_err("empty lowerdir\n"); goto out; } err = kern_path(name, LOOKUP_FOLLOW, path); if (err) { - pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + pr_err("failed to resolve '%s': %i\n", name, err); goto out; } err = -EINVAL; if (ovl_dentry_weird(path->dentry)) { - pr_err("overlayfs: filesystem on '%s' not supported\n", name); + pr_err("filesystem on '%s' not supported\n", name); goto out_put; } if (!d_is_dir(path->dentry)) { - pr_err("overlayfs: '%s' not a directory\n", name); + pr_err("'%s' not a directory\n", name); goto out_put; } return 0; @@ -752,7 +753,7 @@ static int ovl_mount_dir(const char *name, struct path *path) if (!err) if (ovl_dentry_remote(path->dentry)) { - pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n", + pr_err("filesystem on '%s' not supported as upperdir\n", tmp); path_put_init(path); err = -EINVAL; @@ -769,7 +770,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs, int err = vfs_statfs(path, &statfs); if (err) - pr_err("overlayfs: statfs failed on '%s'\n", name); + pr_err("statfs failed on '%s'\n", name); else ofs->namelen = max(ofs->namelen, statfs.f_namelen); @@ -804,13 +805,13 @@ static int ovl_lower_dir(const char *name, struct path *path, (ofs->config.index && ofs->config.upperdir)) && !fh_type) { ofs->config.index = false; ofs->config.nfs_export = false; - pr_warn("overlayfs: fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", + pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n", name); } /* Check if lower fs has 32bit inode numbers */ if (fh_type != FILEID_INO32_GEN) - ofs->xino_bits = 0; + ofs->xino_mode = -1; return 0; @@ -996,7 +997,7 @@ static int ovl_setup_trap(struct super_block *sb, struct dentry *dir, err = PTR_ERR_OR_ZERO(trap); if (err) { if (err == -ELOOP) - pr_err("overlayfs: conflicting %s path\n", name); + pr_err("conflicting %s path\n", name); return err; } @@ -1013,11 +1014,11 @@ static int ovl_setup_trap(struct super_block *sb, struct dentry *dir, static int ovl_report_in_use(struct ovl_fs *ofs, const char *name) { if (ofs->config.index) { - pr_err("overlayfs: %s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n", + pr_err("%s is in-use as upperdir/workdir of another mount, mount with '-o index=off' to override exclusive upperdir protection.\n", name); return -EBUSY; } else { - pr_warn("overlayfs: %s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n", + pr_warn("%s is in-use as upperdir/workdir of another mount, accessing files from both mounts will result in undefined behavior.\n", name); return 0; } @@ -1035,7 +1036,7 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, /* Upper fs should not be r/o */ if (sb_rdonly(upperpath->mnt->mnt_sb)) { - pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); + pr_err("upper fs is r/o, try multi-lower layers mount\n"); err = -EINVAL; goto out; } @@ -1052,7 +1053,7 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, upper_mnt = clone_private_mount(upperpath); err = PTR_ERR(upper_mnt); if (IS_ERR(upper_mnt)) { - pr_err("overlayfs: failed to clone upperpath\n"); + pr_err("failed to clone upperpath\n"); goto out; } @@ -1108,7 +1109,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, * kernel upgrade. So warn instead of erroring out. */ if (!err) - pr_warn("overlayfs: upper fs needs to support d_type.\n"); + pr_warn("upper fs needs to support d_type.\n"); /* Check if upper/work fs supports O_TMPFILE */ temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0); @@ -1116,7 +1117,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, if (ofs->tmpfile) dput(temp); else - pr_warn("overlayfs: upper fs does not support tmpfile.\n"); + pr_warn("upper fs does not support tmpfile.\n"); /* * Check if upper/work fs supports trusted.overlay.* xattr @@ -1126,7 +1127,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, ofs->noxattr = true; ofs->config.index = false; ofs->config.metacopy = false; - pr_warn("overlayfs: upper fs does not support xattr, falling back to index=off and metacopy=off.\n"); + pr_warn("upper fs does not support xattr, falling back to index=off and metacopy=off.\n"); err = 0; } else { vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE); @@ -1136,16 +1137,16 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); if (ofs->config.index && !fh_type) { ofs->config.index = false; - pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n"); + pr_warn("upper fs does not support file handles, falling back to index=off.\n"); } /* Check if upper fs has 32bit inode numbers */ if (fh_type != FILEID_INO32_GEN) - ofs->xino_bits = 0; + ofs->xino_mode = -1; /* NFS export of r/w mount depends on index */ if (ofs->config.nfs_export && !ofs->config.index) { - pr_warn("overlayfs: NFS export requires \"index=on\", falling back to nfs_export=off.\n"); + pr_warn("NFS export requires \"index=on\", falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } out: @@ -1165,11 +1166,11 @@ static int ovl_get_workdir(struct super_block *sb, struct ovl_fs *ofs, err = -EINVAL; if (upperpath->mnt != workpath.mnt) { - pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + pr_err("workdir and upperdir must reside under the same mount\n"); goto out; } if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) { - pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + pr_err("workdir and upperdir must be separate subtrees\n"); goto out; } @@ -1210,7 +1211,7 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry, true); if (err) { - pr_err("overlayfs: failed to verify upper root origin\n"); + pr_err("failed to verify upper root origin\n"); goto out; } @@ -1233,18 +1234,18 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs, err = ovl_verify_set_fh(ofs->indexdir, OVL_XATTR_ORIGIN, upperpath->dentry, true, false); if (err) - pr_err("overlayfs: failed to verify index dir 'origin' xattr\n"); + pr_err("failed to verify index dir 'origin' xattr\n"); } err = ovl_verify_upper(ofs->indexdir, upperpath->dentry, true); if (err) - pr_err("overlayfs: failed to verify index dir 'upper' xattr\n"); + pr_err("failed to verify index dir 'upper' xattr\n"); /* Cleanup bad/stale/orphan index entries */ if (!err) err = ovl_indexdir_cleanup(ofs); } if (err || !ofs->indexdir) - pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); + pr_warn("try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); out: mnt_drop_write(mnt); @@ -1258,7 +1259,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) if (!ofs->config.nfs_export && !ofs->upper_mnt) return true; - for (i = 0; i < ofs->numlowerfs; i++) { + for (i = 0; i < ofs->numfs; i++) { /* * We use uuid to associate an overlay lower file handle with a * lower layer, so we can accept lower fs with null uuid as long @@ -1266,8 +1267,9 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) * if we detect multiple lower fs with the same uuid, we * disable lower file handle decoding on all of them. */ - if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) { - ofs->lower_fs[i].bad_uuid = true; + if (ofs->fs[i].is_lower && + uuid_equal(&ofs->fs[i].sb->s_uuid, uuid)) { + ofs->fs[i].bad_uuid = true; return false; } } @@ -1283,13 +1285,9 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) int err; bool bad_uuid = false; - /* fsid 0 is reserved for upper fs even with non upper overlay */ - if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb) - return 0; - - for (i = 0; i < ofs->numlowerfs; i++) { - if (ofs->lower_fs[i].sb == sb) - return i + 1; + for (i = 0; i < ofs->numfs; i++) { + if (ofs->fs[i].sb == sb) + return i; } if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { @@ -1297,7 +1295,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) if (ofs->config.index || ofs->config.nfs_export) { ofs->config.index = false; ofs->config.nfs_export = false; - pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", + pr_warn("%s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", uuid_is_null(&sb->s_uuid) ? "null" : "conflicting", path->dentry); @@ -1306,35 +1304,59 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) err = get_anon_bdev(&dev); if (err) { - pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); + pr_err("failed to get anonymous bdev for lowerpath\n"); return err; } - ofs->lower_fs[ofs->numlowerfs].sb = sb; - ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev; - ofs->lower_fs[ofs->numlowerfs].bad_uuid = bad_uuid; - ofs->numlowerfs++; + ofs->fs[ofs->numfs].sb = sb; + ofs->fs[ofs->numfs].pseudo_dev = dev; + ofs->fs[ofs->numfs].bad_uuid = bad_uuid; - return ofs->numlowerfs; + return ofs->numfs++; } -static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, - struct path *stack, unsigned int numlower) +static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, + struct path *stack, unsigned int numlower) { int err; unsigned int i; + struct ovl_layer *layers; err = -ENOMEM; - ofs->lower_layers = kcalloc(numlower, sizeof(struct ovl_layer), - GFP_KERNEL); - if (ofs->lower_layers == NULL) + layers = kcalloc(numlower + 1, sizeof(struct ovl_layer), GFP_KERNEL); + if (!layers) goto out; + ofs->layers = layers; - ofs->lower_fs = kcalloc(numlower, sizeof(struct ovl_sb), - GFP_KERNEL); - if (ofs->lower_fs == NULL) + ofs->fs = kcalloc(numlower + 1, sizeof(struct ovl_sb), GFP_KERNEL); + if (ofs->fs == NULL) goto out; + /* idx/fsid 0 are reserved for upper fs even with lower only overlay */ + ofs->numfs++; + + layers[0].mnt = ofs->upper_mnt; + layers[0].idx = 0; + layers[0].fsid = 0; + ofs->numlayer = 1; + + /* + * All lower layers that share the same fs as upper layer, use the same + * pseudo_dev as upper layer. Allocate fs[0].pseudo_dev even for lower + * only overlay to simplify ovl_fs_free(). + * is_lower will be set if upper fs is shared with a lower layer. + */ + err = get_anon_bdev(&ofs->fs[0].pseudo_dev); + if (err) { + pr_err("failed to get anonymous bdev for upper fs\n"); + goto out; + } + + if (ofs->upper_mnt) { + ofs->fs[0].sb = ofs->upper_mnt->mnt_sb; + ofs->fs[0].is_lower = false; + } + for (i = 0; i < numlower; i++) { struct vfsmount *mnt; struct inode *trap; @@ -1357,7 +1379,7 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, mnt = clone_private_mount(&stack[i]); err = PTR_ERR(mnt); if (IS_ERR(mnt)) { - pr_err("overlayfs: failed to clone lowerpath\n"); + pr_err("failed to clone lowerpath\n"); iput(trap); goto out; } @@ -1368,15 +1390,13 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, */ mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; - ofs->lower_layers[ofs->numlower].trap = trap; - ofs->lower_layers[ofs->numlower].mnt = mnt; - ofs->lower_layers[ofs->numlower].idx = i + 1; - ofs->lower_layers[ofs->numlower].fsid = fsid; - if (fsid) { - ofs->lower_layers[ofs->numlower].fs = - &ofs->lower_fs[fsid - 1]; - } - ofs->numlower++; + layers[ofs->numlayer].trap = trap; + layers[ofs->numlayer].mnt = mnt; + layers[ofs->numlayer].idx = ofs->numlayer; + layers[ofs->numlayer].fsid = fsid; + layers[ofs->numlayer].fs = &ofs->fs[fsid]; + ofs->numlayer++; + ofs->fs[fsid].is_lower = true; } /* @@ -1387,22 +1407,23 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs, * bits reserved for fsid, it emits a warning and uses the original * inode number. */ - if (!ofs->numlowerfs || (ofs->numlowerfs == 1 && !ofs->upper_mnt)) { - ofs->xino_bits = 0; - ofs->config.xino = OVL_XINO_OFF; - } else if (ofs->config.xino == OVL_XINO_ON && !ofs->xino_bits) { + if (ofs->numfs - !ofs->upper_mnt == 1) { + if (ofs->config.xino == OVL_XINO_ON) + pr_info("\"xino=on\" is useless with all layers on same fs, ignore.\n"); + ofs->xino_mode = 0; + } else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) { /* - * This is a roundup of number of bits needed for numlowerfs+1 - * (i.e. ilog2(numlowerfs+1 - 1) + 1). fsid 0 is reserved for - * upper fs even with non upper overlay. + * This is a roundup of number of bits needed for encoding + * fsid, where fsid 0 is reserved for upper fs even with + * lower only overlay. */ BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31); - ofs->xino_bits = ilog2(ofs->numlowerfs) + 1; + ofs->xino_mode = ilog2(ofs->numfs - 1) + 1; } - if (ofs->xino_bits) { - pr_info("overlayfs: \"xino\" feature enabled using %d upper inode bits.\n", - ofs->xino_bits); + if (ofs->xino_mode > 0) { + pr_info("\"xino\" feature enabled using %d upper inode bits.\n", + ofs->xino_mode); } err = 0; @@ -1428,15 +1449,15 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, err = -EINVAL; stacklen = ovl_split_lowerdirs(lowertmp); if (stacklen > OVL_MAX_STACK) { - pr_err("overlayfs: too many lower directories, limit is %d\n", + pr_err("too many lower directories, limit is %d\n", OVL_MAX_STACK); goto out_err; } else if (!ofs->config.upperdir && stacklen == 1) { - pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); + pr_err("at least 2 lowerdir are needed while upperdir nonexistent\n"); goto out_err; } else if (!ofs->config.upperdir && ofs->config.nfs_export && ofs->config.redirect_follow) { - pr_warn("overlayfs: NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); + pr_warn("NFS export requires \"redirect_dir=nofollow\" on non-upper mount, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } @@ -1459,11 +1480,11 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, err = -EINVAL; sb->s_stack_depth++; if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { - pr_err("overlayfs: maximum fs stacking depth exceeded\n"); + pr_err("maximum fs stacking depth exceeded\n"); goto out_err; } - err = ovl_get_lower_layers(sb, ofs, stack, numlower); + err = ovl_get_layers(sb, ofs, stack, numlower); if (err) goto out_err; @@ -1474,7 +1495,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, for (i = 0; i < numlower; i++) { oe->lowerstack[i].dentry = dget(stack[i].dentry); - oe->lowerstack[i].layer = &ofs->lower_layers[i]; + oe->lowerstack[i].layer = &ofs->layers[i+1]; } if (remote) @@ -1515,7 +1536,7 @@ static int ovl_check_layer(struct super_block *sb, struct ovl_fs *ofs, while (!err && parent != next) { if (ovl_lookup_trap_inode(sb, parent)) { err = -ELOOP; - pr_err("overlayfs: overlapping %s path\n", name); + pr_err("overlapping %s path\n", name); } else if (ovl_is_inuse(parent)) { err = ovl_report_in_use(ofs, name); } @@ -1555,9 +1576,9 @@ static int ovl_check_overlapping_layers(struct super_block *sb, return err; } - for (i = 0; i < ofs->numlower; i++) { + for (i = 1; i < ofs->numlayer; i++) { err = ovl_check_layer(sb, ofs, - ofs->lower_layers[i].mnt->mnt_root, + ofs->layers[i].mnt->mnt_root, "lowerdir"); if (err) return err; @@ -1595,7 +1616,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; if (!ofs->config.lowerdir) { if (!silent) - pr_err("overlayfs: missing 'lowerdir'\n"); + pr_err("missing 'lowerdir'\n"); goto out_err; } @@ -1603,14 +1624,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = MAX_LFS_FILESIZE; /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ if (ofs->config.xino != OVL_XINO_OFF) - ofs->xino_bits = BITS_PER_LONG - 32; + ofs->xino_mode = BITS_PER_LONG - 32; /* alloc/destroy_inode needed for setting up traps in inode cache */ sb->s_op = &ovl_super_operations; if (ofs->config.upperdir) { if (!ofs->config.workdir) { - pr_err("overlayfs: missing 'workdir'\n"); + pr_err("missing 'workdir'\n"); goto out_err; } @@ -1660,13 +1681,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ofs->indexdir) { ofs->config.index = false; if (ofs->upper_mnt && ofs->config.nfs_export) { - pr_warn("overlayfs: NFS export requires an index dir, falling back to nfs_export=off.\n"); + pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } } if (ofs->config.metacopy && ofs->config.nfs_export) { - pr_warn("overlayfs: NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n"); + pr_warn("NFS export is not supported with metadata only copy up, falling back to nfs_export=off.\n"); ofs->config.nfs_export = false; } @@ -1749,9 +1770,15 @@ static int __init ovl_init(void) if (ovl_inode_cachep == NULL) return -ENOMEM; - err = register_filesystem(&ovl_fs_type); - if (err) - kmem_cache_destroy(ovl_inode_cachep); + err = ovl_aio_request_cache_init(); + if (!err) { + err = register_filesystem(&ovl_fs_type); + if (!err) + return 0; + + ovl_aio_request_cache_destroy(); + } + kmem_cache_destroy(ovl_inode_cachep); return err; } @@ -1766,7 +1793,7 @@ static void __exit ovl_exit(void) */ rcu_barrier(); kmem_cache_destroy(ovl_inode_cachep); - + ovl_aio_request_cache_destroy(); } module_init(ovl_init); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f5678a3f8350..ea005085803f 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -40,18 +40,6 @@ const struct cred *ovl_override_creds(struct super_block *sb) return override_creds(ofs->creator_cred); } -struct super_block *ovl_same_sb(struct super_block *sb) -{ - struct ovl_fs *ofs = sb->s_fs_info; - - if (!ofs->numlowerfs) - return ofs->upper_mnt->mnt_sb; - else if (ofs->numlowerfs == 1 && !ofs->upper_mnt) - return ofs->lower_fs[0].sb; - else - return NULL; -} - /* * Check if underlying fs supports file handles and try to determine encoding * type, in order to deduce maximum inode number used by fs. @@ -198,7 +186,7 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry) return oe->numlower ? oe->lowerstack[0].dentry : NULL; } -struct ovl_layer *ovl_layer_lower(struct dentry *dentry) +const struct ovl_layer *ovl_layer_lower(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; @@ -576,7 +564,7 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, err = ovl_do_setxattr(upperdentry, name, value, size, 0); if (err == -EOPNOTSUPP) { - pr_warn("overlayfs: cannot set %s xattr on upper\n", name); + pr_warn("cannot set %s xattr on upper\n", name); ofs->noxattr = true; return xerr; } @@ -700,7 +688,7 @@ static void ovl_cleanup_index(struct dentry *dentry) inode = d_inode(upperdentry); if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) { - pr_warn_ratelimited("overlayfs: cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", + pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", upperdentry, inode->i_ino, inode->i_nlink); /* * We either have a bug with persistent union nlink or a lower @@ -739,7 +727,7 @@ out: return; fail: - pr_err("overlayfs: cleanup index of '%pd2' failed (%i)\n", dentry, err); + pr_err("cleanup index of '%pd2' failed (%i)\n", dentry, err); goto out; } @@ -830,7 +818,7 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) err_unlock: unlock_rename(workdir, upperdir); err: - pr_err("overlayfs: failed to lock workdir+upperdir\n"); + pr_err("failed to lock workdir+upperdir\n"); return -EIO; } @@ -852,7 +840,7 @@ int ovl_check_metacopy_xattr(struct dentry *dentry) return 1; out: - pr_warn_ratelimited("overlayfs: failed to get metacopy (%i)\n", res); + pr_warn_ratelimited("failed to get metacopy (%i)\n", res); return res; } @@ -899,7 +887,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, return res; fail: - pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n", + pr_warn_ratelimited("failed to get xattr %s: err=%zi)\n", name, res); kfree(buf); return res; @@ -931,7 +919,7 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) return buf; invalid: - pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf); + pr_warn_ratelimited("invalid redirect (%s)\n", buf); res = -EINVAL; kfree(buf); return ERR_PTR(res); diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index 96f1087e372c..c1dea9b8222e 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -16,16 +16,16 @@ static int cpuinfo_open(struct inode *inode, struct file *file) return seq_open(file, &cpuinfo_op); } -static const struct file_operations proc_cpuinfo_operations = { - .open = cpuinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, +static const struct proc_ops cpuinfo_proc_ops = { + .proc_open = cpuinfo_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, }; static int __init proc_cpuinfo_init(void) { - proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); + proc_create("cpuinfo", 0, NULL, &cpuinfo_proc_ops); return 0; } fs_initcall(proc_cpuinfo_init); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 074e9585c699..3faed94e4b65 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -473,7 +473,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent = __proc_create(&parent, name, S_IFDIR | mode, 2); if (ent) { ent->data = data; - ent->proc_fops = &proc_dir_operations; + ent->proc_dir_ops = &proc_dir_operations; ent->proc_iops = &proc_dir_inode_operations; ent = proc_register(parent, ent); } @@ -503,7 +503,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent = __proc_create(&parent, name, mode, 2); if (ent) { ent->data = NULL; - ent->proc_fops = NULL; + ent->proc_dir_ops = NULL; ent->proc_iops = NULL; ent = proc_register(parent, ent); } @@ -533,25 +533,23 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, - const struct file_operations *proc_fops, void *data) + const struct proc_ops *proc_ops, void *data) { struct proc_dir_entry *p; - BUG_ON(proc_fops == NULL); - p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; - p->proc_fops = proc_fops; + p->proc_ops = proc_ops; return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, - const struct file_operations *proc_fops) + const struct proc_ops *proc_ops) { - return proc_create_data(name, mode, parent, proc_fops, NULL); + return proc_create_data(name, mode, parent, proc_ops, NULL); } EXPORT_SYMBOL(proc_create); @@ -573,11 +571,11 @@ static int proc_seq_release(struct inode *inode, struct file *file) return seq_release(inode, file); } -static const struct file_operations proc_seq_fops = { - .open = proc_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = proc_seq_release, +static const struct proc_ops proc_seq_ops = { + .proc_open = proc_seq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = proc_seq_release, }; struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, @@ -589,7 +587,7 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; - p->proc_fops = &proc_seq_fops; + p->proc_ops = &proc_seq_ops; p->seq_ops = ops; p->state_size = state_size; return proc_register(parent, p); @@ -603,11 +601,11 @@ static int proc_single_open(struct inode *inode, struct file *file) return single_open(file, de->single_show, de->data); } -static const struct file_operations proc_single_fops = { - .open = proc_single_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops proc_single_ops = { + .proc_open = proc_single_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, @@ -619,7 +617,7 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; - p->proc_fops = &proc_single_fops; + p->proc_ops = &proc_single_ops; p->single_show = show; return proc_register(parent, p); } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index dbe43a50caf2..6da18316d209 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -163,7 +163,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; - pde->proc_fops->release(file_inode(file), file); + pde->proc_ops->proc_release(file_inode(file), file); spin_lock(&pde->pde_unload_lock); /* After ->release. */ list_del(&pdeo->lh); @@ -200,12 +200,12 @@ static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; if (use_pde(pde)) { - typeof_member(struct file_operations, llseek) llseek; + typeof_member(struct proc_ops, proc_lseek) lseek; - llseek = pde->proc_fops->llseek; - if (!llseek) - llseek = default_llseek; - rv = llseek(file, offset, whence); + lseek = pde->proc_ops->proc_lseek; + if (!lseek) + lseek = default_llseek; + rv = lseek(file, offset, whence); unuse_pde(pde); } return rv; @@ -216,9 +216,9 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (use_pde(pde)) { - typeof_member(struct file_operations, read) read; + typeof_member(struct proc_ops, proc_read) read; - read = pde->proc_fops->read; + read = pde->proc_ops->proc_read; if (read) rv = read(file, buf, count, ppos); unuse_pde(pde); @@ -231,9 +231,9 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; if (use_pde(pde)) { - typeof_member(struct file_operations, write) write; + typeof_member(struct proc_ops, proc_write) write; - write = pde->proc_fops->write; + write = pde->proc_ops->proc_write; if (write) rv = write(file, buf, count, ppos); unuse_pde(pde); @@ -246,9 +246,9 @@ static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; if (use_pde(pde)) { - typeof_member(struct file_operations, poll) poll; + typeof_member(struct proc_ops, proc_poll) poll; - poll = pde->proc_fops->poll; + poll = pde->proc_ops->proc_poll; if (poll) rv = poll(file, pts); unuse_pde(pde); @@ -261,9 +261,9 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (use_pde(pde)) { - typeof_member(struct file_operations, unlocked_ioctl) ioctl; + typeof_member(struct proc_ops, proc_ioctl) ioctl; - ioctl = pde->proc_fops->unlocked_ioctl; + ioctl = pde->proc_ops->proc_ioctl; if (ioctl) rv = ioctl(file, cmd, arg); unuse_pde(pde); @@ -277,9 +277,9 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; if (use_pde(pde)) { - typeof_member(struct file_operations, compat_ioctl) compat_ioctl; + typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; - compat_ioctl = pde->proc_fops->compat_ioctl; + compat_ioctl = pde->proc_ops->proc_compat_ioctl; if (compat_ioctl) rv = compat_ioctl(file, cmd, arg); unuse_pde(pde); @@ -293,9 +293,9 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; if (use_pde(pde)) { - typeof_member(struct file_operations, mmap) mmap; + typeof_member(struct proc_ops, proc_mmap) mmap; - mmap = pde->proc_fops->mmap; + mmap = pde->proc_ops->proc_mmap; if (mmap) rv = mmap(file, vma); unuse_pde(pde); @@ -312,9 +312,9 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long rv = -EIO; if (use_pde(pde)) { - typeof_member(struct file_operations, get_unmapped_area) get_area; + typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; - get_area = pde->proc_fops->get_unmapped_area; + get_area = pde->proc_ops->proc_get_unmapped_area; #ifdef CONFIG_MMU if (!get_area) get_area = current->mm->get_unmapped_area; @@ -333,8 +333,8 @@ static int proc_reg_open(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); int rv = 0; - typeof_member(struct file_operations, open) open; - typeof_member(struct file_operations, release) release; + typeof_member(struct proc_ops, proc_open) open; + typeof_member(struct proc_ops, proc_release) release; struct pde_opener *pdeo; /* @@ -351,7 +351,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (!use_pde(pde)) return -ENOENT; - release = pde->proc_fops->release; + release = pde->proc_ops->proc_release; if (release) { pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); if (!pdeo) { @@ -360,7 +360,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) } } - open = pde->proc_fops->open; + open = pde->proc_ops->proc_open; if (open) rv = open(inode, file); @@ -468,21 +468,23 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_size = de->size; if (de->nlink) set_nlink(inode, de->nlink); - WARN_ON(!de->proc_iops); - inode->i_op = de->proc_iops; - if (de->proc_fops) { - if (S_ISREG(inode->i_mode)) { + + if (S_ISREG(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = &proc_reg_file_ops; #ifdef CONFIG_COMPAT - if (!de->proc_fops->compat_ioctl) - inode->i_fop = - &proc_reg_file_ops_no_compat; - else -#endif - inode->i_fop = &proc_reg_file_ops; - } else { - inode->i_fop = de->proc_fops; + if (!de->proc_ops->proc_compat_ioctl) { + inode->i_fop = &proc_reg_file_ops_no_compat; } - } +#endif + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = de->proc_dir_ops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = de->proc_iops; + inode->i_fop = NULL; + } else + BUG(); } else pde_put(de); return inode; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0f3b557c9b77..41587276798e 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -39,7 +39,10 @@ struct proc_dir_entry { spinlock_t pde_unload_lock; struct completion *pde_unload_completion; const struct inode_operations *proc_iops; - const struct file_operations *proc_fops; + union { + const struct proc_ops *proc_ops; + const struct file_operations *proc_dir_ops; + }; const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e2ed8e08cc7a..8ba492d44e68 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -574,11 +574,11 @@ static int release_kcore(struct inode *inode, struct file *file) return 0; } -static const struct file_operations proc_kcore_operations = { - .read = read_kcore, - .open = open_kcore, - .release = release_kcore, - .llseek = default_llseek, +static const struct proc_ops kcore_proc_ops = { + .proc_read = read_kcore, + .proc_open = open_kcore, + .proc_release = release_kcore, + .proc_lseek = default_llseek, }; /* just remember that we have to update kcore */ @@ -637,8 +637,7 @@ static void __init add_modules_range(void) static int __init proc_kcore_init(void) { - proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, - &proc_kcore_operations); + proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &kcore_proc_ops); if (!proc_root_kcore) { pr_err("couldn't create /proc/kcore\n"); return 0; /* Always returns 0. */ diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 4f4a2abb225e..ec1b7d2fb773 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -49,17 +49,17 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait) } -static const struct file_operations proc_kmsg_operations = { - .read = kmsg_read, - .poll = kmsg_poll, - .open = kmsg_open, - .release = kmsg_release, - .llseek = generic_file_llseek, +static const struct proc_ops kmsg_proc_ops = { + .proc_read = kmsg_read, + .proc_poll = kmsg_poll, + .proc_open = kmsg_open, + .proc_release = kmsg_release, + .proc_lseek = generic_file_llseek, }; static int __init proc_kmsg_init(void) { - proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); + proc_create("kmsg", S_IRUSR, NULL, &kmsg_proc_ops); return 0; } fs_initcall(proc_kmsg_init); diff --git a/fs/proc/page.c b/fs/proc/page.c index 7c952ee732e6..f909243d4a66 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -21,6 +21,21 @@ #define KPMMASK (KPMSIZE - 1) #define KPMBITS (KPMSIZE * BITS_PER_BYTE) +static inline unsigned long get_max_dump_pfn(void) +{ +#ifdef CONFIG_SPARSEMEM + /* + * The memmap of early sections is completely populated and marked + * online even if max_pfn does not fall on a section boundary - + * pfn_to_online_page() will succeed on all pages. Allow inspecting + * these memmaps. + */ + return round_up(max_pfn, PAGES_PER_SECTION); +#else + return max_pfn; +#endif +} + /* /proc/kpagecount - an array exposing page counts * * Each entry is a u64 representing the corresponding @@ -29,6 +44,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; struct page *ppage; unsigned long src = *ppos; @@ -37,9 +53,11 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, u64 pcount; pfn = src / KPMSIZE; - count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { /* @@ -71,9 +89,9 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return ret; } -static const struct file_operations proc_kpagecount_operations = { - .llseek = mem_lseek, - .read = kpagecount_read, +static const struct proc_ops kpagecount_proc_ops = { + .proc_lseek = mem_lseek, + .proc_read = kpagecount_read, }; /* /proc/kpageflags - an array exposing page flags @@ -206,6 +224,7 @@ u64 stable_page_flags(struct page *page) static ssize_t kpageflags_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; struct page *ppage; unsigned long src = *ppos; @@ -213,9 +232,11 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, ssize_t ret = 0; pfn = src / KPMSIZE; - count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { /* @@ -242,15 +263,16 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, return ret; } -static const struct file_operations proc_kpageflags_operations = { - .llseek = mem_lseek, - .read = kpageflags_read, +static const struct proc_ops kpageflags_proc_ops = { + .proc_lseek = mem_lseek, + .proc_read = kpageflags_read, }; #ifdef CONFIG_MEMCG static ssize_t kpagecgroup_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; struct page *ppage; unsigned long src = *ppos; @@ -259,9 +281,11 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, u64 ino; pfn = src / KPMSIZE; - count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); if (src & KPMMASK || count & KPMMASK) return -EINVAL; + if (src >= max_dump_pfn * KPMSIZE) + return 0; + count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { /* @@ -293,18 +317,18 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, return ret; } -static const struct file_operations proc_kpagecgroup_operations = { - .llseek = mem_lseek, - .read = kpagecgroup_read, +static const struct proc_ops kpagecgroup_proc_ops = { + .proc_lseek = mem_lseek, + .proc_read = kpagecgroup_read, }; #endif /* CONFIG_MEMCG */ static int __init proc_page_init(void) { - proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); - proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); + proc_create("kpagecount", S_IRUSR, NULL, &kpagecount_proc_ops); + proc_create("kpageflags", S_IRUSR, NULL, &kpageflags_proc_ops); #ifdef CONFIG_MEMCG - proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations); + proc_create("kpagecgroup", S_IRUSR, NULL, &kpagecgroup_proc_ops); #endif return 0; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 76ae278df1c4..4888c5224442 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -90,12 +90,12 @@ static int seq_release_net(struct inode *ino, struct file *f) return 0; } -static const struct file_operations proc_net_seq_fops = { - .open = seq_open_net, - .read = seq_read, - .write = proc_simple_write, - .llseek = seq_lseek, - .release = seq_release_net, +static const struct proc_ops proc_net_seq_ops = { + .proc_open = seq_open_net, + .proc_read = seq_read, + .proc_write = proc_simple_write, + .proc_lseek = seq_lseek, + .proc_release = seq_release_net, }; struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, @@ -108,7 +108,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, if (!p) return NULL; pde_force_lookup(p); - p->proc_fops = &proc_net_seq_fops; + p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; return proc_register(parent, p); @@ -152,7 +152,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode if (!p) return NULL; pde_force_lookup(p); - p->proc_fops = &proc_net_seq_fops; + p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; p->write = write; @@ -183,12 +183,12 @@ static int single_release_net(struct inode *ino, struct file *f) return single_release(ino, f); } -static const struct file_operations proc_net_single_fops = { - .open = single_open_net, - .read = seq_read, - .write = proc_simple_write, - .llseek = seq_lseek, - .release = single_release_net, +static const struct proc_ops proc_net_single_ops = { + .proc_open = single_open_net, + .proc_read = seq_read, + .proc_write = proc_simple_write, + .proc_lseek = seq_lseek, + .proc_release = single_release_net, }; struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, @@ -201,7 +201,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, if (!p) return NULL; pde_force_lookup(p); - p->proc_fops = &proc_net_single_fops; + p->proc_ops = &proc_net_single_ops; p->single_show = show; return proc_register(parent, p); } @@ -244,7 +244,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo if (!p) return NULL; pde_force_lookup(p); - p->proc_fops = &proc_net_single_fops; + p->proc_ops = &proc_net_single_ops; p->single_show = show; p->write = write; return proc_register(parent, p); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d80989b6c344..c75bb4632ed1 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1720,7 +1720,7 @@ int __init proc_sys_init(void) proc_sys_root = proc_mkdir("sys", NULL); proc_sys_root->proc_iops = &proc_sys_dir_operations; - proc_sys_root->proc_fops = &proc_sys_dir_file_operations; + proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; return sysctl_init(); diff --git a/fs/proc/root.c b/fs/proc/root.c index 0b7c8dffc9ae..72c07a34cff0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -292,7 +292,7 @@ struct proc_dir_entry proc_root = { .nlink = 2, .refcnt = REFCOUNT_INIT(1), .proc_iops = &proc_root_inode_operations, - .proc_fops = &proc_root_operations, + .proc_dir_ops = &proc_root_operations, .parent = &proc_root, .subdir = RB_ROOT, .name = "/proc", diff --git a/fs/proc/stat.c b/fs/proc/stat.c index fd931d3e77be..0449edf460f5 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -223,16 +223,16 @@ static int stat_open(struct inode *inode, struct file *file) return single_open_size(file, show_stat, NULL, size); } -static const struct file_operations proc_stat_operations = { - .open = stat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +static const struct proc_ops stat_proc_ops = { + .proc_open = stat_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, }; static int __init proc_stat_init(void) { - proc_create("stat", 0, NULL, &proc_stat_operations); + proc_create("stat", 0, NULL, &stat_proc_ops); return 0; } fs_initcall(proc_stat_init); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9442631fd4af..3ba9ae83bff5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -505,7 +505,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, #ifdef CONFIG_SHMEM static int smaps_pte_hole(unsigned long addr, unsigned long end, - struct mm_walk *walk) + __always_unused int depth, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; @@ -1282,7 +1282,7 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, } static int pagemap_pte_hole(unsigned long start, unsigned long end, - struct mm_walk *walk) + __always_unused int depth, struct mm_walk *walk) { struct pagemapread *pm = walk->private; unsigned long addr = start; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7b13988796e1..7dc800cce354 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -667,10 +667,10 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) } #endif -static const struct file_operations proc_vmcore_operations = { - .read = read_vmcore, - .llseek = default_llseek, - .mmap = mmap_vmcore, +static const struct proc_ops vmcore_proc_ops = { + .proc_read = read_vmcore, + .proc_lseek = default_llseek, + .proc_mmap = mmap_vmcore, }; static struct vmcore* __init get_new_element(void) @@ -1555,7 +1555,7 @@ static int __init vmcore_init(void) elfcorehdr_free(elfcorehdr_addr); elfcorehdr_addr = ELFCORE_ADDR_ERR; - proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); + proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &vmcore_proc_ops); if (proc_vmcore) proc_vmcore->size = vmcore_size; return 0; diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 53429c29c784..58fc2a7c7fd1 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -22,8 +22,6 @@ MODULE_AUTHOR("Jan Kara"); MODULE_DESCRIPTION("Quota format v2 support"); MODULE_LICENSE("GPL"); -#define __QUOTA_V2_PARANOIA - static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot); static void v2r0_disk2memdqb(struct dquot *dquot, void *dp); static int v2r0_is_id(void *dp, struct dquot *dquot); diff --git a/fs/quota/quotaio_v1.h b/fs/quota/quotaio_v1.h index bd11e2c08119..31dca9a89176 100644 --- a/fs/quota/quotaio_v1.h +++ b/fs/quota/quotaio_v1.h @@ -25,8 +25,10 @@ struct v1_disk_dqblk { __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ __u32 dqb_isoftlimit; /* preferred inode limit */ __u32 dqb_curinodes; /* current # allocated inodes */ - time_t dqb_btime; /* time limit for excessive disk use */ - time_t dqb_itime; /* time limit for excessive inode use */ + + /* below fields differ in length on 32-bit vs 64-bit architectures */ + unsigned long dqb_btime; /* time limit for excessive disk use */ + unsigned long dqb_itime; /* time limit for excessive inode use */ }; #define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk))) diff --git a/fs/read_write.c b/fs/read_write.c index 7458fccc59e1..59d819c5b92e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -939,6 +939,34 @@ out: return ret; } +ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) +{ + size_t tot_len; + ssize_t ret = 0; + + if (!file->f_op->read_iter) + return -EINVAL; + if (!(file->f_mode & FMODE_READ)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + + tot_len = iov_iter_count(iter); + if (!tot_len) + goto out; + ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); + if (ret < 0) + return ret; + + ret = call_read_iter(file, iocb, iter); +out: + if (ret >= 0) + fsnotify_access(file); + return ret; +} +EXPORT_SYMBOL(vfs_iocb_iter_read); + ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { @@ -975,6 +1003,34 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, return ret; } +ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) +{ + size_t tot_len; + ssize_t ret = 0; + + if (!file->f_op->write_iter) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) + return -EBADF; + if (!(file->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + + tot_len = iov_iter_count(iter); + if (!tot_len) + return 0; + ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); + if (ret < 0) + return ret; + + ret = call_write_iter(file, iocb, iter); + if (ret > 0) + fsnotify_modify(file); + + return ret; +} +EXPORT_SYMBOL(vfs_iocb_iter_write); + ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags) { diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4b3e3e73b512..072156c4f895 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -56,8 +56,6 @@ /* gets a struct reiserfs_journal_list * from a list head */ #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ j_list)) -#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ - j_working_list)) /* must be correct to keep the desc and commit structs at 4k */ #define JOURNAL_TRANS_HALF 1018 diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index f2cf3441fdfc..ff336513c254 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -63,7 +63,6 @@ static int show_version(struct seq_file *m, void *unused) #define MAP( i ) D4C( objectid_map( sb, rs )[ i ] ) #define DJF( x ) le32_to_cpu( rs -> x ) -#define DJV( x ) le32_to_cpu( s_v1 -> x ) #define DJP( x ) le32_to_cpu( jp -> x ) #define JF( x ) ( r -> s_journal -> x ) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index da9ebe33882b..8bf88d690729 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -918,12 +918,6 @@ int comp_items(const struct item_head *stored_ih, const struct treepath *path) return memcmp(stored_ih, ih, IH_SIZE); } -/* unformatted nodes are not logged anymore, ever. This is safe now */ -#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1) - -/* block can not be forgotten as it is in I/O or held by someone */ -#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) - /* prepare for delete or cut of direct item */ static inline int prepare_for_direct_item(struct treepath *path, struct item_head *le_ih, @@ -2246,7 +2240,8 @@ error_out: /* also releases the path */ unfix_nodes(&s_ins_balance); #ifdef REISERQUOTA_DEBUG - reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, + if (inode) + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, "reiserquota insert_item(): freeing %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(ih)); #endif diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 3244037b1286..a6bce5b1fb1d 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -629,6 +629,7 @@ static void reiserfs_put_super(struct super_block *s) reiserfs_write_unlock(s); mutex_destroy(&REISERFS_SB(s)->lock); destroy_workqueue(REISERFS_SB(s)->commit_wq); + kfree(REISERFS_SB(s)->s_jdev); kfree(s->s_fs_info); s->s_fs_info = NULL; } @@ -1947,7 +1948,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (!sbi->s_jdev) { SWARN(silent, s, "", "Cannot allocate memory for " "journal device name"); - goto error; + goto error_unlocked; } } #ifdef CONFIG_QUOTA @@ -2240,6 +2241,7 @@ error_unlocked: kfree(qf_names[j]); } #endif + kfree(sbi->s_jdev); kfree(sbi); s->s_fs_info = NULL; diff --git a/fs/stat.c b/fs/stat.c index c38e4c2e1221..030008796479 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -21,6 +21,8 @@ #include <linux/uaccess.h> #include <asm/unistd.h> +#include "internal.h" + /** * generic_fillattr - Fill in the basic attributes from the inode struct * @inode: Inode to use as the source @@ -150,6 +152,23 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat, } EXPORT_SYMBOL(vfs_statx_fd); +inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags) +{ + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | + AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0) + return -EINVAL; + + *lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + *lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_NO_AUTOMOUNT) + *lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_EMPTY_PATH) + *lookup_flags |= LOOKUP_EMPTY; + + return 0; +} + /** * vfs_statx - Get basic and extra attributes by filename * @dfd: A file descriptor representing the base dir for a relative filename @@ -170,19 +189,10 @@ int vfs_statx(int dfd, const char __user *filename, int flags, { struct path path; int error = -EINVAL; - unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; + unsigned lookup_flags; - if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | - AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0) + if (vfs_stat_set_lookup_flags(&lookup_flags, flags)) return -EINVAL; - - if (flags & AT_SYMLINK_NOFOLLOW) - lookup_flags &= ~LOOKUP_FOLLOW; - if (flags & AT_NO_AUTOMOUNT) - lookup_flags &= ~LOOKUP_AUTOMOUNT; - if (flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; - retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) @@ -523,7 +533,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, } #endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */ -static noinline_for_stack int +noinline_for_stack int cp_statx(const struct kstat *stat, struct statx __user *buffer) { struct statx tmp; diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index d41c21fef138..c4ab045926b7 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -449,7 +449,7 @@ int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, } link = kernfs_create_link(kobj->sd, target_name, entry); - if (IS_ERR(link) && PTR_ERR(link) == -EEXIST) + if (PTR_ERR(link) == -EEXIST) sysfs_warn_dup(kobj->sd, target_name); kernfs_put(entry); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 0caa151cae4e..0ee8c6dfb036 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -330,7 +330,10 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) parent = tracefs_mount->mnt_root; inode_lock(parent->d_inode); - dentry = lookup_one_len(name, parent, strlen(name)); + if (unlikely(IS_DEADDIR(parent->d_inode))) + dentry = ERR_PTR(-ENOENT); + else + dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && dentry->d_inode) { dput(dentry); dentry = ERR_PTR(-EEXIST); @@ -499,122 +502,27 @@ __init struct dentry *tracefs_create_instance_dir(const char *name, return dentry; } -static int __tracefs_remove(struct dentry *dentry, struct dentry *parent) +static void remove_one(struct dentry *victim) { - int ret = 0; - - if (simple_positive(dentry)) { - if (dentry->d_inode) { - dget(dentry); - switch (dentry->d_inode->i_mode & S_IFMT) { - case S_IFDIR: - ret = simple_rmdir(parent->d_inode, dentry); - if (!ret) - fsnotify_rmdir(parent->d_inode, dentry); - break; - default: - simple_unlink(parent->d_inode, dentry); - fsnotify_unlink(parent->d_inode, dentry); - break; - } - if (!ret) - d_delete(dentry); - dput(dentry); - } - } - return ret; -} - -/** - * tracefs_remove - removes a file or directory from the tracefs filesystem - * @dentry: a pointer to a the dentry of the file or directory to be - * removed. - * - * This function removes a file or directory in tracefs that was previously - * created with a call to another tracefs function (like - * tracefs_create_file() or variants thereof.) - */ -void tracefs_remove(struct dentry *dentry) -{ - struct dentry *parent; - int ret; - - if (IS_ERR_OR_NULL(dentry)) - return; - - parent = dentry->d_parent; - inode_lock(parent->d_inode); - ret = __tracefs_remove(dentry, parent); - inode_unlock(parent->d_inode); - if (!ret) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); } /** - * tracefs_remove_recursive - recursively removes a directory + * tracefs_remove - recursively removes a directory * @dentry: a pointer to a the dentry of the directory to be removed. * * This function recursively removes a directory tree in tracefs that * was previously created with a call to another tracefs function * (like tracefs_create_file() or variants thereof.) */ -void tracefs_remove_recursive(struct dentry *dentry) +void tracefs_remove(struct dentry *dentry) { - struct dentry *child, *parent; - if (IS_ERR_OR_NULL(dentry)) return; - parent = dentry; - down: - inode_lock(parent->d_inode); - loop: - /* - * The parent->d_subdirs is protected by the d_lock. Outside that - * lock, the child can be unlinked and set to be freed which can - * use the d_u.d_child as the rcu head and corrupt this list. - */ - spin_lock(&parent->d_lock); - list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!simple_positive(child)) - continue; - - /* perhaps simple_empty(child) makes more sense */ - if (!list_empty(&child->d_subdirs)) { - spin_unlock(&parent->d_lock); - inode_unlock(parent->d_inode); - parent = child; - goto down; - } - - spin_unlock(&parent->d_lock); - - if (!__tracefs_remove(child, parent)) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - - /* - * The parent->d_lock protects agaist child from unlinking - * from d_subdirs. When releasing the parent->d_lock we can - * no longer trust that the next pointer is valid. - * Restart the loop. We'll skip this one with the - * simple_positive() check. - */ - goto loop; - } - spin_unlock(&parent->d_lock); - - inode_unlock(parent->d_inode); - child = parent; - parent = parent->d_parent; - inode_lock(parent->d_inode); - - if (child != dentry) - /* go up */ - goto loop; - - if (!__tracefs_remove(child, parent)) - simple_release_fs(&tracefs_mount, &tracefs_mount_count); - inode_unlock(parent->d_inode); + simple_pin_fs(&trace_fs_type, &tracefs_mount, &tracefs_mount_count); + simple_recursive_removal(dentry, remove_one); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); } /** diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index c8e8f50c6054..743928efffc1 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -786,7 +786,9 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, if (page_offset > end_index) break; - page = find_or_create_page(mapping, page_offset, ra_gfp_mask); + page = pagecache_get_page(mapping, page_offset, + FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT, + ra_gfp_mask); if (!page) break; if (!PageUptodate(page)) @@ -1078,18 +1080,12 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr) inode->i_uid = attr->ia_uid; if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; - if (attr->ia_valid & ATTR_ATIME) { - inode->i_atime = timestamp_truncate(attr->ia_atime, - inode); - } - if (attr->ia_valid & ATTR_MTIME) { - inode->i_mtime = timestamp_truncate(attr->ia_mtime, - inode); - } - if (attr->ia_valid & ATTR_CTIME) { - inode->i_ctime = timestamp_truncate(attr->ia_ctime, - inode); - } + if (attr->ia_valid & ATTR_ATIME) + inode->i_atime = attr->ia_atime; + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; if (attr->ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 5dc5abca11c7..d49fc04f2d7d 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -17,10 +17,14 @@ #include "ubifs.h" /* Need to be kept consistent with checked flags in ioctl2ubifs() */ -#define UBIFS_SUPPORTED_IOCTL_FLAGS \ +#define UBIFS_SETTABLE_IOCTL_FLAGS \ (FS_COMPR_FL | FS_SYNC_FL | FS_APPEND_FL | \ FS_IMMUTABLE_FL | FS_DIRSYNC_FL) +/* Need to be kept consistent with checked flags in ubifs2ioctl() */ +#define UBIFS_GETTABLE_IOCTL_FLAGS \ + (UBIFS_SETTABLE_IOCTL_FLAGS | FS_ENCRYPT_FL) + /** * ubifs_set_inode_flags - set VFS inode flags. * @inode: VFS inode to set flags for @@ -91,6 +95,8 @@ static int ubifs2ioctl(int ubifs_flags) ioctl_flags |= FS_IMMUTABLE_FL; if (ubifs_flags & UBIFS_DIRSYNC_FL) ioctl_flags |= FS_DIRSYNC_FL; + if (ubifs_flags & UBIFS_CRYPT_FL) + ioctl_flags |= FS_ENCRYPT_FL; return ioctl_flags; } @@ -113,7 +119,8 @@ static int setflags(struct inode *inode, int flags) if (err) goto out_unlock; - ui->flags = ioctl2ubifs(flags); + ui->flags &= ~ioctl2ubifs(UBIFS_SETTABLE_IOCTL_FLAGS); + ui->flags |= ioctl2ubifs(flags); ubifs_set_inode_flags(inode); inode->i_ctime = current_time(inode); release = ui->dirty; @@ -155,8 +162,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; - if (flags & ~UBIFS_SUPPORTED_IOCTL_FLAGS) + if (flags & ~UBIFS_GETTABLE_IOCTL_FLAGS) return -EOPNOTSUPP; + flags &= UBIFS_SETTABLE_IOCTL_FLAGS; if (!S_ISDIR(inode->i_mode)) flags &= ~FS_DIRSYNC_FL; diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 54d6db61106f..edf43ddd7dce 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -129,7 +129,7 @@ static void __orphan_drop(struct ubifs_info *c, struct ubifs_orphan *o) static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) { if (orph->del) { - dbg_gen("deleted twice ino %lu", orph->inum); + dbg_gen("deleted twice ino %lu", (unsigned long)orph->inum); return; } @@ -137,7 +137,7 @@ static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph) orph->del = 1; orph->dnext = c->orph_dnext; c->orph_dnext = orph; - dbg_gen("delete later ino %lu", orph->inum); + dbg_gen("delete later ino %lu", (unsigned long)orph->inum); return; } diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 2b7c04bf8983..4b4b65b48c57 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -84,7 +84,6 @@ static int create_default_filesystem(struct ubifs_info *c) int idx_node_size; long long tmp64, main_bytes; __le64 tmp_le64; - __le32 tmp_le32; struct timespec64 ts; u8 hash[UBIFS_HASH_ARR_SZ]; u8 hash_lpt[UBIFS_HASH_ARR_SZ]; @@ -161,7 +160,7 @@ static int create_default_filesystem(struct ubifs_info *c) sup = kzalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_KERNEL); mst = kzalloc(c->mst_node_alsz, GFP_KERNEL); idx_node_size = ubifs_idx_node_sz(c, 1); - idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL); + idx = kzalloc(ALIGN(idx_node_size, c->min_io_size), GFP_KERNEL); ino = kzalloc(ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size), GFP_KERNEL); cs = kzalloc(ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size), GFP_KERNEL); @@ -291,16 +290,14 @@ static int create_default_filesystem(struct ubifs_info *c) ino->creat_sqnum = cpu_to_le64(++c->max_sqnum); ino->nlink = cpu_to_le32(2); - ktime_get_real_ts64(&ts); - ts = timespec64_trunc(ts, DEFAULT_TIME_GRAN); + ktime_get_coarse_real_ts64(&ts); tmp_le64 = cpu_to_le64(ts.tv_sec); ino->atime_sec = tmp_le64; ino->ctime_sec = tmp_le64; ino->mtime_sec = tmp_le64; - tmp_le32 = cpu_to_le32(ts.tv_nsec); - ino->atime_nsec = tmp_le32; - ino->ctime_nsec = tmp_le32; - ino->mtime_nsec = tmp_le32; + ino->atime_nsec = 0; + ino->ctime_nsec = 0; + ino->mtime_nsec = 0; ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO); ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 5e1e8ec0589e..7fc2f3f07c16 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1599,6 +1599,7 @@ out_free: vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); + kfree(c->sup_node); ubifs_debugging_exit(c); return err; } @@ -1641,6 +1642,7 @@ static void ubifs_umount(struct ubifs_info *c) vfree(c->ileb_buf); vfree(c->sbuf); kfree(c->bottom_up_buf); + kfree(c->sup_node); ubifs_debugging_exit(c); } diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h index fb7f2c7bec9c..3fd85464abd5 100644 --- a/fs/udf/ecma_167.h +++ b/fs/udf/ecma_167.h @@ -4,7 +4,8 @@ * This file is based on ECMA-167 3rd edition (June 1997) * http://www.ecma.ch * - * Copyright (c) 2001-2002 Ben Fennema <bfennema@falcon.csc.calpoly.edu> + * Copyright (c) 2001-2002 Ben Fennema + * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,11 +33,19 @@ * SUCH DAMAGE. */ +/** + * @file + * ECMA-167r3 defines and structure definitions + */ + #include <linux/types.h> #ifndef _ECMA_167_H #define _ECMA_167_H 1 +/* Character sets and coding - d-characters (ECMA 167r3 1/7.2) */ +typedef uint8_t dchars; + /* Character set specification (ECMA 167r3 1/7.2.1) */ struct charspec { uint8_t charSetType; @@ -54,6 +63,7 @@ struct charspec { #define CHARSPEC_TYPE_CS7 0x07 /* (1/7.2.9) */ #define CHARSPEC_TYPE_CS8 0x08 /* (1/7.2.10) */ +/* Fixed-length character fields - d-string (EMCA 167r3 1/7.2.12) */ typedef uint8_t dstring; /* Timestamp (ECMA 167r3 1/7.3) */ @@ -85,22 +95,8 @@ struct regid { } __packed; /* Flags (ECMA 167r3 1/7.4.1) */ -#define ENTITYID_FLAGS_DIRTY 0x00 -#define ENTITYID_FLAGS_PROTECTED 0x01 - -/* OSTA UDF 2.1.5.2 */ -#define UDF_ID_COMPLIANT "*OSTA UDF Compliant" - -/* OSTA UDF 2.1.5.3 */ -struct domainEntityIDSuffix { - uint16_t revision; - uint8_t flags; - uint8_t reserved[5]; -}; - -/* OSTA UDF 2.1.5.3 */ -#define ENTITYIDSUFFIX_FLAGS_HARDWRITEPROTECT 0 -#define ENTITYIDSUFFIX_FLAGS_SOFTWRITEPROTECT 1 +#define ENTITYID_FLAGS_DIRTY 0x01 +#define ENTITYID_FLAGS_PROTECTED 0x02 /* Volume Structure Descriptor (ECMA 167r3 2/9.1) */ #define VSD_STD_ID_LEN 5 @@ -202,6 +198,13 @@ struct NSRDesc { uint8_t structData[2040]; } __packed; +/* Generic Descriptor */ +struct genericDesc { + struct tag descTag; + __le32 volDescSeqNum; + uint8_t reserved[492]; +} __packed; + /* Primary Volume Descriptor (ECMA 167r3 3/10.1) */ struct primaryVolDesc { struct tag descTag; @@ -316,7 +319,7 @@ struct genericPartitionMap { /* Partition Map Type (ECMA 167r3 3/10.7.1.1) */ #define GP_PARTITION_MAP_TYPE_UNDEF 0x00 -#define GP_PARTIITON_MAP_TYPE_1 0x01 +#define GP_PARTITION_MAP_TYPE_1 0x01 #define GP_PARTITION_MAP_TYPE_2 0x02 /* Type 1 Partition Map (ECMA 167r3 3/10.7.2) */ @@ -723,6 +726,7 @@ struct appUseExtAttr { #define EXTATTR_DEV_SPEC 12 #define EXTATTR_IMP_USE 2048 #define EXTATTR_APP_USE 65536 +#define EXTATTR_SUBTYPE 1 /* Unallocated Space Entry (ECMA 167r3 4/14.11) */ struct unallocSpaceEntry { @@ -754,10 +758,12 @@ struct partitionIntegrityEntry { /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */ /* Extent Length (ECMA 167r3 4/14.14.1.1) */ +#define EXT_LENGTH_MASK 0x3FFFFFFF +#define EXT_TYPE_MASK 0xC0000000 #define EXT_RECORDED_ALLOCATED 0x00000000 #define EXT_NOT_RECORDED_ALLOCATED 0x40000000 #define EXT_NOT_RECORDED_NOT_ALLOCATED 0x80000000 -#define EXT_NEXT_EXTENT_ALLOCDECS 0xC0000000 +#define EXT_NEXT_EXTENT_ALLOCDESCS 0xC0000000 /* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */ @@ -774,7 +780,7 @@ struct pathComponent { uint8_t componentType; uint8_t lengthComponentIdent; __le16 componentFileVersionNum; - dstring componentIdent[0]; + dchars componentIdent[0]; } __packed; /* File Entry (ECMA 167r3 4/14.17) */ diff --git a/fs/udf/inode.c b/fs/udf/inode.c index ea80036d7897..e875bc5668ee 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1981,10 +1981,10 @@ int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1); udf_write_aext(inode, epos, &nepos.block, - sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0); } else { __udf_add_aext(inode, epos, &nepos.block, - sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0); } brelse(epos->bh); @@ -2143,7 +2143,7 @@ int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, unsigned int indirections = 0; while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) == - (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { + (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) { udf_pblk_t block; if (++indirections > UDF_MAX_INDIR_EXTS) { diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h index a4da59e38b7f..35e61b2cacfe 100644 --- a/fs/udf/osta_udf.h +++ b/fs/udf/osta_udf.h @@ -1,10 +1,11 @@ /* * osta_udf.h * - * This file is based on OSTA UDF(tm) 2.50 (April 30, 2003) + * This file is based on OSTA UDF(tm) 2.60 (March 1, 2005) * http://www.osta.org * - * Copyright (c) 2001-2004 Ben Fennema <bfennema@falcon.csc.calpoly.edu> + * Copyright (c) 2001-2004 Ben Fennema + * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,38 +33,57 @@ * SUCH DAMAGE. */ +/** + * @file + * OSTA-UDF defines and structure definitions + */ + #include "ecma_167.h" #ifndef _OSTA_UDF_H #define _OSTA_UDF_H 1 -/* OSTA CS0 Charspec (UDF 2.50 2.1.2) */ +/* OSTA CS0 Charspec (UDF 2.60 2.1.2) */ #define UDF_CHAR_SET_TYPE 0 #define UDF_CHAR_SET_INFO "OSTA Compressed Unicode" -/* Entity Identifier (UDF 2.50 2.1.5) */ -/* Identifiers (UDF 2.50 2.1.5.2) */ +/* Entity Identifier (UDF 2.60 2.1.5) */ +/* Identifiers (UDF 2.60 2.1.5.2) */ +/* Implementation Use Extended Attribute (UDF 2.60 3.3.4.5) */ +/* Virtual Allocation Table (UDF 1.50 2.2.10) */ +/* Logical Volume Extended Information (UDF 1.50 Errata, DCN 5003, 3.3.4.5.1.3) */ +/* OS2EA (UDF 1.50 3.3.4.5.3.1) */ +/* MacUniqueIDTable (UDF 1.50 3.3.4.5.4.3) */ +/* MacResourceFork (UDF 1.50 3.3.4.5.4.4) */ #define UDF_ID_DEVELOPER "*Linux UDFFS" #define UDF_ID_COMPLIANT "*OSTA UDF Compliant" #define UDF_ID_LV_INFO "*UDF LV Info" #define UDF_ID_FREE_EA "*UDF FreeEASpace" #define UDF_ID_FREE_APP_EA "*UDF FreeAppEASpace" #define UDF_ID_DVD_CGMS "*UDF DVD CGMS Info" +#define UDF_ID_VAT_LVEXTENSION "*UDF VAT LVExtension" #define UDF_ID_OS2_EA "*UDF OS/2 EA" #define UDF_ID_OS2_EA_LENGTH "*UDF OS/2 EALength" #define UDF_ID_MAC_VOLUME "*UDF Mac VolumeInfo" #define UDF_ID_MAC_FINDER "*UDF Mac FinderInfo" #define UDF_ID_MAC_UNIQUE "*UDF Mac UniqueIDTable" #define UDF_ID_MAC_RESOURCE "*UDF Mac ResourceFork" +#define UDF_ID_OS400_DIRINFO "*UDF OS/400 DirInfo" #define UDF_ID_VIRTUAL "*UDF Virtual Partition" #define UDF_ID_SPARABLE "*UDF Sparable Partition" #define UDF_ID_ALLOC "*UDF Virtual Alloc Tbl" #define UDF_ID_SPARING "*UDF Sparing Table" #define UDF_ID_METADATA "*UDF Metadata Partition" -/* Identifier Suffix (UDF 2.50 2.1.5.3) */ -#define IS_DF_HARD_WRITE_PROTECT 0x01 -#define IS_DF_SOFT_WRITE_PROTECT 0x02 +/* Identifier Suffix (UDF 2.60 2.1.5.3) */ +#define DOMAIN_FLAGS_HARD_WRITE_PROTECT 0x01 +#define DOMAIN_FLAGS_SOFT_WRITE_PROTECT 0x02 + +struct domainIdentSuffix { + __le16 UDFRevision; + uint8_t domainFlags; + uint8_t reserved[5]; +} __packed; struct UDFIdentSuffix { __le16 UDFRevision; @@ -75,15 +95,15 @@ struct UDFIdentSuffix { struct impIdentSuffix { uint8_t OSClass; uint8_t OSIdentifier; - uint8_t reserved[6]; + uint8_t impUse[6]; } __packed; struct appIdentSuffix { uint8_t impUse[8]; } __packed; -/* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */ -/* Implementation Use (UDF 2.50 2.2.6.4) */ +/* Logical Volume Integrity Descriptor (UDF 2.60 2.2.6) */ +/* Implementation Use (UDF 2.60 2.2.6.4) */ struct logicalVolIntegrityDescImpUse { struct regid impIdent; __le32 numFiles; @@ -94,8 +114,8 @@ struct logicalVolIntegrityDescImpUse { uint8_t impUse[0]; } __packed; -/* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */ -/* Implementation Use (UDF 2.50 2.2.7.2) */ +/* Implementation Use Volume Descriptor (UDF 2.60 2.2.7) */ +/* Implementation Use (UDF 2.60 2.2.7.2) */ struct impUseVolDescImpUse { struct charspec LVICharset; dstring logicalVolIdent[128]; @@ -115,7 +135,7 @@ struct udfPartitionMap2 { __le16 partitionNum; } __packed; -/* Virtual Partition Map (UDF 2.50 2.2.8) */ +/* Virtual Partition Map (UDF 2.60 2.2.8) */ struct virtualPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; @@ -126,7 +146,7 @@ struct virtualPartitionMap { uint8_t reserved2[24]; } __packed; -/* Sparable Partition Map (UDF 2.50 2.2.9) */ +/* Sparable Partition Map (UDF 2.60 2.2.9) */ struct sparablePartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; @@ -141,7 +161,7 @@ struct sparablePartitionMap { __le32 locSparingTable[4]; } __packed; -/* Metadata Partition Map (UDF 2.4.0 2.2.10) */ +/* Metadata Partition Map (UDF 2.60 2.2.10) */ struct metadataPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; @@ -160,14 +180,14 @@ struct metadataPartitionMap { /* Virtual Allocation Table (UDF 1.5 2.2.10) */ struct virtualAllocationTable15 { - __le32 VirtualSector[0]; + __le32 vatEntry[0]; struct regid vatIdent; __le32 previousVATICBLoc; } __packed; #define ICBTAG_FILE_TYPE_VAT15 0x00U -/* Virtual Allocation Table (UDF 2.50 2.2.11) */ +/* Virtual Allocation Table (UDF 2.60 2.2.11) */ struct virtualAllocationTable20 { __le16 lengthHeader; __le16 lengthImpUse; @@ -175,9 +195,9 @@ struct virtualAllocationTable20 { __le32 previousVATICBLoc; __le32 numFiles; __le32 numDirs; - __le16 minReadRevision; - __le16 minWriteRevision; - __le16 maxWriteRevision; + __le16 minUDFReadRev; + __le16 minUDFWriteRev; + __le16 maxUDFWriteRev; __le16 reserved; uint8_t impUse[0]; __le32 vatEntry[0]; @@ -185,7 +205,7 @@ struct virtualAllocationTable20 { #define ICBTAG_FILE_TYPE_VAT20 0xF8U -/* Sparing Table (UDF 2.50 2.2.12) */ +/* Sparing Table (UDF 2.60 2.2.12) */ struct sparingEntry { __le32 origLocation; __le32 mappedLocation; @@ -201,12 +221,12 @@ struct sparingTable { mapEntry[0]; } __packed; -/* Metadata File (and Metadata Mirror File) (UDF 2.50 2.2.13.1) */ +/* Metadata File (and Metadata Mirror File) (UDF 2.60 2.2.13.1) */ #define ICBTAG_FILE_TYPE_MAIN 0xFA #define ICBTAG_FILE_TYPE_MIRROR 0xFB #define ICBTAG_FILE_TYPE_BITMAP 0xFC -/* struct struct long_ad ICB - ADImpUse (UDF 2.50 2.2.4.3) */ +/* struct struct long_ad ICB - ADImpUse (UDF 2.60 2.2.4.3) */ struct allocDescImpUse { __le16 flags; uint8_t impUse[4]; @@ -214,17 +234,17 @@ struct allocDescImpUse { #define AD_IU_EXT_ERASED 0x0001 -/* Real-Time Files (UDF 2.50 6.11) */ +/* Real-Time Files (UDF 2.60 6.11) */ #define ICBTAG_FILE_TYPE_REALTIME 0xF9U -/* Implementation Use Extended Attribute (UDF 2.50 3.3.4.5) */ -/* FreeEASpace (UDF 2.50 3.3.4.5.1.1) */ +/* Implementation Use Extended Attribute (UDF 2.60 3.3.4.5) */ +/* FreeEASpace (UDF 2.60 3.3.4.5.1.1) */ struct freeEaSpace { __le16 headerChecksum; uint8_t freeEASpace[0]; } __packed; -/* DVD Copyright Management Information (UDF 2.50 3.3.4.5.1.2) */ +/* DVD Copyright Management Information (UDF 2.60 3.3.4.5.1.2) */ struct DVDCopyrightImpUse { __le16 headerChecksum; uint8_t CGMSInfo; @@ -232,20 +252,35 @@ struct DVDCopyrightImpUse { uint8_t protectionSystemInfo[4]; } __packed; -/* Application Use Extended Attribute (UDF 2.50 3.3.4.6) */ -/* FreeAppEASpace (UDF 2.50 3.3.4.6.1) */ +/* Logical Volume Extended Information (UDF 1.50 Errata, DCN 5003, 3.3.4.5.1.3) */ +struct LVExtensionEA { + __le16 headerChecksum; + __le64 verificationID; + __le32 numFiles; + __le32 numDirs; + dstring logicalVolIdent[128]; +} __packed; + +/* Application Use Extended Attribute (UDF 2.60 3.3.4.6) */ +/* FreeAppEASpace (UDF 2.60 3.3.4.6.1) */ struct freeAppEASpace { __le16 headerChecksum; uint8_t freeEASpace[0]; } __packed; -/* UDF Defined System Stream (UDF 2.50 3.3.7) */ +/* UDF Defined System Stream (UDF 2.60 3.3.7) */ #define UDF_ID_UNIQUE_ID "*UDF Unique ID Mapping Data" #define UDF_ID_NON_ALLOC "*UDF Non-Allocatable Space" #define UDF_ID_POWER_CAL "*UDF Power Cal Table" #define UDF_ID_BACKUP "*UDF Backup" -/* Operating System Identifiers (UDF 2.50 6.3) */ +/* UDF Defined Non-System Streams (UDF 2.60 3.3.8) */ +#define UDF_ID_MAC_RESOURCE_FORK_STREAM "*UDF Macintosh Resource Fork" +/* #define UDF_ID_OS2_EA "*UDF OS/2 EA" */ +#define UDF_ID_NT_ACL "*UDF NT ACL" +#define UDF_ID_UNIX_ACL "*UDF UNIX ACL" + +/* Operating System Identifiers (UDF 2.60 6.3) */ #define UDF_OS_CLASS_UNDEF 0x00U #define UDF_OS_CLASS_DOS 0x01U #define UDF_OS_CLASS_OS2 0x02U @@ -270,6 +305,7 @@ struct freeAppEASpace { #define UDF_OS_ID_LINUX 0x05U #define UDF_OS_ID_MKLINUX 0x06U #define UDF_OS_ID_FREEBSD 0x07U +#define UDF_OS_ID_NETBSD 0x08U #define UDF_OS_ID_WIN9X 0x00U #define UDF_OS_ID_WINNT 0x00U #define UDF_OS_ID_OS400 0x00U diff --git a/fs/udf/super.c b/fs/udf/super.c index 8c28e93e9b73..f747bf72edbe 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -767,20 +767,20 @@ static int udf_check_vsd(struct super_block *sb) static int udf_verify_domain_identifier(struct super_block *sb, struct regid *ident, char *dname) { - struct domainEntityIDSuffix *suffix; + struct domainIdentSuffix *suffix; if (memcmp(ident->ident, UDF_ID_COMPLIANT, strlen(UDF_ID_COMPLIANT))) { udf_warn(sb, "Not OSTA UDF compliant %s descriptor.\n", dname); goto force_ro; } - if (ident->flags & (1 << ENTITYID_FLAGS_DIRTY)) { + if (ident->flags & ENTITYID_FLAGS_DIRTY) { udf_warn(sb, "Possibly not OSTA UDF compliant %s descriptor.\n", dname); goto force_ro; } - suffix = (struct domainEntityIDSuffix *)ident->identSuffix; - if (suffix->flags & (1 << ENTITYIDSUFFIX_FLAGS_HARDWRITEPROTECT) || - suffix->flags & (1 << ENTITYIDSUFFIX_FLAGS_SOFTWRITEPROTECT)) { + suffix = (struct domainIdentSuffix *)ident->identSuffix; + if ((suffix->domainFlags & DOMAIN_FLAGS_HARD_WRITE_PROTECT) || + (suffix->domainFlags & DOMAIN_FLAGS_SOFT_WRITE_PROTECT)) { if (!sb_rdonly(sb)) { udf_warn(sb, "Descriptor for %s marked write protected." " Forcing read only mount.\n", dname); @@ -1035,7 +1035,6 @@ static int check_partition_desc(struct super_block *sb, switch (le32_to_cpu(p->accessType)) { case PD_ACCESS_TYPE_READ_ONLY: case PD_ACCESS_TYPE_WRITE_ONCE: - case PD_ACCESS_TYPE_REWRITABLE: case PD_ACCESS_TYPE_NONE: goto force_ro; } @@ -1063,7 +1062,8 @@ static int check_partition_desc(struct super_block *sb, goto force_ro; if (map->s_partition_type == UDF_VIRTUAL_MAP15 || - map->s_partition_type == UDF_VIRTUAL_MAP20) + map->s_partition_type == UDF_VIRTUAL_MAP20 || + map->s_partition_type == UDF_METADATA_MAP25) goto force_ro; return 0; @@ -2402,6 +2402,10 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = sbi->s_partmaps[sbi->s_partition].s_partition_len; buf->f_bfree = udf_count_free(sb); buf->f_bavail = buf->f_bfree; + /* + * Let's pretend each free block is also a free 'inode' since UDF does + * not have separate preallocated table of inodes. + */ buf->f_files = (lvidiu != NULL ? (le32_to_cpu(lvidiu->numFiles) + le32_to_cpu(lvidiu->numDirs)) : 0) + buf->f_bfree; @@ -2492,17 +2496,29 @@ static unsigned int udf_count_free_table(struct super_block *sb, static unsigned int udf_count_free(struct super_block *sb) { unsigned int accum = 0; - struct udf_sb_info *sbi; + struct udf_sb_info *sbi = UDF_SB(sb); struct udf_part_map *map; + unsigned int part = sbi->s_partition; + int ptype = sbi->s_partmaps[part].s_partition_type; + + if (ptype == UDF_METADATA_MAP25) { + part = sbi->s_partmaps[part].s_type_specific.s_metadata. + s_phys_partition_ref; + } else if (ptype == UDF_VIRTUAL_MAP15 || ptype == UDF_VIRTUAL_MAP20) { + /* + * Filesystems with VAT are append-only and we cannot write to + * them. Let's just report 0 here. + */ + return 0; + } - sbi = UDF_SB(sb); if (sbi->s_lvid_bh) { struct logicalVolIntegrityDesc *lvid = (struct logicalVolIntegrityDesc *) sbi->s_lvid_bh->b_data; - if (le32_to_cpu(lvid->numOfPartitions) > sbi->s_partition) { + if (le32_to_cpu(lvid->numOfPartitions) > part) { accum = le32_to_cpu( - lvid->freeSpaceTable[sbi->s_partition]); + lvid->freeSpaceTable[part]); if (accum == 0xFFFFFFFF) accum = 0; } @@ -2511,7 +2527,7 @@ static unsigned int udf_count_free(struct super_block *sb) if (accum) return accum; - map = &sbi->s_partmaps[sbi->s_partition]; + map = &sbi->s_partmaps[part]; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) { accum += udf_count_free_bitmap(sb, map->s_uspace.s_bitmap); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 63a47f1e1d52..532cda99644e 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -241,7 +241,7 @@ int udf_truncate_extents(struct inode *inode) while ((etype = udf_current_aext(inode, &epos, &eloc, &elen, 0)) != -1) { - if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { + if (etype == (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) { udf_write_aext(inode, &epos, &neloc, nelen, 0); if (indirect_ext_len) { /* We managed to free all extents in the diff --git a/fs/utimes.c b/fs/utimes.c index c952b6b3d8a0..1d17ce98cb80 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -36,14 +36,14 @@ static int utimes_common(const struct path *path, struct timespec64 *times) if (times[0].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_ATIME; else if (times[0].tv_nsec != UTIME_NOW) { - newattrs.ia_atime = timestamp_truncate(times[0], inode); + newattrs.ia_atime = times[0]; newattrs.ia_valid |= ATTR_ATIME_SET; } if (times[1].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_MTIME; else if (times[1].tv_nsec != UTIME_NOW) { - newattrs.ia_mtime = timestamp_truncate(times[1], inode); + newattrs.ia_mtime = times[1]; newattrs.ia_valid |= ATTR_MTIME_SET; } /* diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 0d7fcc983b3d..e6149720ce02 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -62,6 +62,7 @@ xfs_attr_args_init( struct xfs_da_args *args, struct xfs_inode *dp, const unsigned char *name, + size_t namelen, int flags) { @@ -74,7 +75,7 @@ xfs_attr_args_init( args->dp = dp; args->flags = flags; args->name = name; - args->namelen = strlen((const char *)name); + args->namelen = namelen; if (args->namelen >= MAXNAMELEN) return -EFAULT; /* match IRIX behaviour */ @@ -139,6 +140,7 @@ int xfs_attr_get( struct xfs_inode *ip, const unsigned char *name, + size_t namelen, unsigned char **value, int *valuelenp, int flags) @@ -154,7 +156,7 @@ xfs_attr_get( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, ip, name, flags); + error = xfs_attr_args_init(&args, ip, name, namelen, flags); if (error) return error; @@ -338,6 +340,7 @@ int xfs_attr_set( struct xfs_inode *dp, const unsigned char *name, + size_t namelen, unsigned char *value, int valuelen, int flags) @@ -353,7 +356,7 @@ xfs_attr_set( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, flags); + error = xfs_attr_args_init(&args, dp, name, namelen, flags); if (error) return error; @@ -442,6 +445,7 @@ int xfs_attr_remove( struct xfs_inode *dp, const unsigned char *name, + size_t namelen, int flags) { struct xfs_mount *mp = dp->i_mount; @@ -453,7 +457,7 @@ xfs_attr_remove( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, flags); + error = xfs_attr_args_init(&args, dp, name, namelen, flags); if (error) return error; @@ -1007,7 +1011,7 @@ restart: * The INCOMPLETE flag means that we will find the "old" * attr, not the "new" one. */ - args->flags |= XFS_ATTR_INCOMPLETE; + args->op_flags |= XFS_DA_OP_INCOMPLETE; state = xfs_da_state_alloc(); state->args = args; state->mp = mp; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 94badfa1743e..4243b2272642 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -26,7 +26,7 @@ struct xfs_attr_list_context; *========================================================================*/ -#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ +#define ATTR_DONTFOLLOW 0x0001 /* -- ignored, from IRIX -- */ #define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ #define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ #define ATTR_SECURE 0x0008 /* use attrs in security namespace */ @@ -37,7 +37,10 @@ struct xfs_attr_list_context; #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ #define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ -#define ATTR_ALLOC 0x8000 /* allocate xattr buffer on demand */ +#define ATTR_ALLOC 0x8000 /* [kernel] allocate xattr buffer on demand */ + +#define ATTR_KERNEL_FLAGS \ + (ATTR_KERNOTIME | ATTR_KERNOVAL | ATTR_INCOMPLETE | ATTR_ALLOC) #define XFS_ATTR_FLAGS \ { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ @@ -145,11 +148,13 @@ int xfs_attr_list_int(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - unsigned char **value, int *valuelenp, int flags); + size_t namelen, unsigned char **value, int *valuelenp, + int flags); int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, - unsigned char *value, int valuelen, int flags); + size_t namelen, unsigned char *value, int valuelen, int flags); int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, + size_t namelen, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 08d4b10ae2d5..fed537a4353d 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2403,8 +2403,8 @@ xfs_attr3_leaf_lookup_int( * If we are looking for INCOMPLETE entries, show only those. * If we are looking for complete entries, show only those. */ - if ((args->flags & XFS_ATTR_INCOMPLETE) != - (entry->flags & XFS_ATTR_INCOMPLETE)) { + if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) != + !!(entry->flags & XFS_ATTR_INCOMPLETE)) { continue; } if (entry->flags & XFS_ATTR_LOCAL) { diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index f4a188e28b7b..73615b1dd1a8 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -39,15 +39,6 @@ struct xfs_attr3_icleaf_hdr { } freemap[XFS_ATTR_LEAF_MAPSIZE]; }; -/* - * Used to keep a list of "remote value" extents when unlinking an inode. - */ -typedef struct xfs_attr_inactive_list { - xfs_dablk_t valueblk; /* block number of value bytes */ - int valuelen; /* number of bytes in value */ -} xfs_attr_inactive_list_t; - - /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index a6ef5df42669..a266d05df146 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -26,6 +26,23 @@ #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ /* + * Remote Attribute Values + * ======================= + * + * Remote extended attribute values are conceptually simple -- they're written + * to data blocks mapped by an inode's attribute fork, and they have an upper + * size limit of 64k. Setting a value does not involve the XFS log. + * + * However, on a v5 filesystem, maximally sized remote attr values require one + * block more than 64k worth of space to hold both the remote attribute value + * header (64 bytes). On a 4k block filesystem this results in a 68k buffer; + * on a 64k block filesystem, this would be a 128k buffer. Note that the log + * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k). + * Therefore, we /must/ ensure that remote attribute value buffers never touch + * the logging system and therefore never have a log item. + */ + +/* * Each contiguous block has a header, so it is not just a simple attribute * length to FSB conversion. */ @@ -401,17 +418,25 @@ xfs_attr_rmtval_get( (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, args->trans, - mp->m_ddev_targp, - dblkno, dblkcnt, 0, &bp, - &xfs_attr3_rmt_buf_ops); - if (error) + bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0, + &xfs_attr3_rmt_buf_ops); + if (!bp) + return -ENOMEM; + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; return error; + } error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, &offset, &valuelen, &dst); - xfs_trans_brelse(args->trans, bp); + xfs_buf_relse(bp); if (error) return error; @@ -552,6 +577,33 @@ xfs_attr_rmtval_set( return 0; } +/* Mark stale any incore buffers for the remote value. */ +int +xfs_attr_rmtval_stale( + struct xfs_inode *ip, + struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) || + XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) + return -EFSCORRUPTED; + + bp = xfs_buf_incore(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map->br_startblock), + XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + + return 0; +} + /* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. @@ -560,7 +612,6 @@ int xfs_attr_rmtval_remove( struct xfs_da_args *args) { - struct xfs_mount *mp = args->dp->i_mount; xfs_dablk_t lblkno; int blkcnt; int error; @@ -575,9 +626,6 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; while (blkcnt > 0) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_daddr_t dblkno; - int dblkcnt; int nmap; /* @@ -588,22 +636,11 @@ xfs_attr_rmtval_remove( blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - /* - * If the "remote" value is in the cache, remove it. - */ - bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - bp = NULL; - } + if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) + return -EFSCORRUPTED; + error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK); + if (error) + return error; lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 9d20b66ad379..6fb4572845ce 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); +int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags); #endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index e2cc98931552..b22c7e928eb1 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2389,8 +2389,6 @@ xfs_btree_lshift( XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); if (level > 0) { /* It's a nonleaf. operate on keys and ptrs */ - int i; /* loop index */ - for (i = 0; i < rrecs; i++) { error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); if (error) diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index e16610d1c14f..0f4fbb0889ff 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -89,6 +89,7 @@ typedef struct xfs_da_args { #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ #define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ +#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -96,7 +97,8 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" } + { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \ + { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" } /* * Storage for holding state during Btree searches and split/join ops. diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 3dee33043e09..734837a9b51a 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -217,7 +217,7 @@ typedef struct xfs_dir2_sf_entry { * A 64-bit or 32-bit inode number follows here, at a variable offset * after the name. */ -} xfs_dir2_sf_entry_t; +} __packed xfs_dir2_sf_entry_t; static inline int xfs_dir2_sf_hdr_size(int i8count) { @@ -683,8 +683,6 @@ struct xfs_attr3_leafblock { /* * Flags used in the leaf_entry[i].flags field. - * NOTE: the INCOMPLETE bit must not collide with the flags bits specified - * on the system call, they are "or"ed together for various operations. */ #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 1b7dcbae051c..77e9fa385980 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1540,6 +1540,13 @@ typedef struct xfs_bmdr_block { #define BMBT_BLOCKCOUNT_BITLEN 21 #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) +#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) + +/* + * bmbt records have a file offset (block) field that is 54 bits wide, so this + * is the largest xfs_fileoff_t that we ever expect to see. + */ +#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK) typedef struct xfs_bmbt_rec { __be64 l0, l1; diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 8ef31d71a9c7..9bac0d2e56dc 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -462,11 +462,20 @@ static inline uint xfs_log_dinode_size(int version) #define XFS_BLF_GDQUOT_BUF (1<<4) /* - * This is the structure used to lay out a buf log item in the - * log. The data map describes which 128 byte chunks of the buffer - * have been logged. - */ -#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + * This is the structure used to lay out a buf log item in the log. The data + * map describes which 128 byte chunks of the buffer have been logged. + * + * The placement of blf_map_size causes blf_data_map to start at an odd + * multiple of sizeof(unsigned int) offset within the struct. Because the data + * bitmap size will always be an even number, the end of the data_map (and + * therefore the structure) will also be at an odd multiple of sizeof(unsigned + * int). Some 64-bit compilers will insert padding at the end of the struct to + * ensure 64-bit alignment of blf_blkno, but 32-bit ones will not. Therefore, + * XFS_BLF_DATAMAP_SIZE must be an odd number to make the padding explicit and + * keep the structure size consistent between 32-bit and 64-bit platforms. + */ +#define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) +#define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1) typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 60c61d7052a8..c3422403b169 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -75,7 +75,6 @@ static inline xfs_extlen_t xrep_calc_ag_resblks( struct xfs_scrub *sc) { - ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)); return 0; } diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 91693fce34a8..cd743fad8478 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -145,7 +145,8 @@ xfs_get_acl(struct inode *inode, int type) * go out to the disk. */ len = XFS_ACL_MAX_SIZE(ip->i_mount); - error = xfs_attr_get(ip, ea_name, (unsigned char **)&xfs_acl, &len, + error = xfs_attr_get(ip, ea_name, strlen(ea_name), + (unsigned char **)&xfs_acl, &len, ATTR_ALLOC | ATTR_ROOT); if (error) { /* @@ -196,15 +197,17 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) len -= sizeof(struct xfs_acl_entry) * (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); - error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, - len, ATTR_ROOT); + error = xfs_attr_set(ip, ea_name, strlen(ea_name), + (unsigned char *)xfs_acl, len, ATTR_ROOT); kmem_free(xfs_acl); } else { /* * A NULL ACL argument means we want to remove the ACL. */ - error = xfs_attr_remove(ip, ea_name, ATTR_ROOT); + error = xfs_attr_remove(ip, ea_name, + strlen(ea_name), + ATTR_ROOT); /* * If the attribute didn't exist to start with that's fine. diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 5ff49523d8ea..8fbb841cd6fe 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -25,22 +25,18 @@ #include "xfs_error.h" /* - * Look at all the extents for this logical region, - * invalidate any buffers that are incore/in transactions. + * Invalidate any incore buffers associated with this remote attribute value + * extent. We never log remote attribute value buffers, which means that they + * won't be attached to a transaction and are therefore safe to mark stale. + * The actual bunmapi will be taken care of later. */ STATIC int -xfs_attr3_leaf_freextent( - struct xfs_trans **trans, +xfs_attr3_rmt_stale( struct xfs_inode *dp, xfs_dablk_t blkno, int blkcnt) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_dablk_t tblkno; - xfs_daddr_t dblkno; - int tblkcnt; - int dblkcnt; int nmap; int error; @@ -48,47 +44,29 @@ xfs_attr3_leaf_freextent( * Roll through the "value", invalidating the attribute value's * blocks. */ - tblkno = blkno; - tblkcnt = blkcnt; - while (tblkcnt > 0) { + while (blkcnt > 0) { /* * Try to remember where we decided to put the value. */ nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); - if (error) { + if (error) return error; - } - ASSERT(nmap == 1); - ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (XFS_IS_CORRUPT(dp->i_mount, nmap != 1)) + return -EFSCORRUPTED; /* - * If it's a hole, these are already unmapped - * so there's nothing to invalidate. + * Mark any incore buffers for the remote value as stale. We + * never log remote attr value buffers, so the buffer should be + * easy to kill. */ - if (map.br_startblock != HOLESTARTBLOCK) { - - dblkno = XFS_FSB_TO_DADDR(dp->i_mount, - map.br_startblock); - dblkcnt = XFS_FSB_TO_BB(dp->i_mount, - map.br_blockcount); - bp = xfs_trans_get_buf(*trans, - dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, 0); - if (!bp) - return -ENOMEM; - xfs_trans_binval(*trans, bp); - /* - * Roll to next transaction. - */ - error = xfs_trans_roll_inode(trans, dp); - if (error) - return error; - } + error = xfs_attr_rmtval_stale(dp, &map, 0); + if (error) + return error; - tblkno += map.br_blockcount; - tblkcnt -= map.br_blockcount; + blkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } return 0; @@ -102,86 +80,45 @@ xfs_attr3_leaf_freextent( */ STATIC int xfs_attr3_leaf_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { - struct xfs_attr_leafblock *leaf; - struct xfs_attr3_icleaf_hdr ichdr; - struct xfs_attr_leaf_entry *entry; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = bp->b_mount; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_remote *name_rmt; - struct xfs_attr_inactive_list *list; - struct xfs_attr_inactive_list *lp; - int error; - int count; - int size; - int tmp; - int i; - struct xfs_mount *mp = bp->b_mount; + int error = 0; + int i; - leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); /* - * Count the number of "remote" value extents. + * Find the remote value extents for this leaf and invalidate their + * incore buffers. */ - count = 0; entry = xfs_attr3_leaf_entryp(leaf); for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) - count++; - } - } + int blkcnt; - /* - * If there are no "remote" values, we're done. - */ - if (count == 0) { - xfs_trans_brelse(*trans, bp); - return 0; - } - - /* - * Allocate storage for a list of all the "remote" value extents. - */ - size = count * sizeof(xfs_attr_inactive_list_t); - list = kmem_alloc(size, 0); - - /* - * Identify each of the "remote" value extents. - */ - lp = list; - entry = xfs_attr3_leaf_entryp(leaf); - for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) { - lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - lp++; - } - } - } - xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ + if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL)) + continue; - /* - * Invalidate each of the "remote" value extents. - */ - error = 0; - for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr3_leaf_freextent(trans, dp, - lp->valueblk, lp->valuelen); + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (!name_rmt->valueblk) + continue; - if (error == 0) - error = tmp; /* save only the 1st errno */ + blkcnt = xfs_attr3_rmt_blocks(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + error = xfs_attr3_rmt_stale(dp, + be32_to_cpu(name_rmt->valueblk), blkcnt); + if (error) + goto err; } - kmem_free(list); + xfs_trans_brelse(*trans, bp); +err: return error; } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3984779e5911..5be8973a452c 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -27,6 +27,23 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); +/* Is this log iovec plausibly large enough to contain the buffer log format? */ +bool +xfs_buf_log_check_iovec( + struct xfs_log_iovec *iovec) +{ + struct xfs_buf_log_format *blfp = iovec->i_addr; + char *bmp_end; + char *item_end; + + if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len) + return false; + + item_end = (char *)iovec->i_addr + iovec->i_len; + bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; + return bmp_end <= item_end; +} + static inline int xfs_buf_log_format_size( struct xfs_buf_log_format *blfp) @@ -688,7 +705,7 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_push = xfs_buf_item_push, }; -STATIC int +STATIC void xfs_buf_item_get_format( struct xfs_buf_log_item *bip, int count) @@ -698,14 +715,11 @@ xfs_buf_item_get_format( if (count == 1) { bip->bli_formats = &bip->__bli_format; - return 0; + return; } bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), 0); - if (!bip->bli_formats) - return -ENOMEM; - return 0; } STATIC void @@ -731,7 +745,6 @@ xfs_buf_item_init( struct xfs_buf_log_item *bip = bp->b_log_item; int chunks; int map_size; - int error; int i; /* @@ -760,19 +773,22 @@ xfs_buf_item_init( * Discontiguous buffer support follows the layout of the underlying * buffer. This makes the implementation as simple as possible. */ - error = xfs_buf_item_get_format(bip, bp->b_map_count); - ASSERT(error == 0); - if (error) { /* to stop gcc throwing set-but-unused warnings */ - kmem_cache_free(xfs_buf_item_zone, bip); - return error; - } - + xfs_buf_item_get_format(bip, bp->b_map_count); for (i = 0; i < bip->bli_format_count; i++) { chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), XFS_BLF_CHUNK); map_size = DIV_ROUND_UP(chunks, NBWORD); + if (map_size > XFS_BLF_DATAMAP_SIZE) { + kmem_cache_free(xfs_buf_item_zone, bip); + xfs_err(mp, + "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", + map_size, + BBTOB(bp->b_maps[i].bm_len)); + return -EFSCORRUPTED; + } + bip->bli_formats[i].blf_type = XFS_LI_BUF; bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; @@ -805,6 +821,9 @@ xfs_buf_item_log_segment( uint end_bit; uint mask; + ASSERT(first < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); + ASSERT(last < XFS_BLF_DATAMAP_SIZE * XFS_BLF_CHUNK * NBWORD); + /* * Convert byte offsets to bit numbers. */ diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 4a054b11011a..30114b510332 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -61,6 +61,7 @@ void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, struct list_head *); +bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 2bff21ca9d78..9cfd3209f52b 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -137,7 +137,7 @@ xfs_qm_adjust_dqtimers( (d->d_blk_hardlimit && (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_hardlimit)))) { - d->d_btimer = cpu_to_be32(get_seconds() + + d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_btimelimit); } else { d->d_bwarns = 0; @@ -160,7 +160,7 @@ xfs_qm_adjust_dqtimers( (d->d_ino_hardlimit && (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_hardlimit)))) { - d->d_itimer = cpu_to_be32(get_seconds() + + d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_itimelimit); } else { d->d_iwarns = 0; @@ -183,7 +183,7 @@ xfs_qm_adjust_dqtimers( (d->d_rtb_hardlimit && (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_hardlimit)))) { - d->d_rtbtimer = cpu_to_be32(get_seconds() + + d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + mp->m_quotainfo->qi_rtbtimelimit); } else { d->d_rtbwarns = 0; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c93250108952..b8a4a3f29b36 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -187,7 +187,12 @@ xfs_file_dio_aio_read( file_accessed(iocb->ki_filp); - xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + return -EAGAIN; + } else { + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, is_sync_kiocb(iocb)); xfs_iunlock(ip, XFS_IOLOCK_SHARED); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 401da197f012..1979a0055763 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1518,10 +1518,8 @@ xfs_itruncate_extents_flags( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_fileoff_t last_block; xfs_filblks_t unmap_len; int error = 0; - int done = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!atomic_read(&VFS_I(ip)->i_count) || @@ -1541,21 +1539,22 @@ xfs_itruncate_extents_flags( * the end of the file (in a crash where the space is allocated * but the inode size is not yet updated), simply remove any * blocks which show up between the new EOF and the maximum - * possible file size. If the first block to be removed is - * beyond the maximum file size (ie it is the same as last_block), - * then there is nothing to do. + * possible file size. + * + * We have to free all the blocks to the bmbt maximum offset, even if + * the page cache can't scale that far. */ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - if (first_unmap_block == last_block) + if (first_unmap_block >= XFS_MAX_FILEOFF) { + WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); return 0; + } - ASSERT(first_unmap_block < last_block); - unmap_len = last_block - first_unmap_block + 1; - while (!done) { + unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; + while (unmap_len > 0) { ASSERT(tp->t_firstblock == NULLFSBLOCK); - error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, - XFS_ITRUNC_MAX_EXTENTS, &done); + error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, + flags, XFS_ITRUNC_MAX_EXTENTS); if (error) goto out; @@ -1575,7 +1574,7 @@ xfs_itruncate_extents_flags( if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, - first_unmap_block, last_block, true); + first_unmap_block, XFS_MAX_FILEOFF, true); if (error) goto out; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 7b35d62ede9f..d42de92cb283 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -357,6 +357,7 @@ xfs_attrmulti_attr_get( { unsigned char *kbuf; int error = -EFAULT; + size_t namelen; if (*len > XFS_XATTR_SIZE_MAX) return -EINVAL; @@ -364,7 +365,9 @@ xfs_attrmulti_attr_get( if (!kbuf) return -ENOMEM; - error = xfs_attr_get(XFS_I(inode), name, &kbuf, (int *)len, flags); + namelen = strlen(name); + error = xfs_attr_get(XFS_I(inode), name, namelen, &kbuf, (int *)len, + flags); if (error) goto out_kfree; @@ -386,6 +389,7 @@ xfs_attrmulti_attr_set( { unsigned char *kbuf; int error; + size_t namelen; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; @@ -396,7 +400,8 @@ xfs_attrmulti_attr_set( if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); + namelen = strlen(name); + error = xfs_attr_set(XFS_I(inode), name, namelen, kbuf, len, flags); if (!error) xfs_forget_acl(inode, name, flags); kfree(kbuf); @@ -410,10 +415,12 @@ xfs_attrmulti_attr_remove( uint32_t flags) { int error; + size_t namelen; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; - error = xfs_attr_remove(XFS_I(inode), name, flags); + namelen = strlen(name); + error = xfs_attr_remove(XFS_I(inode), name, namelen, flags); if (!error) xfs_forget_acl(inode, name, flags); return error; @@ -462,6 +469,13 @@ xfs_attrmulti_by_handle( error = 0; for (i = 0; i < am_hreq.opcount; i++) { + if ((ops[i].am_flags & ATTR_ROOT) && + (ops[i].am_flags & ATTR_SECURE)) { + ops[i].am_error = -EINVAL; + continue; + } + ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; + ops[i].am_error = strncpy_from_user((char *)attr_name, ops[i].am_attrname, MAXNAMELEN); if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c4c4f09113d3..769581a79c58 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -107,7 +107,7 @@ xfs_ioctl32_bstime_copyin( xfs_bstime_t *bstime, compat_xfs_bstime_t __user *bstime32) { - compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */ + old_time32_t sec32; /* tv_sec differs on 64 vs. 32 */ if (get_user(sec32, &bstime32->tv_sec) || get_user(bstime->tv_nsec, &bstime32->tv_nsec)) @@ -450,6 +450,13 @@ xfs_compat_attrmulti_by_handle( error = 0; for (i = 0; i < am_hreq.opcount; i++) { + if ((ops[i].am_flags & ATTR_ROOT) && + (ops[i].am_flags & ATTR_SECURE)) { + ops[i].am_error = -EINVAL; + continue; + } + ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; + ops[i].am_error = strncpy_from_user((char *)attr_name, compat_ptr(ops[i].am_attrname), MAXNAMELEN); diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 8c7743cd490e..053de7d894cd 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -32,7 +32,7 @@ #endif typedef struct compat_xfs_bstime { - compat_time_t tv_sec; /* seconds */ + old_time32_t tv_sec; /* seconds */ __s32 tv_nsec; /* and nanoseconds */ } compat_xfs_bstime_t; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 28e2d1f37267..bb590a267a7f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -923,7 +923,7 @@ xfs_buffered_write_iomap_begin( xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_inode_need_cow(ip, &imap, &shared); + error = xfs_bmap_trim_cow(ip, &imap, &shared); if (error) goto out_unlock; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 8afe69ca188b..81f2f93caec0 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -50,8 +50,10 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + error = xfs_attr_set(ip, xattr->name, + strlen(xattr->name), + xattr->value, xattr->value_len, + ATTR_SECURE); if (error < 0) break; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 99ec3fba4548..0d683fb96396 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1934,6 +1934,12 @@ xlog_recover_buffer_pass1( struct list_head *bucket; struct xfs_buf_cancel *bcp; + if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { + xfs_err(log->l_mp, "bad buffer log item size (%d)", + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + /* * If this isn't a cancel buffer item, then just return. */ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index b6701b4f59a9..5f04d8a5ab2a 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -111,6 +111,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); /* log structures */ + XFS_CHECK_STRUCT_SIZE(struct xfs_buf_log_format, 88); XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_32, 28); XFS_CHECK_STRUCT_SIZE(struct xfs_efd_log_format_64, 32); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 7823af39008b..4e57edca8bce 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -64,9 +64,9 @@ struct xfs_quotainfo { struct xfs_inode *qi_pquotaip; /* project quota inode */ struct list_lru qi_lru; int qi_dquots; - time_t qi_btimelimit; /* limit for blks timer */ - time_t qi_itimelimit; /* limit for inodes timer */ - time_t qi_rtbtimelimit;/* limit for rt blks timer */ + time64_t qi_btimelimit; /* limit for blks timer */ + time64_t qi_itimelimit; /* limit for inodes timer */ + time64_t qi_rtbtimelimit;/* limit for rt blks timer */ xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index c7de17deeae6..38669e827206 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -37,9 +37,9 @@ xfs_qm_fill_state( tstate->flags |= QCI_SYSFILE; tstate->blocks = ip->i_d.di_nblocks; tstate->nextents = ip->i_d.di_nextents; - tstate->spc_timelimit = q->qi_btimelimit; - tstate->ino_timelimit = q->qi_itimelimit; - tstate->rt_spc_timelimit = q->qi_rtbtimelimit; + tstate->spc_timelimit = (u32)q->qi_btimelimit; + tstate->ino_timelimit = (u32)q->qi_itimelimit; + tstate->rt_spc_timelimit = (u32)q->qi_rtbtimelimit; tstate->spc_warnlimit = q->qi_bwarnlimit; tstate->ino_warnlimit = q->qi_iwarnlimit; tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index de451235c4ee..e723b267a247 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -223,8 +223,8 @@ xfs_reflink_trim_around_shared( } } -bool -xfs_inode_need_cow( +int +xfs_bmap_trim_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared) @@ -327,7 +327,7 @@ xfs_find_trim_cow_extent( if (cmap->br_startoff > offset_fsb) { xfs_trim_extent(imap, imap->br_startoff, cmap->br_startoff - imap->br_startoff); - return xfs_inode_need_cow(ip, imap, shared); + return xfs_bmap_trim_cow(ip, imap, shared); } *shared = true; @@ -1457,7 +1457,8 @@ xfs_reflink_clear_inode_flag( * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF, + true); if (error) return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index d18ad7f4fb64..3e4fd46373ab 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -22,7 +22,7 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); -bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, +int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared); int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d9ae27ddf253..760901783944 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -193,32 +193,6 @@ xfs_fs_show_options( return 0; } -static uint64_t -xfs_max_file_offset( - unsigned int blockshift) -{ - unsigned int pagefactor = 1; - unsigned int bitshift = BITS_PER_LONG - 1; - - /* Figure out maximum filesize, on Linux this can depend on - * the filesystem blocksize (on 32 bit platforms). - * __block_write_begin does this in an [unsigned] long long... - * page->index << (PAGE_SHIFT - bbits) - * So, for page sized blocks (4K on 32 bit platforms), - * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is - * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) - * but for smaller blocksizes it is less (bbits = log2 bsize). - */ - -#if BITS_PER_LONG == 32 - ASSERT(sizeof(sector_t) == 8); - pagefactor = PAGE_SIZE; - bitshift = BITS_PER_LONG; -#endif - - return (((uint64_t)pagefactor) << bitshift) - 1; -} - /* * Set parameters for inode allocation heuristics, taking into account * filesystem size and inode32/inode64 mount options; i.e. specifically @@ -1424,6 +1398,26 @@ xfs_fc_fill_super( if (error) goto out_free_sb; + /* + * XFS block mappings use 54 bits to store the logical block offset. + * This should suffice to handle the maximum file size that the VFS + * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT + * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes + * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON + * to check this assertion. + * + * Avoid integer overflow by comparing the maximum bmbt offset to the + * maximum pagecache offset in units of fs blocks. + */ + if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) { + xfs_warn(mp, +"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!", + XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE), + XFS_MAX_FILEOFF); + error = -EINVAL; + goto out_free_sb; + } + error = xfs_filestream_mount(mp); if (error) goto out_free_sb; @@ -1435,7 +1429,7 @@ xfs_fc_fill_super( sb->s_magic = XFS_SUPER_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; - sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; sb->s_time_min = S32_MIN; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index a6fe2d8dc40f..d1b9869bc5fa 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -580,7 +580,7 @@ xfs_trans_dqresv( { xfs_qcnt_t hardlimit; xfs_qcnt_t softlimit; - time_t timer; + time64_t timer; xfs_qwarncnt_t warns; xfs_qwarncnt_t warnlimit; xfs_qcnt_t total_count; @@ -635,7 +635,8 @@ xfs_trans_dqresv( goto error_return; } if (softlimit && total_count > softlimit) { - if ((timer != 0 && get_seconds() > timer) || + if ((timer != 0 && + ktime_get_real_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTLONGWARN); @@ -662,7 +663,8 @@ xfs_trans_dqresv( goto error_return; } if (softlimit && total_count > softlimit) { - if ((timer != 0 && get_seconds() > timer) || + if ((timer != 0 && + ktime_get_real_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTLONGWARN); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 383f0203d103..b0fedb543f97 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -24,6 +24,7 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, int xflags = handler->flags; struct xfs_inode *ip = XFS_I(inode); int error, asize = size; + size_t namelen = strlen(name); /* Convert Linux syscall to XFS internal ATTR flags */ if (!size) { @@ -31,7 +32,8 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, value = NULL; } - error = xfs_attr_get(ip, name, (unsigned char **)&value, &asize, xflags); + error = xfs_attr_get(ip, name, namelen, (unsigned char **)&value, + &asize, xflags); if (error) return error; return asize; @@ -67,6 +69,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, int xflags = handler->flags; struct xfs_inode *ip = XFS_I(inode); int error; + size_t namelen = strlen(name); /* Convert Linux syscall to XFS internal ATTR flags */ if (flags & XATTR_CREATE) @@ -74,10 +77,11 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, if (flags & XATTR_REPLACE) xflags |= ATTR_REPLACE; - if (!value) - return xfs_attr_remove(ip, (unsigned char *)name, xflags); - error = xfs_attr_set(ip, (unsigned char *)name, - (void *)value, size, xflags); + if (value) + error = xfs_attr_set(ip, name, namelen, (void *)value, size, + xflags); + else + error = xfs_attr_remove(ip, name, namelen, xflags); if (!error) xfs_forget_acl(inode, name, xflags); |